From 8c67b7a914cca1a5a8e1446f1b684df928ebf963 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Fri, 1 Mar 2024 12:54:07 +0100 Subject: [PATCH 001/167] ice: remove eswitch changing queues algorithm Changing queues used by eswitch will be done through PF netdev. There is no need to reserve queues if the number of used queues is known. Reviewed-by: Wojciech Drewek Reviewed-by: Marcin Szycik Signed-off-by: Michal Swiatkowski Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 6 ---- drivers/net/ethernet/intel/ice/ice_eswitch.c | 34 -------------------- drivers/net/ethernet/intel/ice/ice_eswitch.h | 4 --- drivers/net/ethernet/intel/ice/ice_sriov.c | 3 -- 4 files changed, 47 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 365c03d1c4622..9bb435b4338ff 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -527,12 +527,6 @@ struct ice_eswitch { struct ice_esw_br_offloads *br_offloads; struct xarray reprs; bool is_running; - /* struct to allow cp queues management optimization */ - struct { - int to_reach; - int value; - bool is_reaching; - } qs; }; struct ice_agg_node { diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.c b/drivers/net/ethernet/intel/ice/ice_eswitch.c index 9069725c71b4a..2e999f801c0a1 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.c +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c @@ -455,8 +455,6 @@ static int ice_eswitch_enable_switchdev(struct ice_pf *pf) return -ENODEV; ctrl_vsi = pf->eswitch.control_vsi; - /* cp VSI is createad with 1 queue as default */ - pf->eswitch.qs.value = 1; pf->eswitch.uplink_vsi = uplink_vsi; if (ice_eswitch_setup_env(pf)) @@ -489,7 +487,6 @@ static void ice_eswitch_disable_switchdev(struct ice_pf *pf) ice_vsi_release(ctrl_vsi); pf->eswitch.is_running = false; - pf->eswitch.qs.is_reaching = false; } /** @@ -620,18 +617,6 @@ ice_eswitch_cp_change_queues(struct ice_eswitch *eswitch, int change) struct ice_vsi *cp = eswitch->control_vsi; int queues = 0; - if (eswitch->qs.is_reaching) { - if (eswitch->qs.to_reach >= eswitch->qs.value + change) { - queues = eswitch->qs.to_reach; - eswitch->qs.is_reaching = false; - } else { - queues = 0; - } - } else if ((change > 0 && cp->alloc_txq <= eswitch->qs.value) || - change < 0) { - queues = cp->alloc_txq + change; - } - if (queues) { cp->req_txq = queues; cp->req_rxq = queues; @@ -643,7 +628,6 @@ ice_eswitch_cp_change_queues(struct ice_eswitch *eswitch, int change) ice_vsi_open(cp); } - eswitch->qs.value += change; ice_eswitch_remap_rings_to_vectors(eswitch); } @@ -661,8 +645,6 @@ ice_eswitch_attach(struct ice_pf *pf, struct ice_vf *vf) err = ice_eswitch_enable_switchdev(pf); if (err) return err; - /* Control plane VSI is created with 1 queue as default */ - pf->eswitch.qs.to_reach -= 1; change = 0; } @@ -756,19 +738,3 @@ int ice_eswitch_rebuild(struct ice_pf *pf) return 0; } - -/** - * ice_eswitch_reserve_cp_queues - reserve control plane VSI queues - * @pf: pointer to PF structure - * @change: how many more (or less) queues is needed - * - * Remember to call ice_eswitch_attach/detach() the "change" times. - */ -void ice_eswitch_reserve_cp_queues(struct ice_pf *pf, int change) -{ - if (pf->eswitch.qs.value + change < 0) - return; - - pf->eswitch.qs.to_reach = pf->eswitch.qs.value + change; - pf->eswitch.qs.is_reaching = true; -} diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.h b/drivers/net/ethernet/intel/ice/ice_eswitch.h index 1a288a03a79a1..59d51c0d14e5a 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.h +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.h @@ -26,7 +26,6 @@ void ice_eswitch_set_target_vsi(struct sk_buff *skb, struct ice_tx_offload_params *off); netdev_tx_t ice_eswitch_port_start_xmit(struct sk_buff *skb, struct net_device *netdev); -void ice_eswitch_reserve_cp_queues(struct ice_pf *pf, int change); #else /* CONFIG_ICE_SWITCHDEV */ static inline void ice_eswitch_detach(struct ice_pf *pf, struct ice_vf *vf) { } @@ -77,8 +76,5 @@ ice_eswitch_port_start_xmit(struct sk_buff *skb, struct net_device *netdev) { return NETDEV_TX_BUSY; } - -static inline void -ice_eswitch_reserve_cp_queues(struct ice_pf *pf, int change) { } #endif /* CONFIG_ICE_SWITCHDEV */ #endif /* _ICE_ESWITCH_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.c b/drivers/net/ethernet/intel/ice/ice_sriov.c index a958fcf3e6bef..65e1986af7772 100644 --- a/drivers/net/ethernet/intel/ice/ice_sriov.c +++ b/drivers/net/ethernet/intel/ice/ice_sriov.c @@ -170,8 +170,6 @@ void ice_free_vfs(struct ice_pf *pf) else dev_warn(dev, "VFs are assigned - not disabling SR-IOV\n"); - ice_eswitch_reserve_cp_queues(pf, -ice_get_num_vfs(pf)); - mutex_lock(&vfs->table_lock); ice_for_each_vf(pf, bkt, vf) { @@ -897,7 +895,6 @@ static int ice_ena_vfs(struct ice_pf *pf, u16 num_vfs) goto err_unroll_sriov; } - ice_eswitch_reserve_cp_queues(pf, num_vfs); ret = ice_start_vfs(pf); if (ret) { dev_err(dev, "Failed to start %d VFs, err %d\n", num_vfs, ret); From defd52455aee4a0922e45155d6a348f02a99b775 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Fri, 1 Mar 2024 12:54:08 +0100 Subject: [PATCH 002/167] ice: do Tx through PF netdev in slow-path Tx can be done using PF netdev. Checks before Tx are unnecessary. Checking if switchdev mode is set seems too defensive (there is no PR netdev in legacy mode). If corresponding VF is disabled or during reset, PR netdev also should be down. Reviewed-by: Marcin Szycik Signed-off-by: Michal Swiatkowski Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_eswitch.c | 26 +++++--------------- drivers/net/ethernet/intel/ice/ice_repr.c | 12 --------- drivers/net/ethernet/intel/ice/ice_repr.h | 2 -- 3 files changed, 6 insertions(+), 34 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.c b/drivers/net/ethernet/intel/ice/ice_eswitch.c index 2e999f801c0a1..8ad271534d807 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.c +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c @@ -236,7 +236,7 @@ ice_eswitch_release_repr(struct ice_pf *pf, struct ice_repr *repr) */ static int ice_eswitch_setup_repr(struct ice_pf *pf, struct ice_repr *repr) { - struct ice_vsi *ctrl_vsi = pf->eswitch.control_vsi; + struct ice_vsi *uplink_vsi = pf->eswitch.uplink_vsi; struct ice_vsi *vsi = repr->src_vsi; struct metadata_dst *dst; @@ -255,12 +255,11 @@ static int ice_eswitch_setup_repr(struct ice_pf *pf, struct ice_repr *repr) netif_napi_add(repr->netdev, &repr->q_vector->napi, ice_napi_poll); - netif_keep_dst(repr->netdev); + netif_keep_dst(uplink_vsi->netdev); dst = repr->dst; dst->u.port_info.port_id = vsi->vsi_num; - dst->u.port_info.lower_dev = repr->netdev; - ice_repr_set_traffic_vsi(repr, ctrl_vsi); + dst->u.port_info.lower_dev = uplink_vsi->netdev; return 0; @@ -318,27 +317,14 @@ void ice_eswitch_update_repr(unsigned long repr_id, struct ice_vsi *vsi) netdev_tx_t ice_eswitch_port_start_xmit(struct sk_buff *skb, struct net_device *netdev) { - struct ice_netdev_priv *np; - struct ice_repr *repr; - struct ice_vsi *vsi; - - np = netdev_priv(netdev); - vsi = np->vsi; - - if (!vsi || !ice_is_switchdev_running(vsi->back)) - return NETDEV_TX_BUSY; - - if (ice_is_reset_in_progress(vsi->back->state) || - test_bit(ICE_VF_DIS, vsi->back->state)) - return NETDEV_TX_BUSY; + struct ice_repr *repr = ice_netdev_to_repr(netdev); - repr = ice_netdev_to_repr(netdev); skb_dst_drop(skb); dst_hold((struct dst_entry *)repr->dst); skb_dst_set(skb, (struct dst_entry *)repr->dst); - skb->queue_mapping = repr->q_id; + skb->dev = repr->dst->u.port_info.lower_dev; - return ice_start_xmit(skb, netdev); + return dev_queue_xmit(skb); } /** diff --git a/drivers/net/ethernet/intel/ice/ice_repr.c b/drivers/net/ethernet/intel/ice/ice_repr.c index 5f30fb131f74e..6191bee6c536e 100644 --- a/drivers/net/ethernet/intel/ice/ice_repr.c +++ b/drivers/net/ethernet/intel/ice/ice_repr.c @@ -439,15 +439,3 @@ void ice_repr_stop_tx_queues(struct ice_repr *repr) netif_carrier_off(repr->netdev); netif_tx_stop_all_queues(repr->netdev); } - -/** - * ice_repr_set_traffic_vsi - set traffic VSI for port representor - * @repr: repr on with VSI will be set - * @vsi: pointer to VSI that will be used by port representor to pass traffic - */ -void ice_repr_set_traffic_vsi(struct ice_repr *repr, struct ice_vsi *vsi) -{ - struct ice_netdev_priv *np = netdev_priv(repr->netdev); - - np->vsi = vsi; -} diff --git a/drivers/net/ethernet/intel/ice/ice_repr.h b/drivers/net/ethernet/intel/ice/ice_repr.h index f9aede3157161..b107e058d5380 100644 --- a/drivers/net/ethernet/intel/ice/ice_repr.h +++ b/drivers/net/ethernet/intel/ice/ice_repr.h @@ -28,8 +28,6 @@ void ice_repr_rem_vf(struct ice_repr *repr); void ice_repr_start_tx_queues(struct ice_repr *repr); void ice_repr_stop_tx_queues(struct ice_repr *repr); -void ice_repr_set_traffic_vsi(struct ice_repr *repr, struct ice_vsi *vsi); - struct ice_repr *ice_netdev_to_repr(struct net_device *netdev); bool ice_is_port_repr_netdev(const struct net_device *netdev); From 50d62022f45580e2fc9b62fca486e6d0ea287c40 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Fri, 1 Mar 2024 12:54:09 +0100 Subject: [PATCH 003/167] ice: default Tx rule instead of to queue Steer all packets that miss other rules to PF VSI. Previously in switchdev mode, PF VSI received missed packets, but only ones marked as Rx. Now it is receiving all missed packets. To queue rule per PR isn't needed, because we use PF VSI instead of control VSI now, and it's already correctly configured. Add flag to correctly set LAN_EN bit in default Tx rule. It shouldn't allow packet to go outside when there is a match. Reviewed-by: Marcin Szycik Signed-off-by: Michal Swiatkowski Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_eswitch.c | 107 +++---------------- drivers/net/ethernet/intel/ice/ice_repr.h | 4 - drivers/net/ethernet/intel/ice/ice_switch.c | 4 + drivers/net/ethernet/intel/ice/ice_switch.h | 5 +- 4 files changed, 23 insertions(+), 97 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.c b/drivers/net/ethernet/intel/ice/ice_eswitch.c index 8ad271534d807..50b3de7008375 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.c +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c @@ -10,85 +10,6 @@ #include "ice_devlink.h" #include "ice_tc_lib.h" -/** - * ice_eswitch_del_sp_rules - delete adv rules added on PRs - * @pf: pointer to the PF struct - * - * Delete all advanced rules that were used to forward packets with the - * device's VSI index to the corresponding eswitch ctrl VSI queue. - */ -static void ice_eswitch_del_sp_rules(struct ice_pf *pf) -{ - struct ice_repr *repr; - unsigned long id; - - xa_for_each(&pf->eswitch.reprs, id, repr) { - if (repr->sp_rule.rid) - ice_rem_adv_rule_by_id(&pf->hw, &repr->sp_rule); - } -} - -/** - * ice_eswitch_add_sp_rule - add adv rule with device's VSI index - * @pf: pointer to PF struct - * @repr: pointer to the repr struct - * - * This function adds advanced rule that forwards packets with - * device's VSI index to the corresponding eswitch ctrl VSI queue. - */ -static int ice_eswitch_add_sp_rule(struct ice_pf *pf, struct ice_repr *repr) -{ - struct ice_vsi *ctrl_vsi = pf->eswitch.control_vsi; - struct ice_adv_rule_info rule_info = { 0 }; - struct ice_adv_lkup_elem *list; - struct ice_hw *hw = &pf->hw; - const u16 lkups_cnt = 1; - int err; - - list = kcalloc(lkups_cnt, sizeof(*list), GFP_ATOMIC); - if (!list) - return -ENOMEM; - - ice_rule_add_src_vsi_metadata(list); - - rule_info.sw_act.flag = ICE_FLTR_TX; - rule_info.sw_act.vsi_handle = ctrl_vsi->idx; - rule_info.sw_act.fltr_act = ICE_FWD_TO_Q; - rule_info.sw_act.fwd_id.q_id = hw->func_caps.common_cap.rxq_first_id + - ctrl_vsi->rxq_map[repr->q_id]; - rule_info.flags_info.act |= ICE_SINGLE_ACT_LB_ENABLE; - rule_info.flags_info.act_valid = true; - rule_info.tun_type = ICE_SW_TUN_AND_NON_TUN; - rule_info.src_vsi = repr->src_vsi->idx; - - err = ice_add_adv_rule(hw, list, lkups_cnt, &rule_info, - &repr->sp_rule); - if (err) - dev_err(ice_pf_to_dev(pf), "Unable to add slow-path rule for eswitch for PR %d", - repr->id); - - kfree(list); - return err; -} - -static int -ice_eswitch_add_sp_rules(struct ice_pf *pf) -{ - struct ice_repr *repr; - unsigned long id; - int err; - - xa_for_each(&pf->eswitch.reprs, id, repr) { - err = ice_eswitch_add_sp_rule(pf, repr); - if (err) { - ice_eswitch_del_sp_rules(pf); - return err; - } - } - - return 0; -} - /** * ice_eswitch_setup_env - configure eswitch HW filters * @pf: pointer to PF struct @@ -102,7 +23,6 @@ static int ice_eswitch_setup_env(struct ice_pf *pf) struct ice_vsi *ctrl_vsi = pf->eswitch.control_vsi; struct net_device *netdev = uplink_vsi->netdev; struct ice_vsi_vlan_ops *vlan_ops; - bool rule_added = false; ice_remove_vsi_fltr(&pf->hw, uplink_vsi->idx); @@ -112,17 +32,19 @@ static int ice_eswitch_setup_env(struct ice_pf *pf) netif_addr_unlock_bh(netdev); if (ice_vsi_add_vlan_zero(uplink_vsi)) + goto err_vlan_zero; + + if (ice_cfg_dflt_vsi(uplink_vsi->port_info, uplink_vsi->idx, true, + ICE_FLTR_RX)) goto err_def_rx; - if (!ice_is_dflt_vsi_in_use(uplink_vsi->port_info)) { - if (ice_set_dflt_vsi(uplink_vsi)) - goto err_def_rx; - rule_added = true; - } + if (ice_cfg_dflt_vsi(uplink_vsi->port_info, uplink_vsi->idx, true, + ICE_FLTR_TX)) + goto err_def_tx; vlan_ops = ice_get_compat_vsi_vlan_ops(uplink_vsi); if (vlan_ops->dis_rx_filtering(uplink_vsi)) - goto err_dis_rx; + goto err_vlan_filtering; if (ice_vsi_update_security(uplink_vsi, ice_vsi_ctx_set_allow_override)) goto err_override_uplink; @@ -141,10 +63,15 @@ static int ice_eswitch_setup_env(struct ice_pf *pf) ice_vsi_update_security(uplink_vsi, ice_vsi_ctx_clear_allow_override); err_override_uplink: vlan_ops->ena_rx_filtering(uplink_vsi); -err_dis_rx: - if (rule_added) - ice_clear_dflt_vsi(uplink_vsi); +err_vlan_filtering: + ice_cfg_dflt_vsi(uplink_vsi->port_info, uplink_vsi->idx, false, + ICE_FLTR_TX); +err_def_tx: + ice_cfg_dflt_vsi(uplink_vsi->port_info, uplink_vsi->idx, false, + ICE_FLTR_RX); err_def_rx: + ice_vsi_del_vlan_zero(uplink_vsi); +err_vlan_zero: ice_fltr_add_mac_and_broadcast(uplink_vsi, uplink_vsi->port_info->mac.perm_addr, ICE_FWD_TO_VSI); @@ -585,7 +512,6 @@ void ice_eswitch_stop_all_tx_queues(struct ice_pf *pf) static void ice_eswitch_stop_reprs(struct ice_pf *pf) { - ice_eswitch_del_sp_rules(pf); ice_eswitch_stop_all_tx_queues(pf); ice_eswitch_napi_disable(&pf->eswitch.reprs); } @@ -594,7 +520,6 @@ static void ice_eswitch_start_reprs(struct ice_pf *pf) { ice_eswitch_napi_enable(&pf->eswitch.reprs); ice_eswitch_start_all_tx_queues(pf); - ice_eswitch_add_sp_rules(pf); } static void diff --git a/drivers/net/ethernet/intel/ice/ice_repr.h b/drivers/net/ethernet/intel/ice/ice_repr.h index b107e058d5380..34823f58cd231 100644 --- a/drivers/net/ethernet/intel/ice/ice_repr.h +++ b/drivers/net/ethernet/intel/ice/ice_repr.h @@ -16,10 +16,6 @@ struct ice_repr { int q_id; u32 id; u8 parent_mac[ETH_ALEN]; -#ifdef CONFIG_ICE_SWITCHDEV - /* info about slow path rule */ - struct ice_rule_query_data sp_rule; -#endif }; struct ice_repr *ice_repr_add_vf(struct ice_vf *vf); diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c index f84bab80ca423..ce0b344dda996 100644 --- a/drivers/net/ethernet/intel/ice/ice_switch.c +++ b/drivers/net/ethernet/intel/ice/ice_switch.c @@ -2444,6 +2444,9 @@ static void ice_fill_sw_info(struct ice_hw *hw, struct ice_fltr_info *fi) fi->lan_en = true; } } + + if (fi->flag & ICE_FLTR_TX_ONLY) + fi->lan_en = false; } /** @@ -3819,6 +3822,7 @@ ice_cfg_dflt_vsi(struct ice_port_info *pi, u16 vsi_handle, bool set, } else if (f_info.flag & ICE_FLTR_TX) { f_info.src_id = ICE_SRC_ID_VSI; f_info.src = hw_vsi_id; + f_info.flag |= ICE_FLTR_TX_ONLY; } f_list_entry.fltr_info = f_info; diff --git a/drivers/net/ethernet/intel/ice/ice_switch.h b/drivers/net/ethernet/intel/ice/ice_switch.h index db7e501b7e0a4..1b7dae9ce8d6f 100644 --- a/drivers/net/ethernet/intel/ice/ice_switch.h +++ b/drivers/net/ethernet/intel/ice/ice_switch.h @@ -8,8 +8,9 @@ #define ICE_SW_CFG_MAX_BUF_LEN 2048 #define ICE_DFLT_VSI_INVAL 0xff -#define ICE_FLTR_RX BIT(0) -#define ICE_FLTR_TX BIT(1) +#define ICE_FLTR_RX BIT(0) +#define ICE_FLTR_TX BIT(1) +#define ICE_FLTR_TX_ONLY BIT(2) #define ICE_VSI_INVAL_ID 0xffff #define ICE_INVAL_Q_HANDLE 0xFFFF From 9cba6e1767bf8286563ae8907d6e595c365d3e92 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Fri, 1 Mar 2024 12:54:10 +0100 Subject: [PATCH 004/167] ice: control default Tx rule in lag Tx rule in switchdev was changed to use PF instead of additional control plane VSI. Because of that during lag we should control it. Control means to add and remove the default Tx rule during lag active/inactive switching. It can be done the same way as default Rx rule. Reviewed-by: Wojciech Drewek Reviewed-by: Marcin Szycik Signed-off-by: Michal Swiatkowski Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_lag.c | 44 +++++++++++++++++++----- drivers/net/ethernet/intel/ice/ice_lag.h | 3 +- 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_lag.c b/drivers/net/ethernet/intel/ice/ice_lag.c index 467372d541d21..7b802884440ce 100644 --- a/drivers/net/ethernet/intel/ice/ice_lag.c +++ b/drivers/net/ethernet/intel/ice/ice_lag.c @@ -202,11 +202,12 @@ static struct ice_lag *ice_lag_find_primary(struct ice_lag *lag) * @act: rule action * @recipe_id: recipe id for the new rule * @rule_idx: pointer to rule index + * @direction: ICE_FLTR_RX or ICE_FLTR_TX * @add: boolean on whether we are adding filters */ static int ice_lag_cfg_fltr(struct ice_lag *lag, u32 act, u16 recipe_id, u16 *rule_idx, - bool add) + u8 direction, bool add) { struct ice_sw_rule_lkup_rx_tx *s_rule; u16 s_rule_sz, vsi_num; @@ -231,9 +232,16 @@ ice_lag_cfg_fltr(struct ice_lag *lag, u32 act, u16 recipe_id, u16 *rule_idx, act |= FIELD_PREP(ICE_SINGLE_ACT_VSI_ID_M, vsi_num); - s_rule->hdr.type = cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_RX); s_rule->recipe_id = cpu_to_le16(recipe_id); - s_rule->src = cpu_to_le16(hw->port_info->lport); + if (direction == ICE_FLTR_RX) { + s_rule->hdr.type = + cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_RX); + s_rule->src = cpu_to_le16(hw->port_info->lport); + } else { + s_rule->hdr.type = + cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_TX); + s_rule->src = cpu_to_le16(vsi_num); + } s_rule->act = cpu_to_le32(act); s_rule->hdr_len = cpu_to_le16(DUMMY_ETH_HDR_LEN); opc = ice_aqc_opc_add_sw_rules; @@ -266,9 +274,27 @@ ice_lag_cfg_dflt_fltr(struct ice_lag *lag, bool add) { u32 act = ICE_SINGLE_ACT_VSI_FORWARDING | ICE_SINGLE_ACT_VALID_BIT | ICE_SINGLE_ACT_LAN_ENABLE; + int err; + + err = ice_lag_cfg_fltr(lag, act, lag->pf_recipe, &lag->pf_rx_rule_id, + ICE_FLTR_RX, add); + if (err) + goto err_rx; - return ice_lag_cfg_fltr(lag, act, lag->pf_recipe, - &lag->pf_rule_id, add); + act = ICE_SINGLE_ACT_VSI_FORWARDING | ICE_SINGLE_ACT_VALID_BIT | + ICE_SINGLE_ACT_LB_ENABLE; + err = ice_lag_cfg_fltr(lag, act, lag->pf_recipe, &lag->pf_tx_rule_id, + ICE_FLTR_TX, add); + if (err) + goto err_tx; + + return 0; + +err_tx: + ice_lag_cfg_fltr(lag, act, lag->pf_recipe, &lag->pf_rx_rule_id, + ICE_FLTR_RX, !add); +err_rx: + return err; } /** @@ -284,7 +310,7 @@ ice_lag_cfg_drop_fltr(struct ice_lag *lag, bool add) ICE_SINGLE_ACT_DROP; return ice_lag_cfg_fltr(lag, act, lag->lport_recipe, - &lag->lport_rule_idx, add); + &lag->lport_rule_idx, ICE_FLTR_RX, add); } /** @@ -310,7 +336,7 @@ ice_lag_cfg_pf_fltrs(struct ice_lag *lag, void *ptr) dev = ice_pf_to_dev(lag->pf); /* interface not active - remove old default VSI rule */ - if (bonding_info->slave.state && lag->pf_rule_id) { + if (bonding_info->slave.state && lag->pf_rx_rule_id) { if (ice_lag_cfg_dflt_fltr(lag, false)) dev_err(dev, "Error removing old default VSI filter\n"); if (ice_lag_cfg_drop_fltr(lag, true)) @@ -319,7 +345,7 @@ ice_lag_cfg_pf_fltrs(struct ice_lag *lag, void *ptr) } /* interface becoming active - add new default VSI rule */ - if (!bonding_info->slave.state && !lag->pf_rule_id) { + if (!bonding_info->slave.state && !lag->pf_rx_rule_id) { if (ice_lag_cfg_dflt_fltr(lag, true)) dev_err(dev, "Error adding new default VSI filter\n"); if (lag->lport_rule_idx && ice_lag_cfg_drop_fltr(lag, false)) @@ -2149,7 +2175,7 @@ void ice_lag_rebuild(struct ice_pf *pf) ice_lag_cfg_cp_fltr(lag, true); - if (lag->pf_rule_id) + if (lag->pf_rx_rule_id) if (ice_lag_cfg_dflt_fltr(lag, true)) dev_err(ice_pf_to_dev(pf), "Error adding default VSI rule in rebuild\n"); diff --git a/drivers/net/ethernet/intel/ice/ice_lag.h b/drivers/net/ethernet/intel/ice/ice_lag.h index 183b38792ef22..bab2c83142a1a 100644 --- a/drivers/net/ethernet/intel/ice/ice_lag.h +++ b/drivers/net/ethernet/intel/ice/ice_lag.h @@ -43,7 +43,8 @@ struct ice_lag { u8 primary:1; /* this is primary */ u16 pf_recipe; u16 lport_recipe; - u16 pf_rule_id; + u16 pf_rx_rule_id; + u16 pf_tx_rule_id; u16 cp_rule_idx; u16 lport_rule_idx; u8 role; From 33bf1e86231dbd62f06f0ca3cdf5995eb7d077d5 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Fri, 1 Mar 2024 12:54:11 +0100 Subject: [PATCH 005/167] ice: remove switchdev control plane VSI For slow-path Rx and Tx PF VSI is used. There is no need to have control plane VSI. Remove all code related to it. Eswitch rebuild can't fail without rebuilding control plane VSI. Return void from ice_eswitch_rebuild(). Reviewed-by: Marcin Szycik Signed-off-by: Michal Swiatkowski Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 1 - drivers/net/ethernet/intel/ice/ice_base.c | 36 +--- drivers/net/ethernet/intel/ice/ice_dcb_lib.c | 4 +- drivers/net/ethernet/intel/ice/ice_eswitch.c | 163 +----------------- drivers/net/ethernet/intel/ice/ice_eswitch.h | 2 +- drivers/net/ethernet/intel/ice/ice_lag.c | 9 +- drivers/net/ethernet/intel/ice/ice_lib.c | 49 +----- drivers/net/ethernet/intel/ice/ice_main.c | 10 +- drivers/net/ethernet/intel/ice/ice_repr.c | 12 -- drivers/net/ethernet/intel/ice/ice_repr.h | 2 - drivers/net/ethernet/intel/ice/ice_type.h | 1 - .../net/ethernet/intel/ice/ice_vsi_vlan_ops.c | 1 - 12 files changed, 13 insertions(+), 277 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 9bb435b4338ff..c4127d5f2be32 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -522,7 +522,6 @@ enum ice_misc_thread_tasks { }; struct ice_eswitch { - struct ice_vsi *control_vsi; struct ice_vsi *uplink_vsi; struct ice_esw_br_offloads *br_offloads; struct xarray reprs; diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index d2fd315556a39..8ffae7fe894f6 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -263,30 +263,6 @@ static u16 ice_calc_txq_handle(struct ice_vsi *vsi, struct ice_tx_ring *ring, u8 return ring->q_index - vsi->tc_cfg.tc_info[tc].qoffset; } -/** - * ice_eswitch_calc_txq_handle - * @ring: pointer to ring which unique index is needed - * - * To correctly work with many netdevs ring->q_index of Tx rings on switchdev - * VSI can repeat. Hardware ring setup requires unique q_index. Calculate it - * here by finding index in vsi->tx_rings of this ring. - * - * Return ICE_INVAL_Q_INDEX when index wasn't found. Should never happen, - * because VSI is get from ring->vsi, so it has to be present in this VSI. - */ -static u16 ice_eswitch_calc_txq_handle(struct ice_tx_ring *ring) -{ - const struct ice_vsi *vsi = ring->vsi; - int i; - - ice_for_each_txq(vsi, i) { - if (vsi->tx_rings[i] == ring) - return i; - } - - return ICE_INVAL_Q_INDEX; -} - /** * ice_cfg_xps_tx_ring - Configure XPS for a Tx ring * @ring: The Tx ring to configure @@ -353,9 +329,6 @@ ice_setup_tx_ctx(struct ice_tx_ring *ring, struct ice_tlan_ctx *tlan_ctx, u16 pf tlan_ctx->vmvf_num = hw->func_caps.vf_base_id + vsi->vf->vf_id; tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VF; break; - case ICE_VSI_SWITCHDEV_CTRL: - tlan_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_VMQ; - break; default: return; } @@ -919,14 +892,7 @@ ice_vsi_cfg_txq(struct ice_vsi *vsi, struct ice_tx_ring *ring, /* Add unique software queue handle of the Tx queue per * TC into the VSI Tx ring */ - if (vsi->type == ICE_VSI_SWITCHDEV_CTRL) { - ring->q_handle = ice_eswitch_calc_txq_handle(ring); - - if (ring->q_handle == ICE_INVAL_Q_INDEX) - return -ENODEV; - } else { - ring->q_handle = ice_calc_txq_handle(vsi, ring, tc); - } + ring->q_handle = ice_calc_txq_handle(vsi, ring, tc); if (ch) status = ice_ena_vsi_txq(vsi->port_info, ch->ch_vsi->idx, 0, diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c index 6e20ee6100224..ceb17c004d794 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c @@ -291,7 +291,6 @@ static void ice_dcb_ena_dis_vsi(struct ice_pf *pf, bool ena, bool locked) switch (vsi->type) { case ICE_VSI_CHNL: - case ICE_VSI_SWITCHDEV_CTRL: case ICE_VSI_PF: if (ena) ice_ena_vsi(vsi, locked); @@ -776,8 +775,7 @@ void ice_pf_dcb_recfg(struct ice_pf *pf, bool locked) /* no need to proceed with remaining cfg if it is CHNL * or switchdev VSI */ - if (vsi->type == ICE_VSI_CHNL || - vsi->type == ICE_VSI_SWITCHDEV_CTRL) + if (vsi->type == ICE_VSI_CHNL) continue; ice_vsi_map_rings_to_vectors(vsi); diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.c b/drivers/net/ethernet/intel/ice/ice_eswitch.c index 50b3de7008375..76fa114f22c90 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.c +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c @@ -20,7 +20,6 @@ static int ice_eswitch_setup_env(struct ice_pf *pf) { struct ice_vsi *uplink_vsi = pf->eswitch.uplink_vsi; - struct ice_vsi *ctrl_vsi = pf->eswitch.control_vsi; struct net_device *netdev = uplink_vsi->netdev; struct ice_vsi_vlan_ops *vlan_ops; @@ -49,17 +48,12 @@ static int ice_eswitch_setup_env(struct ice_pf *pf) if (ice_vsi_update_security(uplink_vsi, ice_vsi_ctx_set_allow_override)) goto err_override_uplink; - if (ice_vsi_update_security(ctrl_vsi, ice_vsi_ctx_set_allow_override)) - goto err_override_control; - if (ice_vsi_update_local_lb(uplink_vsi, true)) goto err_override_local_lb; return 0; err_override_local_lb: - ice_vsi_update_security(ctrl_vsi, ice_vsi_ctx_clear_allow_override); -err_override_control: ice_vsi_update_security(uplink_vsi, ice_vsi_ctx_clear_allow_override); err_override_uplink: vlan_ops->ena_rx_filtering(uplink_vsi); @@ -78,61 +72,6 @@ static int ice_eswitch_setup_env(struct ice_pf *pf) return -ENODEV; } -/** - * ice_eswitch_remap_rings_to_vectors - reconfigure rings of eswitch ctrl VSI - * @eswitch: pointer to eswitch struct - * - * In eswitch number of allocated Tx/Rx rings is equal. - * - * This function fills q_vectors structures associated with representor and - * move each ring pairs to port representor netdevs. Each port representor - * will have dedicated 1 Tx/Rx ring pair, so number of rings pair is equal to - * number of VFs. - */ -static void ice_eswitch_remap_rings_to_vectors(struct ice_eswitch *eswitch) -{ - struct ice_vsi *vsi = eswitch->control_vsi; - unsigned long repr_id = 0; - int q_id; - - ice_for_each_txq(vsi, q_id) { - struct ice_q_vector *q_vector; - struct ice_tx_ring *tx_ring; - struct ice_rx_ring *rx_ring; - struct ice_repr *repr; - - repr = xa_find(&eswitch->reprs, &repr_id, U32_MAX, - XA_PRESENT); - if (!repr) - break; - - repr_id += 1; - repr->q_id = q_id; - q_vector = repr->q_vector; - tx_ring = vsi->tx_rings[q_id]; - rx_ring = vsi->rx_rings[q_id]; - - q_vector->vsi = vsi; - q_vector->reg_idx = vsi->q_vectors[0]->reg_idx; - - q_vector->num_ring_tx = 1; - q_vector->tx.tx_ring = tx_ring; - tx_ring->q_vector = q_vector; - tx_ring->next = NULL; - tx_ring->netdev = repr->netdev; - /* In switchdev mode, from OS stack perspective, there is only - * one queue for given netdev, so it needs to be indexed as 0. - */ - tx_ring->q_index = 0; - - q_vector->num_ring_rx = 1; - q_vector->rx.rx_ring = rx_ring; - rx_ring->q_vector = q_vector; - rx_ring->next = NULL; - rx_ring->netdev = repr->netdev; - } -} - /** * ice_eswitch_release_repr - clear PR VSI configuration * @pf: poiner to PF struct @@ -152,8 +91,6 @@ ice_eswitch_release_repr(struct ice_pf *pf, struct ice_repr *repr) repr->dst = NULL; ice_fltr_add_mac_and_broadcast(vsi, repr->parent_mac, ICE_FWD_TO_VSI); - - netif_napi_del(&repr->q_vector->napi); } /** @@ -179,9 +116,6 @@ static int ice_eswitch_setup_repr(struct ice_pf *pf, struct ice_repr *repr) if (ice_vsi_add_vlan_zero(vsi)) goto err_update_security; - netif_napi_add(repr->netdev, &repr->q_vector->napi, - ice_napi_poll); - netif_keep_dst(uplink_vsi->netdev); dst = repr->dst; @@ -287,13 +221,11 @@ ice_eswitch_set_target_vsi(struct sk_buff *skb, static void ice_eswitch_release_env(struct ice_pf *pf) { struct ice_vsi *uplink_vsi = pf->eswitch.uplink_vsi; - struct ice_vsi *ctrl_vsi = pf->eswitch.control_vsi; struct ice_vsi_vlan_ops *vlan_ops; vlan_ops = ice_get_compat_vsi_vlan_ops(uplink_vsi); ice_vsi_update_local_lb(uplink_vsi, false); - ice_vsi_update_security(ctrl_vsi, ice_vsi_ctx_clear_allow_override); ice_vsi_update_security(uplink_vsi, ice_vsi_ctx_clear_allow_override); vlan_ops->ena_rx_filtering(uplink_vsi); ice_clear_dflt_vsi(uplink_vsi); @@ -302,56 +234,13 @@ static void ice_eswitch_release_env(struct ice_pf *pf) ICE_FWD_TO_VSI); } -/** - * ice_eswitch_vsi_setup - configure eswitch control VSI - * @pf: pointer to PF structure - * @pi: pointer to port_info structure - */ -static struct ice_vsi * -ice_eswitch_vsi_setup(struct ice_pf *pf, struct ice_port_info *pi) -{ - struct ice_vsi_cfg_params params = {}; - - params.type = ICE_VSI_SWITCHDEV_CTRL; - params.pi = pi; - params.flags = ICE_VSI_FLAG_INIT; - - return ice_vsi_setup(pf, ¶ms); -} - -/** - * ice_eswitch_napi_enable - enable NAPI for all port representors - * @reprs: xarray of reprs - */ -static void ice_eswitch_napi_enable(struct xarray *reprs) -{ - struct ice_repr *repr; - unsigned long id; - - xa_for_each(reprs, id, repr) - napi_enable(&repr->q_vector->napi); -} - -/** - * ice_eswitch_napi_disable - disable NAPI for all port representors - * @reprs: xarray of reprs - */ -static void ice_eswitch_napi_disable(struct xarray *reprs) -{ - struct ice_repr *repr; - unsigned long id; - - xa_for_each(reprs, id, repr) - napi_disable(&repr->q_vector->napi); -} - /** * ice_eswitch_enable_switchdev - configure eswitch in switchdev mode * @pf: pointer to PF structure */ static int ice_eswitch_enable_switchdev(struct ice_pf *pf) { - struct ice_vsi *ctrl_vsi, *uplink_vsi; + struct ice_vsi *uplink_vsi; uplink_vsi = ice_get_main_vsi(pf); if (!uplink_vsi) @@ -363,15 +252,10 @@ static int ice_eswitch_enable_switchdev(struct ice_pf *pf) return -EINVAL; } - pf->eswitch.control_vsi = ice_eswitch_vsi_setup(pf, pf->hw.port_info); - if (!pf->eswitch.control_vsi) - return -ENODEV; - - ctrl_vsi = pf->eswitch.control_vsi; pf->eswitch.uplink_vsi = uplink_vsi; if (ice_eswitch_setup_env(pf)) - goto err_vsi; + return -ENODEV; if (ice_eswitch_br_offloads_init(pf)) goto err_br_offloads; @@ -382,8 +266,6 @@ static int ice_eswitch_enable_switchdev(struct ice_pf *pf) err_br_offloads: ice_eswitch_release_env(pf); -err_vsi: - ice_vsi_release(ctrl_vsi); return -ENODEV; } @@ -393,11 +275,8 @@ static int ice_eswitch_enable_switchdev(struct ice_pf *pf) */ static void ice_eswitch_disable_switchdev(struct ice_pf *pf) { - struct ice_vsi *ctrl_vsi = pf->eswitch.control_vsi; - ice_eswitch_br_offloads_deinit(pf); ice_eswitch_release_env(pf); - ice_vsi_release(ctrl_vsi); pf->eswitch.is_running = false; } @@ -513,40 +392,17 @@ void ice_eswitch_stop_all_tx_queues(struct ice_pf *pf) static void ice_eswitch_stop_reprs(struct ice_pf *pf) { ice_eswitch_stop_all_tx_queues(pf); - ice_eswitch_napi_disable(&pf->eswitch.reprs); } static void ice_eswitch_start_reprs(struct ice_pf *pf) { - ice_eswitch_napi_enable(&pf->eswitch.reprs); ice_eswitch_start_all_tx_queues(pf); } -static void -ice_eswitch_cp_change_queues(struct ice_eswitch *eswitch, int change) -{ - struct ice_vsi *cp = eswitch->control_vsi; - int queues = 0; - - if (queues) { - cp->req_txq = queues; - cp->req_rxq = queues; - ice_vsi_close(cp); - ice_vsi_rebuild(cp, ICE_VSI_FLAG_NO_INIT); - ice_vsi_open(cp); - } else if (!change) { - /* change == 0 means that VSI wasn't open, open it here */ - ice_vsi_open(cp); - } - - ice_eswitch_remap_rings_to_vectors(eswitch); -} - int ice_eswitch_attach(struct ice_pf *pf, struct ice_vf *vf) { struct ice_repr *repr; - int change = 1; int err; if (pf->eswitch_mode == DEVLINK_ESWITCH_MODE_LEGACY) @@ -556,7 +412,6 @@ ice_eswitch_attach(struct ice_pf *pf, struct ice_vf *vf) err = ice_eswitch_enable_switchdev(pf); if (err) return err; - change = 0; } ice_eswitch_stop_reprs(pf); @@ -578,7 +433,6 @@ ice_eswitch_attach(struct ice_pf *pf, struct ice_vf *vf) vf->repr_id = repr->id; - ice_eswitch_cp_change_queues(&pf->eswitch, change); ice_eswitch_start_reprs(pf); return 0; @@ -608,8 +462,6 @@ void ice_eswitch_detach(struct ice_pf *pf, struct ice_vf *vf) if (xa_empty(&pf->eswitch.reprs)) ice_eswitch_disable_switchdev(pf); - else - ice_eswitch_cp_change_queues(&pf->eswitch, -1); ice_eswitch_release_repr(pf, repr); ice_repr_rem_vf(repr); @@ -631,21 +483,14 @@ void ice_eswitch_detach(struct ice_pf *pf, struct ice_vf *vf) * ice_eswitch_rebuild - rebuild eswitch * @pf: pointer to PF structure */ -int ice_eswitch_rebuild(struct ice_pf *pf) +void ice_eswitch_rebuild(struct ice_pf *pf) { struct ice_repr *repr; unsigned long id; - int err; if (!ice_is_switchdev_running(pf)) - return 0; - - err = ice_vsi_rebuild(pf->eswitch.control_vsi, ICE_VSI_FLAG_INIT); - if (err) - return err; + return; xa_for_each(&pf->eswitch.reprs, id, repr) ice_eswitch_detach(pf, repr->vf); - - return 0; } diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.h b/drivers/net/ethernet/intel/ice/ice_eswitch.h index 59d51c0d14e5a..baad685074714 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.h +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.h @@ -10,7 +10,7 @@ void ice_eswitch_detach(struct ice_pf *pf, struct ice_vf *vf); int ice_eswitch_attach(struct ice_pf *pf, struct ice_vf *vf); -int ice_eswitch_rebuild(struct ice_pf *pf); +void ice_eswitch_rebuild(struct ice_pf *pf); int ice_eswitch_mode_get(struct devlink *devlink, u16 *mode); int diff --git a/drivers/net/ethernet/intel/ice/ice_lag.c b/drivers/net/ethernet/intel/ice/ice_lag.c index 7b802884440ce..7493eb87d2211 100644 --- a/drivers/net/ethernet/intel/ice/ice_lag.c +++ b/drivers/net/ethernet/intel/ice/ice_lag.c @@ -740,8 +740,7 @@ static void ice_lag_move_vf_nodes(struct ice_lag *lag, u8 oldport, u8 newport) pf = lag->pf; ice_for_each_vsi(pf, i) - if (pf->vsi[i] && (pf->vsi[i]->type == ICE_VSI_VF || - pf->vsi[i]->type == ICE_VSI_SWITCHDEV_CTRL)) + if (pf->vsi[i] && pf->vsi[i]->type == ICE_VSI_VF) ice_lag_move_single_vf_nodes(lag, oldport, newport, i); } @@ -979,8 +978,7 @@ ice_lag_reclaim_vf_nodes(struct ice_lag *lag, struct ice_hw *src_hw) pf = lag->pf; ice_for_each_vsi(pf, i) - if (pf->vsi[i] && (pf->vsi[i]->type == ICE_VSI_VF || - pf->vsi[i]->type == ICE_VSI_SWITCHDEV_CTRL)) + if (pf->vsi[i] && pf->vsi[i]->type == ICE_VSI_VF) ice_for_each_traffic_class(tc) ice_lag_reclaim_vf_tc(lag, src_hw, i, tc); } @@ -2002,8 +2000,7 @@ ice_lag_move_vf_nodes_sync(struct ice_lag *lag, struct ice_hw *dest_hw) pf = lag->pf; ice_for_each_vsi(pf, i) - if (pf->vsi[i] && (pf->vsi[i]->type == ICE_VSI_VF || - pf->vsi[i]->type == ICE_VSI_SWITCHDEV_CTRL)) + if (pf->vsi[i] && pf->vsi[i]->type == ICE_VSI_VF) ice_for_each_traffic_class(tc) ice_lag_move_vf_nodes_tc_sync(lag, dest_hw, i, tc); diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index ee3f0d3e3f6db..24b8f394ec4b0 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -27,8 +27,6 @@ const char *ice_vsi_type_str(enum ice_vsi_type vsi_type) return "ICE_VSI_CHNL"; case ICE_VSI_LB: return "ICE_VSI_LB"; - case ICE_VSI_SWITCHDEV_CTRL: - return "ICE_VSI_SWITCHDEV_CTRL"; default: return "unknown"; } @@ -144,7 +142,6 @@ static void ice_vsi_set_num_desc(struct ice_vsi *vsi) { switch (vsi->type) { case ICE_VSI_PF: - case ICE_VSI_SWITCHDEV_CTRL: case ICE_VSI_CTRL: case ICE_VSI_LB: /* a user could change the values of num_[tr]x_desc using @@ -211,21 +208,6 @@ static void ice_vsi_set_num_qs(struct ice_vsi *vsi) max_t(int, vsi->alloc_rxq, vsi->alloc_txq)); break; - case ICE_VSI_SWITCHDEV_CTRL: - /* The number of queues for ctrl VSI is equal to number of PRs - * Each ring is associated to the corresponding VF_PR netdev. - * Tx and Rx rings are always equal - */ - if (vsi->req_txq && vsi->req_rxq) { - vsi->alloc_txq = vsi->req_txq; - vsi->alloc_rxq = vsi->req_rxq; - } else { - vsi->alloc_txq = 1; - vsi->alloc_rxq = 1; - } - - vsi->num_q_vectors = 1; - break; case ICE_VSI_VF: if (vf->num_req_qs) vf->num_vf_qs = vf->num_req_qs; @@ -522,22 +504,6 @@ static irqreturn_t ice_msix_clean_rings(int __always_unused irq, void *data) return IRQ_HANDLED; } -static irqreturn_t ice_eswitch_msix_clean_rings(int __always_unused irq, void *data) -{ - struct ice_q_vector *q_vector = (struct ice_q_vector *)data; - struct ice_pf *pf = q_vector->vsi->back; - struct ice_repr *repr; - unsigned long id; - - if (!q_vector->tx.tx_ring && !q_vector->rx.rx_ring) - return IRQ_HANDLED; - - xa_for_each(&pf->eswitch.reprs, id, repr) - napi_schedule(&repr->q_vector->napi); - - return IRQ_HANDLED; -} - /** * ice_vsi_alloc_stat_arrays - Allocate statistics arrays * @vsi: VSI pointer @@ -600,10 +566,6 @@ ice_vsi_alloc_def(struct ice_vsi *vsi, struct ice_channel *ch) } switch (vsi->type) { - case ICE_VSI_SWITCHDEV_CTRL: - /* Setup eswitch MSIX irq handler for VSI */ - vsi->irq_handler = ice_eswitch_msix_clean_rings; - break; case ICE_VSI_PF: /* Setup default MSIX irq handler for VSI */ vsi->irq_handler = ice_msix_clean_rings; @@ -933,11 +895,6 @@ static void ice_vsi_set_rss_params(struct ice_vsi *vsi) max_rss_size); vsi->rss_lut_type = ICE_LUT_PF; break; - case ICE_VSI_SWITCHDEV_CTRL: - vsi->rss_table_size = ICE_LUT_VSI_SIZE; - vsi->rss_size = min_t(u16, num_online_cpus(), max_rss_size); - vsi->rss_lut_type = ICE_LUT_VSI; - break; case ICE_VSI_VF: /* VF VSI will get a small RSS table. * For VSI_LUT, LUT size should be set to 64 bytes. @@ -1263,7 +1220,6 @@ static int ice_vsi_init(struct ice_vsi *vsi, u32 vsi_flags) case ICE_VSI_PF: ctxt->flags = ICE_AQ_VSI_TYPE_PF; break; - case ICE_VSI_SWITCHDEV_CTRL: case ICE_VSI_CHNL: ctxt->flags = ICE_AQ_VSI_TYPE_VMDQ2; break; @@ -2145,7 +2101,6 @@ static void ice_set_agg_vsi(struct ice_vsi *vsi) case ICE_VSI_CHNL: case ICE_VSI_LB: case ICE_VSI_PF: - case ICE_VSI_SWITCHDEV_CTRL: max_agg_nodes = ICE_MAX_PF_AGG_NODES; agg_node_id_start = ICE_PF_AGG_NODE_ID_START; agg_node_iter = &pf->pf_agg_node[0]; @@ -2317,7 +2272,6 @@ ice_vsi_cfg_def(struct ice_vsi *vsi, struct ice_vsi_cfg_params *params) switch (vsi->type) { case ICE_VSI_CTRL: - case ICE_VSI_SWITCHDEV_CTRL: case ICE_VSI_PF: ret = ice_vsi_alloc_q_vectors(vsi); if (ret) @@ -2750,8 +2704,7 @@ void ice_dis_vsi(struct ice_vsi *vsi, bool locked) } else { ice_vsi_close(vsi); } - } else if (vsi->type == ICE_VSI_CTRL || - vsi->type == ICE_VSI_SWITCHDEV_CTRL) { + } else if (vsi->type == ICE_VSI_CTRL) { ice_vsi_close(vsi); } } diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 33a164fa325ac..f2b7d6ca8805e 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -7055,13 +7055,11 @@ int ice_down(struct ice_vsi *vsi) WARN_ON(!test_bit(ICE_VSI_DOWN, vsi->state)); - if (vsi->netdev && vsi->type == ICE_VSI_PF) { + if (vsi->netdev) { vlan_err = ice_vsi_del_vlan_zero(vsi); ice_ptp_link_change(vsi->back, vsi->back->hw.pf_id, false); netif_carrier_off(vsi->netdev); netif_tx_disable(vsi->netdev); - } else if (vsi->type == ICE_VSI_SWITCHDEV_CTRL) { - ice_eswitch_stop_all_tx_queues(vsi->back); } ice_vsi_dis_irq(vsi); @@ -7544,11 +7542,7 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type) goto err_vsi_rebuild; } - err = ice_eswitch_rebuild(pf); - if (err) { - dev_err(dev, "Switchdev rebuild failed: %d\n", err); - goto err_vsi_rebuild; - } + ice_eswitch_rebuild(pf); if (reset_type == ICE_RESET_PFR) { err = ice_rebuild_channels(pf); diff --git a/drivers/net/ethernet/intel/ice/ice_repr.c b/drivers/net/ethernet/intel/ice/ice_repr.c index 6191bee6c536e..a5c24da31b887 100644 --- a/drivers/net/ethernet/intel/ice/ice_repr.c +++ b/drivers/net/ethernet/intel/ice/ice_repr.c @@ -291,7 +291,6 @@ static void ice_repr_remove_node(struct devlink_port *devlink_port) */ static void ice_repr_rem(struct ice_repr *repr) { - kfree(repr->q_vector); free_netdev(repr->netdev); kfree(repr); } @@ -331,7 +330,6 @@ static void ice_repr_set_tx_topology(struct ice_pf *pf) static struct ice_repr * ice_repr_add(struct ice_pf *pf, struct ice_vsi *src_vsi, const u8 *parent_mac) { - struct ice_q_vector *q_vector; struct ice_netdev_priv *np; struct ice_repr *repr; int err; @@ -350,20 +348,10 @@ ice_repr_add(struct ice_pf *pf, struct ice_vsi *src_vsi, const u8 *parent_mac) np = netdev_priv(repr->netdev); np->repr = repr; - q_vector = kzalloc(sizeof(*q_vector), GFP_KERNEL); - if (!q_vector) { - err = -ENOMEM; - goto err_alloc_q_vector; - } - repr->q_vector = q_vector; - repr->q_id = repr->id; - ether_addr_copy(repr->parent_mac, parent_mac); return repr; -err_alloc_q_vector: - free_netdev(repr->netdev); err_alloc: kfree(repr); return ERR_PTR(err); diff --git a/drivers/net/ethernet/intel/ice/ice_repr.h b/drivers/net/ethernet/intel/ice/ice_repr.h index 34823f58cd231..eb8dec1f7de43 100644 --- a/drivers/net/ethernet/intel/ice/ice_repr.h +++ b/drivers/net/ethernet/intel/ice/ice_repr.h @@ -9,11 +9,9 @@ struct ice_repr { struct ice_vsi *src_vsi; struct ice_vf *vf; - struct ice_q_vector *q_vector; struct net_device *netdev; struct metadata_dst *dst; struct ice_esw_br_port *br_port; - int q_id; u32 id; u8 parent_mac[ETH_ALEN]; }; diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h index 9ff92dba58236..64b8e7ec0b63e 100644 --- a/drivers/net/ethernet/intel/ice/ice_type.h +++ b/drivers/net/ethernet/intel/ice/ice_type.h @@ -150,7 +150,6 @@ enum ice_vsi_type { ICE_VSI_CTRL = 3, /* equates to ICE_VSI_PF with 1 queue pair */ ICE_VSI_CHNL = 4, ICE_VSI_LB = 6, - ICE_VSI_SWITCHDEV_CTRL = 7, }; struct ice_link_status { diff --git a/drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.c b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.c index 4a6c850d83ac9..7aae7fdcfcdb9 100644 --- a/drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.c +++ b/drivers/net/ethernet/intel/ice/ice_vsi_vlan_ops.c @@ -72,7 +72,6 @@ void ice_vsi_init_vlan_ops(struct ice_vsi *vsi) switch (vsi->type) { case ICE_VSI_PF: - case ICE_VSI_SWITCHDEV_CTRL: ice_pf_vsi_init_vlan_ops(vsi); break; case ICE_VSI_VF: From 6235cb6e5b0de0717a1055740f6fd83a28531193 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Fri, 1 Mar 2024 12:54:12 +0100 Subject: [PATCH 006/167] ice: change repr::id values Instead of getting repr::id from xa_alloc() value, set it to the src_vsi::num_vsi value. It is unique for each PR. Reviewed-by: Przemek Kitszel Reviewed-by: Marcin Szycik Signed-off-by: Michal Swiatkowski Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_eswitch.c | 5 ++--- drivers/net/ethernet/intel/ice/ice_repr.c | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.c b/drivers/net/ethernet/intel/ice/ice_eswitch.c index 76fa114f22c90..5eba8dec9f949 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.c +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c @@ -319,7 +319,7 @@ ice_eswitch_mode_set(struct devlink *devlink, u16 mode, dev_info(ice_pf_to_dev(pf), "PF %d changed eswitch mode to switchdev", pf->hw.pf_id); - xa_init_flags(&pf->eswitch.reprs, XA_FLAGS_ALLOC); + xa_init(&pf->eswitch.reprs); NL_SET_ERR_MSG_MOD(extack, "Changed eswitch mode to switchdev"); break; } @@ -426,8 +426,7 @@ ice_eswitch_attach(struct ice_pf *pf, struct ice_vf *vf) if (err) goto err_setup_repr; - err = xa_alloc(&pf->eswitch.reprs, &repr->id, repr, - XA_LIMIT(1, INT_MAX), GFP_KERNEL); + err = xa_insert(&pf->eswitch.reprs, repr->id, repr, GFP_KERNEL); if (err) goto err_xa_alloc; diff --git a/drivers/net/ethernet/intel/ice/ice_repr.c b/drivers/net/ethernet/intel/ice/ice_repr.c index a5c24da31b887..b4fb742718115 100644 --- a/drivers/net/ethernet/intel/ice/ice_repr.c +++ b/drivers/net/ethernet/intel/ice/ice_repr.c @@ -345,6 +345,7 @@ ice_repr_add(struct ice_pf *pf, struct ice_vsi *src_vsi, const u8 *parent_mac) } repr->src_vsi = src_vsi; + repr->id = src_vsi->vsi_num; np = netdev_priv(repr->netdev); np->repr = repr; From 44ba608db50970dbb7eae1dca13ebcbf9485b4e7 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Fri, 1 Mar 2024 12:54:13 +0100 Subject: [PATCH 007/167] ice: do switchdev slow-path Rx using PF VSI Add an ICE_RX_FLAG_MULTIDEV flag to Rx ring. If it is set try to find correct port representor. Do it based on src_vsi value stored in flex descriptor. Ids of representor pointers stored in xarray are equal to corresponding src_vsi value. Thanks to that we can directly get correct representor if we have src_vsi value. Set multidev flag during ring configuration. If the mode is switchdev, change the ring descriptor to the one that contains src_vsi value. PF netdev should be reconfigured, do it by calling ice_down() and ice_up() if the netdev was up before configuring switchdev. Reviewed-by: Marcin Szycik Signed-off-by: Michal Swiatkowski Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_base.c | 8 +++++ drivers/net/ethernet/intel/ice/ice_eswitch.c | 36 +++++++++++++++++++ drivers/net/ethernet/intel/ice/ice_eswitch.h | 9 +++++ drivers/net/ethernet/intel/ice/ice_txrx.h | 1 + drivers/net/ethernet/intel/ice/ice_txrx_lib.c | 9 ++++- 5 files changed, 62 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 8ffae7fe894f6..662fc395edcca 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -452,6 +452,14 @@ static int ice_setup_rx_ctx(struct ice_rx_ring *ring) /* Rx queue threshold in units of 64 */ rlan_ctx.lrxqthresh = 1; + /* PF acts as uplink for switchdev; set flex descriptor with src_vsi + * metadata and flags to allow redirecting to PR netdev + */ + if (ice_is_eswitch_mode_switchdev(vsi->back)) { + ring->flags |= ICE_RX_FLAGS_MULTIDEV; + rxdid = ICE_RXDID_FLEX_NIC_2; + } + /* Enable Flexible Descriptors in the queue context which * allows this driver to select a specific receive descriptor format * increasing context priority to pick up profile ID; default is 0x01; diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.c b/drivers/net/ethernet/intel/ice/ice_eswitch.c index 5eba8dec9f949..86a6d58ad3ec9 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.c +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c @@ -21,8 +21,13 @@ static int ice_eswitch_setup_env(struct ice_pf *pf) { struct ice_vsi *uplink_vsi = pf->eswitch.uplink_vsi; struct net_device *netdev = uplink_vsi->netdev; + bool if_running = netif_running(netdev); struct ice_vsi_vlan_ops *vlan_ops; + if (if_running && !test_and_set_bit(ICE_VSI_DOWN, uplink_vsi->state)) + if (ice_down(uplink_vsi)) + return -ENODEV; + ice_remove_vsi_fltr(&pf->hw, uplink_vsi->idx); netif_addr_lock_bh(netdev); @@ -51,8 +56,13 @@ static int ice_eswitch_setup_env(struct ice_pf *pf) if (ice_vsi_update_local_lb(uplink_vsi, true)) goto err_override_local_lb; + if (if_running && ice_up(uplink_vsi)) + goto err_up; + return 0; +err_up: + ice_vsi_update_local_lb(uplink_vsi, false); err_override_local_lb: ice_vsi_update_security(uplink_vsi, ice_vsi_ctx_clear_allow_override); err_override_uplink: @@ -69,6 +79,9 @@ static int ice_eswitch_setup_env(struct ice_pf *pf) ice_fltr_add_mac_and_broadcast(uplink_vsi, uplink_vsi->port_info->mac.perm_addr, ICE_FWD_TO_VSI); + if (if_running) + ice_up(uplink_vsi); + return -ENODEV; } @@ -493,3 +506,26 @@ void ice_eswitch_rebuild(struct ice_pf *pf) xa_for_each(&pf->eswitch.reprs, id, repr) ice_eswitch_detach(pf, repr->vf); } + +/** + * ice_eswitch_get_target - get netdev based on src_vsi from descriptor + * @rx_ring: ring used to receive the packet + * @rx_desc: descriptor used to get src_vsi value + * + * Get src_vsi value from descriptor and load correct representor. If it isn't + * found return rx_ring->netdev. + */ +struct net_device *ice_eswitch_get_target(struct ice_rx_ring *rx_ring, + union ice_32b_rx_flex_desc *rx_desc) +{ + struct ice_eswitch *eswitch = &rx_ring->vsi->back->eswitch; + struct ice_32b_rx_flex_desc_nic_2 *desc; + struct ice_repr *repr; + + desc = (struct ice_32b_rx_flex_desc_nic_2 *)rx_desc; + repr = xa_load(&eswitch->reprs, le16_to_cpu(desc->src_vsi)); + if (!repr) + return rx_ring->netdev; + + return repr->netdev; +} diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.h b/drivers/net/ethernet/intel/ice/ice_eswitch.h index baad685074714..e2e5c0c75e7d4 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.h +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.h @@ -26,6 +26,8 @@ void ice_eswitch_set_target_vsi(struct sk_buff *skb, struct ice_tx_offload_params *off); netdev_tx_t ice_eswitch_port_start_xmit(struct sk_buff *skb, struct net_device *netdev); +struct net_device *ice_eswitch_get_target(struct ice_rx_ring *rx_ring, + union ice_32b_rx_flex_desc *rx_desc); #else /* CONFIG_ICE_SWITCHDEV */ static inline void ice_eswitch_detach(struct ice_pf *pf, struct ice_vf *vf) { } @@ -76,5 +78,12 @@ ice_eswitch_port_start_xmit(struct sk_buff *skb, struct net_device *netdev) { return NETDEV_TX_BUSY; } + +static inline struct net_device * +ice_eswitch_get_target(struct ice_rx_ring *rx_ring, + union ice_32b_rx_flex_desc *rx_desc) +{ + return rx_ring->netdev; +} #endif /* CONFIG_ICE_SWITCHDEV */ #endif /* _ICE_ESWITCH_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index af955b0e5dc5c..feba314a3fe44 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -365,6 +365,7 @@ struct ice_rx_ring { u8 ptp_rx; #define ICE_RX_FLAGS_RING_BUILD_SKB BIT(1) #define ICE_RX_FLAGS_CRC_STRIP_DIS BIT(2) +#define ICE_RX_FLAGS_MULTIDEV BIT(3) u8 flags; /* CL5 - 5th cacheline starts here */ struct xdp_rxq_info xdp_rxq; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c index f8f1d2bdc1be9..676c00e1554c3 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c @@ -236,7 +236,14 @@ ice_process_skb_fields(struct ice_rx_ring *rx_ring, ice_rx_hash_to_skb(rx_ring, rx_desc, skb, ptype); /* modifies the skb - consumes the enet header */ - skb->protocol = eth_type_trans(skb, rx_ring->netdev); + if (unlikely(rx_ring->flags & ICE_RX_FLAGS_MULTIDEV)) { + struct net_device *netdev = ice_eswitch_get_target(rx_ring, + rx_desc); + + skb->protocol = eth_type_trans(skb, netdev); + } else { + skb->protocol = eth_type_trans(skb, rx_ring->netdev); + } ice_rx_csum(rx_ring, skb, rx_desc, ptype); From 4498159a509328e903a692504a0ba1624754bc3e Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Fri, 1 Mar 2024 12:54:14 +0100 Subject: [PATCH 008/167] ice: count representor stats Removing control plane VSI result in no information about slow-path statistic. In current solution statistics need to be counted in driver. Patch is based on similar implementation done by Simon Horman in nfp: commit eadfa4c3be99 ("nfp: add stats and xmit helpers for representors") Add const modifier to netdev parameter in ice_netdev_to_repr(). It isn't (and shouldn't be) modified in the function. Reviewed-by: Marcin Szycik Signed-off-by: Michal Swiatkowski Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_eswitch.c | 7 +- drivers/net/ethernet/intel/ice/ice_repr.c | 103 +++++++++++++----- drivers/net/ethernet/intel/ice/ice_repr.h | 16 ++- drivers/net/ethernet/intel/ice/ice_txrx_lib.c | 2 + 4 files changed, 98 insertions(+), 30 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.c b/drivers/net/ethernet/intel/ice/ice_eswitch.c index 86a6d58ad3ec9..af4e9530eb48a 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.c +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c @@ -192,13 +192,18 @@ netdev_tx_t ice_eswitch_port_start_xmit(struct sk_buff *skb, struct net_device *netdev) { struct ice_repr *repr = ice_netdev_to_repr(netdev); + unsigned int len = skb->len; + int ret; skb_dst_drop(skb); dst_hold((struct dst_entry *)repr->dst); skb_dst_set(skb, (struct dst_entry *)repr->dst); skb->dev = repr->dst->u.port_info.lower_dev; - return dev_queue_xmit(skb); + ret = dev_queue_xmit(skb); + ice_repr_inc_tx_stats(repr, len, ret); + + return ret; } /** diff --git a/drivers/net/ethernet/intel/ice/ice_repr.c b/drivers/net/ethernet/intel/ice/ice_repr.c index b4fb742718115..2429727d55629 100644 --- a/drivers/net/ethernet/intel/ice/ice_repr.c +++ b/drivers/net/ethernet/intel/ice/ice_repr.c @@ -41,6 +41,47 @@ ice_repr_get_phys_port_name(struct net_device *netdev, char *buf, size_t len) return 0; } +/** + * ice_repr_inc_tx_stats - increment Tx statistic by one packet + * @repr: repr to increment stats on + * @len: length of the packet + * @xmit_status: value returned by xmit function + */ +void ice_repr_inc_tx_stats(struct ice_repr *repr, unsigned int len, + int xmit_status) +{ + struct ice_repr_pcpu_stats *stats; + + if (unlikely(xmit_status != NET_XMIT_SUCCESS && + xmit_status != NET_XMIT_CN)) { + this_cpu_inc(repr->stats->tx_drops); + return; + } + + stats = this_cpu_ptr(repr->stats); + u64_stats_update_begin(&stats->syncp); + stats->tx_packets++; + stats->tx_bytes += len; + u64_stats_update_end(&stats->syncp); +} + +/** + * ice_repr_inc_rx_stats - increment Rx statistic by one packet + * @netdev: repr netdev to increment stats on + * @len: length of the packet + */ +void ice_repr_inc_rx_stats(struct net_device *netdev, unsigned int len) +{ + struct ice_repr *repr = ice_netdev_to_repr(netdev); + struct ice_repr_pcpu_stats *stats; + + stats = this_cpu_ptr(repr->stats); + u64_stats_update_begin(&stats->syncp); + stats->rx_packets++; + stats->rx_bytes += len; + u64_stats_update_end(&stats->syncp); +} + /** * ice_repr_get_stats64 - get VF stats for VFPR use * @netdev: pointer to port representor netdev @@ -76,7 +117,7 @@ ice_repr_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats) * ice_netdev_to_repr - Get port representor for given netdevice * @netdev: pointer to port representor netdev */ -struct ice_repr *ice_netdev_to_repr(struct net_device *netdev) +struct ice_repr *ice_netdev_to_repr(const struct net_device *netdev) { struct ice_netdev_priv *np = netdev_priv(netdev); @@ -139,38 +180,35 @@ static int ice_repr_stop(struct net_device *netdev) * ice_repr_sp_stats64 - get slow path stats for port representor * @dev: network interface device structure * @stats: netlink stats structure - * - * RX/TX stats are being swapped here to be consistent with VF stats. In slow - * path, port representor receives data when the corresponding VF is sending it - * (and vice versa), TX and RX bytes/packets are effectively swapped on port - * representor. */ static int ice_repr_sp_stats64(const struct net_device *dev, struct rtnl_link_stats64 *stats) { - struct ice_netdev_priv *np = netdev_priv(dev); - int vf_id = np->repr->vf->vf_id; - struct ice_tx_ring *tx_ring; - struct ice_rx_ring *rx_ring; - u64 pkts, bytes; - - tx_ring = np->vsi->tx_rings[vf_id]; - ice_fetch_u64_stats_per_ring(&tx_ring->ring_stats->syncp, - tx_ring->ring_stats->stats, - &pkts, &bytes); - stats->rx_packets = pkts; - stats->rx_bytes = bytes; - - rx_ring = np->vsi->rx_rings[vf_id]; - ice_fetch_u64_stats_per_ring(&rx_ring->ring_stats->syncp, - rx_ring->ring_stats->stats, - &pkts, &bytes); - stats->tx_packets = pkts; - stats->tx_bytes = bytes; - stats->tx_dropped = rx_ring->ring_stats->rx_stats.alloc_page_failed + - rx_ring->ring_stats->rx_stats.alloc_buf_failed; - + struct ice_repr *repr = ice_netdev_to_repr(dev); + int i; + + for_each_possible_cpu(i) { + u64 tbytes, tpkts, tdrops, rbytes, rpkts; + struct ice_repr_pcpu_stats *repr_stats; + unsigned int start; + + repr_stats = per_cpu_ptr(repr->stats, i); + do { + start = u64_stats_fetch_begin(&repr_stats->syncp); + tbytes = repr_stats->tx_bytes; + tpkts = repr_stats->tx_packets; + tdrops = repr_stats->tx_drops; + rbytes = repr_stats->rx_bytes; + rpkts = repr_stats->rx_packets; + } while (u64_stats_fetch_retry(&repr_stats->syncp, start)); + + stats->tx_bytes += tbytes; + stats->tx_packets += tpkts; + stats->tx_dropped += tdrops; + stats->rx_bytes += rbytes; + stats->rx_packets += rpkts; + } return 0; } @@ -291,6 +329,7 @@ static void ice_repr_remove_node(struct devlink_port *devlink_port) */ static void ice_repr_rem(struct ice_repr *repr) { + free_percpu(repr->stats); free_netdev(repr->netdev); kfree(repr); } @@ -344,6 +383,12 @@ ice_repr_add(struct ice_pf *pf, struct ice_vsi *src_vsi, const u8 *parent_mac) goto err_alloc; } + repr->stats = netdev_alloc_pcpu_stats(struct ice_repr_pcpu_stats); + if (!repr->stats) { + err = -ENOMEM; + goto err_stats; + } + repr->src_vsi = src_vsi; repr->id = src_vsi->vsi_num; np = netdev_priv(repr->netdev); @@ -353,6 +398,8 @@ ice_repr_add(struct ice_pf *pf, struct ice_vsi *src_vsi, const u8 *parent_mac) return repr; +err_stats: + free_netdev(repr->netdev); err_alloc: kfree(repr); return ERR_PTR(err); diff --git a/drivers/net/ethernet/intel/ice/ice_repr.h b/drivers/net/ethernet/intel/ice/ice_repr.h index eb8dec1f7de43..cff730b15ca0e 100644 --- a/drivers/net/ethernet/intel/ice/ice_repr.h +++ b/drivers/net/ethernet/intel/ice/ice_repr.h @@ -6,12 +6,22 @@ #include +struct ice_repr_pcpu_stats { + struct u64_stats_sync syncp; + u64 rx_packets; + u64 rx_bytes; + u64 tx_packets; + u64 tx_bytes; + u64 tx_drops; +}; + struct ice_repr { struct ice_vsi *src_vsi; struct ice_vf *vf; struct net_device *netdev; struct metadata_dst *dst; struct ice_esw_br_port *br_port; + struct ice_repr_pcpu_stats __percpu *stats; u32 id; u8 parent_mac[ETH_ALEN]; }; @@ -22,8 +32,12 @@ void ice_repr_rem_vf(struct ice_repr *repr); void ice_repr_start_tx_queues(struct ice_repr *repr); void ice_repr_stop_tx_queues(struct ice_repr *repr); -struct ice_repr *ice_netdev_to_repr(struct net_device *netdev); +struct ice_repr *ice_netdev_to_repr(const struct net_device *netdev); bool ice_is_port_repr_netdev(const struct net_device *netdev); struct ice_repr *ice_repr_get_by_vsi(struct ice_vsi *vsi); + +void ice_repr_inc_tx_stats(struct ice_repr *repr, unsigned int len, + int xmit_status); +void ice_repr_inc_rx_stats(struct net_device *netdev, unsigned int len); #endif diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c index 676c00e1554c3..df072ce767b1e 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c @@ -240,6 +240,8 @@ ice_process_skb_fields(struct ice_rx_ring *rx_ring, struct net_device *netdev = ice_eswitch_get_target(rx_ring, rx_desc); + if (ice_is_port_repr_netdev(netdev)) + ice_repr_inc_rx_stats(netdev, skb->len); skb->protocol = eth_type_trans(skb, netdev); } else { skb->protocol = eth_type_trans(skb, rx_ring->netdev); From ea558de7238bb12c3435c47f0631e9d17bf4a09f Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Sat, 16 Mar 2024 12:38:29 +0100 Subject: [PATCH 009/167] i40e: Enforce software interrupt during busy-poll exit As for ice bug fixed by commit b7306b42beaf ("ice: manage interrupts during poll exit") followed by commit 23be7075b318 ("ice: fix software generating extra interrupts") I'm seeing the similar issue also with i40e driver. In certain situation when busy-loop is enabled together with adaptive coalescing, the driver occasionally misses that there are outstanding descriptors to clean when exiting busy poll. Try to catch the remaining work by triggering a software interrupt when exiting busy poll. No extra interrupts will be generated when busy polling is not used. The issue was found when running sockperf ping-pong tcp test with adaptive coalescing and busy poll enabled (50 as value busy_pool and busy_read sysctl knobs) and results in huge latency spikes with more than 100000us. The fix is inspired from the ice driver and do the following: 1) During napi poll exit in case of busy-poll (napo_complete_done() returns false) this is recorded to q_vector that we were in busy loop. 2) Extends i40e_buildreg_itr() to be able to add an enforced software interrupt into built value 2) In i40e_update_enable_itr() enforces a software interrupt trigger if we are exiting busy poll to catch any pending clean-ups 3) Reuses unused 3rd ITR (interrupt throttle) index and set it to 20K interrupts per second to limit the number of these sw interrupts. Test results ============ Prior: [root@dell-per640-07 net]# sockperf ping-pong -i 10.9.9.1 --tcp -m 1000 --mps=max -t 120 sockperf: == version #3.10-no.git == sockperf[CLIENT] send on:sockperf: using recvfrom() to block on socket(s) [ 0] IP = 10.9.9.1 PORT = 11111 # TCP sockperf: Warmup stage (sending a few dummy messages)... sockperf: Starting test... sockperf: Test end (interrupted by timer) sockperf: Test ended sockperf: [Total Run] RunTime=119.999 sec; Warm up time=400 msec; SentMessages=2438563; ReceivedMessages=2438562 sockperf: ========= Printing statistics for Server No: 0 sockperf: [Valid Duration] RunTime=119.549 sec; SentMessages=2429473; ReceivedMessages=2429473 sockperf: ====> avg-latency=24.571 (std-dev=93.297, mean-ad=4.904, median-ad=1.510, siqr=1.063, cv=3.797, std-error=0.060, 99.0% ci=[24.417, 24.725]) sockperf: # dropped messages = 0; # duplicated messages = 0; # out-of-order messages = 0 sockperf: Summary: Latency is 24.571 usec sockperf: Total 2429473 observations; each percentile contains 24294.73 observations sockperf: ---> observation = 103294.331 sockperf: ---> percentile 99.999 = 45.633 sockperf: ---> percentile 99.990 = 37.013 sockperf: ---> percentile 99.900 = 35.910 sockperf: ---> percentile 99.000 = 33.390 sockperf: ---> percentile 90.000 = 28.626 sockperf: ---> percentile 75.000 = 27.741 sockperf: ---> percentile 50.000 = 26.743 sockperf: ---> percentile 25.000 = 25.614 sockperf: ---> observation = 12.220 After: [root@dell-per640-07 net]# sockperf ping-pong -i 10.9.9.1 --tcp -m 1000 --mps=max -t 120 sockperf: == version #3.10-no.git == sockperf[CLIENT] send on:sockperf: using recvfrom() to block on socket(s) [ 0] IP = 10.9.9.1 PORT = 11111 # TCP sockperf: Warmup stage (sending a few dummy messages)... sockperf: Starting test... sockperf: Test end (interrupted by timer) sockperf: Test ended sockperf: [Total Run] RunTime=119.999 sec; Warm up time=400 msec; SentMessages=2400055; ReceivedMessages=2400054 sockperf: ========= Printing statistics for Server No: 0 sockperf: [Valid Duration] RunTime=119.549 sec; SentMessages=2391186; ReceivedMessages=2391186 sockperf: ====> avg-latency=24.965 (std-dev=5.934, mean-ad=4.642, median-ad=1.485, siqr=1.067, cv=0.238, std-error=0.004, 99.0% ci=[24.955, 24.975]) sockperf: # dropped messages = 0; # duplicated messages = 0; # out-of-order messages = 0 sockperf: Summary: Latency is 24.965 usec sockperf: Total 2391186 observations; each percentile contains 23911.86 observations sockperf: ---> observation = 195.841 sockperf: ---> percentile 99.999 = 45.026 sockperf: ---> percentile 99.990 = 39.009 sockperf: ---> percentile 99.900 = 35.922 sockperf: ---> percentile 99.000 = 33.482 sockperf: ---> percentile 90.000 = 28.902 sockperf: ---> percentile 75.000 = 27.821 sockperf: ---> percentile 50.000 = 26.860 sockperf: ---> percentile 25.000 = 25.685 sockperf: ---> observation = 12.277 Fixes: 0bcd952feec7 ("ethernet/intel: consolidate NAPI and NAPI exit") Reported-by: Hugo Ferreira Reviewed-by: Michal Schmidt Signed-off-by: Ivan Vecera Reviewed-by: Jesse Brandeburg Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e.h | 1 + drivers/net/ethernet/intel/i40e/i40e_main.c | 6 ++ .../net/ethernet/intel/i40e/i40e_register.h | 3 + drivers/net/ethernet/intel/i40e/i40e_txrx.c | 82 ++++++++++++++----- drivers/net/ethernet/intel/i40e/i40e_txrx.h | 1 + 5 files changed, 72 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index ba24f3fa92c37..2fbabcdb5bb5f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -955,6 +955,7 @@ struct i40e_q_vector { struct rcu_head rcu; /* to avoid race with update stats on free */ char name[I40E_INT_NAME_STR_LEN]; bool arm_wb_state; + bool in_busy_poll; int irq_num; /* IRQ assigned to this q_vector */ } ____cacheline_internodealigned_in_smp; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index f86578857e8ae..6576a00810936 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -3911,6 +3911,12 @@ static void i40e_vsi_configure_msix(struct i40e_vsi *vsi) q_vector->tx.target_itr >> 1); q_vector->tx.current_itr = q_vector->tx.target_itr; + /* Set ITR for software interrupts triggered after exiting + * busy-loop polling. + */ + wr32(hw, I40E_PFINT_ITRN(I40E_SW_ITR, vector - 1), + I40E_ITR_20K); + wr32(hw, I40E_PFINT_RATEN(vector - 1), i40e_intrl_usec_to_reg(vsi->int_rate_limit)); diff --git a/drivers/net/ethernet/intel/i40e/i40e_register.h b/drivers/net/ethernet/intel/i40e/i40e_register.h index 14ab642cafdb2..432afbb642013 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_register.h +++ b/drivers/net/ethernet/intel/i40e/i40e_register.h @@ -333,8 +333,11 @@ #define I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT 3 #define I40E_PFINT_DYN_CTLN_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT) #define I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT 5 +#define I40E_PFINT_DYN_CTLN_INTERVAL_MASK I40E_MASK(0xFFF, I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT) #define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT 24 #define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT) +#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_SHIFT 25 +#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_DYN_CTLN_SW_ITR_INDX_SHIFT) #define I40E_PFINT_ICR0 0x00038780 /* Reset: CORER */ #define I40E_PFINT_ICR0_INTEVENT_SHIFT 0 #define I40E_PFINT_ICR0_INTEVENT_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_INTEVENT_SHIFT) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 0d7177083708f..1a12b732818ee 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2630,7 +2630,22 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget, return failure ? budget : (int)total_rx_packets; } -static inline u32 i40e_buildreg_itr(const int type, u16 itr) +/** + * i40e_buildreg_itr - build a value for writing to I40E_PFINT_DYN_CTLN register + * @itr_idx: interrupt throttling index + * @interval: interrupt throttling interval value in usecs + * @force_swint: force software interrupt + * + * The function builds a value for I40E_PFINT_DYN_CTLN register that + * is used to update interrupt throttling interval for specified ITR index + * and optionally enforces a software interrupt. If the @itr_idx is equal + * to I40E_ITR_NONE then no interval change is applied and only @force_swint + * parameter is taken into account. If the interval change and enforced + * software interrupt are not requested then the built value just enables + * appropriate vector interrupt. + **/ +static u32 i40e_buildreg_itr(enum i40e_dyn_idx itr_idx, u16 interval, + bool force_swint) { u32 val; @@ -2644,23 +2659,33 @@ static inline u32 i40e_buildreg_itr(const int type, u16 itr) * an event in the PBA anyway so we need to rely on the automask * to hold pending events for us until the interrupt is re-enabled * - * The itr value is reported in microseconds, and the register - * value is recorded in 2 microsecond units. For this reason we - * only need to shift by the interval shift - 1 instead of the - * full value. + * We have to shift the given value as it is reported in microseconds + * and the register value is recorded in 2 microsecond units. */ - itr &= I40E_ITR_MASK; + interval >>= 1; + /* 1. Enable vector interrupt + * 2. Update the interval for the specified ITR index + * (I40E_ITR_NONE in the register is used to indicate that + * no interval update is requested) + */ val = I40E_PFINT_DYN_CTLN_INTENA_MASK | - (type << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT) | - (itr << (I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT - 1)); + FIELD_PREP(I40E_PFINT_DYN_CTLN_ITR_INDX_MASK, itr_idx) | + FIELD_PREP(I40E_PFINT_DYN_CTLN_INTERVAL_MASK, interval); + + /* 3. Enforce software interrupt trigger if requested + * (These software interrupts rate is limited by ITR2 that is + * set to 20K interrupts per second) + */ + if (force_swint) + val |= I40E_PFINT_DYN_CTLN_SWINT_TRIG_MASK | + I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK | + FIELD_PREP(I40E_PFINT_DYN_CTLN_SW_ITR_INDX_MASK, + I40E_SW_ITR); return val; } -/* a small macro to shorten up some long lines */ -#define INTREG I40E_PFINT_DYN_CTLN - /* The act of updating the ITR will cause it to immediately trigger. In order * to prevent this from throwing off adaptive update statistics we defer the * update so that it can only happen so often. So after either Tx or Rx are @@ -2679,8 +2704,10 @@ static inline u32 i40e_buildreg_itr(const int type, u16 itr) static inline void i40e_update_enable_itr(struct i40e_vsi *vsi, struct i40e_q_vector *q_vector) { + enum i40e_dyn_idx itr_idx = I40E_ITR_NONE; struct i40e_hw *hw = &vsi->back->hw; - u32 intval; + u16 interval = 0; + u32 itr_val; /* If we don't have MSIX, then we only need to re-enable icr0 */ if (!test_bit(I40E_FLAG_MSIX_ENA, vsi->back->flags)) { @@ -2702,8 +2729,8 @@ static inline void i40e_update_enable_itr(struct i40e_vsi *vsi, */ if (q_vector->rx.target_itr < q_vector->rx.current_itr) { /* Rx ITR needs to be reduced, this is highest priority */ - intval = i40e_buildreg_itr(I40E_RX_ITR, - q_vector->rx.target_itr); + itr_idx = I40E_RX_ITR; + interval = q_vector->rx.target_itr; q_vector->rx.current_itr = q_vector->rx.target_itr; q_vector->itr_countdown = ITR_COUNTDOWN_START; } else if ((q_vector->tx.target_itr < q_vector->tx.current_itr) || @@ -2712,25 +2739,36 @@ static inline void i40e_update_enable_itr(struct i40e_vsi *vsi, /* Tx ITR needs to be reduced, this is second priority * Tx ITR needs to be increased more than Rx, fourth priority */ - intval = i40e_buildreg_itr(I40E_TX_ITR, - q_vector->tx.target_itr); + itr_idx = I40E_TX_ITR; + interval = q_vector->tx.target_itr; q_vector->tx.current_itr = q_vector->tx.target_itr; q_vector->itr_countdown = ITR_COUNTDOWN_START; } else if (q_vector->rx.current_itr != q_vector->rx.target_itr) { /* Rx ITR needs to be increased, third priority */ - intval = i40e_buildreg_itr(I40E_RX_ITR, - q_vector->rx.target_itr); + itr_idx = I40E_RX_ITR; + interval = q_vector->rx.target_itr; q_vector->rx.current_itr = q_vector->rx.target_itr; q_vector->itr_countdown = ITR_COUNTDOWN_START; } else { /* No ITR update, lowest priority */ - intval = i40e_buildreg_itr(I40E_ITR_NONE, 0); if (q_vector->itr_countdown) q_vector->itr_countdown--; } - if (!test_bit(__I40E_VSI_DOWN, vsi->state)) - wr32(hw, INTREG(q_vector->reg_idx), intval); + /* Do not update interrupt control register if VSI is down */ + if (test_bit(__I40E_VSI_DOWN, vsi->state)) + return; + + /* Update ITR interval if necessary and enforce software interrupt + * if we are exiting busy poll. + */ + if (q_vector->in_busy_poll) { + itr_val = i40e_buildreg_itr(itr_idx, interval, true); + q_vector->in_busy_poll = false; + } else { + itr_val = i40e_buildreg_itr(itr_idx, interval, false); + } + wr32(hw, I40E_PFINT_DYN_CTLN(q_vector->reg_idx), itr_val); } /** @@ -2845,6 +2883,8 @@ int i40e_napi_poll(struct napi_struct *napi, int budget) */ if (likely(napi_complete_done(napi, work_done))) i40e_update_enable_itr(vsi, q_vector); + else + q_vector->in_busy_poll = true; return min(work_done, budget - 1); } diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h index abf15067eb5de..2cdc7de6301c1 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h @@ -68,6 +68,7 @@ enum i40e_dyn_idx { /* these are indexes into ITRN registers */ #define I40E_RX_ITR I40E_IDX_ITR0 #define I40E_TX_ITR I40E_IDX_ITR1 +#define I40E_SW_ITR I40E_IDX_ITR2 /* Supported RSS offloads */ #define I40E_DEFAULT_RSS_HENA ( \ From eb58c598ce45b7e787568fe27016260417c3d807 Mon Sep 17 00:00:00 2001 From: Aleksandr Loktionov Date: Wed, 13 Mar 2024 10:44:00 +0100 Subject: [PATCH 010/167] i40e: fix i40e_count_filters() to count only active/new filters The bug usually affects untrusted VFs, because they are limited to 18 MACs, it affects them badly, not letting to create MAC all filters. Not stable to reproduce, it happens when VF user creates MAC filters when other MACVLAN operations are happened in parallel. But consequence is that VF can't receive desired traffic. Fix counter to be bumped only for new or active filters. Fixes: 621650cabee5 ("i40e: Refactoring VF MAC filters counting to make more reliable") Signed-off-by: Aleksandr Loktionov Reviewed-by: Arkadiusz Kubalewski Reviewed-by: Paul Menzel Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 6576a00810936..48b9ddb2b1b38 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1253,8 +1253,11 @@ int i40e_count_filters(struct i40e_vsi *vsi) int bkt; int cnt = 0; - hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) - ++cnt; + hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) { + if (f->state == I40E_FILTER_NEW || + f->state == I40E_FILTER_ACTIVE) + ++cnt; + } return cnt; } From f37c4eac99c258111d414d31b740437e1925b8e8 Mon Sep 17 00:00:00 2001 From: Aleksandr Loktionov Date: Wed, 13 Mar 2024 10:56:39 +0100 Subject: [PATCH 011/167] i40e: fix vf may be used uninitialized in this function warning To fix the regression introduced by commit 52424f974bc5, which causes servers hang in very hard to reproduce conditions with resets races. Using two sources for the information is the root cause. In this function before the fix bumping v didn't mean bumping vf pointer. But the code used this variables interchangeably, so stale vf could point to different/not intended vf. Remove redundant "v" variable and iterate via single VF pointer across whole function instead to guarantee VF pointer validity. Fixes: 52424f974bc5 ("i40e: Fix VF hang when reset is triggered on another VF") Signed-off-by: Aleksandr Loktionov Reviewed-by: Arkadiusz Kubalewski Reviewed-by: Przemek Kitszel Reviewed-by: Paul Menzel Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 34 +++++++++---------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 83a34e98bdc79..4c13f78af6756 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -1624,8 +1624,8 @@ bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) { struct i40e_hw *hw = &pf->hw; struct i40e_vf *vf; - int i, v; u32 reg; + int i; /* If we don't have any VFs, then there is nothing to reset */ if (!pf->num_alloc_vfs) @@ -1636,11 +1636,10 @@ bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) return false; /* Begin reset on all VFs at once */ - for (v = 0; v < pf->num_alloc_vfs; v++) { - vf = &pf->vf[v]; + for (vf = &pf->vf[0]; vf < &pf->vf[pf->num_alloc_vfs]; ++vf) { /* If VF is being reset no need to trigger reset again */ if (!test_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) - i40e_trigger_vf_reset(&pf->vf[v], flr); + i40e_trigger_vf_reset(vf, flr); } /* HW requires some time to make sure it can flush the FIFO for a VF @@ -1649,14 +1648,13 @@ bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) * the VFs using a simple iterator that increments once that VF has * finished resetting. */ - for (i = 0, v = 0; i < 10 && v < pf->num_alloc_vfs; i++) { + for (i = 0, vf = &pf->vf[0]; i < 10 && vf < &pf->vf[pf->num_alloc_vfs]; ++i) { usleep_range(10000, 20000); /* Check each VF in sequence, beginning with the VF to fail * the previous check. */ - while (v < pf->num_alloc_vfs) { - vf = &pf->vf[v]; + while (vf < &pf->vf[pf->num_alloc_vfs]) { if (!test_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) { reg = rd32(hw, I40E_VPGEN_VFRSTAT(vf->vf_id)); if (!(reg & I40E_VPGEN_VFRSTAT_VFRD_MASK)) @@ -1666,7 +1664,7 @@ bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) /* If the current VF has finished resetting, move on * to the next VF in sequence. */ - v++; + ++vf; } } @@ -1676,39 +1674,39 @@ bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) /* Display a warning if at least one VF didn't manage to reset in * time, but continue on with the operation. */ - if (v < pf->num_alloc_vfs) + if (vf < &pf->vf[pf->num_alloc_vfs]) dev_err(&pf->pdev->dev, "VF reset check timeout on VF %d\n", - pf->vf[v].vf_id); + vf->vf_id); usleep_range(10000, 20000); /* Begin disabling all the rings associated with VFs, but do not wait * between each VF. */ - for (v = 0; v < pf->num_alloc_vfs; v++) { + for (vf = &pf->vf[0]; vf < &pf->vf[pf->num_alloc_vfs]; ++vf) { /* On initial reset, we don't have any queues to disable */ - if (pf->vf[v].lan_vsi_idx == 0) + if (vf->lan_vsi_idx == 0) continue; /* If VF is reset in another thread just continue */ if (test_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) continue; - i40e_vsi_stop_rings_no_wait(pf->vsi[pf->vf[v].lan_vsi_idx]); + i40e_vsi_stop_rings_no_wait(pf->vsi[vf->lan_vsi_idx]); } /* Now that we've notified HW to disable all of the VF rings, wait * until they finish. */ - for (v = 0; v < pf->num_alloc_vfs; v++) { + for (vf = &pf->vf[0]; vf < &pf->vf[pf->num_alloc_vfs]; ++vf) { /* On initial reset, we don't have any queues to disable */ - if (pf->vf[v].lan_vsi_idx == 0) + if (vf->lan_vsi_idx == 0) continue; /* If VF is reset in another thread just continue */ if (test_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) continue; - i40e_vsi_wait_queues_disabled(pf->vsi[pf->vf[v].lan_vsi_idx]); + i40e_vsi_wait_queues_disabled(pf->vsi[vf->lan_vsi_idx]); } /* Hw may need up to 50ms to finish disabling the RX queues. We @@ -1717,12 +1715,12 @@ bool i40e_reset_all_vfs(struct i40e_pf *pf, bool flr) mdelay(50); /* Finish the reset on each VF */ - for (v = 0; v < pf->num_alloc_vfs; v++) { + for (vf = &pf->vf[0]; vf < &pf->vf[pf->num_alloc_vfs]; ++vf) { /* If VF is reset in another thread just continue */ if (test_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) continue; - i40e_cleanup_reset_vf(&pf->vf[v]); + i40e_cleanup_reset_vf(vf); } i40e_flush(hw); From 6dbdd4de0362c37e54e8b049781402e5a409e7d0 Mon Sep 17 00:00:00 2001 From: Vitaly Lifshits Date: Thu, 4 Jan 2024 16:16:52 +0200 Subject: [PATCH 012/167] e1000e: Workaround for sporadic MDI error on Meteor Lake systems On some Meteor Lake systems accessing the PHY via the MDIO interface may result in an MDI error. This issue happens sporadically and in most cases a second access to the PHY via the MDIO interface results in success. As a workaround, introduce a retry counter which is set to 3 on Meteor Lake systems. The driver will only return an error if 3 consecutive PHY access attempts fail. The retry mechanism is disabled in specific flows, where MDI errors are expected. Fixes: cc23f4f0b6b9 ("e1000e: Add support for Meteor Lake") Suggested-by: Nikolay Mushayev Co-developed-by: Nir Efrati Signed-off-by: Nir Efrati Signed-off-by: Vitaly Lifshits Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/hw.h | 2 + drivers/net/ethernet/intel/e1000e/ich8lan.c | 33 ++++ drivers/net/ethernet/intel/e1000e/phy.c | 182 ++++++++++++-------- drivers/net/ethernet/intel/e1000e/phy.h | 2 + 4 files changed, 150 insertions(+), 69 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/hw.h b/drivers/net/ethernet/intel/e1000e/hw.h index 1fef6bb5a5fbc..4b6e7536170ab 100644 --- a/drivers/net/ethernet/intel/e1000e/hw.h +++ b/drivers/net/ethernet/intel/e1000e/hw.h @@ -628,6 +628,7 @@ struct e1000_phy_info { u32 id; u32 reset_delay_us; /* in usec */ u32 revision; + u32 retry_count; enum e1000_media_type media_type; @@ -644,6 +645,7 @@ struct e1000_phy_info { bool polarity_correction; bool speed_downgraded; bool autoneg_wait_to_complete; + bool retry_enabled; }; struct e1000_nvm_info { diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index 19e450a5bd314..d8e97669f31b0 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -222,11 +222,18 @@ static bool e1000_phy_is_accessible_pchlan(struct e1000_hw *hw) if (hw->mac.type >= e1000_pch_lpt) { /* Only unforce SMBus if ME is not active */ if (!(er32(FWSM) & E1000_ICH_FWSM_FW_VALID)) { + /* Switching PHY interface always returns MDI error + * so disable retry mechanism to avoid wasting time + */ + e1000e_disable_phy_retry(hw); + /* Unforce SMBus mode in PHY */ e1e_rphy_locked(hw, CV_SMB_CTRL, &phy_reg); phy_reg &= ~CV_SMB_CTRL_FORCE_SMBUS; e1e_wphy_locked(hw, CV_SMB_CTRL, phy_reg); + e1000e_enable_phy_retry(hw); + /* Unforce SMBus mode in MAC */ mac_reg = er32(CTRL_EXT); mac_reg &= ~E1000_CTRL_EXT_FORCE_SMBUS; @@ -310,6 +317,11 @@ static s32 e1000_init_phy_workarounds_pchlan(struct e1000_hw *hw) goto out; } + /* There is no guarantee that the PHY is accessible at this time + * so disable retry mechanism to avoid wasting time + */ + e1000e_disable_phy_retry(hw); + /* The MAC-PHY interconnect may be in SMBus mode. If the PHY is * inaccessible and resetting the PHY is not blocked, toggle the * LANPHYPC Value bit to force the interconnect to PCIe mode. @@ -380,6 +392,8 @@ static s32 e1000_init_phy_workarounds_pchlan(struct e1000_hw *hw) break; } + e1000e_enable_phy_retry(hw); + hw->phy.ops.release(hw); if (!ret_val) { @@ -449,6 +463,11 @@ static s32 e1000_init_phy_params_pchlan(struct e1000_hw *hw) phy->id = e1000_phy_unknown; + if (hw->mac.type == e1000_pch_mtp) { + phy->retry_count = 2; + e1000e_enable_phy_retry(hw); + } + ret_val = e1000_init_phy_workarounds_pchlan(hw); if (ret_val) return ret_val; @@ -1146,6 +1165,11 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000_hw *hw, bool to_sx) if (ret_val) goto out; + /* Switching PHY interface always returns MDI error + * so disable retry mechanism to avoid wasting time + */ + e1000e_disable_phy_retry(hw); + /* Force SMBus mode in PHY */ ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &phy_reg); if (ret_val) @@ -1153,6 +1177,8 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000_hw *hw, bool to_sx) phy_reg |= CV_SMB_CTRL_FORCE_SMBUS; e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, phy_reg); + e1000e_enable_phy_retry(hw); + /* Force SMBus mode in MAC */ mac_reg = er32(CTRL_EXT); mac_reg |= E1000_CTRL_EXT_FORCE_SMBUS; @@ -1313,6 +1339,11 @@ static s32 e1000_disable_ulp_lpt_lp(struct e1000_hw *hw, bool force) /* Toggle LANPHYPC Value bit */ e1000_toggle_lanphypc_pch_lpt(hw); + /* Switching PHY interface always returns MDI error + * so disable retry mechanism to avoid wasting time + */ + e1000e_disable_phy_retry(hw); + /* Unforce SMBus mode in PHY */ ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &phy_reg); if (ret_val) { @@ -1333,6 +1364,8 @@ static s32 e1000_disable_ulp_lpt_lp(struct e1000_hw *hw, bool force) phy_reg &= ~CV_SMB_CTRL_FORCE_SMBUS; e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, phy_reg); + e1000e_enable_phy_retry(hw); + /* Unforce SMBus mode in MAC */ mac_reg = er32(CTRL_EXT); mac_reg &= ~E1000_CTRL_EXT_FORCE_SMBUS; diff --git a/drivers/net/ethernet/intel/e1000e/phy.c b/drivers/net/ethernet/intel/e1000e/phy.c index 5e329156d1bae..93544f1cc2a51 100644 --- a/drivers/net/ethernet/intel/e1000e/phy.c +++ b/drivers/net/ethernet/intel/e1000e/phy.c @@ -107,6 +107,16 @@ s32 e1000e_phy_reset_dsp(struct e1000_hw *hw) return e1e_wphy(hw, M88E1000_PHY_GEN_CONTROL, 0); } +void e1000e_disable_phy_retry(struct e1000_hw *hw) +{ + hw->phy.retry_enabled = false; +} + +void e1000e_enable_phy_retry(struct e1000_hw *hw) +{ + hw->phy.retry_enabled = true; +} + /** * e1000e_read_phy_reg_mdic - Read MDI control register * @hw: pointer to the HW structure @@ -118,55 +128,73 @@ s32 e1000e_phy_reset_dsp(struct e1000_hw *hw) **/ s32 e1000e_read_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 *data) { + u32 i, mdic = 0, retry_counter, retry_max; struct e1000_phy_info *phy = &hw->phy; - u32 i, mdic = 0; + bool success; if (offset > MAX_PHY_REG_ADDRESS) { e_dbg("PHY Address %d is out of range\n", offset); return -E1000_ERR_PARAM; } + retry_max = phy->retry_enabled ? phy->retry_count : 0; + /* Set up Op-code, Phy Address, and register offset in the MDI * Control register. The MAC will take care of interfacing with the * PHY to retrieve the desired data. */ - mdic = ((offset << E1000_MDIC_REG_SHIFT) | - (phy->addr << E1000_MDIC_PHY_SHIFT) | - (E1000_MDIC_OP_READ)); + for (retry_counter = 0; retry_counter <= retry_max; retry_counter++) { + success = true; - ew32(MDIC, mdic); + mdic = ((offset << E1000_MDIC_REG_SHIFT) | + (phy->addr << E1000_MDIC_PHY_SHIFT) | + (E1000_MDIC_OP_READ)); - /* Poll the ready bit to see if the MDI read completed - * Increasing the time out as testing showed failures with - * the lower time out - */ - for (i = 0; i < (E1000_GEN_POLL_TIMEOUT * 3); i++) { - udelay(50); - mdic = er32(MDIC); - if (mdic & E1000_MDIC_READY) - break; - } - if (!(mdic & E1000_MDIC_READY)) { - e_dbg("MDI Read PHY Reg Address %d did not complete\n", offset); - return -E1000_ERR_PHY; - } - if (mdic & E1000_MDIC_ERROR) { - e_dbg("MDI Read PHY Reg Address %d Error\n", offset); - return -E1000_ERR_PHY; - } - if (FIELD_GET(E1000_MDIC_REG_MASK, mdic) != offset) { - e_dbg("MDI Read offset error - requested %d, returned %d\n", - offset, FIELD_GET(E1000_MDIC_REG_MASK, mdic)); - return -E1000_ERR_PHY; + ew32(MDIC, mdic); + + /* Poll the ready bit to see if the MDI read completed + * Increasing the time out as testing showed failures with + * the lower time out + */ + for (i = 0; i < (E1000_GEN_POLL_TIMEOUT * 3); i++) { + usleep_range(50, 60); + mdic = er32(MDIC); + if (mdic & E1000_MDIC_READY) + break; + } + if (!(mdic & E1000_MDIC_READY)) { + e_dbg("MDI Read PHY Reg Address %d did not complete\n", + offset); + success = false; + } + if (mdic & E1000_MDIC_ERROR) { + e_dbg("MDI Read PHY Reg Address %d Error\n", offset); + success = false; + } + if (FIELD_GET(E1000_MDIC_REG_MASK, mdic) != offset) { + e_dbg("MDI Read offset error - requested %d, returned %d\n", + offset, FIELD_GET(E1000_MDIC_REG_MASK, mdic)); + success = false; + } + + /* Allow some time after each MDIC transaction to avoid + * reading duplicate data in the next MDIC transaction. + */ + if (hw->mac.type == e1000_pch2lan) + usleep_range(100, 150); + + if (success) { + *data = (u16)mdic; + return 0; + } + + if (retry_counter != retry_max) { + e_dbg("Perform retry on PHY transaction...\n"); + mdelay(10); + } } - *data = (u16)mdic; - /* Allow some time after each MDIC transaction to avoid - * reading duplicate data in the next MDIC transaction. - */ - if (hw->mac.type == e1000_pch2lan) - udelay(100); - return 0; + return -E1000_ERR_PHY; } /** @@ -179,56 +207,72 @@ s32 e1000e_read_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 *data) **/ s32 e1000e_write_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 data) { + u32 i, mdic = 0, retry_counter, retry_max; struct e1000_phy_info *phy = &hw->phy; - u32 i, mdic = 0; + bool success; if (offset > MAX_PHY_REG_ADDRESS) { e_dbg("PHY Address %d is out of range\n", offset); return -E1000_ERR_PARAM; } + retry_max = phy->retry_enabled ? phy->retry_count : 0; + /* Set up Op-code, Phy Address, and register offset in the MDI * Control register. The MAC will take care of interfacing with the * PHY to retrieve the desired data. */ - mdic = (((u32)data) | - (offset << E1000_MDIC_REG_SHIFT) | - (phy->addr << E1000_MDIC_PHY_SHIFT) | - (E1000_MDIC_OP_WRITE)); + for (retry_counter = 0; retry_counter <= retry_max; retry_counter++) { + success = true; - ew32(MDIC, mdic); + mdic = (((u32)data) | + (offset << E1000_MDIC_REG_SHIFT) | + (phy->addr << E1000_MDIC_PHY_SHIFT) | + (E1000_MDIC_OP_WRITE)); - /* Poll the ready bit to see if the MDI read completed - * Increasing the time out as testing showed failures with - * the lower time out - */ - for (i = 0; i < (E1000_GEN_POLL_TIMEOUT * 3); i++) { - udelay(50); - mdic = er32(MDIC); - if (mdic & E1000_MDIC_READY) - break; - } - if (!(mdic & E1000_MDIC_READY)) { - e_dbg("MDI Write PHY Reg Address %d did not complete\n", offset); - return -E1000_ERR_PHY; - } - if (mdic & E1000_MDIC_ERROR) { - e_dbg("MDI Write PHY Red Address %d Error\n", offset); - return -E1000_ERR_PHY; - } - if (FIELD_GET(E1000_MDIC_REG_MASK, mdic) != offset) { - e_dbg("MDI Write offset error - requested %d, returned %d\n", - offset, FIELD_GET(E1000_MDIC_REG_MASK, mdic)); - return -E1000_ERR_PHY; - } + ew32(MDIC, mdic); - /* Allow some time after each MDIC transaction to avoid - * reading duplicate data in the next MDIC transaction. - */ - if (hw->mac.type == e1000_pch2lan) - udelay(100); + /* Poll the ready bit to see if the MDI read completed + * Increasing the time out as testing showed failures with + * the lower time out + */ + for (i = 0; i < (E1000_GEN_POLL_TIMEOUT * 3); i++) { + usleep_range(50, 60); + mdic = er32(MDIC); + if (mdic & E1000_MDIC_READY) + break; + } + if (!(mdic & E1000_MDIC_READY)) { + e_dbg("MDI Write PHY Reg Address %d did not complete\n", + offset); + success = false; + } + if (mdic & E1000_MDIC_ERROR) { + e_dbg("MDI Write PHY Reg Address %d Error\n", offset); + success = false; + } + if (FIELD_GET(E1000_MDIC_REG_MASK, mdic) != offset) { + e_dbg("MDI Write offset error - requested %d, returned %d\n", + offset, FIELD_GET(E1000_MDIC_REG_MASK, mdic)); + success = false; + } - return 0; + /* Allow some time after each MDIC transaction to avoid + * reading duplicate data in the next MDIC transaction. + */ + if (hw->mac.type == e1000_pch2lan) + usleep_range(100, 150); + + if (success) + return 0; + + if (retry_counter != retry_max) { + e_dbg("Perform retry on PHY transaction...\n"); + mdelay(10); + } + } + + return -E1000_ERR_PHY; } /** diff --git a/drivers/net/ethernet/intel/e1000e/phy.h b/drivers/net/ethernet/intel/e1000e/phy.h index c48777d095235..049bb325b4b14 100644 --- a/drivers/net/ethernet/intel/e1000e/phy.h +++ b/drivers/net/ethernet/intel/e1000e/phy.h @@ -51,6 +51,8 @@ s32 e1000e_read_phy_reg_bm2(struct e1000_hw *hw, u32 offset, u16 *data); s32 e1000e_write_phy_reg_bm2(struct e1000_hw *hw, u32 offset, u16 data); void e1000_power_up_phy_copper(struct e1000_hw *hw); void e1000_power_down_phy_copper(struct e1000_hw *hw); +void e1000e_disable_phy_retry(struct e1000_hw *hw); +void e1000e_enable_phy_retry(struct e1000_hw *hw); s32 e1000e_read_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 *data); s32 e1000e_write_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 data); s32 e1000_read_phy_reg_hv(struct e1000_hw *hw, u32 offset, u16 *data); From 861e8086029e003305750b4126ecd6617465f5c7 Mon Sep 17 00:00:00 2001 From: Vitaly Lifshits Date: Sun, 3 Mar 2024 12:51:32 +0200 Subject: [PATCH 013/167] e1000e: move force SMBUS from enable ulp function to avoid PHY loss issue Forcing SMBUS inside the ULP enabling flow leads to sporadic PHY loss on some systems. It is suspected to be caused by initiating PHY transactions before the interface settles. Separating this configuration from the ULP enabling flow and moving it to the shutdown function allows enough time for the interface to settle and avoids adding a delay. Fixes: 6607c99e7034 ("e1000e: i219 - fix to enable both ULP and EEE in Sx state") Co-developed-by: Dima Ruinskiy Signed-off-by: Dima Ruinskiy Signed-off-by: Vitaly Lifshits Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/ich8lan.c | 19 ------------------- drivers/net/ethernet/intel/e1000e/netdev.c | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index d8e97669f31b0..f9e94be36e97f 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -1165,25 +1165,6 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000_hw *hw, bool to_sx) if (ret_val) goto out; - /* Switching PHY interface always returns MDI error - * so disable retry mechanism to avoid wasting time - */ - e1000e_disable_phy_retry(hw); - - /* Force SMBus mode in PHY */ - ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &phy_reg); - if (ret_val) - goto release; - phy_reg |= CV_SMB_CTRL_FORCE_SMBUS; - e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, phy_reg); - - e1000e_enable_phy_retry(hw); - - /* Force SMBus mode in MAC */ - mac_reg = er32(CTRL_EXT); - mac_reg |= E1000_CTRL_EXT_FORCE_SMBUS; - ew32(CTRL_EXT, mac_reg); - /* Si workaround for ULP entry flow on i127/rev6 h/w. Enable * LPLU and disable Gig speed when entering ULP */ diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index cc8c531ec3dff..3692fce201959 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -6623,6 +6623,7 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool runtime) struct e1000_hw *hw = &adapter->hw; u32 ctrl, ctrl_ext, rctl, status, wufc; int retval = 0; + u16 smb_ctrl; /* Runtime suspend should only enable wakeup for link changes */ if (runtime) @@ -6696,6 +6697,23 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool runtime) if (retval) return retval; } + + /* Force SMBUS to allow WOL */ + /* Switching PHY interface always returns MDI error + * so disable retry mechanism to avoid wasting time + */ + e1000e_disable_phy_retry(hw); + + e1e_rphy(hw, CV_SMB_CTRL, &smb_ctrl); + smb_ctrl |= CV_SMB_CTRL_FORCE_SMBUS; + e1e_wphy(hw, CV_SMB_CTRL, smb_ctrl); + + e1000e_enable_phy_retry(hw); + + /* Force SMBus mode in MAC */ + ctrl_ext = er32(CTRL_EXT); + ctrl_ext |= E1000_CTRL_EXT_FORCE_SMBUS; + ew32(CTRL_EXT, ctrl_ext); } /* Ensure that the appropriate bits are set in LPI_CTRL From 931ec1e4cb7fd81fe01e85419238a9cfb9d930c9 Mon Sep 17 00:00:00 2001 From: William Tu Date: Mon, 25 Mar 2024 11:12:28 -0700 Subject: [PATCH 014/167] Documentation: Add documentation for eswitch attribute Provide devlink documentation for three eswitch attributes: mode, inline-mode, and encap-mode. Signed-off-by: William Tu Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240325181228.6244-1-witu@nvidia.com Signed-off-by: Jakub Kicinski --- .../devlink/devlink-eswitch-attr.rst | 76 +++++++++++++++++++ Documentation/networking/devlink/index.rst | 1 + Documentation/networking/representors.rst | 1 + 3 files changed, 78 insertions(+) create mode 100644 Documentation/networking/devlink/devlink-eswitch-attr.rst diff --git a/Documentation/networking/devlink/devlink-eswitch-attr.rst b/Documentation/networking/devlink/devlink-eswitch-attr.rst new file mode 100644 index 0000000000000..08bb39ab15286 --- /dev/null +++ b/Documentation/networking/devlink/devlink-eswitch-attr.rst @@ -0,0 +1,76 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================== +Devlink E-Switch Attribute +========================== + +Devlink E-Switch supports two modes of operation: legacy and switchdev. +Legacy mode operates based on traditional MAC/VLAN steering rules. Switching +decisions are made based on MAC addresses, VLANs, etc. There is limited ability +to offload switching rules to hardware. + +On the other hand, switchdev mode allows for more advanced offloading +capabilities of the E-Switch to hardware. In switchdev mode, more switching +rules and logic can be offloaded to the hardware switch ASIC. It enables +representor netdevices that represent the slow path of virtual functions (VFs) +or scalable-functions (SFs) of the device. See more information about +:ref:`Documentation/networking/switchdev.rst ` and +:ref:`Documentation/networking/representors.rst `. + +In addition, the devlink E-Switch also comes with other attributes listed +in the following section. + +Attributes Description +====================== + +The following is a list of E-Switch attributes. + +.. list-table:: E-Switch attributes + :widths: 8 5 45 + + * - Name + - Type + - Description + * - ``mode`` + - enum + - The mode of the device. The mode can be one of the following: + + * ``legacy`` operates based on traditional MAC/VLAN steering + rules. + * ``switchdev`` allows for more advanced offloading capabilities of + the E-Switch to hardware. + * - ``inline-mode`` + - enum + - Some HWs need the VF driver to put part of the packet + headers on the TX descriptor so the e-switch can do proper + matching and steering. Support for both switchdev mode and legacy mode. + + * ``none`` none. + * ``link`` L2 mode. + * ``network`` L3 mode. + * ``transport`` L4 mode. + * - ``encap-mode`` + - enum + - The encapsulation mode of the device. Support for both switchdev mode + and legacy mode. The mode can be one of the following: + + * ``none`` Disable encapsulation support. + * ``basic`` Enable encapsulation support. + +Example Usage +============= + +.. code:: shell + + # enable switchdev mode + $ devlink dev eswitch set pci/0000:08:00.0 mode switchdev + + # set inline-mode and encap-mode + $ devlink dev eswitch set pci/0000:08:00.0 inline-mode none encap-mode basic + + # display devlink device eswitch attributes + $ devlink dev eswitch show pci/0000:08:00.0 + pci/0000:08:00.0: mode switchdev inline-mode none encap-mode basic + + # enable encap-mode with legacy mode + $ devlink dev eswitch set pci/0000:08:00.0 mode legacy inline-mode none encap-mode basic diff --git a/Documentation/networking/devlink/index.rst b/Documentation/networking/devlink/index.rst index e14d7a701b72b..948c8c44e233f 100644 --- a/Documentation/networking/devlink/index.rst +++ b/Documentation/networking/devlink/index.rst @@ -67,6 +67,7 @@ general. devlink-selftests devlink-trap devlink-linecard + devlink-eswitch-attr Driver-specific documentation ----------------------------- diff --git a/Documentation/networking/representors.rst b/Documentation/networking/representors.rst index decb39c19b9ed..5e23386f69687 100644 --- a/Documentation/networking/representors.rst +++ b/Documentation/networking/representors.rst @@ -1,4 +1,5 @@ .. SPDX-License-Identifier: GPL-2.0 +.. _representors: ============================= Network Function Representors From 3bcbc67be1b7e7f6dc48a5b6687a511e5fd7cf81 Mon Sep 17 00:00:00 2001 From: John Fraker Date: Mon, 25 Mar 2024 15:33:08 -0700 Subject: [PATCH 015/167] gve: Add counter adminq_get_ptype_map_cnt to stats report This counter counts the number of times get_ptype_map is executed on the admin queue, and was previously missing from the stats report. Signed-off-by: John Fraker Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240325223308.618671-1-jfraker@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_ethtool.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c index 9aebfb843d9d1..dbe05402d40b1 100644 --- a/drivers/net/ethernet/google/gve/gve_ethtool.c +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -73,7 +73,7 @@ static const char gve_gstrings_adminq_stats[][ETH_GSTRING_LEN] = { "adminq_create_tx_queue_cnt", "adminq_create_rx_queue_cnt", "adminq_destroy_tx_queue_cnt", "adminq_destroy_rx_queue_cnt", "adminq_dcfg_device_resources_cnt", "adminq_set_driver_parameter_cnt", - "adminq_report_stats_cnt", "adminq_report_link_speed_cnt" + "adminq_report_stats_cnt", "adminq_report_link_speed_cnt", "adminq_get_ptype_map_cnt" }; static const char gve_gstrings_priv_flags[][ETH_GSTRING_LEN] = { @@ -428,6 +428,7 @@ gve_get_ethtool_stats(struct net_device *netdev, data[i++] = priv->adminq_set_driver_parameter_cnt; data[i++] = priv->adminq_report_stats_cnt; data[i++] = priv->adminq_report_link_speed_cnt; + data[i++] = priv->adminq_get_ptype_map_cnt; } static void gve_get_channels(struct net_device *netdev, From 49d665b8535e5ea5927b896086dcb2eb65514341 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 25 Mar 2024 17:49:31 -0500 Subject: [PATCH 016/167] qed: Drop useless pci_params.pm_cap qed_init_pci() used pci_params.pm_cap only to cache the pci_dev.pm_cap. Drop the cache and use pci_dev.pm_cap directly. Signed-off-by: Bjorn Helgaas Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240325224931.1462051-1-helgaas@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qed/qed.h | 2 -- drivers/net/ethernet/qlogic/qed/qed_main.c | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h index 1d719726f72b2..b7def3b549376 100644 --- a/drivers/net/ethernet/qlogic/qed/qed.h +++ b/drivers/net/ethernet/qlogic/qed/qed.h @@ -662,8 +662,6 @@ struct qed_hwfn { }; struct pci_params { - int pm_cap; - unsigned long mem_start; unsigned long mem_end; unsigned int irq; diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index c278f8893042b..408da4312e018 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -323,8 +323,7 @@ static int qed_init_pci(struct qed_dev *cdev, struct pci_dev *pdev) goto err2; } - cdev->pci_params.pm_cap = pci_find_capability(pdev, PCI_CAP_ID_PM); - if (IS_PF(cdev) && !cdev->pci_params.pm_cap) + if (IS_PF(cdev) && !pdev->pm_cap) DP_NOTICE(cdev, "Cannot find power management capability\n"); rc = dma_set_mask_and_coherent(&cdev->pdev->dev, DMA_BIT_MASK(64)); From fa84513997e9703fbac94b73bbe50aafdb29040e Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 27 Mar 2024 09:14:13 +0100 Subject: [PATCH 017/167] ptp: MAINTAINERS: drop Jeff Sipek Emails to Jeff Sipek bounce: Your message to jsipek@vmware.com couldn't be delivered. Recipient is not authorized to accept external mail Status code: 550 5.7.1_ETR Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240327081413.306054-1-krzysztof.kozlowski@linaro.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 6a233e1a3cf2e..7b8f97bf79751 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23661,7 +23661,6 @@ F: drivers/scsi/vmw_pvscsi.c F: drivers/scsi/vmw_pvscsi.h VMWARE VIRTUAL PTP CLOCK DRIVER -M: Jeff Sipek R: Ajay Kaher R: Alexey Makhalov R: VMware PV-Drivers Reviewers From 037965402a010898d34f4e35327d22c0a95cd51f Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 27 Mar 2024 13:14:56 +0100 Subject: [PATCH 018/167] xen-netfront: Add missing skb_mark_for_recycle Notice that skb_mark_for_recycle() is introduced later than fixes tag in commit 6a5bcd84e886 ("page_pool: Allow drivers to hint on SKB recycling"). It is believed that fixes tag were missing a call to page_pool_release_page() between v5.9 to v5.14, after which is should have used skb_mark_for_recycle(). Since v6.6 the call page_pool_release_page() were removed (in commit 535b9c61bdef ("net: page_pool: hide page_pool_release_page()") and remaining callers converted (in commit 6bfef2ec0172 ("Merge branch 'net-page_pool-remove-page_pool_release_page'")). This leak became visible in v6.8 via commit dba1b8a7ab68 ("mm/page_pool: catch page_pool memory leaks"). Cc: stable@vger.kernel.org Fixes: 6c5aa6fc4def ("xen networking: add basic XDP support for xen-netfront") Reported-by: Leonidas Spyropoulos Link: https://bugzilla.kernel.org/show_bug.cgi?id=218654 Reported-by: Arthur Borsboom Signed-off-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/171154167446.2671062.9127105384591237363.stgit@firesoul Signed-off-by: Jakub Kicinski --- drivers/net/xen-netfront.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index ad29f370034e4..8d2aee88526c6 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -285,6 +285,7 @@ static struct sk_buff *xennet_alloc_one_rx_buffer(struct netfront_queue *queue) return NULL; } skb_add_rx_frag(skb, 0, page, 0, 0, PAGE_SIZE); + skb_mark_for_recycle(skb); /* Align ip header to a 16 bytes boundary */ skb_reserve(skb, NET_IP_ALIGN); From 6e9b01909a811555ff3326cf80a5847169c57806 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 26 Mar 2024 21:02:12 -0700 Subject: [PATCH 019/167] net: remove gfp_mask from napi_alloc_skb() __napi_alloc_skb() is napi_alloc_skb() with the added flexibility of choosing gfp_mask. This is a NAPI function, so GFP_ATOMIC is implied. The only practical choice the caller has is whether to set __GFP_NOWARN. But that's a false choice, too, allocation failures in atomic context will happen, and printing warnings in logs, effectively for a packet drop, is both too much and very likely non-actionable. This leads me to a conclusion that most uses of napi_alloc_skb() are simply misguided, and should use __GFP_NOWARN in the first place. We also have a "standard" way of reporting allocation failures via the queue stat API (qstats::rx-alloc-fail). The direct motivation for this patch is that one of the drivers used at Meta calls napi_alloc_skb() (so prior to this patch without __GFP_NOWARN), and the resulting OOM warning is the top networking warning in our fleet. Reviewed-by: Alexander Lobakin Reviewed-by: Simon Horman Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240327040213.3153864-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/mm/page_frags.rst | 2 +- Documentation/translations/zh_CN/mm/page_frags.rst | 2 +- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 4 +--- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 3 +-- drivers/net/ethernet/intel/iavf/iavf_txrx.c | 4 +--- drivers/net/ethernet/intel/ice/ice_txrx.c | 3 +-- drivers/net/ethernet/intel/ice/ice_xsk.c | 3 +-- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 5 ++--- drivers/net/ethernet/intel/igc/igc_main.c | 3 +-- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 3 +-- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 5 ++--- include/linux/skbuff.h | 8 +------- net/core/skbuff.c | 9 ++++----- 13 files changed, 18 insertions(+), 36 deletions(-) diff --git a/Documentation/mm/page_frags.rst b/Documentation/mm/page_frags.rst index a81617e688a84..503ca6cdb8040 100644 --- a/Documentation/mm/page_frags.rst +++ b/Documentation/mm/page_frags.rst @@ -25,7 +25,7 @@ to be disabled when executing the fragment allocation. The network stack uses two separate caches per CPU to handle fragment allocation. The netdev_alloc_cache is used by callers making use of the netdev_alloc_frag and __netdev_alloc_skb calls. The napi_alloc_cache is -used by callers of the __napi_alloc_frag and __napi_alloc_skb calls. The +used by callers of the __napi_alloc_frag and napi_alloc_skb calls. The main difference between these two calls is the context in which they may be called. The "netdev" prefixed functions are usable in any context as these functions will disable interrupts, while the "napi" prefixed functions are diff --git a/Documentation/translations/zh_CN/mm/page_frags.rst b/Documentation/translations/zh_CN/mm/page_frags.rst index 20bd3fafdc8c9..a5b22486a9134 100644 --- a/Documentation/translations/zh_CN/mm/page_frags.rst +++ b/Documentation/translations/zh_CN/mm/page_frags.rst @@ -25,7 +25,7 @@ sk_buff->head使用,或者用于skb_shared_info的 “frags” 部分。 网络堆栈在每个CPU使用两个独立的缓存来处理碎片分配。netdev_alloc_cache被使用 netdev_alloc_frag和__netdev_alloc_skb调用的调用者使用。napi_alloc_cache -被调用__napi_alloc_frag和__napi_alloc_skb的调用者使用。这两个调用的主要区别是 +被调用__napi_alloc_frag和napi_alloc_skb的调用者使用。这两个调用的主要区别是 它们可能被调用的环境。“netdev” 前缀的函数可以在任何上下文中使用,因为这些函数 将禁用中断,而 ”napi“ 前缀的函数只可以在softirq上下文中使用。 diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 0d7177083708f..ac2fcc5ac595e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2144,9 +2144,7 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring, */ /* allocate a skb to store the frags */ - skb = __napi_alloc_skb(&rx_ring->q_vector->napi, - I40E_RX_HDR_SIZE, - GFP_ATOMIC | __GFP_NOWARN); + skb = napi_alloc_skb(&rx_ring->q_vector->napi, I40E_RX_HDR_SIZE); if (unlikely(!skb)) return NULL; diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 11500003af0d4..a85b425794df2 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -301,8 +301,7 @@ static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring, net_prefetch(xdp->data_meta); /* allocate a skb to store the frags */ - skb = __napi_alloc_skb(&rx_ring->q_vector->napi, totalsize, - GFP_ATOMIC | __GFP_NOWARN); + skb = napi_alloc_skb(&rx_ring->q_vector->napi, totalsize); if (unlikely(!skb)) goto out; diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c index b71484c87a846..32bb604a13825 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c +++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c @@ -1334,9 +1334,7 @@ static struct sk_buff *iavf_construct_skb(struct iavf_ring *rx_ring, net_prefetch(va); /* allocate a skb to store the frags */ - skb = __napi_alloc_skb(&rx_ring->q_vector->napi, - IAVF_RX_HDR_SIZE, - GFP_ATOMIC | __GFP_NOWARN); + skb = napi_alloc_skb(&rx_ring->q_vector->napi, IAVF_RX_HDR_SIZE); if (unlikely(!skb)) return NULL; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 97d41d6ebf1fb..8bb743f78fcb4 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1051,8 +1051,7 @@ ice_construct_skb(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp) } /* allocate a skb to store the frags */ - skb = __napi_alloc_skb(&rx_ring->q_vector->napi, ICE_RX_HDR_SIZE, - GFP_ATOMIC | __GFP_NOWARN); + skb = napi_alloc_skb(&rx_ring->q_vector->napi, ICE_RX_HDR_SIZE); if (unlikely(!skb)) return NULL; diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 1857220d27fee..aa81d1162b815 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -555,8 +555,7 @@ ice_construct_skb_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp) } net_prefetch(xdp->data_meta); - skb = __napi_alloc_skb(&rx_ring->q_vector->napi, totalsize, - GFP_ATOMIC | __GFP_NOWARN); + skb = napi_alloc_skb(&rx_ring->q_vector->napi, totalsize); if (unlikely(!skb)) return NULL; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 6dd7a66bb8979..f940f650cd788 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -3005,8 +3005,7 @@ struct sk_buff *idpf_rx_construct_skb(struct idpf_queue *rxq, /* prefetch first cache line of first page */ net_prefetch(va); /* allocate a skb to store the frags */ - skb = __napi_alloc_skb(&rxq->q_vector->napi, IDPF_RX_HDR_SIZE, - GFP_ATOMIC); + skb = napi_alloc_skb(&rxq->q_vector->napi, IDPF_RX_HDR_SIZE); if (unlikely(!skb)) { idpf_rx_put_page(rx_buf); @@ -3060,7 +3059,7 @@ static struct sk_buff *idpf_rx_hdr_construct_skb(struct idpf_queue *rxq, struct sk_buff *skb; /* allocate a skb to store the frags */ - skb = __napi_alloc_skb(&rxq->q_vector->napi, size, GFP_ATOMIC); + skb = napi_alloc_skb(&rxq->q_vector->napi, size); if (unlikely(!skb)) return NULL; diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 35ad40a803cb6..0cd923c8ac9b2 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2712,8 +2712,7 @@ static struct sk_buff *igc_construct_skb_zc(struct igc_ring *ring, net_prefetch(xdp->data_meta); - skb = __napi_alloc_skb(&ring->q_vector->napi, totalsize, - GFP_ATOMIC | __GFP_NOWARN); + skb = napi_alloc_skb(&ring->q_vector->napi, totalsize); if (unlikely(!skb)) return NULL; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index d34d715c59ebc..397cb773fabbe 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -220,8 +220,7 @@ static struct sk_buff *ixgbe_construct_skb_zc(struct ixgbe_ring *rx_ring, net_prefetch(xdp->data_meta); /* allocate a skb to store the frags */ - skb = __napi_alloc_skb(&rx_ring->q_vector->napi, totalsize, - GFP_ATOMIC | __GFP_NOWARN); + skb = napi_alloc_skb(&rx_ring->q_vector->napi, totalsize); if (unlikely(!skb)) return NULL; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 24cd80490d19c..bcdde68a099a2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -5109,9 +5109,8 @@ static struct sk_buff *stmmac_construct_skb_zc(struct stmmac_channel *ch, unsigned int datasize = xdp->data_end - xdp->data; struct sk_buff *skb; - skb = __napi_alloc_skb(&ch->rxtx_napi, - xdp->data_end - xdp->data_hard_start, - GFP_ATOMIC | __GFP_NOWARN); + skb = napi_alloc_skb(&ch->rxtx_napi, + xdp->data_end - xdp->data_hard_start); if (unlikely(!skb)) return NULL; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 517e546a120ae..b7f1ecdaec382 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3350,13 +3350,7 @@ static inline void *napi_alloc_frag_align(unsigned int fragsz, return __napi_alloc_frag_align(fragsz, -align); } -struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, - unsigned int length, gfp_t gfp_mask); -static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi, - unsigned int length) -{ - return __napi_alloc_skb(napi, length, GFP_ATOMIC); -} +struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int length); void napi_consume_skb(struct sk_buff *skb, int budget); void napi_skb_free_stolen_head(struct sk_buff *skb); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 17617c29be2df..a1be84be5d35a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -775,10 +775,9 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, EXPORT_SYMBOL(__netdev_alloc_skb); /** - * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance + * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance * @napi: napi instance this buffer was allocated for * @len: length to allocate - * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages * * Allocate a new sk_buff for use in NAPI receive. This buffer will * attempt to allocate the head from a special reserved region used @@ -787,9 +786,9 @@ EXPORT_SYMBOL(__netdev_alloc_skb); * * %NULL is returned if there is no free memory. */ -struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, - gfp_t gfp_mask) +struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) { + gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN; struct napi_alloc_cache *nc; struct sk_buff *skb; bool pfmemalloc; @@ -860,7 +859,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, skb_fail: return skb; } -EXPORT_SYMBOL(__napi_alloc_skb); +EXPORT_SYMBOL(napi_alloc_skb); void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, int off, int size, unsigned int truesize) From ca7e324e8ad385a2da15049953c04ea7310687f7 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 27 Mar 2024 15:22:39 +0100 Subject: [PATCH 020/167] compiler_types: add Endianness-dependent __counted_by_{le,be} Some structures contain flexible arrays at the end and the counter for them, but the counter has explicit Endianness and thus __counted_by() can't be used directly. To increase test coverage for potential problems without breaking anything, introduce __counted_by_{le,be}() defined depending on platform's Endianness to either __counted_by() when applicable or noop otherwise. Maybe it would be a good idea to introduce such attributes on compiler level if possible, but for now let's stop on what we have. Acked-by: Kees Cook Acked-by: Gustavo A. R. Silva Signed-off-by: Alexander Lobakin Reviewed-by: Przemek Kitszel Link: https://lore.kernel.org/r/20240327142241.1745989-2-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- Documentation/conf.py | 2 ++ include/linux/compiler_types.h | 11 +++++++++++ scripts/kernel-doc | 1 + 3 files changed, 14 insertions(+) diff --git a/Documentation/conf.py b/Documentation/conf.py index d148f3e8dd572..0c2205d536b38 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -75,6 +75,8 @@ def have_command(cmd): "__rcu", "__user", "__force", + "__counted_by_le", + "__counted_by_be", # include/linux/compiler_attributes.h: "__alias", diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 2abaa3a825a9e..a29ba6ef1e275 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -282,6 +282,17 @@ struct ftrace_likely_data { #define __no_sanitize_or_inline __always_inline #endif +/* + * Apply __counted_by() when the Endianness matches to increase test coverage. + */ +#ifdef __LITTLE_ENDIAN +#define __counted_by_le(member) __counted_by(member) +#define __counted_by_be(member) +#else +#define __counted_by_le(member) +#define __counted_by_be(member) __counted_by(member) +#endif + /* Do not trap wrapping arithmetic within an annotated function. */ #ifdef CONFIG_UBSAN_SIGNED_WRAP # define __signed_wrap __attribute__((no_sanitize("signed-integer-overflow"))) diff --git a/scripts/kernel-doc b/scripts/kernel-doc index 967f1abb0edbd..1474e95dbe4f0 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -1143,6 +1143,7 @@ sub dump_struct($$) { $members =~ s/\s*$attribute/ /gi; $members =~ s/\s*__aligned\s*\([^;]*\)/ /gos; $members =~ s/\s*__counted_by\s*\([^;]*\)/ /gos; + $members =~ s/\s*__counted_by_(le|be)\s*\([^;]*\)/ /gos; $members =~ s/\s*__packed\s*/ /gos; $members =~ s/\s*CRYPTO_MINALIGN_ATTR/ /gos; $members =~ s/\s*____cacheline_aligned_in_smp/ /gos; From c00d33f1fc7958e6e7f461c994fa025aa2273c13 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 27 Mar 2024 15:22:40 +0100 Subject: [PATCH 021/167] idpf: make virtchnl2.h self-contained To ease maintaining of virtchnl2.h, which already is messy enough, make it self-contained by adding missing if_ether.h include due to %ETH_ALEN usage. At the same time, virtchnl2_lan_desc.h is not used anywhere in the file, so move this include to idpf_txrx.h to speed up C preprocessing. Acked-by: Kees Cook Acked-by: Gustavo A. R. Silva Signed-off-by: Alexander Lobakin Reviewed-by: Przemek Kitszel Link: https://lore.kernel.org/r/20240327142241.1745989-3-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 2 ++ drivers/net/ethernet/intel/idpf/virtchnl2.h | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index df76493faa756..3d046b81e507a 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -8,6 +8,8 @@ #include #include +#include "virtchnl2_lan_desc.h" + #define IDPF_LARGE_MAX_Q 256 #define IDPF_MAX_Q 16 #define IDPF_MIN_Q 2 diff --git a/drivers/net/ethernet/intel/idpf/virtchnl2.h b/drivers/net/ethernet/intel/idpf/virtchnl2.h index 4a3c4454d25ab..29419211b3d9c 100644 --- a/drivers/net/ethernet/intel/idpf/virtchnl2.h +++ b/drivers/net/ethernet/intel/idpf/virtchnl2.h @@ -4,6 +4,8 @@ #ifndef _VIRTCHNL2_H_ #define _VIRTCHNL2_H_ +#include + /* All opcodes associated with virtchnl2 are prefixed with virtchnl2 or * VIRTCHNL2. Any future opcodes, offloads/capabilities, structures, * and defines must be prefixed with virtchnl2 or VIRTCHNL2 to avoid confusion. @@ -17,8 +19,6 @@ * must remain unchanged over time, so we specify explicit values for all enums. */ -#include "virtchnl2_lan_desc.h" - /* This macro is used to generate compilation errors if a structure * is not exactly the correct length. */ From 93d24acfa05ebe954ec1782bca374de2501a5830 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 27 Mar 2024 15:22:41 +0100 Subject: [PATCH 022/167] idpf: sprinkle __counted_by{,_le}() in the virtchnl2 header Both virtchnl2.h and its consumer idpf_virtchnl.c are very error-prone. There are 10 structures with flexible arrays at the end, but 9 of them has flex member counter in Little Endian. Make the code a bit more robust by applying __counted_by_le() to those 9. LE platforms is the main target for this driver, so they would receive additional protection. While we're here, add __counted_by() to virtchnl2_ptype::proto_id, as its counter is `u8` regardless of the Endianness. Compile test on x86_64 (LE) didn't reveal any new issues after applying the attributes. Acked-by: Kees Cook Acked-by: Gustavo A. R. Silva Signed-off-by: Alexander Lobakin Reviewed-by: Przemek Kitszel Link: https://lore.kernel.org/r/20240327142241.1745989-4-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/idpf/virtchnl2.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/virtchnl2.h b/drivers/net/ethernet/intel/idpf/virtchnl2.h index 29419211b3d9c..63deb120359cf 100644 --- a/drivers/net/ethernet/intel/idpf/virtchnl2.h +++ b/drivers/net/ethernet/intel/idpf/virtchnl2.h @@ -555,7 +555,7 @@ VIRTCHNL2_CHECK_STRUCT_LEN(32, virtchnl2_queue_reg_chunk); struct virtchnl2_queue_reg_chunks { __le16 num_chunks; u8 pad[6]; - struct virtchnl2_queue_reg_chunk chunks[]; + struct virtchnl2_queue_reg_chunk chunks[] __counted_by_le(num_chunks); }; VIRTCHNL2_CHECK_STRUCT_LEN(8, virtchnl2_queue_reg_chunks); @@ -703,7 +703,7 @@ struct virtchnl2_config_tx_queues { __le32 vport_id; __le16 num_qinfo; u8 pad[10]; - struct virtchnl2_txq_info qinfo[]; + struct virtchnl2_txq_info qinfo[] __counted_by_le(num_qinfo); }; VIRTCHNL2_CHECK_STRUCT_LEN(16, virtchnl2_config_tx_queues); @@ -782,7 +782,7 @@ struct virtchnl2_config_rx_queues { __le32 vport_id; __le16 num_qinfo; u8 pad[18]; - struct virtchnl2_rxq_info qinfo[]; + struct virtchnl2_rxq_info qinfo[] __counted_by_le(num_qinfo); }; VIRTCHNL2_CHECK_STRUCT_LEN(24, virtchnl2_config_rx_queues); @@ -868,7 +868,7 @@ VIRTCHNL2_CHECK_STRUCT_LEN(32, virtchnl2_vector_chunk); struct virtchnl2_vector_chunks { __le16 num_vchunks; u8 pad[14]; - struct virtchnl2_vector_chunk vchunks[]; + struct virtchnl2_vector_chunk vchunks[] __counted_by_le(num_vchunks); }; VIRTCHNL2_CHECK_STRUCT_LEN(16, virtchnl2_vector_chunks); @@ -912,7 +912,7 @@ struct virtchnl2_rss_lut { __le16 lut_entries_start; __le16 lut_entries; u8 pad[4]; - __le32 lut[]; + __le32 lut[] __counted_by_le(lut_entries); }; VIRTCHNL2_CHECK_STRUCT_LEN(12, virtchnl2_rss_lut); @@ -977,7 +977,7 @@ struct virtchnl2_ptype { u8 ptype_id_8; u8 proto_id_count; __le16 pad; - __le16 proto_id[]; + __le16 proto_id[] __counted_by(proto_id_count); } __packed __aligned(2); VIRTCHNL2_CHECK_STRUCT_LEN(6, virtchnl2_ptype); @@ -1104,7 +1104,7 @@ struct virtchnl2_rss_key { __le32 vport_id; __le16 key_len; u8 pad; - u8 key_flex[]; + u8 key_flex[] __counted_by_le(key_len); } __packed; VIRTCHNL2_CHECK_STRUCT_LEN(7, virtchnl2_rss_key); @@ -1131,7 +1131,7 @@ VIRTCHNL2_CHECK_STRUCT_LEN(16, virtchnl2_queue_chunk); struct virtchnl2_queue_chunks { __le16 num_chunks; u8 pad[6]; - struct virtchnl2_queue_chunk chunks[]; + struct virtchnl2_queue_chunk chunks[] __counted_by_le(num_chunks); }; VIRTCHNL2_CHECK_STRUCT_LEN(8, virtchnl2_queue_chunks); @@ -1195,7 +1195,7 @@ struct virtchnl2_queue_vector_maps { __le32 vport_id; __le16 num_qv_maps; u8 pad[10]; - struct virtchnl2_queue_vector qv_maps[]; + struct virtchnl2_queue_vector qv_maps[] __counted_by_le(num_qv_maps); }; VIRTCHNL2_CHECK_STRUCT_LEN(16, virtchnl2_queue_vector_maps); @@ -1247,7 +1247,7 @@ struct virtchnl2_mac_addr_list { __le32 vport_id; __le16 num_mac_addr; u8 pad[2]; - struct virtchnl2_mac_addr mac_addr_list[]; + struct virtchnl2_mac_addr mac_addr_list[] __counted_by_le(num_mac_addr); }; VIRTCHNL2_CHECK_STRUCT_LEN(8, virtchnl2_mac_addr_list); From 21d9ba5bc5516e1d8a90c4a26de90855a1e4fb59 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 26 Mar 2024 14:32:07 +0100 Subject: [PATCH 023/167] net: phylink: add PHY_F_RXC_ALWAYS_ON to PHY dev flags Some MAC controllers (e.g. stmmac) require their connected PHY to continuously provide a receive clock signal. This can cause issues in two cases: 1. The clock signal hasn't been started yet by the time the MAC driver initializes its hardware. This can make the initialization fail, as in the case of the rzn1 GMAC1 driver. 2. The clock signal is cut during a power saving event. By the time the MAC is brought back up, the clock signal is still not active since phylink_start hasn't been called yet. This brings us back to case 1. If a PHY driver reads this flag, it should ensure that the receive clock signal is started as soon as possible, and that it isn't brought down when the PHY goes into suspend. Signed-off-by: Russell King (Oracle) [rgantois: commit log] Signed-off-by: Romain Gantois Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20240326-rxc_bugfix-v6-1-24a74e5c761f@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phylink.c | 10 +++++++++- include/linux/phy.h | 1 + include/linux/phylink.h | 4 ++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 503fd7c405235..2bb583543dea1 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1923,6 +1923,8 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy, static int phylink_attach_phy(struct phylink *pl, struct phy_device *phy, phy_interface_t interface) { + u32 flags = 0; + if (WARN_ON(pl->cfg_link_an_mode == MLO_AN_FIXED || (pl->cfg_link_an_mode == MLO_AN_INBAND && phy_interface_mode_is_8023z(interface) && !pl->sfp_bus))) @@ -1931,7 +1933,10 @@ static int phylink_attach_phy(struct phylink *pl, struct phy_device *phy, if (pl->phydev) return -EBUSY; - return phy_attach_direct(pl->netdev, phy, 0, interface); + if (pl->config->mac_requires_rxc) + flags |= PHY_F_RXC_ALWAYS_ON; + + return phy_attach_direct(pl->netdev, phy, flags, interface); } /** @@ -2034,6 +2039,9 @@ int phylink_fwnode_phy_connect(struct phylink *pl, pl->link_config.interface = pl->link_interface; } + if (pl->config->mac_requires_rxc) + flags |= PHY_F_RXC_ALWAYS_ON; + ret = phy_attach_direct(pl->netdev, phy_dev, flags, pl->link_interface); phy_device_free(phy_dev); diff --git a/include/linux/phy.h b/include/linux/phy.h index 3f68b8239bb11..e6e83304558e0 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -778,6 +778,7 @@ struct phy_device { /* Generic phy_device::dev_flags */ #define PHY_F_NO_IRQ 0x80000000 +#define PHY_F_RXC_ALWAYS_ON 0x40000000 static inline struct phy_device *to_phy_device(const struct device *dev) { diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 9a57deefcb078..fef8ae66b9883 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -138,6 +138,9 @@ enum phylink_op_type { * @poll_fixed_state: if true, starts link_poll, * if MAC link is at %MLO_AN_FIXED mode. * @mac_managed_pm: if true, indicate the MAC driver is responsible for PHY PM. + * @mac_requires_rxc: if true, the MAC always requires a receive clock from PHY. + * The PHY driver should start the clock signal as soon as + * possible and avoid stopping it during suspend events. * @ovr_an_inband: if true, override PCS to MLO_AN_INBAND * @get_fixed_state: callback to execute to determine the fixed link state, * if MAC link is at %MLO_AN_FIXED mode. @@ -150,6 +153,7 @@ struct phylink_config { enum phylink_op_type type; bool poll_fixed_state; bool mac_managed_pm; + bool mac_requires_rxc; bool ovr_an_inband; void (*get_fixed_state)(struct phylink_config *config, struct phylink_link_state *state); From dceb393a0a8e82ff05b7c1f6c911d0779d856a7a Mon Sep 17 00:00:00 2001 From: Romain Gantois Date: Tue, 26 Mar 2024 14:32:08 +0100 Subject: [PATCH 024/167] net: phylink: add rxc_always_on flag to phylink_pcs Some MAC drivers (e.g. stmmac) require a continuous receive clock signal to be generated by a PCS that is handled by a standalone PCS driver. Such a PCS driver does not have access to a PHY device, thus cannot check the PHY_F_RXC_ALWAYS_ON flag. They cannot check max_requires_rxc in the phylink config either, since it is a private member. Therefore, a new flag is needed to signal to the PCS that it should keep the RX clock signal up at all times. Co-developed-by: Russell King (Oracle) Signed-off-by: Russell King (Oracle) Signed-off-by: Romain Gantois Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20240326-rxc_bugfix-v6-2-24a74e5c761f@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phylink.c | 15 +++++++++++++++ include/linux/phylink.h | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 2bb583543dea1..84a97088dfc6b 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1042,6 +1042,21 @@ static void phylink_pcs_poll_start(struct phylink *pl) mod_timer(&pl->link_poll, jiffies + HZ); } +int phylink_pcs_pre_init(struct phylink *pl, struct phylink_pcs *pcs) +{ + int ret = 0; + + /* Signal to PCS driver that MAC requires RX clock for init */ + if (pl->config->mac_requires_rxc) + pcs->rxc_always_on = true; + + if (pcs->ops->pcs_pre_init) + ret = pcs->ops->pcs_pre_init(pcs); + + return ret; +} +EXPORT_SYMBOL_GPL(phylink_pcs_pre_init); + static void phylink_mac_config(struct phylink *pl, const struct phylink_link_state *state) { diff --git a/include/linux/phylink.h b/include/linux/phylink.h index fef8ae66b9883..5ea6b2ad23963 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -396,6 +396,10 @@ struct phylink_pcs_ops; * @phylink: pointer to &struct phylink_config * @neg_mode: provide PCS neg mode via "mode" argument * @poll: poll the PCS for link changes + * @rxc_always_on: The MAC driver requires the reference clock + * to always be on. Standalone PCS drivers which + * do not have access to a PHY device can check + * this instead of PHY_F_RXC_ALWAYS_ON. * * This structure is designed to be embedded within the PCS private data, * and will be passed between phylink and the PCS. @@ -408,6 +412,7 @@ struct phylink_pcs { struct phylink *phylink; bool neg_mode; bool poll; + bool rxc_always_on; }; /** @@ -422,6 +427,8 @@ struct phylink_pcs { * @pcs_an_restart: restart 802.3z BaseX autonegotiation. * @pcs_link_up: program the PCS for the resolved link configuration * (where necessary). + * @pcs_pre_init: configure PCS components necessary for MAC hardware + * initialization e.g. RX clock for stmmac. */ struct phylink_pcs_ops { int (*pcs_validate)(struct phylink_pcs *pcs, unsigned long *supported, @@ -441,6 +448,7 @@ struct phylink_pcs_ops { void (*pcs_an_restart)(struct phylink_pcs *pcs); void (*pcs_link_up)(struct phylink_pcs *pcs, unsigned int neg_mode, phy_interface_t interface, int speed, int duplex); + int (*pcs_pre_init)(struct phylink_pcs *pcs); }; #if 0 /* For kernel-doc purposes only. */ @@ -546,6 +554,34 @@ void pcs_an_restart(struct phylink_pcs *pcs); */ void pcs_link_up(struct phylink_pcs *pcs, unsigned int neg_mode, phy_interface_t interface, int speed, int duplex); + +/** + * pcs_pre_init() - Configure PCS components necessary for MAC initialization + * @pcs: a pointer to a &struct phylink_pcs. + * + * This function can be called by MAC drivers through the + * phylink_pcs_pre_init() wrapper, before their hardware is initialized. It + * should not be called after the link is brought up, as reconfiguring the PCS + * at this point could break the link. + * + * Some MAC devices require specific hardware initialization to be performed by + * their associated PCS device before they can properly initialize their own + * hardware. An example of this is the initialization of stmmac controllers, + * which requires an active REF_CLK signal to be provided by the PHY/PCS. + * + * By calling phylink_pcs_pre_init(), MAC drivers can ensure that the PCS is + * setup in a way that allows for successful hardware initialization. + * + * The specific configuration performed by pcs_pre_init() is dependent on the + * model of PCS and the requirements of the MAC device attached to it. PCS + * driver authors should consider whether their target device is to be used in + * conjunction with a MAC device whose driver calls phylink_pcs_pre_init(). MAC + * driver authors should document their requirements for the PCS + * pre-initialization. + * + */ +int pcs_pre_init(struct phylink_pcs *pcs); + #endif struct phylink *phylink_create(struct phylink_config *, @@ -565,6 +601,8 @@ void phylink_disconnect_phy(struct phylink *); void phylink_mac_change(struct phylink *, bool up); void phylink_pcs_change(struct phylink_pcs *, bool up); +int phylink_pcs_pre_init(struct phylink *pl, struct phylink_pcs *pcs); + void phylink_start(struct phylink *); void phylink_stop(struct phylink *); From 10658e99d952769002c11f7884eebb5a0310d161 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Tue, 26 Mar 2024 14:32:09 +0100 Subject: [PATCH 025/167] net: stmmac: don't rely on lynx_pcs presence to check for a PHY When initializing attached PHYs, there are some cases where we don't expect any PHY to be connected. The logic uses conditions based on various local PCS configuration, but also calls-in phylink_expects_phy() via stmmac_init_phy(), which is enough to ensure we don't try to initialize a PHY when using a Lynx PCS, as long as we have the phy_interface set to a 802.3z mode and are using inband negociation. Drop the lynx check, making the stmmac generic code more pcs_lynx-agnostic. Signed-off-by: Maxime Chevallier [rgantois: commit log] Signed-off-by: Romain Gantois Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20240326-rxc_bugfix-v6-3-24a74e5c761f@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index bcdde68a099a2..9fb8750248a10 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3960,8 +3960,7 @@ static int __stmmac_open(struct net_device *dev, if (priv->hw->pcs != STMMAC_PCS_TBI && priv->hw->pcs != STMMAC_PCS_RTBI && (!priv->hw->xpcs || - xpcs_get_an_mode(priv->hw->xpcs, mode) != DW_AN_C73) && - !priv->hw->lynx_pcs) { + xpcs_get_an_mode(priv->hw->xpcs, mode) != DW_AN_C73)) { ret = stmmac_init_phy(dev); if (ret) { netdev_err(priv->dev, From f7bff228a616a7eb6cba9246e97ec4da543aa53a Mon Sep 17 00:00:00 2001 From: Romain Gantois Date: Tue, 26 Mar 2024 14:32:10 +0100 Subject: [PATCH 026/167] net: stmmac: Support a generic PCS field in mac_device_info Global stmmac support for early initialization of PCS devices requires a generic PCS reference that can be passed to phylink_pcs_pre_init(). Currently, the mac_device_info struct contains only one PCS field, which is specific to the Lynx model. As PCS models are hardware-specific, it is more appropriate to have a generic PCS field in the mac_device_info struct. Refactor the lynx_pcs field into a generic phylink_pcs field. Signed-off-by: Romain Gantois Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20240326-rxc_bugfix-v6-4-24a74e5c761f@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/common.h | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 8 ++++---- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 5 +---- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index a6fefe675ef15..f55cf09f0783a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -593,7 +593,7 @@ struct mac_device_info { const struct stmmac_mmc_ops *mmc; const struct stmmac_est_ops *est; struct dw_xpcs *xpcs; - struct phylink_pcs *lynx_pcs; /* Lynx external PCS */ + struct phylink_pcs *phylink_pcs; struct mii_regs mii; /* MII register Addresses */ struct mac_link link; void __iomem *pcsr; /* vpointer to device CSRs */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index 68f85e4605cba..12b4a80ea3aa1 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -479,9 +479,9 @@ static int socfpga_dwmac_probe(struct platform_device *pdev) goto err_dvr_remove; } - stpriv->hw->lynx_pcs = lynx_pcs_create_mdiodev(pcs_bus, 0); - if (IS_ERR(stpriv->hw->lynx_pcs)) { - ret = PTR_ERR(stpriv->hw->lynx_pcs); + stpriv->hw->phylink_pcs = lynx_pcs_create_mdiodev(pcs_bus, 0); + if (IS_ERR(stpriv->hw->phylink_pcs)) { + ret = PTR_ERR(stpriv->hw->phylink_pcs); goto err_dvr_remove; } } @@ -498,7 +498,7 @@ static void socfpga_dwmac_remove(struct platform_device *pdev) { struct net_device *ndev = platform_get_drvdata(pdev); struct stmmac_priv *priv = netdev_priv(ndev); - struct phylink_pcs *pcs = priv->hw->lynx_pcs; + struct phylink_pcs *pcs = priv->hw->phylink_pcs; stmmac_pltfr_remove(pdev); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 9fb8750248a10..1ee06a6e5c22f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -944,10 +944,7 @@ static struct phylink_pcs *stmmac_mac_select_pcs(struct phylink_config *config, if (priv->hw->xpcs) return &priv->hw->xpcs->pcs; - if (priv->hw->lynx_pcs) - return priv->hw->lynx_pcs; - - return NULL; + return priv->hw->phylink_pcs; } static void stmmac_mac_config(struct phylink_config *config, unsigned int mode, From 58329b03a5957904fa2b33b3824ed19e7b42c9e9 Mon Sep 17 00:00:00 2001 From: Romain Gantois Date: Tue, 26 Mar 2024 14:32:11 +0100 Subject: [PATCH 027/167] net: stmmac: Signal to PHY/PCS drivers to keep RX clock on MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a reocurring issue with stmmac controllers where the MAC fails to initialize its hardware if an RX clock signal isn't provided on the MAC/PHY link. This causes issues when PHY or PCS devices either go into suspend while cutting the RX clock or do not bring the clock signal up early enough for the MAC to initialize successfully. Set the mac_requires_rxc flag in the stmmac phylink config so that PHY/PCS drivers know to keep the RX clock up at all times. Reported-by: Clark Wang Link: https://lore.kernel.org/all/20230202081559.3553637-1-xiaoning.wang@nxp.com/ Reported-by: Clément Léger Link: https://lore.kernel.org/linux-arm-kernel/20230116103926.276869-4-clement.leger@bootlin.com/ Co-developed-by: Russell King (Oracle) Signed-off-by: Russell King (Oracle) Signed-off-by: Romain Gantois Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20240326-rxc_bugfix-v6-5-24a74e5c761f@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 1ee06a6e5c22f..fe3498e86de9d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1218,6 +1218,9 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) priv->phylink_config.type = PHYLINK_NETDEV; priv->phylink_config.mac_managed_pm = true; + /* Stmmac always requires an RX clock for hardware initialization */ + priv->phylink_config.mac_requires_rxc = true; + mdio_bus_data = priv->plat->mdio_bus_data; if (mdio_bus_data) priv->phylink_config.ovr_an_inband = @@ -3408,6 +3411,10 @@ static int stmmac_hw_setup(struct net_device *dev, bool ptp_register) u32 chan; int ret; + /* Make sure RX clock is enabled */ + if (priv->hw->phylink_pcs) + phylink_pcs_pre_init(priv->phylink, priv->hw->phylink_pcs); + /* DMA initialization and SW reset */ ret = stmmac_init_dma_engine(priv); if (ret < 0) { From 30dc5873967ecc0282a8283622156cba4be8daf4 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 26 Mar 2024 14:32:12 +0100 Subject: [PATCH 028/167] net: phy: qcom: at803x: Avoid hibernating if MAC requires RX clock Stmmac controllers connected to an at803x PHY cannot resume properly after suspend when WoL is enabled. This happens because the MAC requires an RX clock generated by the PHY to initialize its hardware properly. But the RX clock is cut when the PHY suspends and isn't brought up until the MAC driver resumes the phylink. Prevent the at803x PHY driver from going into suspend if the attached MAC driver always requires an RX clock signal. Reported-by: Clark Wang Link: https://lore.kernel.org/all/20230202081559.3553637-1-xiaoning.wang@nxp.com/ Signed-off-by: Russell King (Oracle) [rgantois: commit log] Signed-off-by: Romain Gantois Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20240326-rxc_bugfix-v6-6-24a74e5c761f@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/qcom/at803x.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/qcom/at803x.c b/drivers/net/phy/qcom/at803x.c index e79657f76bea2..c8f83e5f78ab9 100644 --- a/drivers/net/phy/qcom/at803x.c +++ b/drivers/net/phy/qcom/at803x.c @@ -426,7 +426,8 @@ static int at803x_hibernation_mode_config(struct phy_device *phydev) /* The default after hardware reset is hibernation mode enabled. After * software reset, the value is retained. */ - if (!(priv->flags & AT803X_DISABLE_HIBERNATION_MODE)) + if (!(priv->flags & AT803X_DISABLE_HIBERNATION_MODE) && + !(phydev->dev_flags & PHY_F_RXC_ALWAYS_ON)) return 0; return at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_HIB_CTRL, From 0f671b3b6edf85cfe6fc80ad42963973bee1838c Mon Sep 17 00:00:00 2001 From: Romain Gantois Date: Tue, 26 Mar 2024 14:32:13 +0100 Subject: [PATCH 029/167] net: pcs: rzn1-miic: Init RX clock early if MAC requires it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The GMAC1 controller in the RZN1 IP requires the RX MII clock signal to be started before it initializes its own hardware, thus before it calls phylink_start. Implement the pcs_pre_init() callback so that the RX clock signal can be enabled early if necessary. Reported-by: Clément Léger Link: https://lore.kernel.org/linux-arm-kernel/20230116103926.276869-4-clement.leger@bootlin.com/ Signed-off-by: Romain Gantois Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20240326-rxc_bugfix-v6-7-24a74e5c761f@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/pcs/pcs-rzn1-miic.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/drivers/net/pcs/pcs-rzn1-miic.c b/drivers/net/pcs/pcs-rzn1-miic.c index 4bd66fdde3674..d0a722d43368f 100644 --- a/drivers/net/pcs/pcs-rzn1-miic.c +++ b/drivers/net/pcs/pcs-rzn1-miic.c @@ -279,10 +279,38 @@ static int miic_validate(struct phylink_pcs *pcs, unsigned long *supported, return -EINVAL; } +static int miic_pre_init(struct phylink_pcs *pcs) +{ + struct miic_port *miic_port = phylink_pcs_to_miic_port(pcs); + struct miic *miic = miic_port->miic; + u32 val, mask; + + /* Start RX clock if required */ + if (pcs->rxc_always_on) { + /* In MII through mode, the clock signals will be driven by the + * external PHY, which might not be initialized yet. Set RMII + * as default mode to ensure that a reference clock signal is + * generated. + */ + miic_port->interface = PHY_INTERFACE_MODE_RMII; + + val = FIELD_PREP(MIIC_CONVCTRL_CONV_MODE, CONV_MODE_RMII) | + FIELD_PREP(MIIC_CONVCTRL_CONV_SPEED, CONV_MODE_100MBPS); + mask = MIIC_CONVCTRL_CONV_MODE | MIIC_CONVCTRL_CONV_SPEED; + + miic_reg_rmw(miic, MIIC_CONVCTRL(miic_port->port), mask, val); + + miic_converter_enable(miic, miic_port->port, 1); + } + + return 0; +} + static const struct phylink_pcs_ops miic_phylink_ops = { .pcs_validate = miic_validate, .pcs_config = miic_config, .pcs_link_up = miic_link_up, + .pcs_pre_init = miic_pre_init, }; struct phylink_pcs *miic_create(struct device *dev, struct device_node *np) From 7de3c2218eed77ed4771521459a18bef938f7089 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 25 Mar 2024 15:28:51 -0700 Subject: [PATCH 030/167] bnxt_en: Add a timeout parameter to bnxt_hwrm_port_ts_query() The caller can pass this new timeout parameter to the function to specify the firmware timeout value when requesting the TX timestamp from the firmware. This will allow the caller to precisely control the timeout and will be used in the next patch. In this patch, the parameter is 0 which means to use the current default value. Cc: Richard Cochran Reviewed-by: Pavan Chebbi Signed-off-by: Michael Chan Reviewed-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20240325222902.220712-2-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c | 16 ++++++++++++---- drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h | 1 + 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c index cc07660330f53..dbfd1b36774c3 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c @@ -109,7 +109,8 @@ static void bnxt_ptp_get_current_time(struct bnxt *bp) spin_unlock_bh(&ptp->ptp_lock); } -static int bnxt_hwrm_port_ts_query(struct bnxt *bp, u32 flags, u64 *ts) +static int bnxt_hwrm_port_ts_query(struct bnxt *bp, u32 flags, u64 *ts, + u32 txts_tmo) { struct hwrm_port_ts_query_output *resp; struct hwrm_port_ts_query_input *req; @@ -122,10 +123,15 @@ static int bnxt_hwrm_port_ts_query(struct bnxt *bp, u32 flags, u64 *ts) req->flags = cpu_to_le32(flags); if ((flags & PORT_TS_QUERY_REQ_FLAGS_PATH) == PORT_TS_QUERY_REQ_FLAGS_PATH_TX) { + u32 tmo_us = txts_tmo * 1000; + req->enables = cpu_to_le16(BNXT_PTP_QTS_TX_ENABLES); req->ptp_seq_id = cpu_to_le32(bp->ptp_cfg->tx_seqid); req->ptp_hdr_offset = cpu_to_le16(bp->ptp_cfg->tx_hdr_off); - req->ts_req_timeout = cpu_to_le16(BNXT_PTP_QTS_TIMEOUT); + if (!tmo_us) + tmo_us = BNXT_PTP_QTS_TIMEOUT; + tmo_us = min(tmo_us, BNXT_PTP_QTS_MAX_TMO_US); + req->ts_req_timeout = cpu_to_le16(txts_tmo); } resp = hwrm_req_hold(bp, req); @@ -675,7 +681,8 @@ static void bnxt_stamp_tx_skb(struct bnxt *bp, struct sk_buff *skb) u64 ts = 0, ns = 0; int rc; - rc = bnxt_hwrm_port_ts_query(bp, PORT_TS_QUERY_REQ_FLAGS_PATH_TX, &ts); + rc = bnxt_hwrm_port_ts_query(bp, PORT_TS_QUERY_REQ_FLAGS_PATH_TX, &ts, + 0); if (!rc) { memset(×tamp, 0, sizeof(timestamp)); spin_lock_bh(&ptp->ptp_lock); @@ -891,7 +898,8 @@ int bnxt_ptp_init_rtc(struct bnxt *bp, bool phc_cfg) if (rc) return rc; } else { - rc = bnxt_hwrm_port_ts_query(bp, PORT_TS_QUERY_REQ_FLAGS_CURRENT_TIME, &ns); + rc = bnxt_hwrm_port_ts_query(bp, PORT_TS_QUERY_REQ_FLAGS_CURRENT_TIME, + &ns, 0); if (rc) return rc; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h index fce8dc39a7d0e..04886d5f22adc 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h @@ -23,6 +23,7 @@ #define BNXT_HI_TIMER_MASK 0xffff00000000UL #define BNXT_PTP_QTS_TIMEOUT 1000 +#define BNXT_PTP_QTS_MAX_TMO_US 65535 #define BNXT_PTP_QTS_TX_ENABLES (PORT_TS_QUERY_REQ_ENABLES_PTP_SEQ_ID | \ PORT_TS_QUERY_REQ_ENABLES_TS_REQ_TIMEOUT | \ PORT_TS_QUERY_REQ_ENABLES_PTP_HDR_OFFSET) From 604041643a858bc1d3926e0a32ebc482e8beaee2 Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Mon, 25 Mar 2024 15:28:52 -0700 Subject: [PATCH 031/167] bnxt_en: Retry PTP TX timestamp from FW for 1 second Use a new default 1 second timeout value instead of the existing 1 msec value. The driver will keep track of the remaining time before timeout and will pass this value to bnxt_hwrm_port_ts_query(). The firmware supports timeout values up to 65535 usecs. If the timeout value passed to bnxt_hwrm_port_ts_query() is less than the FW max value, we will use that value to precisely control the specified timeout. If it is larger than the FW max value, we will use the FW max value and any additional retry to reach the desired timeout will be done in the context of bnxt_ptp_ts_aux_eork(). Link: https://lore.kernel.org/netdev/20240229070202.107488-2-michael.chan@broadcom.com/ Cc: Richard Cochran Reviewed-by: Andy Gospodarek Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Reviewed-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20240325222902.220712-3-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c | 16 +++++++++++++++- drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h | 4 ++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c index dbfd1b36774c3..345aac4484ee4 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c @@ -678,11 +678,17 @@ static void bnxt_stamp_tx_skb(struct bnxt *bp, struct sk_buff *skb) { struct bnxt_ptp_cfg *ptp = bp->ptp_cfg; struct skb_shared_hwtstamps timestamp; + unsigned long now = jiffies; u64 ts = 0, ns = 0; + u32 tmo = 0; int rc; + if (!ptp->txts_pending) + ptp->abs_txts_tmo = now + msecs_to_jiffies(ptp->txts_tmo); + if (!time_after_eq(now, ptp->abs_txts_tmo)) + tmo = jiffies_to_msecs(ptp->abs_txts_tmo - now); rc = bnxt_hwrm_port_ts_query(bp, PORT_TS_QUERY_REQ_FLAGS_PATH_TX, &ts, - 0); + tmo); if (!rc) { memset(×tamp, 0, sizeof(timestamp)); spin_lock_bh(&ptp->ptp_lock); @@ -691,6 +697,10 @@ static void bnxt_stamp_tx_skb(struct bnxt *bp, struct sk_buff *skb) timestamp.hwtstamp = ns_to_ktime(ns); skb_tstamp_tx(ptp->tx_skb, ×tamp); } else { + if (!time_after_eq(jiffies, ptp->abs_txts_tmo)) { + ptp->txts_pending = true; + return; + } netdev_warn_once(bp->dev, "TS query for TX timer failed rc = %x\n", rc); } @@ -698,6 +708,7 @@ static void bnxt_stamp_tx_skb(struct bnxt *bp, struct sk_buff *skb) dev_kfree_skb_any(ptp->tx_skb); ptp->tx_skb = NULL; atomic_inc(&ptp->tx_avail); + ptp->txts_pending = false; } static long bnxt_ptp_ts_aux_work(struct ptp_clock_info *ptp_info) @@ -721,6 +732,8 @@ static long bnxt_ptp_ts_aux_work(struct ptp_clock_info *ptp_info) spin_unlock_bh(&ptp->ptp_lock); ptp->next_overflow_check = now + BNXT_PHC_OVERFLOW_PERIOD; } + if (ptp->txts_pending) + return 0; return HZ; } @@ -973,6 +986,7 @@ int bnxt_ptp_init(struct bnxt *bp, bool phc_cfg) spin_unlock_bh(&ptp->ptp_lock); ptp_schedule_worker(ptp->ptp_clock, 0); } + ptp->txts_tmo = BNXT_PTP_DFLT_TX_TMO; return 0; out: diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h index 04886d5f22adc..6a2bba3f9e2dd 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h @@ -22,6 +22,7 @@ #define BNXT_LO_TIMER_MASK 0x0000ffffffffUL #define BNXT_HI_TIMER_MASK 0xffff00000000UL +#define BNXT_PTP_DFLT_TX_TMO 1000 /* ms */ #define BNXT_PTP_QTS_TIMEOUT 1000 #define BNXT_PTP_QTS_MAX_TMO_US 65535 #define BNXT_PTP_QTS_TX_ENABLES (PORT_TS_QUERY_REQ_ENABLES_PTP_SEQ_ID | \ @@ -116,11 +117,14 @@ struct bnxt_ptp_cfg { BNXT_PTP_MSG_PDELAY_REQ | \ BNXT_PTP_MSG_PDELAY_RESP) u8 tx_tstamp_en:1; + u8 txts_pending:1; int rx_filter; u32 tstamp_filters; u32 refclk_regs[2]; u32 refclk_mapped_regs[2]; + u32 txts_tmo; + unsigned long abs_txts_tmo; }; #if BITS_PER_LONG == 32 From 1dcd70ba2437b52d18f3ad4cc65931a74bdcb353 Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Mon, 25 Mar 2024 15:28:53 -0700 Subject: [PATCH 032/167] bnxt_en: Add helper function bnxt_hwrm_vnic_rss_cfg_p5() This is a pure refactoring patch. The new function bnxt_hwrm_vnic_set_rss_p5() will set up the P5_PLUS specific RSS ring table and then call bnxt_hwrm_vnic_cfg() to setup the vnic for proper RSS operations. This new function will be used later for additional RSS contexts. Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/20240325222902.220712-4-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 31 +++++++++++++++-------- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 493b724848c8f..dd2c5033af027 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -9825,6 +9825,23 @@ static int __bnxt_setup_vnic(struct bnxt *bp, u16 vnic_id) return rc; } +int bnxt_hwrm_vnic_rss_cfg_p5(struct bnxt *bp, u16 vnic_id) +{ + int rc; + + rc = bnxt_hwrm_vnic_set_rss_p5(bp, vnic_id, true); + if (rc) { + netdev_err(bp->dev, "hwrm vnic %d set rss failure rc: %d\n", + vnic_id, rc); + return rc; + } + rc = bnxt_hwrm_vnic_cfg(bp, vnic_id); + if (rc) + netdev_err(bp->dev, "hwrm vnic %d cfg failure rc: %x\n", + vnic_id, rc); + return rc; +} + static int __bnxt_setup_vnic_p5(struct bnxt *bp, u16 vnic_id) { int rc, i, nr_ctxs; @@ -9842,18 +9859,10 @@ static int __bnxt_setup_vnic_p5(struct bnxt *bp, u16 vnic_id) if (i < nr_ctxs) return -ENOMEM; - rc = bnxt_hwrm_vnic_set_rss_p5(bp, vnic_id, true); - if (rc) { - netdev_err(bp->dev, "hwrm vnic %d set rss failure rc: %d\n", - vnic_id, rc); - return rc; - } - rc = bnxt_hwrm_vnic_cfg(bp, vnic_id); - if (rc) { - netdev_err(bp->dev, "hwrm vnic %d cfg failure rc: %x\n", - vnic_id, rc); + rc = bnxt_hwrm_vnic_rss_cfg_p5(bp, vnic_id); + if (rc) return rc; - } + if (bp->flags & BNXT_FLAG_AGG_RINGS) { rc = bnxt_hwrm_vnic_set_hds(bp, vnic_id); if (rc) { diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index dd849e715c9ba..fbb53308cb815 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2721,6 +2721,7 @@ int bnxt_hwrm_free_wol_fltr(struct bnxt *bp); int bnxt_hwrm_func_resc_qcaps(struct bnxt *bp, bool all); int bnxt_hwrm_func_qcaps(struct bnxt *bp); int bnxt_hwrm_fw_set_time(struct bnxt *); +int bnxt_hwrm_vnic_rss_cfg_p5(struct bnxt *bp, u16 vnic_id); int bnxt_open_nic(struct bnxt *, bool, bool); int bnxt_half_open_nic(struct bnxt *bp); void bnxt_half_close_nic(struct bnxt *bp); From a4c11166a69619f1fd0118a0790c46872836240a Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Mon, 25 Mar 2024 15:28:54 -0700 Subject: [PATCH 033/167] bnxt_en: Refactor VNIC alloc and cfg functions The current VNIC structures are stored in an array bp->vnic_info[]. The index of the array (vnic_id) is passed to all the functions that need to reference the VNIC. This patch changes the scheme to pass the VNIC pointer instead of the vnic index. Subsequent patches will create additional VNICs that will not be stored in the bp->vnic_info[] array. Using the VNIC pointer will work for all the VNICs. Reviewed-by: Kalesh AP Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/20240325222902.220712-5-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 140 +++++++++--------- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 5 +- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 2 +- 3 files changed, 75 insertions(+), 72 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index dd2c5033af027..3f5d7c81a2817 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -4241,6 +4241,7 @@ static void bnxt_init_vnics(struct bnxt *bp) int j; vnic->fw_vnic_id = INVALID_HW_RING_ID; + vnic->vnic_id = i; for (j = 0; j < BNXT_MAX_CTX_PER_VNIC; j++) vnic->fw_rss_cos_lb_ctx[j] = INVALID_HW_RING_ID; @@ -5938,9 +5939,9 @@ static void bnxt_hwrm_vnic_update_tunl_tpa(struct bnxt *bp, req->tnl_tpa_en_bitmap = cpu_to_le32(tunl_tpa_bmap); } -static int bnxt_hwrm_vnic_set_tpa(struct bnxt *bp, u16 vnic_id, u32 tpa_flags) +static int bnxt_hwrm_vnic_set_tpa(struct bnxt *bp, struct bnxt_vnic_info *vnic, + u32 tpa_flags) { - struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id]; u16 max_aggs = VNIC_TPA_CFG_REQ_MAX_AGGS_MAX; struct hwrm_vnic_tpa_cfg_input *req; int rc; @@ -6154,9 +6155,9 @@ __bnxt_hwrm_vnic_set_rss(struct bnxt *bp, struct hwrm_vnic_rss_cfg_input *req, req->hash_key_tbl_addr = cpu_to_le64(vnic->rss_hash_key_dma_addr); } -static int bnxt_hwrm_vnic_set_rss(struct bnxt *bp, u16 vnic_id, bool set_rss) +static int bnxt_hwrm_vnic_set_rss(struct bnxt *bp, struct bnxt_vnic_info *vnic, + bool set_rss) { - struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id]; struct hwrm_vnic_rss_cfg_input *req; int rc; @@ -6174,9 +6175,9 @@ static int bnxt_hwrm_vnic_set_rss(struct bnxt *bp, u16 vnic_id, bool set_rss) return hwrm_req_send(bp, req); } -static int bnxt_hwrm_vnic_set_rss_p5(struct bnxt *bp, u16 vnic_id, bool set_rss) +static int bnxt_hwrm_vnic_set_rss_p5(struct bnxt *bp, + struct bnxt_vnic_info *vnic, bool set_rss) { - struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id]; struct hwrm_vnic_rss_cfg_input *req; dma_addr_t ring_tbl_map; u32 i, nr_ctxs; @@ -6229,9 +6230,8 @@ static void bnxt_hwrm_update_rss_hash_cfg(struct bnxt *bp) hwrm_req_drop(bp, req); } -static int bnxt_hwrm_vnic_set_hds(struct bnxt *bp, u16 vnic_id) +static int bnxt_hwrm_vnic_set_hds(struct bnxt *bp, struct bnxt_vnic_info *vnic) { - struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id]; struct hwrm_vnic_plcmodes_cfg_input *req; int rc; @@ -6256,7 +6256,8 @@ static int bnxt_hwrm_vnic_set_hds(struct bnxt *bp, u16 vnic_id) return hwrm_req_send(bp, req); } -static void bnxt_hwrm_vnic_ctx_free_one(struct bnxt *bp, u16 vnic_id, +static void bnxt_hwrm_vnic_ctx_free_one(struct bnxt *bp, + struct bnxt_vnic_info *vnic, u16 ctx_idx) { struct hwrm_vnic_rss_cos_lb_ctx_free_input *req; @@ -6265,10 +6266,10 @@ static void bnxt_hwrm_vnic_ctx_free_one(struct bnxt *bp, u16 vnic_id, return; req->rss_cos_lb_ctx_id = - cpu_to_le16(bp->vnic_info[vnic_id].fw_rss_cos_lb_ctx[ctx_idx]); + cpu_to_le16(vnic->fw_rss_cos_lb_ctx[ctx_idx]); hwrm_req_send(bp, req); - bp->vnic_info[vnic_id].fw_rss_cos_lb_ctx[ctx_idx] = INVALID_HW_RING_ID; + vnic->fw_rss_cos_lb_ctx[ctx_idx] = INVALID_HW_RING_ID; } static void bnxt_hwrm_vnic_ctx_free(struct bnxt *bp) @@ -6280,13 +6281,14 @@ static void bnxt_hwrm_vnic_ctx_free(struct bnxt *bp) for (j = 0; j < BNXT_MAX_CTX_PER_VNIC; j++) { if (vnic->fw_rss_cos_lb_ctx[j] != INVALID_HW_RING_ID) - bnxt_hwrm_vnic_ctx_free_one(bp, i, j); + bnxt_hwrm_vnic_ctx_free_one(bp, vnic, j); } } bp->rsscos_nr_ctxs = 0; } -static int bnxt_hwrm_vnic_ctx_alloc(struct bnxt *bp, u16 vnic_id, u16 ctx_idx) +static int bnxt_hwrm_vnic_ctx_alloc(struct bnxt *bp, + struct bnxt_vnic_info *vnic, u16 ctx_idx) { struct hwrm_vnic_rss_cos_lb_ctx_alloc_output *resp; struct hwrm_vnic_rss_cos_lb_ctx_alloc_input *req; @@ -6299,7 +6301,7 @@ static int bnxt_hwrm_vnic_ctx_alloc(struct bnxt *bp, u16 vnic_id, u16 ctx_idx) resp = hwrm_req_hold(bp, req); rc = hwrm_req_send(bp, req); if (!rc) - bp->vnic_info[vnic_id].fw_rss_cos_lb_ctx[ctx_idx] = + vnic->fw_rss_cos_lb_ctx[ctx_idx] = le16_to_cpu(resp->rss_cos_lb_ctx_id); hwrm_req_drop(bp, req); @@ -6313,10 +6315,9 @@ static u32 bnxt_get_roce_vnic_mode(struct bnxt *bp) return VNIC_CFG_REQ_FLAGS_ROCE_DUAL_VNIC_MODE; } -int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id) +int bnxt_hwrm_vnic_cfg(struct bnxt *bp, struct bnxt_vnic_info *vnic) { struct bnxt_vnic_info *vnic0 = &bp->vnic_info[BNXT_VNIC_DEFAULT]; - struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id]; struct hwrm_vnic_cfg_input *req; unsigned int ring = 0, grp_idx; u16 def_vlan = 0; @@ -6364,8 +6365,8 @@ int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id) if (vnic->flags & BNXT_VNIC_RSS_FLAG) ring = 0; else if (vnic->flags & BNXT_VNIC_RFS_FLAG) - ring = vnic_id - 1; - else if ((vnic_id == 1) && BNXT_CHIP_TYPE_NITRO_A0(bp)) + ring = vnic->vnic_id - 1; + else if ((vnic->vnic_id == 1) && BNXT_CHIP_TYPE_NITRO_A0(bp)) ring = bp->rx_nr_rings - 1; grp_idx = bp->rx_ring[ring].bnapi->index; @@ -6381,25 +6382,25 @@ int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id) #endif if ((bp->flags & BNXT_FLAG_STRIP_VLAN) || def_vlan) req->flags |= cpu_to_le32(VNIC_CFG_REQ_FLAGS_VLAN_STRIP_MODE); - if (!vnic_id && bnxt_ulp_registered(bp->edev)) + if (vnic->vnic_id == BNXT_VNIC_DEFAULT && bnxt_ulp_registered(bp->edev)) req->flags |= cpu_to_le32(bnxt_get_roce_vnic_mode(bp)); return hwrm_req_send(bp, req); } -static void bnxt_hwrm_vnic_free_one(struct bnxt *bp, u16 vnic_id) +static void bnxt_hwrm_vnic_free_one(struct bnxt *bp, + struct bnxt_vnic_info *vnic) { - if (bp->vnic_info[vnic_id].fw_vnic_id != INVALID_HW_RING_ID) { + if (vnic->fw_vnic_id != INVALID_HW_RING_ID) { struct hwrm_vnic_free_input *req; if (hwrm_req_init(bp, req, HWRM_VNIC_FREE)) return; - req->vnic_id = - cpu_to_le32(bp->vnic_info[vnic_id].fw_vnic_id); + req->vnic_id = cpu_to_le32(vnic->fw_vnic_id); hwrm_req_send(bp, req); - bp->vnic_info[vnic_id].fw_vnic_id = INVALID_HW_RING_ID; + vnic->fw_vnic_id = INVALID_HW_RING_ID; } } @@ -6408,15 +6409,14 @@ static void bnxt_hwrm_vnic_free(struct bnxt *bp) u16 i; for (i = 0; i < bp->nr_vnics; i++) - bnxt_hwrm_vnic_free_one(bp, i); + bnxt_hwrm_vnic_free_one(bp, &bp->vnic_info[i]); } -static int bnxt_hwrm_vnic_alloc(struct bnxt *bp, u16 vnic_id, +static int bnxt_hwrm_vnic_alloc(struct bnxt *bp, struct bnxt_vnic_info *vnic, unsigned int start_rx_ring_idx, unsigned int nr_rings) { unsigned int i, j, grp_idx, end_idx = start_rx_ring_idx + nr_rings; - struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id]; struct hwrm_vnic_alloc_output *resp; struct hwrm_vnic_alloc_input *req; int rc; @@ -6442,7 +6442,7 @@ static int bnxt_hwrm_vnic_alloc(struct bnxt *bp, u16 vnic_id, vnic_no_ring_grps: for (i = 0; i < BNXT_MAX_CTX_PER_VNIC; i++) vnic->fw_rss_cos_lb_ctx[i] = INVALID_HW_RING_ID; - if (vnic_id == BNXT_VNIC_DEFAULT) + if (vnic->vnic_id == BNXT_VNIC_DEFAULT) req->flags = cpu_to_le32(VNIC_ALLOC_REQ_FLAGS_DEFAULT); resp = hwrm_req_hold(bp, req); @@ -9676,7 +9676,7 @@ static int bnxt_set_tpa(struct bnxt *bp, bool set_tpa) else if (BNXT_NO_FW_ACCESS(bp)) return 0; for (i = 0; i < bp->nr_vnics; i++) { - rc = bnxt_hwrm_vnic_set_tpa(bp, i, tpa_flags); + rc = bnxt_hwrm_vnic_set_tpa(bp, &bp->vnic_info[i], tpa_flags); if (rc) { netdev_err(bp->dev, "hwrm vnic set tpa failure rc for vnic %d: %x\n", i, rc); @@ -9691,7 +9691,7 @@ static void bnxt_hwrm_clear_vnic_rss(struct bnxt *bp) int i; for (i = 0; i < bp->nr_vnics; i++) - bnxt_hwrm_vnic_set_rss(bp, i, false); + bnxt_hwrm_vnic_set_rss(bp, &bp->vnic_info[i], false); } static void bnxt_clear_vnic(struct bnxt *bp) @@ -9769,28 +9769,27 @@ static int bnxt_hwrm_set_cache_line_size(struct bnxt *bp, int size) return hwrm_req_send(bp, req); } -static int __bnxt_setup_vnic(struct bnxt *bp, u16 vnic_id) +static int __bnxt_setup_vnic(struct bnxt *bp, struct bnxt_vnic_info *vnic) { - struct bnxt_vnic_info *vnic = &bp->vnic_info[vnic_id]; int rc; if (vnic->flags & BNXT_VNIC_RFS_NEW_RSS_FLAG) goto skip_rss_ctx; /* allocate context for vnic */ - rc = bnxt_hwrm_vnic_ctx_alloc(bp, vnic_id, 0); + rc = bnxt_hwrm_vnic_ctx_alloc(bp, vnic, 0); if (rc) { netdev_err(bp->dev, "hwrm vnic %d alloc failure rc: %x\n", - vnic_id, rc); + vnic->vnic_id, rc); goto vnic_setup_err; } bp->rsscos_nr_ctxs++; if (BNXT_CHIP_TYPE_NITRO_A0(bp)) { - rc = bnxt_hwrm_vnic_ctx_alloc(bp, vnic_id, 1); + rc = bnxt_hwrm_vnic_ctx_alloc(bp, vnic, 1); if (rc) { netdev_err(bp->dev, "hwrm vnic %d cos ctx alloc failure rc: %x\n", - vnic_id, rc); + vnic->vnic_id, rc); goto vnic_setup_err; } bp->rsscos_nr_ctxs++; @@ -9798,26 +9797,26 @@ static int __bnxt_setup_vnic(struct bnxt *bp, u16 vnic_id) skip_rss_ctx: /* configure default vnic, ring grp */ - rc = bnxt_hwrm_vnic_cfg(bp, vnic_id); + rc = bnxt_hwrm_vnic_cfg(bp, vnic); if (rc) { netdev_err(bp->dev, "hwrm vnic %d cfg failure rc: %x\n", - vnic_id, rc); + vnic->vnic_id, rc); goto vnic_setup_err; } /* Enable RSS hashing on vnic */ - rc = bnxt_hwrm_vnic_set_rss(bp, vnic_id, true); + rc = bnxt_hwrm_vnic_set_rss(bp, vnic, true); if (rc) { netdev_err(bp->dev, "hwrm vnic %d set rss failure rc: %x\n", - vnic_id, rc); + vnic->vnic_id, rc); goto vnic_setup_err; } if (bp->flags & BNXT_FLAG_AGG_RINGS) { - rc = bnxt_hwrm_vnic_set_hds(bp, vnic_id); + rc = bnxt_hwrm_vnic_set_hds(bp, vnic); if (rc) { netdev_err(bp->dev, "hwrm vnic %d set hds failure rc: %x\n", - vnic_id, rc); + vnic->vnic_id, rc); } } @@ -9825,33 +9824,33 @@ static int __bnxt_setup_vnic(struct bnxt *bp, u16 vnic_id) return rc; } -int bnxt_hwrm_vnic_rss_cfg_p5(struct bnxt *bp, u16 vnic_id) +int bnxt_hwrm_vnic_rss_cfg_p5(struct bnxt *bp, struct bnxt_vnic_info *vnic) { int rc; - rc = bnxt_hwrm_vnic_set_rss_p5(bp, vnic_id, true); + rc = bnxt_hwrm_vnic_set_rss_p5(bp, vnic, true); if (rc) { netdev_err(bp->dev, "hwrm vnic %d set rss failure rc: %d\n", - vnic_id, rc); + vnic->vnic_id, rc); return rc; } - rc = bnxt_hwrm_vnic_cfg(bp, vnic_id); + rc = bnxt_hwrm_vnic_cfg(bp, vnic); if (rc) netdev_err(bp->dev, "hwrm vnic %d cfg failure rc: %x\n", - vnic_id, rc); + vnic->vnic_id, rc); return rc; } -static int __bnxt_setup_vnic_p5(struct bnxt *bp, u16 vnic_id) +static int __bnxt_setup_vnic_p5(struct bnxt *bp, struct bnxt_vnic_info *vnic) { int rc, i, nr_ctxs; nr_ctxs = bnxt_get_nr_rss_ctxs(bp, bp->rx_nr_rings); for (i = 0; i < nr_ctxs; i++) { - rc = bnxt_hwrm_vnic_ctx_alloc(bp, vnic_id, i); + rc = bnxt_hwrm_vnic_ctx_alloc(bp, vnic, i); if (rc) { netdev_err(bp->dev, "hwrm vnic %d ctx %d alloc failure rc: %x\n", - vnic_id, i, rc); + vnic->vnic_id, i, rc); break; } bp->rsscos_nr_ctxs++; @@ -9859,55 +9858,57 @@ static int __bnxt_setup_vnic_p5(struct bnxt *bp, u16 vnic_id) if (i < nr_ctxs) return -ENOMEM; - rc = bnxt_hwrm_vnic_rss_cfg_p5(bp, vnic_id); + rc = bnxt_hwrm_vnic_rss_cfg_p5(bp, vnic); if (rc) return rc; if (bp->flags & BNXT_FLAG_AGG_RINGS) { - rc = bnxt_hwrm_vnic_set_hds(bp, vnic_id); + rc = bnxt_hwrm_vnic_set_hds(bp, vnic); if (rc) { netdev_err(bp->dev, "hwrm vnic %d set hds failure rc: %x\n", - vnic_id, rc); + vnic->vnic_id, rc); } } return rc; } -static int bnxt_setup_vnic(struct bnxt *bp, u16 vnic_id) +static int bnxt_setup_vnic(struct bnxt *bp, struct bnxt_vnic_info *vnic) { if (bp->flags & BNXT_FLAG_CHIP_P5_PLUS) - return __bnxt_setup_vnic_p5(bp, vnic_id); + return __bnxt_setup_vnic_p5(bp, vnic); else - return __bnxt_setup_vnic(bp, vnic_id); + return __bnxt_setup_vnic(bp, vnic); } -static int bnxt_alloc_and_setup_vnic(struct bnxt *bp, u16 vnic_id, +static int bnxt_alloc_and_setup_vnic(struct bnxt *bp, + struct bnxt_vnic_info *vnic, u16 start_rx_ring_idx, int rx_rings) { int rc; - rc = bnxt_hwrm_vnic_alloc(bp, vnic_id, start_rx_ring_idx, rx_rings); + rc = bnxt_hwrm_vnic_alloc(bp, vnic, start_rx_ring_idx, rx_rings); if (rc) { netdev_err(bp->dev, "hwrm vnic %d alloc failure rc: %x\n", - vnic_id, rc); + vnic->vnic_id, rc); return rc; } - return bnxt_setup_vnic(bp, vnic_id); + return bnxt_setup_vnic(bp, vnic); } static int bnxt_alloc_rfs_vnics(struct bnxt *bp) { + struct bnxt_vnic_info *vnic; int i, rc = 0; - if (BNXT_SUPPORTS_NTUPLE_VNIC(bp)) - return bnxt_alloc_and_setup_vnic(bp, BNXT_VNIC_NTUPLE, 0, - bp->rx_nr_rings); + if (BNXT_SUPPORTS_NTUPLE_VNIC(bp)) { + vnic = &bp->vnic_info[BNXT_VNIC_NTUPLE]; + return bnxt_alloc_and_setup_vnic(bp, vnic, 0, bp->rx_nr_rings); + } if (bp->flags & BNXT_FLAG_CHIP_P5_PLUS) return 0; for (i = 0; i < bp->rx_nr_rings; i++) { - struct bnxt_vnic_info *vnic; u16 vnic_id = i + 1; u16 ring_id = i; @@ -9918,7 +9919,7 @@ static int bnxt_alloc_rfs_vnics(struct bnxt *bp) vnic->flags |= BNXT_VNIC_RFS_FLAG; if (bp->rss_cap & BNXT_RSS_CAP_NEW_RSS_CAP) vnic->flags |= BNXT_VNIC_RFS_NEW_RSS_FLAG; - if (bnxt_alloc_and_setup_vnic(bp, vnic_id, ring_id, 1)) + if (bnxt_alloc_and_setup_vnic(bp, &bp->vnic_info[vnic_id], ring_id, 1)) break; } return rc; @@ -9936,16 +9937,17 @@ static bool bnxt_promisc_ok(struct bnxt *bp) static int bnxt_setup_nitroa0_vnic(struct bnxt *bp) { + struct bnxt_vnic_info *vnic = &bp->vnic_info[1]; unsigned int rc = 0; - rc = bnxt_hwrm_vnic_alloc(bp, 1, bp->rx_nr_rings - 1, 1); + rc = bnxt_hwrm_vnic_alloc(bp, vnic, bp->rx_nr_rings - 1, 1); if (rc) { netdev_err(bp->dev, "Cannot allocate special vnic for NS2 A0: %x\n", rc); return rc; } - rc = bnxt_hwrm_vnic_cfg(bp, 1); + rc = bnxt_hwrm_vnic_cfg(bp, vnic); if (rc) { netdev_err(bp->dev, "Cannot allocate special vnic for NS2 A0: %x\n", rc); @@ -9988,7 +9990,7 @@ static int bnxt_init_chip(struct bnxt *bp, bool irq_re_init) rx_nr_rings--; /* default vnic 0 */ - rc = bnxt_hwrm_vnic_alloc(bp, BNXT_VNIC_DEFAULT, 0, rx_nr_rings); + rc = bnxt_hwrm_vnic_alloc(bp, vnic, 0, rx_nr_rings); if (rc) { netdev_err(bp->dev, "hwrm vnic alloc failure rc: %x\n", rc); goto err_out; @@ -9997,7 +9999,7 @@ static int bnxt_init_chip(struct bnxt *bp, bool irq_re_init) if (BNXT_VF(bp)) bnxt_hwrm_func_qcfg(bp); - rc = bnxt_setup_vnic(bp, BNXT_VNIC_DEFAULT); + rc = bnxt_setup_vnic(bp, vnic); if (rc) goto err_out; if (bp->rss_cap & BNXT_RSS_CAP_RSS_HASH_TYPE_DELTA) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index fbb53308cb815..81460a96c0dd8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1256,6 +1256,7 @@ struct bnxt_vnic_info { #define BNXT_VNIC_UCAST_FLAG 8 #define BNXT_VNIC_RFS_NEW_RSS_FLAG 0x10 #define BNXT_VNIC_NTUPLE_FLAG 0x20 + u32 vnic_id; }; struct bnxt_hw_rings { @@ -2695,7 +2696,7 @@ int bnxt_hwrm_cfa_ntuple_filter_alloc(struct bnxt *bp, struct bnxt_ntuple_filter *fltr); void bnxt_fill_ipv6_mask(__be32 mask[4]); int bnxt_get_nr_rss_ctxs(struct bnxt *bp, int rx_rings); -int bnxt_hwrm_vnic_cfg(struct bnxt *bp, u16 vnic_id); +int bnxt_hwrm_vnic_cfg(struct bnxt *bp, struct bnxt_vnic_info *vnic); int __bnxt_hwrm_get_tx_rings(struct bnxt *bp, u16 fid, int *tx_rings); int bnxt_nq_rings_in_use(struct bnxt *bp); int bnxt_hwrm_set_coal(struct bnxt *); @@ -2721,7 +2722,7 @@ int bnxt_hwrm_free_wol_fltr(struct bnxt *bp); int bnxt_hwrm_func_resc_qcaps(struct bnxt *bp, bool all); int bnxt_hwrm_func_qcaps(struct bnxt *bp); int bnxt_hwrm_fw_set_time(struct bnxt *); -int bnxt_hwrm_vnic_rss_cfg_p5(struct bnxt *bp, u16 vnic_id); +int bnxt_hwrm_vnic_rss_cfg_p5(struct bnxt *bp, struct bnxt_vnic_info *vnic); int bnxt_open_nic(struct bnxt *, bool, bool); int bnxt_half_open_nic(struct bnxt *bp); void bnxt_half_close_nic(struct bnxt *bp); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index 93f9bd55020f2..86dcd2c76587b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -71,7 +71,7 @@ int bnxt_register_dev(struct bnxt_en_dev *edev, rcu_assign_pointer(ulp->ulp_ops, ulp_ops); if (test_bit(BNXT_STATE_OPEN, &bp->state)) - bnxt_hwrm_vnic_cfg(bp, 0); + bnxt_hwrm_vnic_cfg(bp, &bp->vnic_info[BNXT_VNIC_DEFAULT]); bnxt_fill_msix_vecs(bp, bp->edev->msix_entries); edev->flags |= BNXT_EN_FLAG_MSIX_REQUESTED; From fea41bd766342a734e0562945254b65c22c6c3b4 Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Mon, 25 Mar 2024 15:28:55 -0700 Subject: [PATCH 034/167] bnxt_en: Introduce rss ctx structure, alloc/free functions Add struct bnxt_rss_ctx, related storage lists, required defines, and its alloc/free functions. Later patches will use them in order to support multiple RSS contexts. Reviewed-by: Kalesh AP Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/20240325222902.220712-6-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 54 +++++++++++++++++++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 24 +++++++++ .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 2 + 3 files changed, 80 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 3f5d7c81a2817..0ede267904ad2 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -9925,6 +9925,53 @@ static int bnxt_alloc_rfs_vnics(struct bnxt *bp) return rc; } +void bnxt_del_one_rss_ctx(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx, + bool all) +{ + if (!all) + return; + + list_del(&rss_ctx->list); + bp->num_rss_ctx--; + clear_bit(rss_ctx->index, bp->rss_ctx_bmap); + kfree(rss_ctx); +} + +struct bnxt_rss_ctx *bnxt_alloc_rss_ctx(struct bnxt *bp) +{ + struct bnxt_rss_ctx *rss_ctx = NULL; + + rss_ctx = kzalloc(sizeof(*rss_ctx), GFP_KERNEL); + if (rss_ctx) { + rss_ctx->vnic.rss_ctx = rss_ctx; + list_add_tail(&rss_ctx->list, &bp->rss_ctx_list); + bp->num_rss_ctx++; + } + return rss_ctx; +} + +void bnxt_clear_rss_ctxs(struct bnxt *bp, bool all) +{ + struct bnxt_rss_ctx *rss_ctx, *tmp; + + list_for_each_entry_safe(rss_ctx, tmp, &bp->rss_ctx_list, list) + bnxt_del_one_rss_ctx(bp, rss_ctx, all); + + if (all) + bitmap_free(bp->rss_ctx_bmap); +} + +static void bnxt_init_multi_rss_ctx(struct bnxt *bp) +{ + bp->rss_ctx_bmap = bitmap_zalloc(BNXT_RSS_CTX_BMAP_LEN, GFP_KERNEL); + if (bp->rss_ctx_bmap) { + /* burn index 0 since we cannot have context 0 */ + __set_bit(0, bp->rss_ctx_bmap); + INIT_LIST_HEAD(&bp->rss_ctx_list); + bp->rss_cap |= BNXT_RSS_CAP_MULTI_RSS_CTX; + } +} + /* Allow PF, trusted VFs and VFs with default VLAN to be in promiscuous mode */ static bool bnxt_promisc_ok(struct bnxt *bp) { @@ -14612,6 +14659,8 @@ static void bnxt_remove_one(struct pci_dev *pdev) unregister_netdev(dev); bnxt_free_l2_filters(bp, true); bnxt_free_ntp_fltrs(bp, true); + if (BNXT_SUPPORTS_MULTI_RSS_CTX(bp)) + bnxt_clear_rss_ctxs(bp, true); clear_bit(BNXT_STATE_IN_FW_RESET, &bp->state); /* Flush any pending tasks */ cancel_work_sync(&bp->sp_task); @@ -15223,6 +15272,9 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) INIT_LIST_HEAD(&bp->usr_fltr_list); + if (BNXT_SUPPORTS_NTUPLE_VNIC(bp)) + bnxt_init_multi_rss_ctx(bp); + rc = register_netdev(dev); if (rc) goto init_err_cleanup; @@ -15243,6 +15295,8 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) bnxt_clear_int_mode(bp); init_err_pci_clean: + if (BNXT_SUPPORTS_MULTI_RSS_CTX(bp)) + bnxt_clear_rss_ctxs(bp, true); bnxt_hwrm_func_drv_unrgtr(bp); bnxt_free_hwrm_resources(bp); bnxt_hwmon_uninit(bp); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 81460a96c0dd8..4d3104c26cfaa 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1256,9 +1256,21 @@ struct bnxt_vnic_info { #define BNXT_VNIC_UCAST_FLAG 8 #define BNXT_VNIC_RFS_NEW_RSS_FLAG 0x10 #define BNXT_VNIC_NTUPLE_FLAG 0x20 +#define BNXT_VNIC_RSSCTX_FLAG 0x40 + struct bnxt_rss_ctx *rss_ctx; u32 vnic_id; }; +struct bnxt_rss_ctx { + struct list_head list; + struct bnxt_vnic_info vnic; + u16 *rss_indir_tbl; + u8 index; +}; + +#define BNXT_MAX_ETH_RSS_CTX 32 +#define BNXT_RSS_CTX_BMAP_LEN (BNXT_MAX_ETH_RSS_CTX + 1) + struct bnxt_hw_rings { int tx; int rx; @@ -2228,6 +2240,9 @@ struct bnxt { /* grp_info indexed by completion ring index */ struct bnxt_ring_grp_info *grp_info; struct bnxt_vnic_info *vnic_info; + struct list_head rss_ctx_list; + unsigned long *rss_ctx_bmap; + u32 num_rss_ctx; int nr_vnics; u16 *rss_indir_tbl; u16 rss_indir_tbl_entries; @@ -2242,6 +2257,7 @@ struct bnxt { #define BNXT_RSS_CAP_AH_V6_RSS_CAP BIT(5) #define BNXT_RSS_CAP_ESP_V4_RSS_CAP BIT(6) #define BNXT_RSS_CAP_ESP_V6_RSS_CAP BIT(7) +#define BNXT_RSS_CAP_MULTI_RSS_CTX BIT(8) u8 rss_hash_key[HW_HASH_KEY_SIZE]; u8 rss_hash_key_valid:1; @@ -2341,6 +2357,10 @@ struct bnxt { #define BNXT_SUPPORTS_NTUPLE_VNIC(bp) \ (BNXT_PF(bp) && ((bp)->fw_cap & BNXT_FW_CAP_CFA_RFS_RING_TBL_IDX_V3)) +#define BNXT_SUPPORTS_MULTI_RSS_CTX(bp) \ + (BNXT_PF(bp) && BNXT_SUPPORTS_NTUPLE_VNIC(bp) && \ + ((bp)->rss_cap & BNXT_RSS_CAP_MULTI_RSS_CTX)) + u32 hwrm_spec_code; u16 hwrm_cmd_seq; u16 hwrm_cmd_kong_seq; @@ -2723,6 +2743,10 @@ int bnxt_hwrm_func_resc_qcaps(struct bnxt *bp, bool all); int bnxt_hwrm_func_qcaps(struct bnxt *bp); int bnxt_hwrm_fw_set_time(struct bnxt *); int bnxt_hwrm_vnic_rss_cfg_p5(struct bnxt *bp, struct bnxt_vnic_info *vnic); +void bnxt_del_one_rss_ctx(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx, + bool all); +struct bnxt_rss_ctx *bnxt_alloc_rss_ctx(struct bnxt *bp); +void bnxt_clear_rss_ctxs(struct bnxt *bp, bool all); int bnxt_open_nic(struct bnxt *, bool, bool); int bnxt_half_open_nic(struct bnxt *bp); void bnxt_half_close_nic(struct bnxt *bp); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 1d240a27455a5..771833b1900dc 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -969,6 +969,8 @@ static int bnxt_set_channels(struct net_device *dev, } bnxt_clear_usr_fltrs(bp, true); + if (BNXT_SUPPORTS_MULTI_RSS_CTX(bp)) + bnxt_clear_rss_ctxs(bp, false); if (netif_running(dev)) { if (BNXT_PF(bp)) { /* TODO CHIMP_FW: Send message to all VF's From ecb342bb6098fa13f2719f8baaab4475e404f784 Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Mon, 25 Mar 2024 15:28:56 -0700 Subject: [PATCH 035/167] bnxt_en: Refactor RSS indir alloc/set functions We will need to dynamically allocate and change indirection tables for additional RSS contexts. Add the rss_ctx pointer parameter to bnxt_alloc_rss_indir_tbl() and bnxt_set_dflt_rss_indir_tbl(). Existing usage will always pass rss_ctx as NULL which means the default RSS context. When supporting additional RSS contexts in subsequent patches, we'll pass the valid rss_ctx to these 2 functions. Reviewed-by: Andy Gospodarek Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/20240325222902.220712-7-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 31 +++++++++++++++-------- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 ++ 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 0ede267904ad2..80ccb5a54dae8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6026,9 +6026,10 @@ static u16 bnxt_cp_ring_for_tx(struct bnxt *bp, struct bnxt_tx_ring_info *txr) return bnxt_cp_ring_from_grp(bp, &txr->tx_ring_struct); } -static int bnxt_alloc_rss_indir_tbl(struct bnxt *bp) +int bnxt_alloc_rss_indir_tbl(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx) { int entries; + u16 *tbl; if (bp->flags & BNXT_FLAG_CHIP_P5_PLUS) entries = BNXT_MAX_RSS_TABLE_ENTRIES_P5; @@ -6036,16 +6037,22 @@ static int bnxt_alloc_rss_indir_tbl(struct bnxt *bp) entries = HW_HASH_INDEX_SIZE; bp->rss_indir_tbl_entries = entries; - bp->rss_indir_tbl = kmalloc_array(entries, sizeof(*bp->rss_indir_tbl), - GFP_KERNEL); - if (!bp->rss_indir_tbl) + tbl = kmalloc_array(entries, sizeof(*bp->rss_indir_tbl), GFP_KERNEL); + if (!tbl) return -ENOMEM; + + if (rss_ctx) + rss_ctx->rss_indir_tbl = tbl; + else + bp->rss_indir_tbl = tbl; + return 0; } -static void bnxt_set_dflt_rss_indir_tbl(struct bnxt *bp) +void bnxt_set_dflt_rss_indir_tbl(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx) { u16 max_rings, max_entries, pad, i; + u16 *rss_indir_tbl; if (!bp->rx_nr_rings) return; @@ -6056,13 +6063,17 @@ static void bnxt_set_dflt_rss_indir_tbl(struct bnxt *bp) max_rings = bp->rx_nr_rings; max_entries = bnxt_get_rxfh_indir_size(bp->dev); + if (rss_ctx) + rss_indir_tbl = &rss_ctx->rss_indir_tbl[0]; + else + rss_indir_tbl = &bp->rss_indir_tbl[0]; for (i = 0; i < max_entries; i++) - bp->rss_indir_tbl[i] = ethtool_rxfh_indir_default(i, max_rings); + rss_indir_tbl[i] = ethtool_rxfh_indir_default(i, max_rings); pad = bp->rss_indir_tbl_entries - max_entries; if (pad) - memset(&bp->rss_indir_tbl[i], 0, pad * sizeof(u16)); + memset(&rss_indir_tbl[i], 0, pad * sizeof(u16)); } static u16 bnxt_get_max_rss_ring(struct bnxt *bp) @@ -7341,7 +7352,7 @@ static void bnxt_check_rss_tbl_no_rmgr(struct bnxt *bp) if (hw_resc->resv_rx_rings != bp->rx_nr_rings) { hw_resc->resv_rx_rings = bp->rx_nr_rings; if (!netif_is_rxfh_configured(bp->dev)) - bnxt_set_dflt_rss_indir_tbl(bp); + bnxt_set_dflt_rss_indir_tbl(bp, NULL); } } @@ -7497,7 +7508,7 @@ static int __bnxt_reserve_rings(struct bnxt *bp) return -ENOMEM; if (!netif_is_rxfh_configured(bp->dev)) - bnxt_set_dflt_rss_indir_tbl(bp); + bnxt_set_dflt_rss_indir_tbl(bp, NULL); return rc; } @@ -15119,7 +15130,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) bp->flags |= BNXT_FLAG_CHIP_P7; } - rc = bnxt_alloc_rss_indir_tbl(bp); + rc = bnxt_alloc_rss_indir_tbl(bp, NULL); if (rc) goto init_err_pci_clean; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 4d3104c26cfaa..181758f6892ae 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2715,6 +2715,8 @@ int bnxt_hwrm_cfa_ntuple_filter_free(struct bnxt *bp, int bnxt_hwrm_cfa_ntuple_filter_alloc(struct bnxt *bp, struct bnxt_ntuple_filter *fltr); void bnxt_fill_ipv6_mask(__be32 mask[4]); +int bnxt_alloc_rss_indir_tbl(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx); +void bnxt_set_dflt_rss_indir_tbl(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx); int bnxt_get_nr_rss_ctxs(struct bnxt *bp, int rx_rings); int bnxt_hwrm_vnic_cfg(struct bnxt *bp, struct bnxt_vnic_info *vnic); int __bnxt_hwrm_get_tx_rings(struct bnxt *bp, u16 fid, int *tx_rings); From b09353437b28ff8786e60aa9bd560a4474facfc0 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 25 Mar 2024 15:28:57 -0700 Subject: [PATCH 036/167] bnxt_en: Simplify bnxt_rfs_capable() bnxt_rfs_capable() determines the number of VNICs and RSS_CTXs required to support aRFS and then reserves the resources. We already have functions bnxt_get_total_vnics() and bnxt_get_total_rss_ctxs() to do that. Simplify the code by calling these functions. It is also more correct to do the resource reservation after bnxt_can_reserve_rings() returns true. Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/20240325222902.220712-8-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 80ccb5a54dae8..e968684ccb1d6 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -12479,28 +12479,19 @@ static bool bnxt_rfs_capable(struct bnxt *bp) struct bnxt_hw_rings hwr = {0}; int max_vnics, max_rss_ctxs; - hwr.rss_ctx = 1; - if (BNXT_SUPPORTS_NTUPLE_VNIC(bp)) { - /* 2 VNICS: default + Ntuple */ - hwr.vnic = 2; - hwr.rss_ctx = bnxt_get_nr_rss_ctxs(bp, bp->rx_nr_rings) * - hwr.vnic; - goto check_reserve_vnic; - } - if (bp->flags & BNXT_FLAG_CHIP_P5_PLUS) + if ((bp->flags & BNXT_FLAG_CHIP_P5_PLUS) && + !BNXT_SUPPORTS_NTUPLE_VNIC(bp)) return bnxt_rfs_supported(bp); + if (!(bp->flags & BNXT_FLAG_MSIX_CAP) || !bnxt_can_reserve_rings(bp) || !bp->rx_nr_rings) return false; - hwr.vnic = 1 + bp->rx_nr_rings; -check_reserve_vnic: + hwr.grp = bp->rx_nr_rings; + hwr.vnic = bnxt_get_total_vnics(bp, bp->rx_nr_rings); + hwr.rss_ctx = bnxt_get_total_rss_ctxs(bp, &hwr); max_vnics = bnxt_get_max_func_vnics(bp); max_rss_ctxs = bnxt_get_max_func_rss_ctxs(bp); - if (!(bp->flags & BNXT_FLAG_CHIP_P5_PLUS) && - !(bp->rss_cap & BNXT_RSS_CAP_NEW_RSS_CAP)) - hwr.rss_ctx = hwr.vnic; - if (hwr.vnic > max_vnics || hwr.rss_ctx > max_rss_ctxs) { if (bp->rx_nr_rings > 1) netdev_warn(bp->dev, From 0895926f725ac6e4d163eb4b964199a5e81fbf40 Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Mon, 25 Mar 2024 15:28:58 -0700 Subject: [PATCH 037/167] bnxt_en: Add a new_rss_ctx parameter to bnxt_rfs_capable() Modify bnxt_rfs_capable() to check that there are enough resources to support aRFS/ntuple filters for a new RSS context requested by the user. Existing use cases in the driver will always set the new parameter to false. Reviewed-by: Kalesh AP Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/20240325222902.220712-9-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 10 ++++++---- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index e968684ccb1d6..c5f3b97d258d8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7360,7 +7360,7 @@ static int bnxt_get_total_vnics(struct bnxt *bp, int rx_rings) { if (bp->flags & BNXT_FLAG_RFS) { if (BNXT_SUPPORTS_NTUPLE_VNIC(bp)) - return 2; + return 2 + bp->num_rss_ctx; if (!(bp->flags & BNXT_FLAG_CHIP_P5_PLUS)) return rx_rings + 1; } @@ -12474,7 +12474,7 @@ static bool bnxt_rfs_supported(struct bnxt *bp) } /* If runtime conditions support RFS */ -static bool bnxt_rfs_capable(struct bnxt *bp) +bool bnxt_rfs_capable(struct bnxt *bp, bool new_rss_ctx) { struct bnxt_hw_rings hwr = {0}; int max_vnics, max_rss_ctxs; @@ -12488,6 +12488,8 @@ static bool bnxt_rfs_capable(struct bnxt *bp) hwr.grp = bp->rx_nr_rings; hwr.vnic = bnxt_get_total_vnics(bp, bp->rx_nr_rings); + if (new_rss_ctx) + hwr.vnic++; hwr.rss_ctx = bnxt_get_total_rss_ctxs(bp, &hwr); max_vnics = bnxt_get_max_func_vnics(bp); max_rss_ctxs = bnxt_get_max_func_rss_ctxs(bp); @@ -12525,7 +12527,7 @@ static netdev_features_t bnxt_fix_features(struct net_device *dev, struct bnxt *bp = netdev_priv(dev); netdev_features_t vlan_features; - if ((features & NETIF_F_NTUPLE) && !bnxt_rfs_capable(bp)) + if ((features & NETIF_F_NTUPLE) && !bnxt_rfs_capable(bp, false)) features &= ~NETIF_F_NTUPLE; if ((bp->flags & BNXT_FLAG_NO_AGG_RINGS) || bp->xdp_prog) @@ -13661,7 +13663,7 @@ static void bnxt_set_dflt_rfs(struct bnxt *bp) bp->flags &= ~BNXT_FLAG_RFS; if (bnxt_rfs_supported(bp)) { dev->hw_features |= NETIF_F_NTUPLE; - if (bnxt_rfs_capable(bp)) { + if (bnxt_rfs_capable(bp, false)) { bp->flags |= BNXT_FLAG_RFS; dev->features |= NETIF_F_NTUPLE; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 181758f6892ae..d5fbb63a0e0ce 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2756,6 +2756,7 @@ void bnxt_reenable_sriov(struct bnxt *bp); void bnxt_close_nic(struct bnxt *, bool, bool); void bnxt_get_ring_err_stats(struct bnxt *bp, struct bnxt_total_ring_err_stats *stats); +bool bnxt_rfs_capable(struct bnxt *bp, bool new_rss_ctx); int bnxt_dbg_hwrm_rd_reg(struct bnxt *bp, u32 reg_off, u16 num_words, u32 *reg_buf); void bnxt_fw_exception(struct bnxt *bp); From 77a614f7499edd47549f31730f50cb043b699a8b Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 25 Mar 2024 15:28:59 -0700 Subject: [PATCH 038/167] bnxt_en: Refactor bnxt_set_rxfh() Add a new bnxt_modify_rss() function to modify the RSS key and RSS indirection table. The new function can modify the parameters for the default context or additional contexts. Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/20240325222902.220712-10-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 40 +++++++++++++------ 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 771833b1900dc..7f57198f5834c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -1778,6 +1778,32 @@ static int bnxt_get_rxfh(struct net_device *dev, return 0; } +static void bnxt_modify_rss(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx, + struct ethtool_rxfh_param *rxfh) +{ + if (rxfh->key) { + if (rss_ctx) { + memcpy(rss_ctx->vnic.rss_hash_key, rxfh->key, + HW_HASH_KEY_SIZE); + } else { + memcpy(bp->rss_hash_key, rxfh->key, HW_HASH_KEY_SIZE); + bp->rss_hash_key_updated = true; + } + } + if (rxfh->indir) { + u32 i, pad, tbl_size = bnxt_get_rxfh_indir_size(bp->dev); + u16 *indir_tbl = bp->rss_indir_tbl; + + if (rss_ctx) + indir_tbl = rss_ctx->rss_indir_tbl; + for (i = 0; i < tbl_size; i++) + indir_tbl[i] = rxfh->indir[i]; + pad = bp->rss_indir_tbl_entries - tbl_size; + if (pad) + memset(&bp->rss_indir_tbl[i], 0, pad * sizeof(u16)); + } +} + static int bnxt_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh, struct netlink_ext_ack *extack) @@ -1788,20 +1814,8 @@ static int bnxt_set_rxfh(struct net_device *dev, if (rxfh->hfunc && rxfh->hfunc != ETH_RSS_HASH_TOP) return -EOPNOTSUPP; - if (rxfh->key) { - memcpy(bp->rss_hash_key, rxfh->key, HW_HASH_KEY_SIZE); - bp->rss_hash_key_updated = true; - } - - if (rxfh->indir) { - u32 i, pad, tbl_size = bnxt_get_rxfh_indir_size(dev); + bnxt_modify_rss(bp, NULL, rxfh); - for (i = 0; i < tbl_size; i++) - bp->rss_indir_tbl[i] = rxfh->indir[i]; - pad = bp->rss_indir_tbl_entries - tbl_size; - if (pad) - memset(&bp->rss_indir_tbl[i], 0, pad * sizeof(u16)); - } bnxt_clear_usr_fltrs(bp, false); if (netif_running(bp->dev)) { bnxt_close_nic(bp, false, false); From b3d0083caf9abd495e1fc0016b0b29bbfeee99af Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Mon, 25 Mar 2024 15:29:00 -0700 Subject: [PATCH 039/167] bnxt_en: Support RSS contexts in ethtool .{get|set}_rxfh() Support up to 32 RSS contexts per device if supported by the device. Reviewed-by: Kalesh AP Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/20240325222902.220712-11-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 49 +++++- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 7 + .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 148 +++++++++++++++++- 3 files changed, 196 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index c5f3b97d258d8..8aa3db2ceecec 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -5939,8 +5939,8 @@ static void bnxt_hwrm_vnic_update_tunl_tpa(struct bnxt *bp, req->tnl_tpa_en_bitmap = cpu_to_le32(tunl_tpa_bmap); } -static int bnxt_hwrm_vnic_set_tpa(struct bnxt *bp, struct bnxt_vnic_info *vnic, - u32 tpa_flags) +int bnxt_hwrm_vnic_set_tpa(struct bnxt *bp, struct bnxt_vnic_info *vnic, + u32 tpa_flags) { u16 max_aggs = VNIC_TPA_CFG_REQ_MAX_AGGS_MAX; struct hwrm_vnic_tpa_cfg_input *req; @@ -6129,6 +6129,8 @@ static void bnxt_fill_hw_rss_tbl_p5(struct bnxt *bp, if (vnic->flags & BNXT_VNIC_NTUPLE_FLAG) j = ethtool_rxfh_indir_default(i, bp->rx_nr_rings); + else if (vnic->flags & BNXT_VNIC_RSSCTX_FLAG) + j = vnic->rss_ctx->rss_indir_tbl[i]; else j = bp->rss_indir_tbl[i]; rxr = &bp->rx_ring[j]; @@ -6423,9 +6425,9 @@ static void bnxt_hwrm_vnic_free(struct bnxt *bp) bnxt_hwrm_vnic_free_one(bp, &bp->vnic_info[i]); } -static int bnxt_hwrm_vnic_alloc(struct bnxt *bp, struct bnxt_vnic_info *vnic, - unsigned int start_rx_ring_idx, - unsigned int nr_rings) +int bnxt_hwrm_vnic_alloc(struct bnxt *bp, struct bnxt_vnic_info *vnic, + unsigned int start_rx_ring_idx, + unsigned int nr_rings) { unsigned int i, j, grp_idx, end_idx = start_rx_ring_idx + nr_rings; struct hwrm_vnic_alloc_output *resp; @@ -9852,7 +9854,7 @@ int bnxt_hwrm_vnic_rss_cfg_p5(struct bnxt *bp, struct bnxt_vnic_info *vnic) return rc; } -static int __bnxt_setup_vnic_p5(struct bnxt *bp, struct bnxt_vnic_info *vnic) +int __bnxt_setup_vnic_p5(struct bnxt *bp, struct bnxt_vnic_info *vnic) { int rc, i, nr_ctxs; @@ -9939,15 +9941,46 @@ static int bnxt_alloc_rfs_vnics(struct bnxt *bp) void bnxt_del_one_rss_ctx(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx, bool all) { + struct bnxt_vnic_info *vnic = &rss_ctx->vnic; + int i; + + bnxt_hwrm_vnic_free_one(bp, &rss_ctx->vnic); + for (i = 0; i < BNXT_MAX_CTX_PER_VNIC; i++) { + if (vnic->fw_rss_cos_lb_ctx[i] != INVALID_HW_RING_ID) + bnxt_hwrm_vnic_ctx_free_one(bp, vnic, i); + } if (!all) return; + if (vnic->rss_table) + dma_free_coherent(&bp->pdev->dev, vnic->rss_table_size, + vnic->rss_table, + vnic->rss_table_dma_addr); + kfree(rss_ctx->rss_indir_tbl); list_del(&rss_ctx->list); bp->num_rss_ctx--; clear_bit(rss_ctx->index, bp->rss_ctx_bmap); kfree(rss_ctx); } +static void bnxt_hwrm_realloc_rss_ctx_vnic(struct bnxt *bp) +{ + bool set_tpa = !!(bp->flags & BNXT_FLAG_TPA); + struct bnxt_rss_ctx *rss_ctx, *tmp; + + list_for_each_entry_safe(rss_ctx, tmp, &bp->rss_ctx_list, list) { + struct bnxt_vnic_info *vnic = &rss_ctx->vnic; + + if (bnxt_hwrm_vnic_alloc(bp, vnic, 0, bp->rx_nr_rings) || + bnxt_hwrm_vnic_set_tpa(bp, vnic, set_tpa) || + __bnxt_setup_vnic_p5(bp, vnic)) { + netdev_err(bp->dev, "Failed to restore RSS ctx %d\n", + rss_ctx->index); + bnxt_del_one_rss_ctx(bp, rss_ctx, true); + } + } +} + struct bnxt_rss_ctx *bnxt_alloc_rss_ctx(struct bnxt *bp) { struct bnxt_rss_ctx *rss_ctx = NULL; @@ -11829,6 +11862,8 @@ static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init) bnxt_vf_reps_open(bp); bnxt_ptp_init_rtc(bp, true); bnxt_ptp_cfg_tstamp_filters(bp); + if (BNXT_SUPPORTS_MULTI_RSS_CTX(bp)) + bnxt_hwrm_realloc_rss_ctx_vnic(bp); bnxt_cfg_usr_fltrs(bp); return 0; @@ -11977,6 +12012,8 @@ static void __bnxt_close_nic(struct bnxt *bp, bool irq_re_init, while (bnxt_drv_busy(bp)) msleep(20); + if (BNXT_SUPPORTS_MULTI_RSS_CTX(bp)) + bnxt_clear_rss_ctxs(bp, false); /* Flush rings and disable interrupts */ bnxt_shutdown_nic(bp, irq_re_init); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index d5fbb63a0e0ce..37a850959315b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1270,6 +1270,7 @@ struct bnxt_rss_ctx { #define BNXT_MAX_ETH_RSS_CTX 32 #define BNXT_RSS_CTX_BMAP_LEN (BNXT_MAX_ETH_RSS_CTX + 1) +#define BNXT_VNIC_ID_INVALID 0xffffffff struct bnxt_hw_rings { int tx; @@ -2714,11 +2715,16 @@ int bnxt_hwrm_cfa_ntuple_filter_free(struct bnxt *bp, struct bnxt_ntuple_filter *fltr); int bnxt_hwrm_cfa_ntuple_filter_alloc(struct bnxt *bp, struct bnxt_ntuple_filter *fltr); +int bnxt_hwrm_vnic_set_tpa(struct bnxt *bp, struct bnxt_vnic_info *vnic, + u32 tpa_flags); void bnxt_fill_ipv6_mask(__be32 mask[4]); int bnxt_alloc_rss_indir_tbl(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx); void bnxt_set_dflt_rss_indir_tbl(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx); int bnxt_get_nr_rss_ctxs(struct bnxt *bp, int rx_rings); int bnxt_hwrm_vnic_cfg(struct bnxt *bp, struct bnxt_vnic_info *vnic); +int bnxt_hwrm_vnic_alloc(struct bnxt *bp, struct bnxt_vnic_info *vnic, + unsigned int start_rx_ring_idx, + unsigned int nr_rings); int __bnxt_hwrm_get_tx_rings(struct bnxt *bp, u16 fid, int *tx_rings); int bnxt_nq_rings_in_use(struct bnxt *bp); int bnxt_hwrm_set_coal(struct bnxt *); @@ -2745,6 +2751,7 @@ int bnxt_hwrm_func_resc_qcaps(struct bnxt *bp, bool all); int bnxt_hwrm_func_qcaps(struct bnxt *bp); int bnxt_hwrm_fw_set_time(struct bnxt *); int bnxt_hwrm_vnic_rss_cfg_p5(struct bnxt *bp, struct bnxt_vnic_info *vnic); +int __bnxt_setup_vnic_p5(struct bnxt *bp, struct bnxt_vnic_info *vnic); void bnxt_del_one_rss_ctx(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx, bool all); struct bnxt_rss_ctx *bnxt_alloc_rss_ctx(struct bnxt *bp); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 7f57198f5834c..4dbe80b11ddaf 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -1207,6 +1207,36 @@ static int bnxt_grxclsrule(struct bnxt *bp, struct ethtool_rxnfc *cmd) return rc; } +static struct bnxt_rss_ctx *bnxt_get_rss_ctx_from_index(struct bnxt *bp, + u32 index) +{ + struct bnxt_rss_ctx *rss_ctx, *tmp; + + list_for_each_entry_safe(rss_ctx, tmp, &bp->rss_ctx_list, list) + if (rss_ctx->index == index) + return rss_ctx; + return NULL; +} + +static int bnxt_alloc_rss_ctx_rss_table(struct bnxt *bp, + struct bnxt_rss_ctx *rss_ctx) +{ + int size = L1_CACHE_ALIGN(BNXT_MAX_RSS_TABLE_SIZE_P5); + struct bnxt_vnic_info *vnic = &rss_ctx->vnic; + + vnic->rss_table_size = size + HW_HASH_KEY_SIZE; + vnic->rss_table = dma_alloc_coherent(&bp->pdev->dev, + vnic->rss_table_size, + &vnic->rss_table_dma_addr, + GFP_KERNEL); + if (!vnic->rss_table) + return -ENOMEM; + + vnic->rss_hash_key = ((void *)vnic->rss_table) + size; + vnic->rss_hash_key_dma_addr = vnic->rss_table_dma_addr + size; + return 0; +} + static int bnxt_add_l2_cls_rule(struct bnxt *bp, struct ethtool_rx_flow_spec *fs) { @@ -1756,7 +1786,10 @@ static u32 bnxt_get_rxfh_key_size(struct net_device *dev) static int bnxt_get_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh) { + u32 rss_context = rxfh->rss_context; + struct bnxt_rss_ctx *rss_ctx = NULL; struct bnxt *bp = netdev_priv(dev); + u16 *indir_tbl = bp->rss_indir_tbl; struct bnxt_vnic_info *vnic; u32 i, tbl_size; @@ -1766,10 +1799,18 @@ static int bnxt_get_rxfh(struct net_device *dev, return 0; vnic = &bp->vnic_info[BNXT_VNIC_DEFAULT]; - if (rxfh->indir && bp->rss_indir_tbl) { + if (rxfh->rss_context) { + rss_ctx = bnxt_get_rss_ctx_from_index(bp, rss_context); + if (!rss_ctx) + return -EINVAL; + indir_tbl = rss_ctx->rss_indir_tbl; + vnic = &rss_ctx->vnic; + } + + if (rxfh->indir && indir_tbl) { tbl_size = bnxt_get_rxfh_indir_size(dev); for (i = 0; i < tbl_size; i++) - rxfh->indir[i] = bp->rss_indir_tbl[i]; + rxfh->indir[i] = indir_tbl[i]; } if (rxfh->key && vnic->rss_hash_key) @@ -1804,6 +1845,105 @@ static void bnxt_modify_rss(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx, } } +static int bnxt_set_rxfh_context(struct bnxt *bp, + struct ethtool_rxfh_param *rxfh, + struct netlink_ext_ack *extack) +{ + u32 *rss_context = &rxfh->rss_context; + struct bnxt_rss_ctx *rss_ctx; + struct bnxt_vnic_info *vnic; + bool modify = false; + int bit_id; + int rc; + + if (!BNXT_SUPPORTS_MULTI_RSS_CTX(bp)) { + NL_SET_ERR_MSG_MOD(extack, "RSS contexts not supported"); + return -EOPNOTSUPP; + } + + if (*rss_context != ETH_RXFH_CONTEXT_ALLOC) { + rss_ctx = bnxt_get_rss_ctx_from_index(bp, *rss_context); + if (!rss_ctx) { + NL_SET_ERR_MSG_FMT_MOD(extack, "RSS context %u not found", + *rss_context); + return -EINVAL; + } + if (*rss_context && rxfh->rss_delete) { + bnxt_del_one_rss_ctx(bp, rss_ctx, true); + return 0; + } + modify = true; + vnic = &rss_ctx->vnic; + goto modify_context; + } + + if (bp->num_rss_ctx >= BNXT_MAX_ETH_RSS_CTX) { + NL_SET_ERR_MSG_FMT_MOD(extack, "Out of RSS contexts, maximum %u", + BNXT_MAX_ETH_RSS_CTX); + return -EINVAL; + } + + if (!bnxt_rfs_capable(bp, true)) { + NL_SET_ERR_MSG_MOD(extack, "Out hardware resources"); + return -ENOMEM; + } + + rss_ctx = bnxt_alloc_rss_ctx(bp); + if (!rss_ctx) + return -ENOMEM; + + vnic = &rss_ctx->vnic; + vnic->flags |= BNXT_VNIC_RSSCTX_FLAG; + vnic->vnic_id = BNXT_VNIC_ID_INVALID; + rc = bnxt_alloc_rss_ctx_rss_table(bp, rss_ctx); + if (rc) + goto out; + + rc = bnxt_alloc_rss_indir_tbl(bp, rss_ctx); + if (rc) + goto out; + + bnxt_set_dflt_rss_indir_tbl(bp, rss_ctx); + memcpy(vnic->rss_hash_key, bp->rss_hash_key, HW_HASH_KEY_SIZE); + + rc = bnxt_hwrm_vnic_alloc(bp, vnic, 0, bp->rx_nr_rings); + if (rc) { + NL_SET_ERR_MSG_MOD(extack, "Unable to allocate VNIC"); + goto out; + } + + rc = bnxt_hwrm_vnic_set_tpa(bp, vnic, bp->flags & BNXT_FLAG_TPA); + if (rc) { + NL_SET_ERR_MSG_MOD(extack, "Unable to setup TPA"); + goto out; + } +modify_context: + bnxt_modify_rss(bp, rss_ctx, rxfh); + + if (modify) + return bnxt_hwrm_vnic_rss_cfg_p5(bp, vnic); + + rc = __bnxt_setup_vnic_p5(bp, vnic); + if (rc) { + NL_SET_ERR_MSG_MOD(extack, "Unable to setup TPA"); + goto out; + } + + bit_id = bitmap_find_free_region(bp->rss_ctx_bmap, + BNXT_RSS_CTX_BMAP_LEN, 0); + if (bit_id < 0) { + rc = -ENOMEM; + goto out; + } + rss_ctx->index = (u16)bit_id; + *rss_context = rss_ctx->index; + + return 0; +out: + bnxt_del_one_rss_ctx(bp, rss_ctx, true); + return rc; +} + static int bnxt_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh, struct netlink_ext_ack *extack) @@ -1814,6 +1954,9 @@ static int bnxt_set_rxfh(struct net_device *dev, if (rxfh->hfunc && rxfh->hfunc != ETH_RSS_HASH_TOP) return -EOPNOTSUPP; + if (rxfh->rss_context) + return bnxt_set_rxfh_context(bp, rxfh, extack); + bnxt_modify_rss(bp, NULL, rxfh); bnxt_clear_usr_fltrs(bp, false); @@ -5087,6 +5230,7 @@ void bnxt_ethtool_free(struct bnxt *bp) const struct ethtool_ops bnxt_ethtool_ops = { .cap_link_lanes_supported = 1, + .cap_rss_ctx_supported = 1, .supported_coalesce_params = ETHTOOL_COALESCE_USECS | ETHTOOL_COALESCE_MAX_FRAMES | ETHTOOL_COALESCE_USECS_IRQ | From 61c814bf4ad7c4422796e53e9489965e9257abac Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Mon, 25 Mar 2024 15:29:01 -0700 Subject: [PATCH 040/167] bnxt_en: Refactor bnxt_cfg_rfs_ring_tbl_idx() Refactor bnxt_cfg_rfs_ring_tbl_idx() to pass in the filter structure pointer instead of the RX ring number. This will allow an ntuple filter to be set up for the non-default RSS contexts in the next patch. Reviewed-by: Kalesh AP Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/20240325222902.220712-12-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 8aa3db2ceecec..88d4116cfd79d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -5789,8 +5789,10 @@ void bnxt_fill_ipv6_mask(__be32 mask[4]) static void bnxt_cfg_rfs_ring_tbl_idx(struct bnxt *bp, struct hwrm_cfa_ntuple_filter_alloc_input *req, - u16 rxq) + struct bnxt_ntuple_filter *fltr) { + u16 rxq = fltr->base.rxq; + if (BNXT_SUPPORTS_NTUPLE_VNIC(bp)) { struct bnxt_vnic_info *vnic; u32 enables; @@ -5831,7 +5833,7 @@ int bnxt_hwrm_cfa_ntuple_filter_alloc(struct bnxt *bp, req->flags = cpu_to_le32(CFA_NTUPLE_FILTER_ALLOC_REQ_FLAGS_DROP); } else if (bp->fw_cap & BNXT_FW_CAP_CFA_RFS_RING_TBL_IDX_V2) { - bnxt_cfg_rfs_ring_tbl_idx(bp, req, fltr->base.rxq); + bnxt_cfg_rfs_ring_tbl_idx(bp, req, fltr); } else { vnic = &bp->vnic_info[fltr->base.rxq + 1]; req->dst_id = cpu_to_le16(vnic->fw_vnic_id); From 2f4f9fe5bf5fe0eff7e3f372353b30e600e33c20 Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Mon, 25 Mar 2024 15:29:02 -0700 Subject: [PATCH 041/167] bnxt_en: Support adding ntuple rules on RSS contexts When the user wants to add an ntuple filter to an RSS context, select the appropriate VNIC belonging to the selected RSS context and add the VNIC destination rule. Make the necessary changes to bnxt_add_ntuple_cls_rule(). Reviewed-by: Kalesh AP Signed-off-by: Pavan Chebbi Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/20240325222902.220712-13-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 26 +++++++++++++++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 33 ++++++++++++++----- 3 files changed, 51 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 88d4116cfd79d..388e80bf91f5e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -5791,8 +5791,20 @@ bnxt_cfg_rfs_ring_tbl_idx(struct bnxt *bp, struct hwrm_cfa_ntuple_filter_alloc_input *req, struct bnxt_ntuple_filter *fltr) { + struct bnxt_rss_ctx *rss_ctx, *tmp; u16 rxq = fltr->base.rxq; + if (fltr->base.flags & BNXT_ACT_RSS_CTX) { + list_for_each_entry_safe(rss_ctx, tmp, &bp->rss_ctx_list, list) { + if (rss_ctx->index == fltr->base.fw_vnic_id) { + struct bnxt_vnic_info *vnic = &rss_ctx->vnic; + + req->dst_id = cpu_to_le16(vnic->fw_vnic_id); + break; + } + } + return; + } if (BNXT_SUPPORTS_NTUPLE_VNIC(bp)) { struct bnxt_vnic_info *vnic; u32 enables; @@ -9944,6 +9956,8 @@ void bnxt_del_one_rss_ctx(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx, bool all) { struct bnxt_vnic_info *vnic = &rss_ctx->vnic; + struct bnxt_filter_base *usr_fltr, *tmp; + struct bnxt_ntuple_filter *ntp_fltr; int i; bnxt_hwrm_vnic_free_one(bp, &rss_ctx->vnic); @@ -9954,6 +9968,18 @@ void bnxt_del_one_rss_ctx(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx, if (!all) return; + list_for_each_entry_safe(usr_fltr, tmp, &bp->usr_fltr_list, list) { + if ((usr_fltr->flags & BNXT_ACT_RSS_CTX) && + usr_fltr->fw_vnic_id == rss_ctx->index) { + ntp_fltr = container_of(usr_fltr, + struct bnxt_ntuple_filter, + base); + bnxt_hwrm_cfa_ntuple_filter_free(bp, ntp_fltr); + bnxt_del_ntp_filter(bp, ntp_fltr); + bnxt_del_one_usr_fltr(bp, usr_fltr); + } + } + if (vnic->rss_table) dma_free_coherent(&bp->pdev->dev, vnic->rss_table_size, vnic->rss_table, diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 37a850959315b..0640fcb57ef84 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1374,6 +1374,7 @@ struct bnxt_filter_base { #define BNXT_ACT_RING_DST 2 #define BNXT_ACT_FUNC_DST 4 #define BNXT_ACT_NO_AGING 8 +#define BNXT_ACT_RSS_CTX 0x10 u16 sw_id; u16 rxq; u16 fw_vnic_id; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 4dbe80b11ddaf..9c49f629d5657 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -1312,22 +1312,24 @@ static bool bnxt_verify_ntuple_ip6_flow(struct ethtool_usrip6_spec *ip_spec, } static int bnxt_add_ntuple_cls_rule(struct bnxt *bp, - struct ethtool_rx_flow_spec *fs) + struct ethtool_rxnfc *cmd) { - u8 vf = ethtool_get_flow_spec_ring_vf(fs->ring_cookie); - u32 ring = ethtool_get_flow_spec_ring(fs->ring_cookie); + struct ethtool_rx_flow_spec *fs = &cmd->fs; struct bnxt_ntuple_filter *new_fltr, *fltr; + u32 flow_type = fs->flow_type & 0xff; struct bnxt_l2_filter *l2_fltr; struct bnxt_flow_masks *fmasks; - u32 flow_type = fs->flow_type; struct flow_keys *fkeys; - u32 idx; + u32 idx, ring; int rc; + u8 vf; if (!bp->vnic_info) return -EAGAIN; - if ((flow_type & (FLOW_MAC_EXT | FLOW_EXT)) || vf) + vf = ethtool_get_flow_spec_ring_vf(fs->ring_cookie); + ring = ethtool_get_flow_spec_ring(fs->ring_cookie); + if ((fs->flow_type & (FLOW_MAC_EXT | FLOW_EXT)) || vf) return -EOPNOTSUPP; if (flow_type == IP_USER_FLOW) { @@ -1435,6 +1437,19 @@ static int bnxt_add_ntuple_cls_rule(struct bnxt *bp, rcu_read_unlock(); new_fltr->base.flags = BNXT_ACT_NO_AGING; + if (fs->flow_type & FLOW_RSS) { + struct bnxt_rss_ctx *rss_ctx; + + new_fltr->base.fw_vnic_id = 0; + new_fltr->base.flags |= BNXT_ACT_RSS_CTX; + rss_ctx = bnxt_get_rss_ctx_from_index(bp, cmd->rss_context); + if (rss_ctx) { + new_fltr->base.fw_vnic_id = rss_ctx->index; + } else { + rc = -EINVAL; + goto ntuple_err; + } + } if (fs->ring_cookie == RX_CLS_FLOW_DISC) new_fltr->base.flags |= BNXT_ACT_DROP; else @@ -1476,12 +1491,12 @@ static int bnxt_srxclsrlins(struct bnxt *bp, struct ethtool_rxnfc *cmd) flow_type == IPV6_USER_FLOW) && !(bp->fw_cap & BNXT_FW_CAP_CFA_NTUPLE_RX_EXT_IP_PROTO)) return -EOPNOTSUPP; - if (flow_type & (FLOW_MAC_EXT | FLOW_RSS)) + if (flow_type & FLOW_MAC_EXT) return -EINVAL; flow_type &= ~FLOW_EXT; if (fs->ring_cookie == RX_CLS_FLOW_DISC && flow_type != ETHER_FLOW) - return bnxt_add_ntuple_cls_rule(bp, fs); + return bnxt_add_ntuple_cls_rule(bp, cmd); ring = ethtool_get_flow_spec_ring(fs->ring_cookie); vf = ethtool_get_flow_spec_ring_vf(fs->ring_cookie); @@ -1495,7 +1510,7 @@ static int bnxt_srxclsrlins(struct bnxt *bp, struct ethtool_rxnfc *cmd) if (flow_type == ETHER_FLOW) rc = bnxt_add_l2_cls_rule(bp, fs); else - rc = bnxt_add_ntuple_cls_rule(bp, fs); + rc = bnxt_add_ntuple_cls_rule(bp, cmd); return rc; } From f631ef39d81956a2ee69d25039781ceae1162f62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Mon, 25 Mar 2024 20:47:34 +0000 Subject: [PATCH 042/167] net: sched: cls_api: add skip_sw counter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Maintain a count of skip_sw filters. This counter is protected by the cb_lock, and is updated at the same time as offloadcnt. Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Jiri Pirko Reviewed-by: Simon Horman Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 + net/sched/cls_api.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index cefe0c4bdae34..120a4ca6ec9b2 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -471,6 +471,7 @@ struct tcf_block { struct flow_block flow_block; struct list_head owner_list; bool keep_dst; + atomic_t skipswcnt; /* Number of skip_sw filters */ atomic_t offloadcnt; /* Number of oddloaded filters */ unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */ unsigned int lockeddevcnt; /* Number of devs that require rtnl lock. */ diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ca5676b2668e1..397c3d29659cd 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3483,6 +3483,8 @@ static void tcf_block_offload_inc(struct tcf_block *block, u32 *flags) if (*flags & TCA_CLS_FLAGS_IN_HW) return; *flags |= TCA_CLS_FLAGS_IN_HW; + if (tc_skip_sw(*flags)) + atomic_inc(&block->skipswcnt); atomic_inc(&block->offloadcnt); } @@ -3491,6 +3493,8 @@ static void tcf_block_offload_dec(struct tcf_block *block, u32 *flags) if (!(*flags & TCA_CLS_FLAGS_IN_HW)) return; *flags &= ~TCA_CLS_FLAGS_IN_HW; + if (tc_skip_sw(*flags)) + atomic_dec(&block->skipswcnt); atomic_dec(&block->offloadcnt); } From 2081fd3445fec6b9813c20e8b910c2abd6de31cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Mon, 25 Mar 2024 20:47:35 +0000 Subject: [PATCH 043/167] net: sched: cls_api: add filter counter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Maintain a count of filters per block. Counter updates are protected by cb_lock, which is also used to protect the offload counters. Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Simon Horman Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sch_generic.h | 2 ++ net/sched/cls_api.c | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 120a4ca6ec9b2..eb3872c22fcd1 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -422,6 +422,7 @@ struct tcf_proto { */ spinlock_t lock; bool deleting; + bool counted; refcount_t refcnt; struct rcu_head rcu; struct hlist_node destroy_ht_node; @@ -471,6 +472,7 @@ struct tcf_block { struct flow_block flow_block; struct list_head owner_list; bool keep_dst; + atomic_t filtercnt; /* Number of filters */ atomic_t skipswcnt; /* Number of skip_sw filters */ atomic_t offloadcnt; /* Number of oddloaded filters */ unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */ diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 397c3d29659cd..304a46ab0e0b3 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -410,12 +410,30 @@ static void tcf_proto_get(struct tcf_proto *tp) refcount_inc(&tp->refcnt); } +static void tcf_block_filter_cnt_update(struct tcf_block *block, bool *counted, bool add) +{ + lockdep_assert_not_held(&block->cb_lock); + + down_write(&block->cb_lock); + if (*counted != add) { + if (add) { + atomic_inc(&block->filtercnt); + *counted = true; + } else { + atomic_dec(&block->filtercnt); + *counted = false; + } + } + up_write(&block->cb_lock); +} + static void tcf_chain_put(struct tcf_chain *chain); static void tcf_proto_destroy(struct tcf_proto *tp, bool rtnl_held, bool sig_destroy, struct netlink_ext_ack *extack) { tp->ops->destroy(tp, rtnl_held, extack); + tcf_block_filter_cnt_update(tp->chain->block, &tp->counted, false); if (sig_destroy) tcf_proto_signal_destroyed(tp->chain, tp); tcf_chain_put(tp->chain); @@ -2364,6 +2382,7 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, flags, extack); if (err == 0) { + tcf_block_filter_cnt_update(block, &tp->counted, true); tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_NEWTFILTER, false, rtnl_held, extack); tfilter_put(tp, fh); From 047f340b36fc550c0fc6a8947fc0a1f8e429e9ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Mon, 25 Mar 2024 20:47:36 +0000 Subject: [PATCH 044/167] net: sched: make skip_sw actually skip software MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TC filters come in 3 variants: - no flag (try to process in hardware, but fallback to software)) - skip_hw (do not process filter by hardware) - skip_sw (do not process filter by software) However skip_sw is implemented so that the skip_sw flag can first be checked, after it has been matched. IMHO it's common when using skip_sw, to use it on all rules. So if all filters in a block is skip_sw filters, then we can bail early, we can thus avoid having to match the filters, just to check for the skip_sw flag. This patch adds a bypass, for when only TC skip_sw rules are used. The bypass is guarded by a static key, to avoid harming other workloads. There are 3 ways that a packet from a skip_sw ruleset, can end up in the kernel path. Although the send packets to a non-existent chain way is only improved a few percents, then I believe it's worth optimizing the trap and fall-though use-cases. +----------------------------+--------+--------+--------+ | Test description | Pre- | Post- | Rel. | | | kpps | kpps | chg. | +----------------------------+--------+--------+--------+ | basic forwarding + notrack | 3589.3 | 3587.9 | 1.00x | | switch to eswitch mode | 3081.8 | 3094.7 | 1.00x | | add ingress qdisc | 3042.9 | 3063.6 | 1.01x | | tc forward in hw / skip_sw |37024.7 |37028.4 | 1.00x | | tc forward in sw / skip_hw | 3245.0 | 3245.3 | 1.00x | +----------------------------+--------+--------+--------+ | tests with only skip_sw rules below: | +----------------------------+--------+--------+--------+ | 1 non-matching rule | 2694.7 | 3058.7 | 1.14x | | 1 n-m rule, match trap | 2611.2 | 3323.1 | 1.27x | | 1 n-m rule, goto non-chain | 2886.8 | 2945.9 | 1.02x | | 5 non-matching rules | 1958.2 | 3061.3 | 1.56x | | 5 n-m rules, match trap | 1911.9 | 3327.0 | 1.74x | | 5 n-m rules, goto non-chain| 2883.1 | 2947.5 | 1.02x | | 10 non-matching rules | 1466.3 | 3062.8 | 2.09x | | 10 n-m rules, match trap | 1444.3 | 3317.9 | 2.30x | | 10 n-m rules,goto non-chain| 2883.1 | 2939.5 | 1.02x | | 25 non-matching rules | 838.5 | 3058.9 | 3.65x | | 25 n-m rules, match trap | 824.5 | 3323.0 | 4.03x | | 25 n-m rules,goto non-chain| 2875.8 | 2944.7 | 1.02x | | 50 non-matching rules | 488.1 | 3054.7 | 6.26x | | 50 n-m rules, match trap | 484.9 | 3318.5 | 6.84x | | 50 n-m rules,goto non-chain| 2884.1 | 2939.7 | 1.02x | +----------------------------+--------+--------+--------+ perf top (25 n-m skip_sw rules - pre patch): 20.39% [kernel] [k] __skb_flow_dissect 16.43% [kernel] [k] rhashtable_jhash2 10.58% [kernel] [k] fl_classify 10.23% [kernel] [k] fl_mask_lookup 4.79% [kernel] [k] memset_orig 2.58% [kernel] [k] tcf_classify 1.47% [kernel] [k] __x86_indirect_thunk_rax 1.42% [kernel] [k] __dev_queue_xmit 1.36% [kernel] [k] nft_do_chain 1.21% [kernel] [k] __rcu_read_lock perf top (25 n-m skip_sw rules - post patch): 5.12% [kernel] [k] __dev_queue_xmit 4.77% [kernel] [k] nft_do_chain 3.65% [kernel] [k] dev_gro_receive 3.41% [kernel] [k] check_preemption_disabled 3.14% [kernel] [k] mlx5e_skb_from_cqe_mpwrq_nonlinear 2.88% [kernel] [k] __netif_receive_skb_core.constprop.0 2.49% [kernel] [k] mlx5e_xmit 2.15% [kernel] [k] ip_forward 1.95% [kernel] [k] mlx5e_tc_restore_tunnel 1.92% [kernel] [k] vlan_gro_receive Test setup: DUT: Intel Xeon D-1518 (2.20GHz) w/ Nvidia/Mellanox ConnectX-6 Dx 2x100G Data rate measured on switch (Extreme X690), and DUT connected as a router on a stick, with pktgen and pktsink as VLANs. Pktgen-dpdk was in range 36.6-37.7 Mpps 64B packets across all tests. Full test data at https://files.fiberby.net/ast/2024/tc_skip_sw/v2_tests/ Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Simon Horman Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 9 +++++++++ include/net/sch_generic.h | 1 + net/core/dev.c | 10 ++++++++++ net/sched/cls_api.c | 18 ++++++++++++++++++ 4 files changed, 38 insertions(+) diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a4ee43f493bbf..41297bd38dff7 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -74,6 +74,15 @@ static inline bool tcf_block_non_null_shared(struct tcf_block *block) return block && block->index; } +#ifdef CONFIG_NET_CLS_ACT +DECLARE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key); + +static inline bool tcf_block_bypass_sw(struct tcf_block *block) +{ + return block && block->bypass_wanted; +} +#endif + static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { WARN_ON(tcf_block_shared(block)); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index eb3872c22fcd1..76db6be160831 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -472,6 +472,7 @@ struct tcf_block { struct flow_block flow_block; struct list_head owner_list; bool keep_dst; + bool bypass_wanted; atomic_t filtercnt; /* Number of filters */ atomic_t skipswcnt; /* Number of skip_sw filters */ atomic_t offloadcnt; /* Number of oddloaded filters */ diff --git a/net/core/dev.c b/net/core/dev.c index 5d36a634f468f..c136e80dea618 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2083,6 +2083,11 @@ void net_dec_egress_queue(void) EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif +#ifdef CONFIG_NET_CLS_ACT +DEFINE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key); +EXPORT_SYMBOL(tcf_bypass_check_needed_key); +#endif + DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); EXPORT_SYMBOL(netstamp_needed_key); #ifdef CONFIG_JUMP_LABEL @@ -3937,6 +3942,11 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, if (!miniq) return ret; + if (static_branch_unlikely(&tcf_bypass_check_needed_key)) { + if (tcf_block_bypass_sw(miniq->block)) + return ret; + } + tc_skb_cb(skb)->mru = 0; tc_skb_cb(skb)->post_ct = false; tcf_set_drop_reason(skb, *drop_reason); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 304a46ab0e0b3..db06539936323 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -410,6 +410,23 @@ static void tcf_proto_get(struct tcf_proto *tp) refcount_inc(&tp->refcnt); } +static void tcf_maintain_bypass(struct tcf_block *block) +{ + int filtercnt = atomic_read(&block->filtercnt); + int skipswcnt = atomic_read(&block->skipswcnt); + bool bypass_wanted = filtercnt > 0 && filtercnt == skipswcnt; + + if (bypass_wanted != block->bypass_wanted) { +#ifdef CONFIG_NET_CLS_ACT + if (bypass_wanted) + static_branch_inc(&tcf_bypass_check_needed_key); + else + static_branch_dec(&tcf_bypass_check_needed_key); +#endif + block->bypass_wanted = bypass_wanted; + } +} + static void tcf_block_filter_cnt_update(struct tcf_block *block, bool *counted, bool add) { lockdep_assert_not_held(&block->cb_lock); @@ -424,6 +441,7 @@ static void tcf_block_filter_cnt_update(struct tcf_block *block, bool *counted, *counted = false; } } + tcf_maintain_bypass(block); up_write(&block->cb_lock); } From af398bd0cb2151a581027a19bd995f72af2310ac Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Tue, 26 Mar 2024 15:29:52 +0800 Subject: [PATCH 045/167] net/smc: make smc_hash_sk/smc_unhash_sk static smc_hash_sk and smc_unhash_sk are only used in af_smc.c, so make them static and remove the output symbol. They can be called under the path .prot->hash()/unhash(). Signed-off-by: Zhengchao Shao Reviewed-by: Wen Gu Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/smc.h | 3 --- net/smc/af_smc.c | 6 ++---- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/include/net/smc.h b/include/net/smc.h index c9dcb30e3fd98..10684d0a33df3 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -26,9 +26,6 @@ struct smc_hashinfo { struct hlist_head ht; }; -int smc_hash_sk(struct sock *sk); -void smc_unhash_sk(struct sock *sk); - /* SMCD/ISM device driver interface */ struct smcd_dmb { u64 dmb_tok; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 4b52b3b159c0e..e8dcd28a554ce 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -177,7 +177,7 @@ static struct smc_hashinfo smc_v6_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), }; -int smc_hash_sk(struct sock *sk) +static int smc_hash_sk(struct sock *sk) { struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; struct hlist_head *head; @@ -191,9 +191,8 @@ int smc_hash_sk(struct sock *sk) return 0; } -EXPORT_SYMBOL_GPL(smc_hash_sk); -void smc_unhash_sk(struct sock *sk) +static void smc_unhash_sk(struct sock *sk) { struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; @@ -202,7 +201,6 @@ void smc_unhash_sk(struct sock *sk) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); write_unlock_bh(&h->lock); } -EXPORT_SYMBOL_GPL(smc_unhash_sk); /* This will be called before user really release sock_lock. So do the * work which we didn't do because of user hold the sock_lock in the From 10e52ad5ced2a7dcdb3fb18c9cef111d5f30471d Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Tue, 26 Mar 2024 09:56:49 +0100 Subject: [PATCH 046/167] net: hsr: Use full string description when opening HSR network device Up till now only single character ('A' or 'B') was used to provide information of HSR slave network device status. As it is also possible and valid, that Interlink network device may be supported as well, the description must be more verbose. As a result the full string description is now used. Signed-off-by: Lukasz Majewski Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/hsr/hsr_device.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index c98b5b71ad7c3..e9d45133d6412 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -132,30 +132,29 @@ static int hsr_dev_open(struct net_device *dev) { struct hsr_priv *hsr; struct hsr_port *port; - char designation; + const char *designation = NULL; hsr = netdev_priv(dev); - designation = '\0'; hsr_for_each_port(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { case HSR_PT_SLAVE_A: - designation = 'A'; + designation = "Slave A"; break; case HSR_PT_SLAVE_B: - designation = 'B'; + designation = "Slave B"; break; default: - designation = '?'; + designation = "Unknown"; } if (!is_slave_up(port->dev)) - netdev_warn(dev, "Slave %c (%s) is not up; please bring it up to get a fully working HSR network\n", + netdev_warn(dev, "%s (%s) is not up; please bring it up to get a fully working HSR network\n", designation, port->dev->name); } - if (designation == '\0') + if (!designation) netdev_warn(dev, "No slave devices configured\n"); return 0; From 385edf325efaad802e1bc8eeb710e02eb6461d73 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Tue, 26 Mar 2024 12:16:36 +0100 Subject: [PATCH 047/167] net: dsa: hellcreek: Convert to gettimex64() As of commit 916444df305e ("ptp: deprecate gettime64() in favor of gettimex64()") (new) PTP drivers should rather implement gettimex64(). In addition, this variant provides timestamps from the system clock. The readings have to be recorded right before and after reading the lowest bits of the PHC timestamp. Reported-by: Thomas Gleixner Signed-off-by: Kurt Kanzenbach Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/dsa/hirschmann/hellcreek_ptp.c | 25 +++++++++++++--------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/net/dsa/hirschmann/hellcreek_ptp.c b/drivers/net/dsa/hirschmann/hellcreek_ptp.c index 5249a1c2a80b8..bfe21f9f7dcd3 100644 --- a/drivers/net/dsa/hirschmann/hellcreek_ptp.c +++ b/drivers/net/dsa/hirschmann/hellcreek_ptp.c @@ -27,7 +27,8 @@ void hellcreek_ptp_write(struct hellcreek *hellcreek, u16 data, } /* Get nanoseconds from PTP clock */ -static u64 hellcreek_ptp_clock_read(struct hellcreek *hellcreek) +static u64 hellcreek_ptp_clock_read(struct hellcreek *hellcreek, + struct ptp_system_timestamp *sts) { u16 nsl, nsh; @@ -45,16 +46,19 @@ static u64 hellcreek_ptp_clock_read(struct hellcreek *hellcreek) nsh = hellcreek_ptp_read(hellcreek, PR_SS_SYNC_DATA_C); nsh = hellcreek_ptp_read(hellcreek, PR_SS_SYNC_DATA_C); nsh = hellcreek_ptp_read(hellcreek, PR_SS_SYNC_DATA_C); + ptp_read_system_prets(sts); nsl = hellcreek_ptp_read(hellcreek, PR_SS_SYNC_DATA_C); + ptp_read_system_postts(sts); return (u64)nsl | ((u64)nsh << 16); } -static u64 __hellcreek_ptp_gettime(struct hellcreek *hellcreek) +static u64 __hellcreek_ptp_gettime(struct hellcreek *hellcreek, + struct ptp_system_timestamp *sts) { u64 ns; - ns = hellcreek_ptp_clock_read(hellcreek); + ns = hellcreek_ptp_clock_read(hellcreek, sts); if (ns < hellcreek->last_ts) hellcreek->seconds++; hellcreek->last_ts = ns; @@ -72,7 +76,7 @@ u64 hellcreek_ptp_gettime_seconds(struct hellcreek *hellcreek, u64 ns) { u64 s; - __hellcreek_ptp_gettime(hellcreek); + __hellcreek_ptp_gettime(hellcreek, NULL); if (hellcreek->last_ts > ns) s = hellcreek->seconds * NSEC_PER_SEC; else @@ -81,14 +85,15 @@ u64 hellcreek_ptp_gettime_seconds(struct hellcreek *hellcreek, u64 ns) return s; } -static int hellcreek_ptp_gettime(struct ptp_clock_info *ptp, - struct timespec64 *ts) +static int hellcreek_ptp_gettimex(struct ptp_clock_info *ptp, + struct timespec64 *ts, + struct ptp_system_timestamp *sts) { struct hellcreek *hellcreek = ptp_to_hellcreek(ptp); u64 ns; mutex_lock(&hellcreek->ptp_lock); - ns = __hellcreek_ptp_gettime(hellcreek); + ns = __hellcreek_ptp_gettime(hellcreek, sts); mutex_unlock(&hellcreek->ptp_lock); *ts = ns_to_timespec64(ns); @@ -184,7 +189,7 @@ static int hellcreek_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) if (abs(delta) > MAX_SLOW_OFFSET_ADJ) { struct timespec64 now, then = ns_to_timespec64(delta); - hellcreek_ptp_gettime(ptp, &now); + hellcreek_ptp_gettimex(ptp, &now, NULL); now = timespec64_add(now, then); hellcreek_ptp_settime(ptp, &now); @@ -233,7 +238,7 @@ static void hellcreek_ptp_overflow_check(struct work_struct *work) hellcreek = dw_overflow_to_hellcreek(dw); mutex_lock(&hellcreek->ptp_lock); - __hellcreek_ptp_gettime(hellcreek); + __hellcreek_ptp_gettime(hellcreek, NULL); mutex_unlock(&hellcreek->ptp_lock); schedule_delayed_work(&hellcreek->overflow_work, @@ -409,7 +414,7 @@ int hellcreek_ptp_setup(struct hellcreek *hellcreek) hellcreek->ptp_clock_info.pps = 0; hellcreek->ptp_clock_info.adjfine = hellcreek_ptp_adjfine; hellcreek->ptp_clock_info.adjtime = hellcreek_ptp_adjtime; - hellcreek->ptp_clock_info.gettime64 = hellcreek_ptp_gettime; + hellcreek->ptp_clock_info.gettimex64 = hellcreek_ptp_gettimex; hellcreek->ptp_clock_info.settime64 = hellcreek_ptp_settime; hellcreek->ptp_clock_info.enable = hellcreek_ptp_enable; hellcreek->ptp_clock_info.do_aux_work = hellcreek_hwtstamp_work; From 3d010c8031e39f5fa1e8b13ada77e0321091011f Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 26 Mar 2024 12:33:58 +0100 Subject: [PATCH 048/167] udp: do not accept non-tunnel GSO skbs landing in a tunnel When rx-udp-gro-forwarding is enabled UDP packets might be GROed when being forwarded. If such packets might land in a tunnel this can cause various issues and udp_gro_receive makes sure this isn't the case by looking for a matching socket. This is performed in udp4/6_gro_lookup_skb but only in the current netns. This is an issue with tunneled packets when the endpoint is in another netns. In such cases the packets will be GROed at the UDP level, which leads to various issues later on. The same thing can happen with rx-gro-list. We saw this with geneve packets being GROed at the UDP level. In such case gso_size is set; later the packet goes through the geneve rx path, the geneve header is pulled, the offset are adjusted and frag_list skbs are not adjusted with regard to geneve. When those skbs hit skb_fragment, it will misbehave. Different outcomes are possible depending on what the GROed skbs look like; from corrupted packets to kernel crashes. One example is a BUG_ON[1] triggered in skb_segment while processing the frag_list. Because gso_size is wrong (geneve header was pulled) skb_segment thinks there is "geneve header size" of data in frag_list, although it's in fact the next packet. The BUG_ON itself has nothing to do with the issue. This is only one of the potential issues. Looking up for a matching socket in udp_gro_receive is fragile: the lookup could be extended to all netns (not speaking about performances) but nothing prevents those packets from being modified in between and we could still not find a matching socket. It's OK to keep the current logic there as it should cover most cases but we also need to make sure we handle tunnel packets being GROed too early. This is done by extending the checks in udp_unexpected_gso: GSO packets lacking the SKB_GSO_UDP_TUNNEL/_CSUM bits and landing in a tunnel must be segmented. [1] kernel BUG at net/core/skbuff.c:4408! RIP: 0010:skb_segment+0xd2a/0xf70 __udp_gso_segment+0xaa/0x560 Fixes: 9fd1ff5d2ac7 ("udp: Support UDP fraglist GRO/GSO.") Fixes: 36707061d6ba ("udp: allow forwarding of plain (non-fraglisted) UDP GRO packets") Signed-off-by: Antoine Tenart Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/udp.h | 28 ++++++++++++++++++++++++++++ net/ipv4/udp.c | 7 +++++++ net/ipv4/udp_offload.c | 6 ++++-- net/ipv6/udp.c | 2 +- 4 files changed, 40 insertions(+), 3 deletions(-) diff --git a/include/linux/udp.h b/include/linux/udp.h index 3748e82b627b7..17539d0896661 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -150,6 +150,24 @@ static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk, } } +DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key); +#if IS_ENABLED(CONFIG_IPV6) +DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); +#endif + +static inline bool udp_encap_needed(void) +{ + if (static_branch_unlikely(&udp_encap_needed_key)) + return true; + +#if IS_ENABLED(CONFIG_IPV6) + if (static_branch_unlikely(&udpv6_encap_needed_key)) + return true; +#endif + + return false; +} + static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb) { if (!skb_is_gso(skb)) @@ -163,6 +181,16 @@ static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb) !udp_test_bit(ACCEPT_FRAGLIST, sk)) return true; + /* GSO packets lacking the SKB_GSO_UDP_TUNNEL/_CSUM bits might still + * land in a tunnel as the socket check in udp_gro_receive cannot be + * foolproof. + */ + if (udp_encap_needed() && + READ_ONCE(udp_sk(sk)->encap_rcv) && + !(skb_shinfo(skb)->gso_type & + (SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM))) + return true; + return false; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 661d0e0d273f6..c02bf011d4a6f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -582,6 +582,13 @@ static inline bool __udp_is_mcast_sock(struct net *net, const struct sock *sk, } DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); +EXPORT_SYMBOL(udp_encap_needed_key); + +#if IS_ENABLED(CONFIG_IPV6) +DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); +EXPORT_SYMBOL(udpv6_encap_needed_key); +#endif + void udp_encap_enable(void) { static_branch_inc(&udp_encap_needed_key); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index b9880743765c6..e9719afe91cf5 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -551,8 +551,10 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, unsigned int off = skb_gro_offset(skb); int flush = 1; - /* we can do L4 aggregation only if the packet can't land in a tunnel - * otherwise we could corrupt the inner stream + /* We can do L4 aggregation only if the packet can't land in a tunnel + * otherwise we could corrupt the inner stream. Detecting such packets + * cannot be foolproof and the aggregation might still happen in some + * cases. Such packets should be caught in udp_unexpected_gso later. */ NAPI_GRO_CB(skb)->is_flist = 0; if (!sk || !udp_sk(sk)->gro_receive) { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7c1e6469d091d..8b1dd7f512491 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -447,7 +447,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, goto try_again; } -DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); +DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); void udpv6_encap_enable(void) { static_branch_inc(&udpv6_encap_needed_key); From ed4cccef64c1d0d5b91e69f7a8a6697c3a865486 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 26 Mar 2024 12:33:59 +0100 Subject: [PATCH 049/167] gro: fix ownership transfer If packets are GROed with fraglist they might be segmented later on and continue their journey in the stack. In skb_segment_list those skbs can be reused as-is. This is an issue as their destructor was removed in skb_gro_receive_list but not the reference to their socket, and then they can't be orphaned. Fix this by also removing the reference to the socket. For example this could be observed, kernel BUG at include/linux/skbuff.h:3131! (skb_orphan) RIP: 0010:ip6_rcv_core+0x11bc/0x19a0 Call Trace: ipv6_list_rcv+0x250/0x3f0 __netif_receive_skb_list_core+0x49d/0x8f0 netif_receive_skb_list_internal+0x634/0xd40 napi_complete_done+0x1d2/0x7d0 gro_cell_poll+0x118/0x1f0 A similar construction is found in skb_gro_receive, apply the same change there. Fixes: 5e10da5385d2 ("skbuff: allow 'slow_gro' for skb carring sock reference") Signed-off-by: Antoine Tenart Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/gro.c | 3 ++- net/ipv4/udp_offload.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/core/gro.c b/net/core/gro.c index ee30d4f0c0387..83f35d99a682c 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -192,8 +192,9 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) } merge: - /* sk owenrship - if any - completely transferred to the aggregated packet */ + /* sk ownership - if any - completely transferred to the aggregated packet */ skb->destructor = NULL; + skb->sk = NULL; delta_truesize = skb->truesize; if (offset > headlen) { unsigned int eat = offset - headlen; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index e9719afe91cf5..3bb69464930bf 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -449,8 +449,9 @@ static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) NAPI_GRO_CB(p)->count++; p->data_len += skb->len; - /* sk owenrship - if any - completely transferred to the aggregated packet */ + /* sk ownership - if any - completely transferred to the aggregated packet */ skb->destructor = NULL; + skb->sk = NULL; p->truesize += skb->truesize; p->len += skb->len; From f0b8c30345565344df2e33a8417a27503589247d Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 26 Mar 2024 12:34:00 +0100 Subject: [PATCH 050/167] udp: do not transition UDP GRO fraglist partial checksums to unnecessary UDP GRO validates checksums and in udp4/6_gro_complete fraglist packets are converted to CHECKSUM_UNNECESSARY to avoid later checks. However this is an issue for CHECKSUM_PARTIAL packets as they can be looped in an egress path and then their partial checksums are not fixed. Different issues can be observed, from invalid checksum on packets to traces like: gen01: hw csum failure skb len=3008 headroom=160 headlen=1376 tailroom=0 mac=(106,14) net=(120,40) trans=160 shinfo(txflags=0 nr_frags=0 gso(size=0 type=0 segs=0)) csum(0xffff232e ip_summed=2 complete_sw=0 valid=0 level=0) hash(0x77e3d716 sw=1 l4=1) proto=0x86dd pkttype=0 iif=12 ... Fix this by only converting CHECKSUM_NONE packets to CHECKSUM_UNNECESSARY by reusing __skb_incr_checksum_unnecessary. All other checksum types are kept as-is, including CHECKSUM_COMPLETE as fraglist packets being segmented back would have their skb->csum valid. Fixes: 9fd1ff5d2ac7 ("udp: Support UDP fraglist GRO/GSO.") Signed-off-by: Antoine Tenart Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp_offload.c | 8 +------- net/ipv6/udp_offload.c | 8 +------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 3bb69464930bf..548476d782371 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -722,13 +722,7 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff) skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; - if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - if (skb->csum_level < SKB_MAX_CSUM_LEVEL) - skb->csum_level++; - } else { - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->csum_level = 0; - } + __skb_incr_checksum_unnecessary(skb); return 0; } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 312bcaeea96fb..bbd347de00b45 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -174,13 +174,7 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff) skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; - if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - if (skb->csum_level < SKB_MAX_CSUM_LEVEL) - skb->csum_level++; - } else { - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->csum_level = 0; - } + __skb_incr_checksum_unnecessary(skb); return 0; } From 64235eabc4b5b18c507c08a1f16cdac6c5661220 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 26 Mar 2024 12:34:01 +0100 Subject: [PATCH 051/167] udp: prevent local UDP tunnel packets from being GROed GRO has a fundamental issue with UDP tunnel packets as it can't detect those in a foolproof way and GRO could happen before they reach the tunnel endpoint. Previous commits have fixed issues when UDP tunnel packets come from a remote host, but if those packets are issued locally they could run into checksum issues. If the inner packet has a partial checksum the information will be lost in the GRO logic, either in udp4/6_gro_complete or in udp_gro_complete_segment and packets will have an invalid checksum when leaving the host. Prevent local UDP tunnel packets from ever being GROed at the outer UDP level. Due to skb->encapsulation being wrongly used in some drivers this is actually only preventing UDP tunnel packets with a partial checksum to be GROed (see iptunnel_handle_offloads) but those were also the packets triggering issues so in practice this should be sufficient. Fixes: 9fd1ff5d2ac7 ("udp: Support UDP fraglist GRO/GSO.") Fixes: 36707061d6ba ("udp: allow forwarding of plain (non-fraglisted) UDP GRO packets") Suggested-by: Paolo Abeni Signed-off-by: Antoine Tenart Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp_offload.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 548476d782371..3498dd1d0694d 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -559,6 +559,12 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, */ NAPI_GRO_CB(skb)->is_flist = 0; if (!sk || !udp_sk(sk)->gro_receive) { + /* If the packet was locally encapsulated in a UDP tunnel that + * wasn't detected above, do not GRO. + */ + if (skb->encapsulation) + goto out; + if (skb->dev->features & NETIF_F_GRO_FRAGLIST) NAPI_GRO_CB(skb)->is_flist = sk ? !udp_test_bit(GRO_ENABLED, sk) : 1; From 0fb101be97ca27850c5ecdbd1269423ce4d1f607 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 26 Mar 2024 12:34:02 +0100 Subject: [PATCH 052/167] selftests: net: gro fwd: update vxlan GRO test expectations UDP tunnel packets can't be GRO in-between their endpoints as this causes different issues. The UDP GRO fwd vxlan tests were relying on this and their expectations have to be fixed. We keep both vxlan tests and expected no GRO from happening. The vxlan UDP GRO bench test was removed as it's not providing any valuable information now. Fixes: a062260a9d5f ("selftests: net: add UDP GRO forwarding self-tests") Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- tools/testing/selftests/net/udpgro_fwd.sh | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh index 380cb15e942e4..83ed987cff340 100755 --- a/tools/testing/selftests/net/udpgro_fwd.sh +++ b/tools/testing/selftests/net/udpgro_fwd.sh @@ -244,7 +244,7 @@ for family in 4 6; do create_vxlan_pair ip netns exec $NS_DST ethtool -K veth$DST generic-receive-offload on ip netns exec $NS_DST ethtool -K veth$DST rx-gro-list on - run_test "GRO frag list over UDP tunnel" $OL_NET$DST 1 1 + run_test "GRO frag list over UDP tunnel" $OL_NET$DST 10 10 cleanup # use NAT to circumvent GRO FWD check @@ -258,13 +258,7 @@ for family in 4 6; do # load arp cache before running the test to reduce the amount of # stray traffic on top of the UDP tunnel ip netns exec $NS_SRC $PING -q -c 1 $OL_NET$DST_NAT >/dev/null - run_test "GRO fwd over UDP tunnel" $OL_NET$DST_NAT 1 1 $OL_NET$DST - cleanup - - create_vxlan_pair - run_bench "UDP tunnel fwd perf" $OL_NET$DST - ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on - run_bench "UDP tunnel GRO fwd perf" $OL_NET$DST + run_test "GRO fwd over UDP tunnel" $OL_NET$DST_NAT 10 10 $OL_NET$DST cleanup done From 0ba80d96585662299d4ea4624043759ce9015421 Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Tue, 26 Mar 2024 17:51:49 +0530 Subject: [PATCH 053/167] octeontx2-af: Fix issue with loading coalesced KPU profiles The current implementation for loading coalesced KPU profiles has a limitation. The "offset" field, which is used to locate profiles within the profile is restricted to a u16. This restricts the number of profiles that can be loaded. This patch addresses this limitation by increasing the size of the "offset" field. Fixes: 11c730bfbf5b ("octeontx2-af: support for coalescing KPU profiles") Signed-off-by: Hariprasad Kelam Reviewed-by: Kalesh AP Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index e350242bbafba..be709f83f3318 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -1657,7 +1657,7 @@ static int npc_fwdb_detect_load_prfl_img(struct rvu *rvu, uint64_t prfl_sz, struct npc_coalesced_kpu_prfl *img_data = NULL; int i = 0, rc = -EINVAL; void __iomem *kpu_prfl_addr; - u16 offset; + u32 offset; img_data = (struct npc_coalesced_kpu_prfl __force *)rvu->kpu_prfl_addr; if (le64_to_cpu(img_data->signature) == KPU_SIGN && From 4790a73ace86f3d165bbedba898e0758e6e1b82d Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 14 Mar 2024 09:44:12 +0100 Subject: [PATCH 054/167] Revert "Bluetooth: hci_qca: Set BDA quirk bit if fwnode exists in DT" This reverts commit 7dcd3e014aa7faeeaf4047190b22d8a19a0db696. Qualcomm Bluetooth controllers like WCN6855 do not have persistent storage for the Bluetooth address and must therefore start as unconfigured to allow the user to set a valid address unless one has been provided by the boot firmware in the devicetree. A recent change snuck into v6.8-rc7 and incorrectly started marking the default (non-unique) address as valid. This specifically also breaks the Bluetooth setup for some user of the Lenovo ThinkPad X13s. Note that this is the second time Qualcomm breaks the driver this way and that this was fixed last year by commit 6945795bc81a ("Bluetooth: fix use-bdaddr-property quirk"), which also has some further details. Fixes: 7dcd3e014aa7 ("Bluetooth: hci_qca: Set BDA quirk bit if fwnode exists in DT") Cc: stable@vger.kernel.org # 6.8 Cc: Janaki Ramaiah Thota Signed-off-by: Johan Hovold Reported-by: Clayton Craft Tested-by: Clayton Craft Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/hci_qca.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 8a60ad7acd705..4ecbcb1644cc6 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -7,7 +7,6 @@ * * Copyright (C) 2007 Texas Instruments, Inc. * Copyright (c) 2010, 2012, 2018 The Linux Foundation. All rights reserved. - * Copyright (c) 2023 Qualcomm Innovation Center, Inc. All rights reserved. * * Acknowledgements: * This file is based on hci_ll.c, which was... @@ -1904,17 +1903,7 @@ static int qca_setup(struct hci_uart *hu) case QCA_WCN6750: case QCA_WCN6855: case QCA_WCN7850: - - /* Set BDA quirk bit for reading BDA value from fwnode property - * only if that property exist in DT. - */ - if (fwnode_property_present(dev_fwnode(hdev->dev.parent), "local-bd-address")) { - set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); - bt_dev_info(hdev, "setting quirk bit to read BDA from fwnode later"); - } else { - bt_dev_dbg(hdev, "local-bd-address` is not present in the devicetree so not setting quirk bit for BDA"); - } - + set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); hci_set_aosp_capable(hdev); ret = qca_read_soc_version(hdev, &ver, soc_type); From 7003de8a226ea07d36e9461a30633af26dc79248 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 20 Mar 2024 08:55:51 +0100 Subject: [PATCH 055/167] dt-bindings: bluetooth: add 'qcom,local-bd-address-broken' Several Qualcomm Bluetooth controllers lack persistent storage for the device address and instead one can be provided by the boot firmware using the 'local-bd-address' devicetree property. The Bluetooth bindings clearly states that the address should be specified in little-endian order, but due to a long-standing bug in the Qualcomm driver which reversed the address some boot firmware has been providing the address in big-endian order instead. The only device out there that should be affected by this is the WCN3991 used in some Chromebooks. Add a 'qcom,local-bd-address-broken' property which can be set on these platforms to indicate that the boot firmware is using the wrong byte order. Note that ChromeOS always updates the kernel and devicetree in lockstep so that there is no need to handle backwards compatibility with older devicetrees. Reviewed-by: Douglas Anderson Signed-off-by: Johan Hovold Reviewed-by: Rob Herring Signed-off-by: Luiz Augusto von Dentz --- .../devicetree/bindings/net/bluetooth/qualcomm-bluetooth.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/net/bluetooth/qualcomm-bluetooth.yaml b/Documentation/devicetree/bindings/net/bluetooth/qualcomm-bluetooth.yaml index 528ef3572b621..055a3351880bc 100644 --- a/Documentation/devicetree/bindings/net/bluetooth/qualcomm-bluetooth.yaml +++ b/Documentation/devicetree/bindings/net/bluetooth/qualcomm-bluetooth.yaml @@ -94,6 +94,10 @@ properties: local-bd-address: true + qcom,local-bd-address-broken: + type: boolean + description: + boot firmware is incorrectly passing the address in big-endian order required: - compatible From e12e28009e584c8f8363439f6a928ec86278a106 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 20 Mar 2024 08:55:52 +0100 Subject: [PATCH 056/167] arm64: dts: qcom: sc7180-trogdor: mark bluetooth address as broken Several Qualcomm Bluetooth controllers lack persistent storage for the device address and instead one can be provided by the boot firmware using the 'local-bd-address' devicetree property. The Bluetooth bindings clearly states that the address should be specified in little-endian order, but due to a long-standing bug in the Qualcomm driver which reversed the address some boot firmware has been providing the address in big-endian order instead. The boot firmware in SC7180 Trogdor Chromebooks is known to be affected so mark the 'local-bd-address' property as broken to maintain backwards compatibility with older firmware when fixing the underlying driver bug. Note that ChromeOS always updates the kernel and devicetree in lockstep so that there is no need to handle backwards compatibility with older devicetrees. Fixes: 7ec3e67307f8 ("arm64: dts: qcom: sc7180-trogdor: add initial trogdor and lazor dt") Cc: stable@vger.kernel.org # 5.10 Cc: Rob Clark Reviewed-by: Douglas Anderson Signed-off-by: Johan Hovold Acked-by: Bjorn Andersson Reviewed-by: Bjorn Andersson Signed-off-by: Luiz Augusto von Dentz --- arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi b/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi index f3a6da8b28901..5260c63db0078 100644 --- a/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi +++ b/arch/arm64/boot/dts/qcom/sc7180-trogdor.dtsi @@ -944,6 +944,8 @@ ap_spi_fp: &spi10 { vddrf-supply = <&pp1300_l2c>; vddch0-supply = <&pp3300_l10c>; max-speed = <3200000>; + + qcom,local-bd-address-broken; }; }; From 39646f29b100566451d37abc4cc8cdd583756dfe Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 20 Mar 2024 08:55:53 +0100 Subject: [PATCH 057/167] Bluetooth: add quirk for broken address properties Some Bluetooth controllers lack persistent storage for the device address and instead one can be provided by the boot firmware using the 'local-bd-address' devicetree property. The Bluetooth devicetree bindings clearly states that the address should be specified in little-endian order, but due to a long-standing bug in the Qualcomm driver which reversed the address some boot firmware has been providing the address in big-endian order instead. Add a new quirk that can be set on platforms with broken firmware and use it to reverse the address when parsing the property so that the underlying driver bug can be fixed. Fixes: 5c0a1001c8be ("Bluetooth: hci_qca: Add helper to set device address") Cc: stable@vger.kernel.org # 5.1 Reviewed-by: Douglas Anderson Signed-off-by: Johan Hovold Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 9 +++++++++ net/bluetooth/hci_sync.c | 5 ++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 8701ca5f31eec..5c12761cbc0e2 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -176,6 +176,15 @@ enum { */ HCI_QUIRK_USE_BDADDR_PROPERTY, + /* When this quirk is set, the Bluetooth Device Address provided by + * the 'local-bd-address' fwnode property is incorrectly specified in + * big-endian order. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + */ + HCI_QUIRK_BDADDR_PROPERTY_BROKEN, + /* When this quirk is set, the duplicate filtering during * scanning is based on Bluetooth devices addresses. To allow * RSSI based updates, restart scanning if needed. diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index f6b662369322b..639090b9f4b85 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3416,7 +3416,10 @@ static void hci_dev_get_bd_addr_from_property(struct hci_dev *hdev) if (ret < 0 || !bacmp(&ba, BDADDR_ANY)) return; - bacpy(&hdev->public_addr, &ba); + if (test_bit(HCI_QUIRK_BDADDR_PROPERTY_BROKEN, &hdev->quirks)) + baswap(&hdev->public_addr, &ba); + else + bacpy(&hdev->public_addr, &ba); } struct hci_init_stage { From 77f45cca8bc55d00520a192f5a7715133591c83e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 20 Mar 2024 08:55:54 +0100 Subject: [PATCH 058/167] Bluetooth: qca: fix device-address endianness The WCN6855 firmware on the Lenovo ThinkPad X13s expects the Bluetooth device address in big-endian order when setting it using the EDL_WRITE_BD_ADDR_OPCODE command. Presumably, this is the case for all non-ROME devices which all use the EDL_WRITE_BD_ADDR_OPCODE command for this (unlike the ROME devices which use a different command and expect the address in little-endian order). Reverse the little-endian address before setting it to make sure that the address can be configured using tools like btmgmt or using the 'local-bd-address' devicetree property. Note that this can potentially break systems with boot firmware which has started relying on the broken behaviour and is incorrectly passing the address via devicetree in big-endian order. The only device affected by this should be the WCN3991 used in some Chromebooks. As ChromeOS updates the kernel and devicetree in lockstep, the new 'qcom,local-bd-address-broken' property can be used to determine if the firmware is buggy so that the underlying driver bug can be fixed without breaking backwards compatibility. Set the HCI_QUIRK_BDADDR_PROPERTY_BROKEN quirk for such platforms so that the address is reversed when parsing the address property. Fixes: 5c0a1001c8be ("Bluetooth: hci_qca: Add helper to set device address") Cc: stable@vger.kernel.org # 5.1 Cc: Balakrishna Godavarthi Cc: Matthias Kaehlcke Tested-by: Nikita Travkin # sc7180 Reviewed-by: Douglas Anderson Signed-off-by: Johan Hovold Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btqca.c | 8 ++++++-- drivers/bluetooth/hci_qca.c | 10 ++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c index b40b32fa7f1c3..19cfc342fc7bb 100644 --- a/drivers/bluetooth/btqca.c +++ b/drivers/bluetooth/btqca.c @@ -826,11 +826,15 @@ EXPORT_SYMBOL_GPL(qca_uart_setup); int qca_set_bdaddr(struct hci_dev *hdev, const bdaddr_t *bdaddr) { + bdaddr_t bdaddr_swapped; struct sk_buff *skb; int err; - skb = __hci_cmd_sync_ev(hdev, EDL_WRITE_BD_ADDR_OPCODE, 6, bdaddr, - HCI_EV_VENDOR, HCI_INIT_TIMEOUT); + baswap(&bdaddr_swapped, bdaddr); + + skb = __hci_cmd_sync_ev(hdev, EDL_WRITE_BD_ADDR_OPCODE, 6, + &bdaddr_swapped, HCI_EV_VENDOR, + HCI_INIT_TIMEOUT); if (IS_ERR(skb)) { err = PTR_ERR(skb); bt_dev_err(hdev, "QCA Change address cmd failed (%d)", err); diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 4ecbcb1644cc6..ecbc52eaf1010 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -225,6 +225,7 @@ struct qca_serdev { struct qca_power *bt_power; u32 init_speed; u32 oper_speed; + bool bdaddr_property_broken; const char *firmware_name; }; @@ -1842,6 +1843,7 @@ static int qca_setup(struct hci_uart *hu) const char *firmware_name = qca_get_firmware_name(hu); int ret; struct qca_btsoc_version ver; + struct qca_serdev *qcadev; const char *soc_name; ret = qca_check_speeds(hu); @@ -1904,6 +1906,11 @@ static int qca_setup(struct hci_uart *hu) case QCA_WCN6855: case QCA_WCN7850: set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); + + qcadev = serdev_device_get_drvdata(hu->serdev); + if (qcadev->bdaddr_property_broken) + set_bit(HCI_QUIRK_BDADDR_PROPERTY_BROKEN, &hdev->quirks); + hci_set_aosp_capable(hdev); ret = qca_read_soc_version(hdev, &ver, soc_type); @@ -2284,6 +2291,9 @@ static int qca_serdev_probe(struct serdev_device *serdev) if (!qcadev->oper_speed) BT_DBG("UART will pick default operating speed"); + qcadev->bdaddr_property_broken = device_property_read_bool(&serdev->dev, + "qcom,local-bd-address-broken"); + if (data) qcadev->btsoc_type = data->soc_type; else From 6946b9c99bde45f3ba74e00a7af9a3458cc24bea Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 26 Mar 2024 12:43:17 -0400 Subject: [PATCH 059/167] Bluetooth: hci_sync: Fix not checking error on hci_cmd_sync_cancel_sync hci_cmd_sync_cancel_sync shall check the error passed to it since it will be propagated using req_result which is __u32 it needs to be properly set to a positive value if it was passed as negative othertise IS_ERR will not trigger as -(errno) would be converted to a positive value. Fixes: 63298d6e752f ("Bluetooth: hci_core: Cancel request on command timeout") Signed-off-by: Luiz Augusto von Dentz Reported-and-tested-by: Thorsten Leemhuis Closes: https://lore.kernel.org/all/08275279-7462-4f4a-a0ee-8aa015f829bc@leemhuis.info/ --- net/bluetooth/hci_core.c | 6 +++--- net/bluetooth/hci_sync.c | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 1690ae57a09db..a7028d38c1f5c 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2874,7 +2874,7 @@ static void hci_cancel_cmd_sync(struct hci_dev *hdev, int err) cancel_delayed_work_sync(&hdev->ncmd_timer); atomic_set(&hdev->cmd_cnt, 1); - hci_cmd_sync_cancel_sync(hdev, -err); + hci_cmd_sync_cancel_sync(hdev, err); } /* Suspend HCI device */ @@ -2894,7 +2894,7 @@ int hci_suspend_dev(struct hci_dev *hdev) return 0; /* Cancel potentially blocking sync operation before suspend */ - hci_cancel_cmd_sync(hdev, -EHOSTDOWN); + hci_cancel_cmd_sync(hdev, EHOSTDOWN); hci_req_sync_lock(hdev); ret = hci_suspend_sync(hdev); @@ -4210,7 +4210,7 @@ static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) err = hci_send_frame(hdev, skb); if (err < 0) { - hci_cmd_sync_cancel_sync(hdev, err); + hci_cmd_sync_cancel_sync(hdev, -err); return; } diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 639090b9f4b85..8fe02921adf15 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -617,7 +617,10 @@ void hci_cmd_sync_cancel_sync(struct hci_dev *hdev, int err) bt_dev_dbg(hdev, "err 0x%2.2x", err); if (hdev->req_status == HCI_REQ_PEND) { - hdev->req_result = err; + /* req_result is __u32 so error must be positive to be properly + * propagated. + */ + hdev->req_result = err < 0 ? -err : err; hdev->req_status = HCI_REQ_CANCELED; wake_up_interruptible(&hdev->req_wait_q); From c569242cd49287d53b73a94233db40097d838535 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Wed, 27 Mar 2024 12:30:30 +0800 Subject: [PATCH 060/167] Bluetooth: hci_event: set the conn encrypted before conn establishes We have a BT headset (Lenovo Thinkplus XT99), the pairing and connecting has no problem, once this headset is paired, bluez will remember this device and will auto re-connect it whenever the device is powered on. The auto re-connecting works well with Windows and Android, but with Linux, it always fails. Through debugging, we found at the rfcomm connection stage, the bluetooth stack reports "Connection refused - security block (0x0003)". For this device, the re-connecting negotiation process is different from other BT headsets, it sends the Link_KEY_REQUEST command before the CONNECT_REQUEST completes, and it doesn't send ENCRYPT_CHANGE command during the negotiation. When the device sends the "connect complete" to hci, the ev->encr_mode is 1. So here in the conn_complete_evt(), if ev->encr_mode is 1, link type is ACL and HCI_CONN_ENCRYPT is not set, we set HCI_CONN_ENCRYPT to this conn, and update conn->enc_key_size accordingly. After this change, this BT headset could re-connect with Linux successfully. This is the btmon log after applying the patch, after receiving the "Connect Complete" with "Encryption: Enabled", will send the command to read encryption key size: > HCI Event: Connect Request (0x04) plen 10 Address: 8C:3C:AA:D8:11:67 (OUI 8C-3C-AA) Class: 0x240404 Major class: Audio/Video (headset, speaker, stereo, video, vcr) Minor class: Wearable Headset Device Rendering (Printing, Speaker) Audio (Speaker, Microphone, Headset) Link type: ACL (0x01) ... > HCI Event: Link Key Request (0x17) plen 6 Address: 8C:3C:AA:D8:11:67 (OUI 8C-3C-AA) < HCI Command: Link Key Request Reply (0x01|0x000b) plen 22 Address: 8C:3C:AA:D8:11:67 (OUI 8C-3C-AA) Link key: ${32-hex-digits-key} ... > HCI Event: Connect Complete (0x03) plen 11 Status: Success (0x00) Handle: 256 Address: 8C:3C:AA:D8:11:67 (OUI 8C-3C-AA) Link type: ACL (0x01) Encryption: Enabled (0x01) < HCI Command: Read Encryption Key... (0x05|0x0008) plen 2 Handle: 256 < ACL Data TX: Handle 256 flags 0x00 dlen 10 L2CAP: Information Request (0x0a) ident 1 len 2 Type: Extended features supported (0x0002) > HCI Event: Command Complete (0x0e) plen 7 Read Encryption Key Size (0x05|0x0008) ncmd 1 Status: Success (0x00) Handle: 256 Key size: 16 Cc: stable@vger.kernel.org Link: https://github.com/bluez/bluez/issues/704 Reviewed-by: Paul Menzel Reviewed-by: Luiz Augusto von Dentz Signed-off-by: Hui Wang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 4ae2248240121..a8b8cfebe0180 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3208,6 +3208,31 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, if (test_bit(HCI_ENCRYPT, &hdev->flags)) set_bit(HCI_CONN_ENCRYPT, &conn->flags); + /* "Link key request" completed ahead of "connect request" completes */ + if (ev->encr_mode == 1 && !test_bit(HCI_CONN_ENCRYPT, &conn->flags) && + ev->link_type == ACL_LINK) { + struct link_key *key; + struct hci_cp_read_enc_key_size cp; + + key = hci_find_link_key(hdev, &ev->bdaddr); + if (key) { + set_bit(HCI_CONN_ENCRYPT, &conn->flags); + + if (!(hdev->commands[20] & 0x10)) { + conn->enc_key_size = HCI_LINK_KEY_SIZE; + } else { + cp.handle = cpu_to_le16(conn->handle); + if (hci_send_cmd(hdev, HCI_OP_READ_ENC_KEY_SIZE, + sizeof(cp), &cp)) { + bt_dev_err(hdev, "sending read key size failed"); + conn->enc_key_size = HCI_LINK_KEY_SIZE; + } + } + + hci_encrypt_cfm(conn, ev->status); + } + } + /* Get remote features */ if (conn->type == ACL_LINK) { struct hci_cp_read_remote_features cp; From 7835fcfd132eb88b87e8eb901f88436f63ab60f7 Mon Sep 17 00:00:00 2001 From: Bastien Nocera Date: Wed, 27 Mar 2024 15:24:56 +0100 Subject: [PATCH 061/167] Bluetooth: Fix TOCTOU in HCI debugfs implementation struct hci_dev members conn_info_max_age, conn_info_min_age, le_conn_max_interval, le_conn_min_interval, le_adv_max_interval, and le_adv_min_interval can be modified from the HCI core code, as well through debugfs. The debugfs implementation, that's only available to privileged users, will check for boundaries, making sure that the minimum value being set is strictly above the maximum value that already exists, and vice-versa. However, as both minimum and maximum values can be changed concurrently to us modifying them, we need to make sure that the value we check is the value we end up using. For example, with ->conn_info_max_age set to 10, conn_info_min_age_set() gets called from vfs handlers to set conn_info_min_age to 8. In conn_info_min_age_set(), this goes through: if (val == 0 || val > hdev->conn_info_max_age) return -EINVAL; Concurrently, conn_info_max_age_set() gets called to set to set the conn_info_max_age to 7: if (val == 0 || val > hdev->conn_info_max_age) return -EINVAL; That check will also pass because we used the old value (10) for conn_info_max_age. After those checks that both passed, the struct hci_dev access is mutex-locked, disabling concurrent access, but that does not matter because the invalid value checks both passed, and we'll end up with conn_info_min_age = 8 and conn_info_max_age = 7 To fix this problem, we need to lock the structure access before so the check and assignment are not interrupted. This fix was originally devised by the BassCheck[1] team, and considered the problem to be an atomicity one. This isn't the case as there aren't any concerns about the variable changing while we check it, but rather after we check it parallel to another change. This patch fixes CVE-2024-24858 and CVE-2024-24857. [1] https://sites.google.com/view/basscheck/ Co-developed-by: Gui-Dong Han <2045gemini@gmail.com> Signed-off-by: Gui-Dong Han <2045gemini@gmail.com> Link: https://lore.kernel.org/linux-bluetooth/20231222161317.6255-1-2045gemini@gmail.com/ Link: https://nvd.nist.gov/vuln/detail/CVE-2024-24858 Link: https://lore.kernel.org/linux-bluetooth/20231222162931.6553-1-2045gemini@gmail.com/ Link: https://lore.kernel.org/linux-bluetooth/20231222162310.6461-1-2045gemini@gmail.com/ Link: https://nvd.nist.gov/vuln/detail/CVE-2024-24857 Fixes: 31ad169148df ("Bluetooth: Add conn info lifetime parameters to debugfs") Fixes: 729a1051da6f ("Bluetooth: Expose default LE advertising interval via debugfs") Fixes: 71c3b60ec6d2 ("Bluetooth: Move BR/EDR debugfs file creation into hci_debugfs.c") Signed-off-by: Bastien Nocera Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_debugfs.c | 48 ++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 233453807b509..ce3ff2fa72e58 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -218,10 +218,12 @@ static int conn_info_min_age_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val == 0 || val > hdev->conn_info_max_age) + hci_dev_lock(hdev); + if (val == 0 || val > hdev->conn_info_max_age) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->conn_info_min_age = val; hci_dev_unlock(hdev); @@ -246,10 +248,12 @@ static int conn_info_max_age_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val == 0 || val < hdev->conn_info_min_age) + hci_dev_lock(hdev); + if (val == 0 || val < hdev->conn_info_min_age) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->conn_info_max_age = val; hci_dev_unlock(hdev); @@ -567,10 +571,12 @@ static int sniff_min_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val == 0 || val % 2 || val > hdev->sniff_max_interval) + hci_dev_lock(hdev); + if (val == 0 || val % 2 || val > hdev->sniff_max_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->sniff_min_interval = val; hci_dev_unlock(hdev); @@ -595,10 +601,12 @@ static int sniff_max_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val == 0 || val % 2 || val < hdev->sniff_min_interval) + hci_dev_lock(hdev); + if (val == 0 || val % 2 || val < hdev->sniff_min_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->sniff_max_interval = val; hci_dev_unlock(hdev); @@ -850,10 +858,12 @@ static int conn_min_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x0006 || val > 0x0c80 || val > hdev->le_conn_max_interval) + hci_dev_lock(hdev); + if (val < 0x0006 || val > 0x0c80 || val > hdev->le_conn_max_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->le_conn_min_interval = val; hci_dev_unlock(hdev); @@ -878,10 +888,12 @@ static int conn_max_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x0006 || val > 0x0c80 || val < hdev->le_conn_min_interval) + hci_dev_lock(hdev); + if (val < 0x0006 || val > 0x0c80 || val < hdev->le_conn_min_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->le_conn_max_interval = val; hci_dev_unlock(hdev); @@ -990,10 +1002,12 @@ static int adv_min_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x0020 || val > 0x4000 || val > hdev->le_adv_max_interval) + hci_dev_lock(hdev); + if (val < 0x0020 || val > 0x4000 || val > hdev->le_adv_max_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->le_adv_min_interval = val; hci_dev_unlock(hdev); @@ -1018,10 +1032,12 @@ static int adv_max_interval_set(void *data, u64 val) { struct hci_dev *hdev = data; - if (val < 0x0020 || val > 0x4000 || val < hdev->le_adv_min_interval) + hci_dev_lock(hdev); + if (val < 0x0020 || val > 0x4000 || val < hdev->le_adv_min_interval) { + hci_dev_unlock(hdev); return -EINVAL; + } - hci_dev_lock(hdev); hdev->le_adv_max_interval = val; hci_dev_unlock(hdev); From 50e2907ef8bb52cf80ecde9eec5c4dac07177146 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Mar 2024 19:12:06 +0000 Subject: [PATCH 062/167] tcp/dccp: bypass empty buckets in inet_twsk_purge() TCP ehash table is often sparsely populated. inet_twsk_purge() spends too much time calling cond_resched(). This patch can reduce time spent in inet_twsk_purge() by 20x. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240327191206.508114-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_timewait_sock.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index e8de45d34d56a..b0cc07d9a568c 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -266,12 +266,17 @@ EXPORT_SYMBOL_GPL(__inet_twsk_schedule); /* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */ void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family) { + struct inet_ehash_bucket *head = &hashinfo->ehash[0]; + unsigned int ehash_mask = hashinfo->ehash_mask; struct hlist_nulls_node *node; unsigned int slot; struct sock *sk; - for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { - struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; + for (slot = 0; slot <= ehash_mask; slot++, head++) { + + if (hlist_nulls_empty(&head->chain)) + continue; + restart_rcu: cond_resched(); rcu_read_lock(); From 1fbfdfaa590248c1d86407f578e40e5c65136330 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:11 -0700 Subject: [PATCH 063/167] af_unix: Allocate struct unix_vertex for each inflight AF_UNIX fd. We will replace the garbage collection algorithm for AF_UNIX, where we will consider each inflight AF_UNIX socket as a vertex and its file descriptor as an edge in a directed graph. This patch introduces a new struct unix_vertex representing a vertex in the graph and adds its pointer to struct unix_sock. When we send a fd using the SCM_RIGHTS message, we allocate struct scm_fp_list to struct scm_cookie in scm_fp_copy(). Then, we bump each refcount of the inflight fds' struct file and save them in scm_fp_list.fp. After that, unix_attach_fds() inexplicably clones scm_fp_list of scm_cookie and sets it to skb. (We will remove this part after replacing GC.) Here, we add a new function call in unix_attach_fds() to preallocate struct unix_vertex per inflight AF_UNIX fd and link each vertex to skb's scm_fp_list.vertices. When sendmsg() succeeds later, if the socket of the inflight fd is still not inflight yet, we will set the preallocated vertex to struct unix_sock.vertex and link it to a global list unix_unvisited_vertices under spin_lock(&unix_gc_lock). If the socket is already inflight, we free the preallocated vertex. This is to avoid taking the lock unnecessarily when sendmsg() could fail later. In the following patch, we will similarly allocate another struct per edge, which will finally be linked to the inflight socket's unix_vertex.edges. And then, we will count the number of edges as unix_vertex.out_degree. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 9 +++++++++ include/net/scm.h | 3 +++ net/core/scm.c | 7 +++++++ net/unix/af_unix.c | 6 ++++++ net/unix/garbage.c | 38 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 63 insertions(+) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 627ea8e2d9159..c270877a5256d 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -22,9 +22,17 @@ extern unsigned int unix_tot_inflight; void unix_inflight(struct user_struct *user, struct file *fp); void unix_notinflight(struct user_struct *user, struct file *fp); +int unix_prepare_fpl(struct scm_fp_list *fpl); +void unix_destroy_fpl(struct scm_fp_list *fpl); void unix_gc(void); void wait_for_unix_gc(struct scm_fp_list *fpl); +struct unix_vertex { + struct list_head edges; + struct list_head entry; + unsigned long out_degree; +}; + struct sock *unix_peer_get(struct sock *sk); #define UNIX_HASH_MOD (256 - 1) @@ -62,6 +70,7 @@ struct unix_sock { struct path path; struct mutex iolock, bindlock; struct sock *peer; + struct unix_vertex *vertex; struct list_head link; unsigned long inflight; spinlock_t lock; diff --git a/include/net/scm.h b/include/net/scm.h index 92276a2c55436..e34321b6e204b 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -27,6 +27,9 @@ struct scm_fp_list { short count; short count_unix; short max; +#ifdef CONFIG_UNIX + struct list_head vertices; +#endif struct user_struct *user; struct file *fp[SCM_MAX_FD]; }; diff --git a/net/core/scm.c b/net/core/scm.c index 9cd4b0a01cd60..87dfec1c33787 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -89,6 +89,9 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) fpl->count_unix = 0; fpl->max = SCM_MAX_FD; fpl->user = NULL; +#if IS_ENABLED(CONFIG_UNIX) + INIT_LIST_HEAD(&fpl->vertices); +#endif } fpp = &fpl->fp[fpl->count]; @@ -376,8 +379,12 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) if (new_fpl) { for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); + new_fpl->max = new_fpl->count; new_fpl->user = get_uid(fpl->user); +#if IS_ENABLED(CONFIG_UNIX) + INIT_LIST_HEAD(&new_fpl->vertices); +#endif } return new_fpl; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5b41e2321209a..a3b25d3115608 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -980,6 +980,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); u->inflight = 0; + u->vertex = NULL; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); @@ -1805,6 +1806,9 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) for (i = scm->fp->count - 1; i >= 0; i--) unix_inflight(scm->fp->user, scm->fp->fp[i]); + if (unix_prepare_fpl(UNIXCB(skb).fp)) + return -ENOMEM; + return 0; } @@ -1815,6 +1819,8 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) scm->fp = UNIXCB(skb).fp; UNIXCB(skb).fp = NULL; + unix_destroy_fpl(scm->fp); + for (i = scm->fp->count - 1; i >= 0; i--) unix_notinflight(scm->fp->user, scm->fp->fp[i]); } diff --git a/net/unix/garbage.c b/net/unix/garbage.c index fa39b62652385..75bdf66b81df7 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -101,6 +101,44 @@ struct unix_sock *unix_get_socket(struct file *filp) return NULL; } +static void unix_free_vertices(struct scm_fp_list *fpl) +{ + struct unix_vertex *vertex, *next_vertex; + + list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) { + list_del(&vertex->entry); + kfree(vertex); + } +} + +int unix_prepare_fpl(struct scm_fp_list *fpl) +{ + struct unix_vertex *vertex; + int i; + + if (!fpl->count_unix) + return 0; + + for (i = 0; i < fpl->count_unix; i++) { + vertex = kmalloc(sizeof(*vertex), GFP_KERNEL); + if (!vertex) + goto err; + + list_add(&vertex->entry, &fpl->vertices); + } + + return 0; + +err: + unix_free_vertices(fpl); + return -ENOMEM; +} + +void unix_destroy_fpl(struct scm_fp_list *fpl) +{ + unix_free_vertices(fpl); +} + DEFINE_SPINLOCK(unix_gc_lock); unsigned int unix_tot_inflight; static LIST_HEAD(gc_candidates); From 29b64e354029cfcf1eea4d91b146c7b769305930 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:12 -0700 Subject: [PATCH 064/167] af_unix: Allocate struct unix_edge for each inflight AF_UNIX fd. As with the previous patch, we preallocate to skb's scm_fp_list an array of struct unix_edge in the number of inflight AF_UNIX fds. There we just preallocate memory and do not use immediately because sendmsg() could fail after this point. The actual use will be in the next patch. When we queue skb with inflight edges, we will set the inflight socket's unix_sock as unix_edge->predecessor and the receiver's unix_sock as successor, and then we will link the edge to the inflight socket's unix_vertex.edges. Note that we set NULL to cloned scm_fp_list.edges in scm_fp_dup() so that MSG_PEEK does not change the shape of the directed graph. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 6 ++++++ include/net/scm.h | 5 +++++ net/core/scm.c | 2 ++ net/unix/garbage.c | 6 ++++++ 4 files changed, 19 insertions(+) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index c270877a5256d..55c4abc26a714 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -33,6 +33,12 @@ struct unix_vertex { unsigned long out_degree; }; +struct unix_edge { + struct unix_sock *predecessor; + struct unix_sock *successor; + struct list_head vertex_entry; +}; + struct sock *unix_peer_get(struct sock *sk); #define UNIX_HASH_MOD (256 - 1) diff --git a/include/net/scm.h b/include/net/scm.h index e34321b6e204b..5f5154e5096dc 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -23,12 +23,17 @@ struct scm_creds { kgid_t gid; }; +#ifdef CONFIG_UNIX +struct unix_edge; +#endif + struct scm_fp_list { short count; short count_unix; short max; #ifdef CONFIG_UNIX struct list_head vertices; + struct unix_edge *edges; #endif struct user_struct *user; struct file *fp[SCM_MAX_FD]; diff --git a/net/core/scm.c b/net/core/scm.c index 87dfec1c33787..1bcc8a2d65e3f 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -90,6 +90,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) fpl->max = SCM_MAX_FD; fpl->user = NULL; #if IS_ENABLED(CONFIG_UNIX) + fpl->edges = NULL; INIT_LIST_HEAD(&fpl->vertices); #endif } @@ -383,6 +384,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) new_fpl->max = new_fpl->count; new_fpl->user = get_uid(fpl->user); #if IS_ENABLED(CONFIG_UNIX) + new_fpl->edges = NULL; INIT_LIST_HEAD(&new_fpl->vertices); #endif } diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 75bdf66b81df7..f31917683288b 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -127,6 +127,11 @@ int unix_prepare_fpl(struct scm_fp_list *fpl) list_add(&vertex->entry, &fpl->vertices); } + fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges), + GFP_KERNEL_ACCOUNT); + if (!fpl->edges) + goto err; + return 0; err: @@ -136,6 +141,7 @@ int unix_prepare_fpl(struct scm_fp_list *fpl) void unix_destroy_fpl(struct scm_fp_list *fpl) { + kvfree(fpl->edges); unix_free_vertices(fpl); } From 42f298c06b30bfe0a8cbee5d38644e618699e26e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:13 -0700 Subject: [PATCH 065/167] af_unix: Link struct unix_edge when queuing skb. Just before queuing skb with inflight fds, we call scm_stat_add(), which is a good place to set up the preallocated struct unix_vertex and struct unix_edge in UNIXCB(skb).fp. Then, we call unix_add_edges() and construct the directed graph as follows: 1. Set the inflight socket's unix_sock to unix_edge.predecessor. 2. Set the receiver's unix_sock to unix_edge.successor. 3. Set the preallocated vertex to inflight socket's unix_sock.vertex. 4. Link inflight socket's unix_vertex.entry to unix_unvisited_vertices. 5. Link unix_edge.vertex_entry to the inflight socket's unix_vertex.edges. Let's say we pass the fd of AF_UNIX socket A to B and the fd of B to C. The graph looks like this: +-------------------------+ | unix_unvisited_vertices | <-------------------------. +-------------------------+ | + | | +--------------+ +--------------+ | +--------------+ | | unix_sock A | <---. .---> | unix_sock B | <-|-. .---> | unix_sock C | | +--------------+ | | +--------------+ | | | +--------------+ | .-+ | vertex | | | .-+ | vertex | | | | | vertex | | | +--------------+ | | | +--------------+ | | | +--------------+ | | | | | | | | | | +--------------+ | | | +--------------+ | | | | '-> | unix_vertex | | | '-> | unix_vertex | | | | | +--------------+ | | +--------------+ | | | `---> | entry | +---------> | entry | +-' | | |--------------| | | |--------------| | | | edges | <-. | | | edges | <-. | | +--------------+ | | | +--------------+ | | | | | | | | | .----------------------' | | .----------------------' | | | | | | | | | +--------------+ | | | +--------------+ | | | | unix_edge | | | | | unix_edge | | | | +--------------+ | | | +--------------+ | | `-> | vertex_entry | | | `-> | vertex_entry | | | |--------------| | | |--------------| | | | predecessor | +---' | | predecessor | +---' | |--------------| | |--------------| | | successor | +-----' | successor | +-----' +--------------+ +--------------+ Henceforth, we denote such a graph as A -> B (-> C). Now, we can express all inflight fd graphs that do not contain embryo sockets. We will support the particular case later. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 2 + include/net/scm.h | 1 + net/core/scm.c | 2 + net/unix/af_unix.c | 8 +++- net/unix/garbage.c | 90 ++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 100 insertions(+), 3 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 55c4abc26a714..f31ad11663464 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -22,6 +22,8 @@ extern unsigned int unix_tot_inflight; void unix_inflight(struct user_struct *user, struct file *fp); void unix_notinflight(struct user_struct *user, struct file *fp); +void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); +void unix_del_edges(struct scm_fp_list *fpl); int unix_prepare_fpl(struct scm_fp_list *fpl); void unix_destroy_fpl(struct scm_fp_list *fpl); void unix_gc(void); diff --git a/include/net/scm.h b/include/net/scm.h index 5f5154e5096dc..bbc5527809d1d 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -32,6 +32,7 @@ struct scm_fp_list { short count_unix; short max; #ifdef CONFIG_UNIX + bool inflight; struct list_head vertices; struct unix_edge *edges; #endif diff --git a/net/core/scm.c b/net/core/scm.c index 1bcc8a2d65e3f..5763f33203586 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -90,6 +90,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) fpl->max = SCM_MAX_FD; fpl->user = NULL; #if IS_ENABLED(CONFIG_UNIX) + fpl->inflight = false; fpl->edges = NULL; INIT_LIST_HEAD(&fpl->vertices); #endif @@ -384,6 +385,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) new_fpl->max = new_fpl->count; new_fpl->user = get_uid(fpl->user); #if IS_ENABLED(CONFIG_UNIX) + new_fpl->inflight = false; new_fpl->edges = NULL; INIT_LIST_HEAD(&new_fpl->vertices); #endif diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index a3b25d3115608..24adbc4d51886 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1943,8 +1943,10 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb) struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); - if (unlikely(fp && fp->count)) + if (unlikely(fp && fp->count)) { atomic_add(fp->count, &u->scm_stat.nr_fds); + unix_add_edges(fp, u); + } } static void scm_stat_del(struct sock *sk, struct sk_buff *skb) @@ -1952,8 +1954,10 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb) struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); - if (unlikely(fp && fp->count)) + if (unlikely(fp && fp->count)) { atomic_sub(fp->count, &u->scm_stat.nr_fds); + unix_del_edges(fp); + } } /* diff --git a/net/unix/garbage.c b/net/unix/garbage.c index f31917683288b..36d6659360965 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -101,6 +101,38 @@ struct unix_sock *unix_get_socket(struct file *filp) return NULL; } +static LIST_HEAD(unix_unvisited_vertices); + +static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) +{ + struct unix_vertex *vertex = edge->predecessor->vertex; + + if (!vertex) { + vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry); + vertex->out_degree = 0; + INIT_LIST_HEAD(&vertex->edges); + + list_move_tail(&vertex->entry, &unix_unvisited_vertices); + edge->predecessor->vertex = vertex; + } + + vertex->out_degree++; + list_add_tail(&edge->vertex_entry, &vertex->edges); +} + +static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) +{ + struct unix_vertex *vertex = edge->predecessor->vertex; + + list_del(&edge->vertex_entry); + vertex->out_degree--; + + if (!vertex->out_degree) { + edge->predecessor->vertex = NULL; + list_move_tail(&vertex->entry, &fpl->vertices); + } +} + static void unix_free_vertices(struct scm_fp_list *fpl) { struct unix_vertex *vertex, *next_vertex; @@ -111,6 +143,60 @@ static void unix_free_vertices(struct scm_fp_list *fpl) } } +DEFINE_SPINLOCK(unix_gc_lock); + +void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) +{ + int i = 0, j = 0; + + spin_lock(&unix_gc_lock); + + if (!fpl->count_unix) + goto out; + + do { + struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]); + struct unix_edge *edge; + + if (!inflight) + continue; + + edge = fpl->edges + i++; + edge->predecessor = inflight; + edge->successor = receiver; + + unix_add_edge(fpl, edge); + } while (i < fpl->count_unix); + +out: + spin_unlock(&unix_gc_lock); + + fpl->inflight = true; + + unix_free_vertices(fpl); +} + +void unix_del_edges(struct scm_fp_list *fpl) +{ + int i = 0; + + spin_lock(&unix_gc_lock); + + if (!fpl->count_unix) + goto out; + + do { + struct unix_edge *edge = fpl->edges + i++; + + unix_del_edge(fpl, edge); + } while (i < fpl->count_unix); + +out: + spin_unlock(&unix_gc_lock); + + fpl->inflight = false; +} + int unix_prepare_fpl(struct scm_fp_list *fpl) { struct unix_vertex *vertex; @@ -141,11 +227,13 @@ int unix_prepare_fpl(struct scm_fp_list *fpl) void unix_destroy_fpl(struct scm_fp_list *fpl) { + if (fpl->inflight) + unix_del_edges(fpl); + kvfree(fpl->edges); unix_free_vertices(fpl); } -DEFINE_SPINLOCK(unix_gc_lock); unsigned int unix_tot_inflight; static LIST_HEAD(gc_candidates); static LIST_HEAD(gc_inflight_list); From 22c3c0c52d32f41cc38cd936ea0c93f22ced3315 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:14 -0700 Subject: [PATCH 066/167] af_unix: Bulk update unix_tot_inflight/unix_inflight when queuing skb. Currently, we track the number of inflight sockets in two variables. unix_tot_inflight is the total number of inflight AF_UNIX sockets on the host, and user->unix_inflight is the number of inflight fds per user. We update them one by one in unix_inflight(), which can be done once in batch. Also, sendmsg() could fail even after unix_inflight(), then we need to acquire unix_gc_lock only to decrement the counters. Let's bulk update the counters in unix_add_edges() and unix_del_edges(), which is called only for successfully passed fds. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/garbage.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 36d6659360965..84c8ea524a980 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -144,6 +144,7 @@ static void unix_free_vertices(struct scm_fp_list *fpl) } DEFINE_SPINLOCK(unix_gc_lock); +unsigned int unix_tot_inflight; void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) { @@ -168,7 +169,10 @@ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) unix_add_edge(fpl, edge); } while (i < fpl->count_unix); + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix); out: + WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count); + spin_unlock(&unix_gc_lock); fpl->inflight = true; @@ -191,7 +195,10 @@ void unix_del_edges(struct scm_fp_list *fpl) unix_del_edge(fpl, edge); } while (i < fpl->count_unix); + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix); out: + WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count); + spin_unlock(&unix_gc_lock); fpl->inflight = false; @@ -234,7 +241,6 @@ void unix_destroy_fpl(struct scm_fp_list *fpl) unix_free_vertices(fpl); } -unsigned int unix_tot_inflight; static LIST_HEAD(gc_candidates); static LIST_HEAD(gc_inflight_list); @@ -255,13 +261,8 @@ void unix_inflight(struct user_struct *user, struct file *filp) WARN_ON_ONCE(list_empty(&u->link)); } u->inflight++; - - /* Paired with READ_ONCE() in wait_for_unix_gc() */ - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); } - WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1); - spin_unlock(&unix_gc_lock); } @@ -278,13 +279,8 @@ void unix_notinflight(struct user_struct *user, struct file *filp) u->inflight--; if (!u->inflight) list_del_init(&u->link); - - /* Paired with READ_ONCE() in wait_for_unix_gc() */ - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); } - WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1); - spin_unlock(&unix_gc_lock); } From 6ba76fd2848e107594ea4f03b737230f74bc23ea Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:15 -0700 Subject: [PATCH 067/167] af_unix: Iterate all vertices by DFS. The new GC will use a depth first search graph algorithm to find cyclic references. The algorithm visits every vertex exactly once. Here, we implement the DFS part without recursion so that no one can abuse it. unix_walk_scc() marks every vertex unvisited by initialising index as UNIX_VERTEX_INDEX_UNVISITED and iterates inflight vertices in unix_unvisited_vertices and call __unix_walk_scc() to start DFS from an arbitrary vertex. __unix_walk_scc() iterates all edges starting from the vertex and explores the neighbour vertices with DFS using edge_stack. After visiting all neighbours, __unix_walk_scc() moves the visited vertex to unix_visited_vertices so that unix_walk_scc() will not restart DFS from the visited vertex. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 2 ++ net/unix/garbage.c | 74 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index f31ad11663464..970a91da22393 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -33,12 +33,14 @@ struct unix_vertex { struct list_head edges; struct list_head entry; unsigned long out_degree; + unsigned long index; }; struct unix_edge { struct unix_sock *predecessor; struct unix_sock *successor; struct list_head vertex_entry; + struct list_head stack_entry; }; struct sock *unix_peer_get(struct sock *sk); diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 84c8ea524a980..8b16ab9e240e6 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -103,6 +103,11 @@ struct unix_sock *unix_get_socket(struct file *filp) static LIST_HEAD(unix_unvisited_vertices); +enum unix_vertex_index { + UNIX_VERTEX_INDEX_UNVISITED, + UNIX_VERTEX_INDEX_START, +}; + static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) { struct unix_vertex *vertex = edge->predecessor->vertex; @@ -241,6 +246,73 @@ void unix_destroy_fpl(struct scm_fp_list *fpl) unix_free_vertices(fpl); } +static LIST_HEAD(unix_visited_vertices); + +static void __unix_walk_scc(struct unix_vertex *vertex) +{ + unsigned long index = UNIX_VERTEX_INDEX_START; + struct unix_edge *edge; + LIST_HEAD(edge_stack); + +next_vertex: + vertex->index = index; + index++; + + /* Explore neighbour vertices (receivers of the current vertex's fd). */ + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + struct unix_vertex *next_vertex = edge->successor->vertex; + + if (!next_vertex) + continue; + + if (next_vertex->index == UNIX_VERTEX_INDEX_UNVISITED) { + /* Iterative deepening depth first search + * + * 1. Push a forward edge to edge_stack and set + * the successor to vertex for the next iteration. + */ + list_add(&edge->stack_entry, &edge_stack); + + vertex = next_vertex; + goto next_vertex; + + /* 2. Pop the edge directed to the current vertex + * and restore the ancestor for backtracking. + */ +prev_vertex: + edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry); + list_del_init(&edge->stack_entry); + + vertex = edge->predecessor->vertex; + } + } + + /* Don't restart DFS from this vertex in unix_walk_scc(). */ + list_move_tail(&vertex->entry, &unix_visited_vertices); + + /* Need backtracking ? */ + if (!list_empty(&edge_stack)) + goto prev_vertex; +} + +static void unix_walk_scc(void) +{ + struct unix_vertex *vertex; + + list_for_each_entry(vertex, &unix_unvisited_vertices, entry) + vertex->index = UNIX_VERTEX_INDEX_UNVISITED; + + /* Visit every vertex exactly once. + * __unix_walk_scc() moves visited vertices to unix_visited_vertices. + */ + while (!list_empty(&unix_unvisited_vertices)) { + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); + __unix_walk_scc(vertex); + } + + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); +} + static LIST_HEAD(gc_candidates); static LIST_HEAD(gc_inflight_list); @@ -388,6 +460,8 @@ static void __unix_gc(struct work_struct *work) spin_lock(&unix_gc_lock); + unix_walk_scc(); + /* First, select candidates for garbage collection. Only * in-flight sockets are considered, and from those only ones * which don't have any external reference. From 3484f063172dd88776b062046d721d7c2ae1af7c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:16 -0700 Subject: [PATCH 068/167] af_unix: Detect Strongly Connected Components. In the new GC, we use a simple graph algorithm, Tarjan's Strongly Connected Components (SCC) algorithm, to find cyclic references. The algorithm visits every vertex exactly once using depth-first search (DFS). DFS starts by pushing an input vertex to a stack and assigning it a unique number. Two fields, index and lowlink, are initialised with the number, but lowlink could be updated later during DFS. If a vertex has an edge to an unvisited inflight vertex, we visit it and do the same processing. So, we will have vertices in the stack in the order they appear and number them consecutively in the same order. If a vertex has a back-edge to a visited vertex in the stack, we update the predecessor's lowlink with the successor's index. After iterating edges from the vertex, we check if its index equals its lowlink. If the lowlink is different from the index, it shows there was a back-edge. Then, we go backtracking and propagate the lowlink to its predecessor and resume the previous edge iteration from the next edge. If the lowlink is the same as the index, we pop vertices before and including the vertex from the stack. Then, the set of vertices is SCC, possibly forming a cycle. At the same time, we move the vertices to unix_visited_vertices. When we finish the algorithm, all vertices in each SCC will be linked via unix_vertex.scc_entry. Let's take an example. We have a graph including five inflight vertices (F is not inflight): A -> B -> C -> D -> E (-> F) ^ | `---------' Suppose that we start DFS from C. We will visit C, D, and B first and initialise their index and lowlink. Then, the stack looks like this: > B = (3, 3) (index, lowlink) D = (2, 2) C = (1, 1) When checking B's edge to C, we update B's lowlink with C's index and propagate it to D. B = (3, 1) (index, lowlink) > D = (2, 1) C = (1, 1) Next, we visit E, which has no edge to an inflight vertex. > E = (4, 4) (index, lowlink) B = (3, 1) D = (2, 1) C = (1, 1) When we leave from E, its index and lowlink are the same, so we pop E from the stack as single-vertex SCC. Next, we leave from B and D but do nothing because their lowlink are different from their index. B = (3, 1) (index, lowlink) D = (2, 1) > C = (1, 1) Then, we leave from C, whose index and lowlink are the same, so we pop B, D and C as SCC. Last, we do DFS for the rest of vertices, A, which is also a single-vertex SCC. Finally, each unix_vertex.scc_entry is linked as follows: A -. B -> C -> D E -. ^ | ^ | ^ | `--' `---------' `--' We use SCC later to decide whether we can garbage-collect the sockets. Note that we still cannot detect SCC properly if an edge points to an embryo socket. The following two patches will sort it out. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-7-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 3 +++ net/unix/garbage.c | 46 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 970a91da22393..67736767b6168 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -32,8 +32,11 @@ void wait_for_unix_gc(struct scm_fp_list *fpl); struct unix_vertex { struct list_head edges; struct list_head entry; + struct list_head scc_entry; unsigned long out_degree; unsigned long index; + unsigned long lowlink; + bool on_stack; }; struct unix_edge { diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 8b16ab9e240e6..33aadaa35346b 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -251,11 +251,19 @@ static LIST_HEAD(unix_visited_vertices); static void __unix_walk_scc(struct unix_vertex *vertex) { unsigned long index = UNIX_VERTEX_INDEX_START; + LIST_HEAD(vertex_stack); struct unix_edge *edge; LIST_HEAD(edge_stack); next_vertex: + /* Push vertex to vertex_stack. + * The vertex will be popped when finalising SCC later. + */ + vertex->on_stack = true; + list_add(&vertex->scc_entry, &vertex_stack); + vertex->index = index; + vertex->lowlink = index; index++; /* Explore neighbour vertices (receivers of the current vertex's fd). */ @@ -283,12 +291,46 @@ static void __unix_walk_scc(struct unix_vertex *vertex) edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry); list_del_init(&edge->stack_entry); + next_vertex = vertex; vertex = edge->predecessor->vertex; + + /* If the successor has a smaller lowlink, two vertices + * are in the same SCC, so propagate the smaller lowlink + * to skip SCC finalisation. + */ + vertex->lowlink = min(vertex->lowlink, next_vertex->lowlink); + } else if (next_vertex->on_stack) { + /* Loop detected by a back/cross edge. + * + * The successor is on vertex_stack, so two vertices are + * in the same SCC. If the successor has a smaller index, + * propagate it to skip SCC finalisation. + */ + vertex->lowlink = min(vertex->lowlink, next_vertex->index); + } else { + /* The successor was already grouped as another SCC */ } } - /* Don't restart DFS from this vertex in unix_walk_scc(). */ - list_move_tail(&vertex->entry, &unix_visited_vertices); + if (vertex->index == vertex->lowlink) { + struct list_head scc; + + /* SCC finalised. + * + * If the lowlink was not updated, all the vertices above on + * vertex_stack are in the same SCC. Group them using scc_entry. + */ + __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry); + + list_for_each_entry_reverse(vertex, &scc, scc_entry) { + /* Don't restart DFS from this vertex in unix_walk_scc(). */ + list_move_tail(&vertex->entry, &unix_visited_vertices); + + vertex->on_stack = false; + } + + list_del(&scc); + } /* Need backtracking ? */ if (!list_empty(&edge_stack)) From aed6ecef55d70de3762ce41c561b7f547dbaf107 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:17 -0700 Subject: [PATCH 069/167] af_unix: Save listener for embryo socket. This is a prep patch for the following change, where we need to fetch the listening socket from the successor embryo socket during GC. We add a new field to struct unix_sock to save a pointer to a listening socket. We set it when connect() creates a new socket, and clear it when accept() is called. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-8-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 1 + net/unix/af_unix.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 67736767b6168..dc74691911957 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -83,6 +83,7 @@ struct unix_sock { struct path path; struct mutex iolock, bindlock; struct sock *peer; + struct sock *listener; struct unix_vertex *vertex; struct list_head link; unsigned long inflight; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 24adbc4d51886..af74e7ebc35a0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -979,6 +979,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); + u->listener = NULL; u->inflight = 0; u->vertex = NULL; u->path.dentry = NULL; @@ -1598,6 +1599,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, newsk->sk_type = sk->sk_type; init_peercred(newsk); newu = unix_sk(newsk); + newu->listener = other; RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); otheru = unix_sk(other); @@ -1693,8 +1695,8 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sock *sk = sock->sk; - struct sock *tsk; struct sk_buff *skb; + struct sock *tsk; int err; err = -EOPNOTSUPP; @@ -1719,6 +1721,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, } tsk = skb->sk; + unix_sk(tsk)->listener = NULL; skb_free_datagram(sk, skb); wake_up_interruptible(&unix_sk(sk)->peer_wait); From dcf70df2048d27c5d186f013f101a4aefd63aa41 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:18 -0700 Subject: [PATCH 070/167] af_unix: Fix up unix_edge.successor for embryo socket. To garbage collect inflight AF_UNIX sockets, we must define the cyclic reference appropriately. This is a bit tricky if the loop consists of embryo sockets. Suppose that the fd of AF_UNIX socket A is passed to D and the fd B to C and that C and D are embryo sockets of A and B, respectively. It may appear that there are two separate graphs, A (-> D) and B (-> C), but this is not correct. A --. .-- B X C <-' `-> D Now, D holds A's refcount, and C has B's refcount, so unix_release() will never be called for A and B when we close() them. However, no one can call close() for D and C to free skbs holding refcounts of A and B because C/D is in A/B's receive queue, which should have been purged by unix_release() for A and B. So, here's another type of cyclic reference. When a fd of an AF_UNIX socket is passed to an embryo socket, the reference is indirectly held by its parent listening socket. .-> A .-> B | `- sk_receive_queue | `- sk_receive_queue | `- skb | `- skb | `- sk == C | `- sk == D | `- sk_receive_queue | `- sk_receive_queue | `- skb +---------' `- skb +-. | | `---------------------------------------------------------' Technically, the graph must be denoted as A <-> B instead of A (-> D) and B (-> C) to find such a cyclic reference without touching each socket's receive queue. .-> A --. .-- B <-. | X | == A <-> B `-- C <-' `-> D --' We apply this fixup during GC by fetching the real successor by unix_edge_successor(). When we call accept(), we clear unix_sock.listener under unix_gc_lock not to confuse GC. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-9-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 1 + net/unix/af_unix.c | 2 +- net/unix/garbage.c | 20 +++++++++++++++++++- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index dc74691911957..414463803b7e6 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -24,6 +24,7 @@ void unix_inflight(struct user_struct *user, struct file *fp); void unix_notinflight(struct user_struct *user, struct file *fp); void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); void unix_del_edges(struct scm_fp_list *fpl); +void unix_update_edges(struct unix_sock *receiver); int unix_prepare_fpl(struct scm_fp_list *fpl); void unix_destroy_fpl(struct scm_fp_list *fpl); void unix_gc(void); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index af74e7ebc35a0..ae77e2dc0dae4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1721,7 +1721,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, } tsk = skb->sk; - unix_sk(tsk)->listener = NULL; + unix_update_edges(unix_sk(tsk)); skb_free_datagram(sk, skb); wake_up_interruptible(&unix_sk(sk)->peer_wait); diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 33aadaa35346b..8d0912c1d01a2 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -101,6 +101,17 @@ struct unix_sock *unix_get_socket(struct file *filp) return NULL; } +static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) +{ + /* If an embryo socket has a fd, + * the listener indirectly holds the fd's refcnt. + */ + if (edge->successor->listener) + return unix_sk(edge->successor->listener)->vertex; + + return edge->successor->vertex; +} + static LIST_HEAD(unix_unvisited_vertices); enum unix_vertex_index { @@ -209,6 +220,13 @@ void unix_del_edges(struct scm_fp_list *fpl) fpl->inflight = false; } +void unix_update_edges(struct unix_sock *receiver) +{ + spin_lock(&unix_gc_lock); + receiver->listener = NULL; + spin_unlock(&unix_gc_lock); +} + int unix_prepare_fpl(struct scm_fp_list *fpl) { struct unix_vertex *vertex; @@ -268,7 +286,7 @@ static void __unix_walk_scc(struct unix_vertex *vertex) /* Explore neighbour vertices (receivers of the current vertex's fd). */ list_for_each_entry(edge, &vertex->edges, vertex_entry) { - struct unix_vertex *next_vertex = edge->successor->vertex; + struct unix_vertex *next_vertex = unix_edge_successor(edge); if (!next_vertex) continue; From ba31b4a4e1018f5844c6eb31734976e2184f2f9a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:19 -0700 Subject: [PATCH 071/167] af_unix: Save O(n) setup of Tarjan's algo. Before starting Tarjan's algorithm, we need to mark all vertices as unvisited. We can save this O(n) setup by reserving two special indices (0, 1) and using two variables. The first time we link a vertex to unix_unvisited_vertices, we set unix_vertex_unvisited_index to index. During DFS, we can see that the index of unvisited vertices is the same as unix_vertex_unvisited_index. When we finalise SCC later, we set unix_vertex_grouped_index to each vertex's index. Then, we can know (i) that the vertex is on the stack if the index of a visited vertex is >= 2 and (ii) that it is not on the stack and belongs to a different SCC if the index is unix_vertex_grouped_index. After the whole algorithm, all indices of vertices are set as unix_vertex_grouped_index. Next time we start DFS, we know that all unvisited vertices have unix_vertex_grouped_index, and we can use unix_vertex_unvisited_index as the not-on-stack marker. To use the same variable in __unix_walk_scc(), we can swap unix_vertex_(grouped|unvisited)_index at the end of Tarjan's algorithm. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-10-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 1 - net/unix/garbage.c | 26 +++++++++++++++----------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 414463803b7e6..ec040caaa4b5b 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -37,7 +37,6 @@ struct unix_vertex { unsigned long out_degree; unsigned long index; unsigned long lowlink; - bool on_stack; }; struct unix_edge { diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 8d0912c1d01a2..cec6189126ba1 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -115,16 +115,20 @@ static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) static LIST_HEAD(unix_unvisited_vertices); enum unix_vertex_index { - UNIX_VERTEX_INDEX_UNVISITED, + UNIX_VERTEX_INDEX_MARK1, + UNIX_VERTEX_INDEX_MARK2, UNIX_VERTEX_INDEX_START, }; +static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1; + static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) { struct unix_vertex *vertex = edge->predecessor->vertex; if (!vertex) { vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry); + vertex->index = unix_vertex_unvisited_index; vertex->out_degree = 0; INIT_LIST_HEAD(&vertex->edges); @@ -265,6 +269,7 @@ void unix_destroy_fpl(struct scm_fp_list *fpl) } static LIST_HEAD(unix_visited_vertices); +static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; static void __unix_walk_scc(struct unix_vertex *vertex) { @@ -274,10 +279,10 @@ static void __unix_walk_scc(struct unix_vertex *vertex) LIST_HEAD(edge_stack); next_vertex: - /* Push vertex to vertex_stack. + /* Push vertex to vertex_stack and mark it as on-stack + * (index >= UNIX_VERTEX_INDEX_START). * The vertex will be popped when finalising SCC later. */ - vertex->on_stack = true; list_add(&vertex->scc_entry, &vertex_stack); vertex->index = index; @@ -291,7 +296,7 @@ static void __unix_walk_scc(struct unix_vertex *vertex) if (!next_vertex) continue; - if (next_vertex->index == UNIX_VERTEX_INDEX_UNVISITED) { + if (next_vertex->index == unix_vertex_unvisited_index) { /* Iterative deepening depth first search * * 1. Push a forward edge to edge_stack and set @@ -317,7 +322,7 @@ static void __unix_walk_scc(struct unix_vertex *vertex) * to skip SCC finalisation. */ vertex->lowlink = min(vertex->lowlink, next_vertex->lowlink); - } else if (next_vertex->on_stack) { + } else if (next_vertex->index != unix_vertex_grouped_index) { /* Loop detected by a back/cross edge. * * The successor is on vertex_stack, so two vertices are @@ -344,7 +349,8 @@ static void __unix_walk_scc(struct unix_vertex *vertex) /* Don't restart DFS from this vertex in unix_walk_scc(). */ list_move_tail(&vertex->entry, &unix_visited_vertices); - vertex->on_stack = false; + /* Mark vertex as off-stack. */ + vertex->index = unix_vertex_grouped_index; } list_del(&scc); @@ -357,20 +363,18 @@ static void __unix_walk_scc(struct unix_vertex *vertex) static void unix_walk_scc(void) { - struct unix_vertex *vertex; - - list_for_each_entry(vertex, &unix_unvisited_vertices, entry) - vertex->index = UNIX_VERTEX_INDEX_UNVISITED; - /* Visit every vertex exactly once. * __unix_walk_scc() moves visited vertices to unix_visited_vertices. */ while (!list_empty(&unix_unvisited_vertices)) { + struct unix_vertex *vertex; + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); __unix_walk_scc(vertex); } list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); + swap(unix_vertex_unvisited_index, unix_vertex_grouped_index); } static LIST_HEAD(gc_candidates); From 77e5593aebba823bcbcf2c4b58b07efcd63933b8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:20 -0700 Subject: [PATCH 072/167] af_unix: Skip GC if no cycle exists. We do not need to run GC if there is no possible cyclic reference. We use unix_graph_maybe_cyclic to decide if we should run GC. If a fd of an AF_UNIX socket is passed to an already inflight AF_UNIX socket, they could form a cyclic reference. Then, we set true to unix_graph_maybe_cyclic and later run Tarjan's algorithm to group them into SCC. Once we run Tarjan's algorithm, we are 100% sure whether cyclic references exist or not. If there is no cycle, we set false to unix_graph_maybe_cyclic and can skip the entire garbage collection next time. When finalising SCC, we set true to unix_graph_maybe_cyclic if SCC consists of multiple vertices. Even if SCC is a single vertex, a cycle might exist as self-fd passing. Given the corner case is rare, we detect it by checking all edges of the vertex and set true to unix_graph_maybe_cyclic. With this change, __unix_gc() is just a spin_lock() dance in the normal usage. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-11-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/garbage.c | 48 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index cec6189126ba1..5a1fae78d6dc3 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -112,6 +112,19 @@ static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) return edge->successor->vertex; } +static bool unix_graph_maybe_cyclic; + +static void unix_update_graph(struct unix_vertex *vertex) +{ + /* If the receiver socket is not inflight, no cyclic + * reference could be formed. + */ + if (!vertex) + return; + + unix_graph_maybe_cyclic = true; +} + static LIST_HEAD(unix_unvisited_vertices); enum unix_vertex_index { @@ -138,12 +151,16 @@ static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) vertex->out_degree++; list_add_tail(&edge->vertex_entry, &vertex->edges); + + unix_update_graph(unix_edge_successor(edge)); } static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) { struct unix_vertex *vertex = edge->predecessor->vertex; + unix_update_graph(unix_edge_successor(edge)); + list_del(&edge->vertex_entry); vertex->out_degree--; @@ -227,6 +244,7 @@ void unix_del_edges(struct scm_fp_list *fpl) void unix_update_edges(struct unix_sock *receiver) { spin_lock(&unix_gc_lock); + unix_update_graph(unix_sk(receiver->listener)->vertex); receiver->listener = NULL; spin_unlock(&unix_gc_lock); } @@ -268,6 +286,26 @@ void unix_destroy_fpl(struct scm_fp_list *fpl) unix_free_vertices(fpl); } +static bool unix_scc_cyclic(struct list_head *scc) +{ + struct unix_vertex *vertex; + struct unix_edge *edge; + + /* SCC containing multiple vertices ? */ + if (!list_is_singular(scc)) + return true; + + vertex = list_first_entry(scc, typeof(*vertex), scc_entry); + + /* Self-reference or a embryo-listener circle ? */ + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + if (unix_edge_successor(edge) == vertex) + return true; + } + + return false; +} + static LIST_HEAD(unix_visited_vertices); static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; @@ -353,6 +391,9 @@ static void __unix_walk_scc(struct unix_vertex *vertex) vertex->index = unix_vertex_grouped_index; } + if (!unix_graph_maybe_cyclic) + unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + list_del(&scc); } @@ -363,6 +404,8 @@ static void __unix_walk_scc(struct unix_vertex *vertex) static void unix_walk_scc(void) { + unix_graph_maybe_cyclic = false; + /* Visit every vertex exactly once. * __unix_walk_scc() moves visited vertices to unix_visited_vertices. */ @@ -524,6 +567,9 @@ static void __unix_gc(struct work_struct *work) spin_lock(&unix_gc_lock); + if (!unix_graph_maybe_cyclic) + goto skip_gc; + unix_walk_scc(); /* First, select candidates for garbage collection. Only @@ -617,7 +663,7 @@ static void __unix_gc(struct work_struct *work) /* All candidates should have been detached by now. */ WARN_ON_ONCE(!list_empty(&gc_candidates)); - +skip_gc: /* Paired with READ_ONCE() in wait_for_unix_gc(). */ WRITE_ONCE(gc_in_progress, false); From ad081928a8b0f57f269df999a28087fce6f2b6ce Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:21 -0700 Subject: [PATCH 073/167] af_unix: Avoid Tarjan's algorithm if unnecessary. Once a cyclic reference is formed, we need to run GC to check if there is dead SCC. However, we do not need to run Tarjan's algorithm if we know that the shape of the inflight graph has not been changed. If an edge is added/updated/deleted and the edge's successor is inflight, we set false to unix_graph_grouped, which means we need to re-classify SCC. Once we finalise SCC, we set true to unix_graph_grouped. While unix_graph_grouped is true, we can iterate the grouped SCC using vertex->scc_entry in unix_walk_scc_fast(). list_add() and list_for_each_entry_reverse() uses seem weird, but they are to keep the vertex order consistent and make writing test easier. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-12-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/garbage.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 5a1fae78d6dc3..654aa8e30a8bc 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -113,6 +113,7 @@ static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) } static bool unix_graph_maybe_cyclic; +static bool unix_graph_grouped; static void unix_update_graph(struct unix_vertex *vertex) { @@ -123,6 +124,7 @@ static void unix_update_graph(struct unix_vertex *vertex) return; unix_graph_maybe_cyclic = true; + unix_graph_grouped = false; } static LIST_HEAD(unix_unvisited_vertices); @@ -144,6 +146,7 @@ static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) vertex->index = unix_vertex_unvisited_index; vertex->out_degree = 0; INIT_LIST_HEAD(&vertex->edges); + INIT_LIST_HEAD(&vertex->scc_entry); list_move_tail(&vertex->entry, &unix_unvisited_vertices); edge->predecessor->vertex = vertex; @@ -418,6 +421,26 @@ static void unix_walk_scc(void) list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); swap(unix_vertex_unvisited_index, unix_vertex_grouped_index); + + unix_graph_grouped = true; +} + +static void unix_walk_scc_fast(void) +{ + while (!list_empty(&unix_unvisited_vertices)) { + struct unix_vertex *vertex; + struct list_head scc; + + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); + list_add(&scc, &vertex->scc_entry); + + list_for_each_entry_reverse(vertex, &scc, scc_entry) + list_move_tail(&vertex->entry, &unix_visited_vertices); + + list_del(&scc); + } + + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); } static LIST_HEAD(gc_candidates); @@ -570,7 +593,10 @@ static void __unix_gc(struct work_struct *work) if (!unix_graph_maybe_cyclic) goto skip_gc; - unix_walk_scc(); + if (unix_graph_grouped) + unix_walk_scc_fast(); + else + unix_walk_scc(); /* First, select candidates for garbage collection. Only * in-flight sockets are considered, and from those only ones From bfdb01283ee8f2f3089656c3ff8f62bb072dabb2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:22 -0700 Subject: [PATCH 074/167] af_unix: Assign a unique index to SCC. The definition of the lowlink in Tarjan's algorithm is the smallest index of a vertex that is reachable with at most one back-edge in SCC. This is not useful for a cross-edge. If we start traversing from A in the following graph, the final lowlink of D is 3. The cross-edge here is one between D and C. A -> B -> D D = (4, 3) (index, lowlink) ^ | | C = (3, 1) | V | B = (2, 1) `--- C <--' A = (1, 1) This is because the lowlink of D is updated with the index of C. In the following patch, we detect a dead SCC by checking two conditions for each vertex. 1) vertex has no edge directed to another SCC (no bridge) 2) vertex's out_degree is the same as the refcount of its file If 1) is false, there is a receiver of all fds of the SCC and its ancestor SCC. To evaluate 1), we need to assign a unique index to each SCC and assign it to all vertices in the SCC. This patch changes the lowlink update logic for cross-edge so that in the example above, the lowlink of D is updated with the lowlink of C. A -> B -> D D = (4, 1) (index, lowlink) ^ | | C = (3, 1) | V | B = (2, 1) `--- C <--' A = (1, 1) Then, all vertices in the same SCC have the same lowlink, and we can quickly find the bridge connecting to different SCC if exists. However, it is no longer called lowlink, so we rename it to scc_index. (It's sometimes called lowpoint.) Also, we add a global variable to hold the last index used in DFS so that we do not reset the initial index in each DFS. This patch can be squashed to the SCC detection patch but is split deliberately for anyone wondering why lowlink is not used as used in the original Tarjan's algorithm and many reference implementations. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-13-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 2 +- net/unix/garbage.c | 29 +++++++++++++++-------------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index ec040caaa4b5b..696d997a5ac99 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -36,7 +36,7 @@ struct unix_vertex { struct list_head scc_entry; unsigned long out_degree; unsigned long index; - unsigned long lowlink; + unsigned long scc_index; }; struct unix_edge { diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 654aa8e30a8bc..3f59cee3ccbc8 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -312,9 +312,8 @@ static bool unix_scc_cyclic(struct list_head *scc) static LIST_HEAD(unix_visited_vertices); static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; -static void __unix_walk_scc(struct unix_vertex *vertex) +static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index) { - unsigned long index = UNIX_VERTEX_INDEX_START; LIST_HEAD(vertex_stack); struct unix_edge *edge; LIST_HEAD(edge_stack); @@ -326,9 +325,9 @@ static void __unix_walk_scc(struct unix_vertex *vertex) */ list_add(&vertex->scc_entry, &vertex_stack); - vertex->index = index; - vertex->lowlink = index; - index++; + vertex->index = *last_index; + vertex->scc_index = *last_index; + (*last_index)++; /* Explore neighbour vertices (receivers of the current vertex's fd). */ list_for_each_entry(edge, &vertex->edges, vertex_entry) { @@ -358,30 +357,30 @@ static void __unix_walk_scc(struct unix_vertex *vertex) next_vertex = vertex; vertex = edge->predecessor->vertex; - /* If the successor has a smaller lowlink, two vertices - * are in the same SCC, so propagate the smaller lowlink + /* If the successor has a smaller scc_index, two vertices + * are in the same SCC, so propagate the smaller scc_index * to skip SCC finalisation. */ - vertex->lowlink = min(vertex->lowlink, next_vertex->lowlink); + vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); } else if (next_vertex->index != unix_vertex_grouped_index) { /* Loop detected by a back/cross edge. * - * The successor is on vertex_stack, so two vertices are - * in the same SCC. If the successor has a smaller index, + * The successor is on vertex_stack, so two vertices are in + * the same SCC. If the successor has a smaller *scc_index*, * propagate it to skip SCC finalisation. */ - vertex->lowlink = min(vertex->lowlink, next_vertex->index); + vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); } else { /* The successor was already grouped as another SCC */ } } - if (vertex->index == vertex->lowlink) { + if (vertex->index == vertex->scc_index) { struct list_head scc; /* SCC finalised. * - * If the lowlink was not updated, all the vertices above on + * If the scc_index was not updated, all the vertices above on * vertex_stack are in the same SCC. Group them using scc_entry. */ __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry); @@ -407,6 +406,8 @@ static void __unix_walk_scc(struct unix_vertex *vertex) static void unix_walk_scc(void) { + unsigned long last_index = UNIX_VERTEX_INDEX_START; + unix_graph_maybe_cyclic = false; /* Visit every vertex exactly once. @@ -416,7 +417,7 @@ static void unix_walk_scc(void) struct unix_vertex *vertex; vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); - __unix_walk_scc(vertex); + __unix_walk_scc(vertex, &last_index); } list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); From a15702d8b3aad8ce5268c565bd29f0e02fd2db83 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:23 -0700 Subject: [PATCH 075/167] af_unix: Detect dead SCC. When iterating SCC, we call unix_vertex_dead() for each vertex to check if the vertex is close()d and has no bridge to another SCC. If both conditions are true for every vertex in SCC, we can execute garbage collection for all skb in the SCC. The actual garbage collection is done in the following patch, replacing the old implementation. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-14-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/garbage.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 3f59cee3ccbc8..0a6b38da578cd 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -289,6 +289,39 @@ void unix_destroy_fpl(struct scm_fp_list *fpl) unix_free_vertices(fpl); } +static bool unix_vertex_dead(struct unix_vertex *vertex) +{ + struct unix_edge *edge; + struct unix_sock *u; + long total_ref; + + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + struct unix_vertex *next_vertex = unix_edge_successor(edge); + + /* The vertex's fd can be received by a non-inflight socket. */ + if (!next_vertex) + return false; + + /* The vertex's fd can be received by an inflight socket in + * another SCC. + */ + if (next_vertex->scc_index != vertex->scc_index) + return false; + } + + /* No receiver exists out of the same SCC. */ + + edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); + u = edge->predecessor; + total_ref = file_count(u->sk.sk_socket->file); + + /* If not close()d, total_ref > out_degree. */ + if (total_ref != vertex->out_degree) + return false; + + return true; +} + static bool unix_scc_cyclic(struct list_head *scc) { struct unix_vertex *vertex; @@ -377,6 +410,7 @@ static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_inde if (vertex->index == vertex->scc_index) { struct list_head scc; + bool scc_dead = true; /* SCC finalised. * @@ -391,6 +425,9 @@ static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_inde /* Mark vertex as off-stack. */ vertex->index = unix_vertex_grouped_index; + + if (scc_dead) + scc_dead = unix_vertex_dead(vertex); } if (!unix_graph_maybe_cyclic) @@ -431,13 +468,18 @@ static void unix_walk_scc_fast(void) while (!list_empty(&unix_unvisited_vertices)) { struct unix_vertex *vertex; struct list_head scc; + bool scc_dead = true; vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); list_add(&scc, &vertex->scc_entry); - list_for_each_entry_reverse(vertex, &scc, scc_entry) + list_for_each_entry_reverse(vertex, &scc, scc_entry) { list_move_tail(&vertex->entry, &unix_visited_vertices); + if (scc_dead) + scc_dead = unix_vertex_dead(vertex); + } + list_del(&scc); } From 4090fa373f0e763c43610853d2774b5979915959 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:24 -0700 Subject: [PATCH 076/167] af_unix: Replace garbage collection algorithm. If we find a dead SCC during iteration, we call unix_collect_skb() to splice all skb in the SCC to the global sk_buff_head, hitlist. After iterating all SCC, we unlock unix_gc_lock and purge the queue. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-15-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 8 -- net/unix/af_unix.c | 12 -- net/unix/garbage.c | 302 +++++++++--------------------------------- 3 files changed, 64 insertions(+), 258 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 696d997a5ac99..226a8da2cbe35 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -19,9 +19,6 @@ static inline struct unix_sock *unix_get_socket(struct file *filp) extern spinlock_t unix_gc_lock; extern unsigned int unix_tot_inflight; - -void unix_inflight(struct user_struct *user, struct file *fp); -void unix_notinflight(struct user_struct *user, struct file *fp); void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); void unix_del_edges(struct scm_fp_list *fpl); void unix_update_edges(struct unix_sock *receiver); @@ -85,12 +82,7 @@ struct unix_sock { struct sock *peer; struct sock *listener; struct unix_vertex *vertex; - struct list_head link; - unsigned long inflight; spinlock_t lock; - unsigned long gc_flags; -#define UNIX_GC_CANDIDATE 0 -#define UNIX_GC_MAYBE_CYCLE 1 struct socket_wq peer_wq; wait_queue_entry_t peer_wake; struct scm_stat scm_stat; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ae77e2dc0dae4..27ca50ab1cd18 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -980,12 +980,10 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); u->listener = NULL; - u->inflight = 0; u->vertex = NULL; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); - INIT_LIST_HEAD(&u->link); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); @@ -1793,8 +1791,6 @@ static inline bool too_many_unix_fds(struct task_struct *p) static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { - int i; - if (too_many_unix_fds(current)) return -ETOOMANYREFS; @@ -1806,9 +1802,6 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) if (!UNIXCB(skb).fp) return -ENOMEM; - for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->user, scm->fp->fp[i]); - if (unix_prepare_fpl(UNIXCB(skb).fp)) return -ENOMEM; @@ -1817,15 +1810,10 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) { - int i; - scm->fp = UNIXCB(skb).fp; UNIXCB(skb).fp = NULL; unix_destroy_fpl(scm->fp); - - for (i = scm->fp->count - 1; i >= 0; i--) - unix_notinflight(scm->fp->user, scm->fp->fp[i]); } static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 0a6b38da578cd..89ea71d9297ba 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -322,6 +322,52 @@ static bool unix_vertex_dead(struct unix_vertex *vertex) return true; } +enum unix_recv_queue_lock_class { + U_RECVQ_LOCK_NORMAL, + U_RECVQ_LOCK_EMBRYO, +}; + +static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist) +{ + struct unix_vertex *vertex; + + list_for_each_entry_reverse(vertex, scc, scc_entry) { + struct sk_buff_head *queue; + struct unix_edge *edge; + struct unix_sock *u; + + edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); + u = edge->predecessor; + queue = &u->sk.sk_receive_queue; + + spin_lock(&queue->lock); + + if (u->sk.sk_state == TCP_LISTEN) { + struct sk_buff *skb; + + skb_queue_walk(queue, skb) { + struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue; + + /* listener -> embryo order, the inversion never happens. */ + spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO); + skb_queue_splice_init(embryo_queue, hitlist); + spin_unlock(&embryo_queue->lock); + } + } else { + skb_queue_splice_init(queue, hitlist); + +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (u->oob_skb) { + kfree_skb(u->oob_skb); + u->oob_skb = NULL; + } +#endif + } + + spin_unlock(&queue->lock); + } +} + static bool unix_scc_cyclic(struct list_head *scc) { struct unix_vertex *vertex; @@ -345,7 +391,8 @@ static bool unix_scc_cyclic(struct list_head *scc) static LIST_HEAD(unix_visited_vertices); static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; -static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index) +static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index, + struct sk_buff_head *hitlist) { LIST_HEAD(vertex_stack); struct unix_edge *edge; @@ -430,7 +477,9 @@ static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_inde scc_dead = unix_vertex_dead(vertex); } - if (!unix_graph_maybe_cyclic) + if (scc_dead) + unix_collect_skb(&scc, hitlist); + else if (!unix_graph_maybe_cyclic) unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); list_del(&scc); @@ -441,7 +490,7 @@ static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_inde goto prev_vertex; } -static void unix_walk_scc(void) +static void unix_walk_scc(struct sk_buff_head *hitlist) { unsigned long last_index = UNIX_VERTEX_INDEX_START; @@ -454,7 +503,7 @@ static void unix_walk_scc(void) struct unix_vertex *vertex; vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); - __unix_walk_scc(vertex, &last_index); + __unix_walk_scc(vertex, &last_index, hitlist); } list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); @@ -463,7 +512,7 @@ static void unix_walk_scc(void) unix_graph_grouped = true; } -static void unix_walk_scc_fast(void) +static void unix_walk_scc_fast(struct sk_buff_head *hitlist) { while (!list_empty(&unix_unvisited_vertices)) { struct unix_vertex *vertex; @@ -480,263 +529,40 @@ static void unix_walk_scc_fast(void) scc_dead = unix_vertex_dead(vertex); } + if (scc_dead) + unix_collect_skb(&scc, hitlist); + list_del(&scc); } list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); } -static LIST_HEAD(gc_candidates); -static LIST_HEAD(gc_inflight_list); - -/* Keep the number of times in flight count for the file - * descriptor if it is for an AF_UNIX socket. - */ -void unix_inflight(struct user_struct *user, struct file *filp) -{ - struct unix_sock *u = unix_get_socket(filp); - - spin_lock(&unix_gc_lock); - - if (u) { - if (!u->inflight) { - WARN_ON_ONCE(!list_empty(&u->link)); - list_add_tail(&u->link, &gc_inflight_list); - } else { - WARN_ON_ONCE(list_empty(&u->link)); - } - u->inflight++; - } - - spin_unlock(&unix_gc_lock); -} - -void unix_notinflight(struct user_struct *user, struct file *filp) -{ - struct unix_sock *u = unix_get_socket(filp); - - spin_lock(&unix_gc_lock); - - if (u) { - WARN_ON_ONCE(!u->inflight); - WARN_ON_ONCE(list_empty(&u->link)); - - u->inflight--; - if (!u->inflight) - list_del_init(&u->link); - } - - spin_unlock(&unix_gc_lock); -} - -static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), - struct sk_buff_head *hitlist) -{ - struct sk_buff *skb; - struct sk_buff *next; - - spin_lock(&x->sk_receive_queue.lock); - skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { - /* Do we have file descriptors ? */ - if (UNIXCB(skb).fp) { - bool hit = false; - /* Process the descriptors of this socket */ - int nfd = UNIXCB(skb).fp->count; - struct file **fp = UNIXCB(skb).fp->fp; - - while (nfd--) { - /* Get the socket the fd matches if it indeed does so */ - struct unix_sock *u = unix_get_socket(*fp++); - - /* Ignore non-candidates, they could have been added - * to the queues after starting the garbage collection - */ - if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { - hit = true; - - func(u); - } - } - if (hit && hitlist != NULL) { - __skb_unlink(skb, &x->sk_receive_queue); - __skb_queue_tail(hitlist, skb); - } - } - } - spin_unlock(&x->sk_receive_queue.lock); -} - -static void scan_children(struct sock *x, void (*func)(struct unix_sock *), - struct sk_buff_head *hitlist) -{ - if (x->sk_state != TCP_LISTEN) { - scan_inflight(x, func, hitlist); - } else { - struct sk_buff *skb; - struct sk_buff *next; - struct unix_sock *u; - LIST_HEAD(embryos); - - /* For a listening socket collect the queued embryos - * and perform a scan on them as well. - */ - spin_lock(&x->sk_receive_queue.lock); - skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { - u = unix_sk(skb->sk); - - /* An embryo cannot be in-flight, so it's safe - * to use the list link. - */ - WARN_ON_ONCE(!list_empty(&u->link)); - list_add_tail(&u->link, &embryos); - } - spin_unlock(&x->sk_receive_queue.lock); - - while (!list_empty(&embryos)) { - u = list_entry(embryos.next, struct unix_sock, link); - scan_inflight(&u->sk, func, hitlist); - list_del_init(&u->link); - } - } -} - -static void dec_inflight(struct unix_sock *usk) -{ - usk->inflight--; -} - -static void inc_inflight(struct unix_sock *usk) -{ - usk->inflight++; -} - -static void inc_inflight_move_tail(struct unix_sock *u) -{ - u->inflight++; - - /* If this still might be part of a cycle, move it to the end - * of the list, so that it's checked even if it was already - * passed over - */ - if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags)) - list_move_tail(&u->link, &gc_candidates); -} - static bool gc_in_progress; static void __unix_gc(struct work_struct *work) { struct sk_buff_head hitlist; - struct unix_sock *u, *next; - LIST_HEAD(not_cycle_list); - struct list_head cursor; spin_lock(&unix_gc_lock); - if (!unix_graph_maybe_cyclic) + if (!unix_graph_maybe_cyclic) { + spin_unlock(&unix_gc_lock); goto skip_gc; - - if (unix_graph_grouped) - unix_walk_scc_fast(); - else - unix_walk_scc(); - - /* First, select candidates for garbage collection. Only - * in-flight sockets are considered, and from those only ones - * which don't have any external reference. - * - * Holding unix_gc_lock will protect these candidates from - * being detached, and hence from gaining an external - * reference. Since there are no possible receivers, all - * buffers currently on the candidates' queues stay there - * during the garbage collection. - * - * We also know that no new candidate can be added onto the - * receive queues. Other, non candidate sockets _can_ be - * added to queue, so we must make sure only to touch - * candidates. - */ - list_for_each_entry_safe(u, next, &gc_inflight_list, link) { - long total_refs; - - total_refs = file_count(u->sk.sk_socket->file); - - WARN_ON_ONCE(!u->inflight); - WARN_ON_ONCE(total_refs < u->inflight); - if (total_refs == u->inflight) { - list_move_tail(&u->link, &gc_candidates); - __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); - __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); - } - } - - /* Now remove all internal in-flight reference to children of - * the candidates. - */ - list_for_each_entry(u, &gc_candidates, link) - scan_children(&u->sk, dec_inflight, NULL); - - /* Restore the references for children of all candidates, - * which have remaining references. Do this recursively, so - * only those remain, which form cyclic references. - * - * Use a "cursor" link, to make the list traversal safe, even - * though elements might be moved about. - */ - list_add(&cursor, &gc_candidates); - while (cursor.next != &gc_candidates) { - u = list_entry(cursor.next, struct unix_sock, link); - - /* Move cursor to after the current position. */ - list_move(&cursor, &u->link); - - if (u->inflight) { - list_move_tail(&u->link, ¬_cycle_list); - __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); - scan_children(&u->sk, inc_inflight_move_tail, NULL); - } } - list_del(&cursor); - /* Now gc_candidates contains only garbage. Restore original - * inflight counters for these as well, and remove the skbuffs - * which are creating the cycle(s). - */ - skb_queue_head_init(&hitlist); - list_for_each_entry(u, &gc_candidates, link) { - scan_children(&u->sk, inc_inflight, &hitlist); + __skb_queue_head_init(&hitlist); -#if IS_ENABLED(CONFIG_AF_UNIX_OOB) - if (u->oob_skb) { - kfree_skb(u->oob_skb); - u->oob_skb = NULL; - } -#endif - } - - /* not_cycle_list contains those sockets which do not make up a - * cycle. Restore these to the inflight list. - */ - while (!list_empty(¬_cycle_list)) { - u = list_entry(not_cycle_list.next, struct unix_sock, link); - __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags); - list_move_tail(&u->link, &gc_inflight_list); - } + if (unix_graph_grouped) + unix_walk_scc_fast(&hitlist); + else + unix_walk_scc(&hitlist); spin_unlock(&unix_gc_lock); - /* Here we are. Hitlist is filled. Die. */ __skb_queue_purge(&hitlist); - - spin_lock(&unix_gc_lock); - - /* All candidates should have been detached by now. */ - WARN_ON_ONCE(!list_empty(&gc_candidates)); skip_gc: - /* Paired with READ_ONCE() in wait_for_unix_gc(). */ WRITE_ONCE(gc_in_progress, false); - - spin_unlock(&unix_gc_lock); } static DECLARE_WORK(unix_gc_work, __unix_gc); From 2aa0cff26ed53bc8d4855292b501759435ffdd38 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 13:24:25 -0700 Subject: [PATCH 077/167] selftest: af_unix: Test GC for SCM_RIGHTS. This patch adds test cases to verify the new GC. We run each test for the following cases: * SOCK_DGRAM * SOCK_STREAM without embryo socket * SOCK_STREAM without embryo socket + MSG_OOB * SOCK_STREAM with embryo sockets * SOCK_STREAM with embryo sockets + MSG_OOB Before and after running each test case, we ensure that there is no AF_UNIX socket left in the netns by reading /proc/net/protocols. We cannot use /proc/net/unix and UNIX_DIAG because the embryo socket does not show up there. Each test creates multiple sockets in an array. We pass sockets in the even index using the peer sockets in the odd index. So, send_fd(0, 1) actually sends fd[0] to fd[2] via fd[0 + 1]. Test 1 : A <-> A Test 2 : A <-> B Test 3 : A -> B -> C <- D ^.___|___.' ^ `---------' Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-16-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/af_unix/Makefile | 2 +- .../selftests/net/af_unix/scm_rights.c | 286 ++++++++++++++++++ 3 files changed, 288 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/net/af_unix/scm_rights.c diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 2f9d378edec33..d996a0ab0765e 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -31,6 +31,7 @@ reuseport_dualstack rxtimestamp sctp_hello scm_pidfd +scm_rights sk_bind_sendto_listen sk_connect_zero_addr socket diff --git a/tools/testing/selftests/net/af_unix/Makefile b/tools/testing/selftests/net/af_unix/Makefile index 221c387a7d7f6..3b83c797650d4 100644 --- a/tools/testing/selftests/net/af_unix/Makefile +++ b/tools/testing/selftests/net/af_unix/Makefile @@ -1,4 +1,4 @@ CFLAGS += $(KHDR_INCLUDES) -TEST_GEN_PROGS := diag_uid test_unix_oob unix_connect scm_pidfd +TEST_GEN_PROGS := diag_uid test_unix_oob unix_connect scm_pidfd scm_rights include ../../lib.mk diff --git a/tools/testing/selftests/net/af_unix/scm_rights.c b/tools/testing/selftests/net/af_unix/scm_rights.c new file mode 100644 index 0000000000000..bab606c9f1eb0 --- /dev/null +++ b/tools/testing/selftests/net/af_unix/scm_rights.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright Amazon.com Inc. or its affiliates. */ +#define _GNU_SOURCE +#include + +#include +#include +#include +#include +#include +#include + +#include "../../kselftest_harness.h" + +FIXTURE(scm_rights) +{ + int fd[16]; +}; + +FIXTURE_VARIANT(scm_rights) +{ + char name[16]; + int type; + int flags; + bool test_listener; +}; + +FIXTURE_VARIANT_ADD(scm_rights, dgram) +{ + .name = "UNIX ", + .type = SOCK_DGRAM, + .flags = 0, + .test_listener = false, +}; + +FIXTURE_VARIANT_ADD(scm_rights, stream) +{ + .name = "UNIX-STREAM ", + .type = SOCK_STREAM, + .flags = 0, + .test_listener = false, +}; + +FIXTURE_VARIANT_ADD(scm_rights, stream_oob) +{ + .name = "UNIX-STREAM ", + .type = SOCK_STREAM, + .flags = MSG_OOB, + .test_listener = false, +}; + +FIXTURE_VARIANT_ADD(scm_rights, stream_listener) +{ + .name = "UNIX-STREAM ", + .type = SOCK_STREAM, + .flags = 0, + .test_listener = true, +}; + +FIXTURE_VARIANT_ADD(scm_rights, stream_listener_oob) +{ + .name = "UNIX-STREAM ", + .type = SOCK_STREAM, + .flags = MSG_OOB, + .test_listener = true, +}; + +static int count_sockets(struct __test_metadata *_metadata, + const FIXTURE_VARIANT(scm_rights) *variant) +{ + int sockets = -1, len, ret; + char *line = NULL; + size_t unused; + FILE *f; + + f = fopen("/proc/net/protocols", "r"); + ASSERT_NE(NULL, f); + + len = strlen(variant->name); + + while (getline(&line, &unused, f) != -1) { + int unused2; + + if (strncmp(line, variant->name, len)) + continue; + + ret = sscanf(line + len, "%d %d", &unused2, &sockets); + ASSERT_EQ(2, ret); + + break; + } + + free(line); + + ret = fclose(f); + ASSERT_EQ(0, ret); + + return sockets; +} + +FIXTURE_SETUP(scm_rights) +{ + int ret; + + ret = unshare(CLONE_NEWNET); + ASSERT_EQ(0, ret); + + ret = count_sockets(_metadata, variant); + ASSERT_EQ(0, ret); +} + +FIXTURE_TEARDOWN(scm_rights) +{ + int ret; + + sleep(1); + + ret = count_sockets(_metadata, variant); + ASSERT_EQ(0, ret); +} + +static void create_listeners(struct __test_metadata *_metadata, + FIXTURE_DATA(scm_rights) *self, + int n) +{ + struct sockaddr_un addr = { + .sun_family = AF_UNIX, + }; + socklen_t addrlen; + int i, ret; + + for (i = 0; i < n * 2; i += 2) { + self->fd[i] = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_LE(0, self->fd[i]); + + addrlen = sizeof(addr.sun_family); + ret = bind(self->fd[i], (struct sockaddr *)&addr, addrlen); + ASSERT_EQ(0, ret); + + ret = listen(self->fd[i], -1); + ASSERT_EQ(0, ret); + + addrlen = sizeof(addr); + ret = getsockname(self->fd[i], (struct sockaddr *)&addr, &addrlen); + ASSERT_EQ(0, ret); + + self->fd[i + 1] = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_LE(0, self->fd[i + 1]); + + ret = connect(self->fd[i + 1], (struct sockaddr *)&addr, addrlen); + ASSERT_EQ(0, ret); + } +} + +static void create_socketpairs(struct __test_metadata *_metadata, + FIXTURE_DATA(scm_rights) *self, + const FIXTURE_VARIANT(scm_rights) *variant, + int n) +{ + int i, ret; + + ASSERT_GE(sizeof(self->fd) / sizeof(int), n); + + for (i = 0; i < n * 2; i += 2) { + ret = socketpair(AF_UNIX, variant->type, 0, self->fd + i); + ASSERT_EQ(0, ret); + } +} + +static void __create_sockets(struct __test_metadata *_metadata, + FIXTURE_DATA(scm_rights) *self, + const FIXTURE_VARIANT(scm_rights) *variant, + int n) +{ + if (variant->test_listener) + create_listeners(_metadata, self, n); + else + create_socketpairs(_metadata, self, variant, n); +} + +static void __close_sockets(struct __test_metadata *_metadata, + FIXTURE_DATA(scm_rights) *self, + int n) +{ + int i, ret; + + ASSERT_GE(sizeof(self->fd) / sizeof(int), n); + + for (i = 0; i < n * 2; i++) { + ret = close(self->fd[i]); + ASSERT_EQ(0, ret); + } +} + +void __send_fd(struct __test_metadata *_metadata, + const FIXTURE_DATA(scm_rights) *self, + const FIXTURE_VARIANT(scm_rights) *variant, + int inflight, int receiver) +{ +#define MSG "nop" +#define MSGLEN 3 + struct { + struct cmsghdr cmsghdr; + int fd[2]; + } cmsg = { + .cmsghdr = { + .cmsg_len = CMSG_LEN(sizeof(cmsg.fd)), + .cmsg_level = SOL_SOCKET, + .cmsg_type = SCM_RIGHTS, + }, + .fd = { + self->fd[inflight * 2], + self->fd[inflight * 2], + }, + }; + struct iovec iov = { + .iov_base = MSG, + .iov_len = MSGLEN, + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &cmsg, + .msg_controllen = CMSG_SPACE(sizeof(cmsg.fd)), + }; + int ret; + + ret = sendmsg(self->fd[receiver * 2 + 1], &msg, variant->flags); + ASSERT_EQ(MSGLEN, ret); +} + +#define create_sockets(n) \ + __create_sockets(_metadata, self, variant, n) +#define close_sockets(n) \ + __close_sockets(_metadata, self, n) +#define send_fd(inflight, receiver) \ + __send_fd(_metadata, self, variant, inflight, receiver) + +TEST_F(scm_rights, self_ref) +{ + create_sockets(2); + + send_fd(0, 0); + + send_fd(1, 1); + + close_sockets(2); +} + +TEST_F(scm_rights, triangle) +{ + create_sockets(6); + + send_fd(0, 1); + send_fd(1, 2); + send_fd(2, 0); + + send_fd(3, 4); + send_fd(4, 5); + send_fd(5, 3); + + close_sockets(6); +} + +TEST_F(scm_rights, cross_edge) +{ + create_sockets(8); + + send_fd(0, 1); + send_fd(1, 2); + send_fd(2, 0); + send_fd(1, 3); + send_fd(3, 2); + + send_fd(4, 5); + send_fd(5, 6); + send_fd(6, 4); + send_fd(5, 7); + send_fd(7, 6); + + close_sockets(8); +} + +TEST_HARNESS_MAIN From 09ba28e1cd3cf715daab1fca6e1623e22fd754a6 Mon Sep 17 00:00:00 2001 From: David Thompson Date: Mon, 25 Mar 2024 17:09:29 -0400 Subject: [PATCH 078/167] mlxbf_gige: stop interface during shutdown The mlxbf_gige driver intermittantly encounters a NULL pointer exception while the system is shutting down via "reboot" command. The mlxbf_driver will experience an exception right after executing its shutdown() method. One example of this exception is: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000070 Mem abort info: ESR = 0x0000000096000004 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x04: level 0 translation fault Data abort info: ISV = 0, ISS = 0x00000004 CM = 0, WnR = 0 user pgtable: 4k pages, 48-bit VAs, pgdp=000000011d373000 [0000000000000070] pgd=0000000000000000, p4d=0000000000000000 Internal error: Oops: 96000004 [#1] SMP CPU: 0 PID: 13 Comm: ksoftirqd/0 Tainted: G S OE 5.15.0-bf.6.gef6992a #1 Hardware name: https://www.mellanox.com BlueField SoC/BlueField SoC, BIOS 4.0.2.12669 Apr 21 2023 pstate: 20400009 (nzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : mlxbf_gige_handle_tx_complete+0xc8/0x170 [mlxbf_gige] lr : mlxbf_gige_poll+0x54/0x160 [mlxbf_gige] sp : ffff8000080d3c10 x29: ffff8000080d3c10 x28: ffffcce72cbb7000 x27: ffff8000080d3d58 x26: ffff0000814e7340 x25: ffff331cd1a05000 x24: ffffcce72c4ea008 x23: ffff0000814e4b40 x22: ffff0000814e4d10 x21: ffff0000814e4128 x20: 0000000000000000 x19: ffff0000814e4a80 x18: ffffffffffffffff x17: 000000000000001c x16: ffffcce72b4553f4 x15: ffff80008805b8a7 x14: 0000000000000000 x13: 0000000000000030 x12: 0101010101010101 x11: 7f7f7f7f7f7f7f7f x10: c2ac898b17576267 x9 : ffffcce720fa5404 x8 : ffff000080812138 x7 : 0000000000002e9a x6 : 0000000000000080 x5 : ffff00008de3b000 x4 : 0000000000000000 x3 : 0000000000000001 x2 : 0000000000000000 x1 : 0000000000000000 x0 : 0000000000000000 Call trace: mlxbf_gige_handle_tx_complete+0xc8/0x170 [mlxbf_gige] mlxbf_gige_poll+0x54/0x160 [mlxbf_gige] __napi_poll+0x40/0x1c8 net_rx_action+0x314/0x3a0 __do_softirq+0x128/0x334 run_ksoftirqd+0x54/0x6c smpboot_thread_fn+0x14c/0x190 kthread+0x10c/0x110 ret_from_fork+0x10/0x20 Code: 8b070000 f9000ea0 f95056c0 f86178a1 (b9407002) ---[ end trace 7cc3941aa0d8e6a4 ]--- Kernel panic - not syncing: Oops: Fatal exception in interrupt Kernel Offset: 0x4ce722520000 from 0xffff800008000000 PHYS_OFFSET: 0x80000000 CPU features: 0x000005c1,a3330e5a Memory Limit: none ---[ end Kernel panic - not syncing: Oops: Fatal exception in interrupt ]--- During system shutdown, the mlxbf_gige driver's shutdown() is always executed. However, the driver's stop() method will only execute if networking interface configuration logic within the Linux distribution has been setup to do so. If shutdown() executes but stop() does not execute, NAPI remains enabled and this can lead to an exception if NAPI is scheduled while the hardware interface has only been partially deinitialized. The networking interface managed by the mlxbf_gige driver must be properly stopped during system shutdown so that IFF_UP is cleared, the hardware interface is put into a clean state, and NAPI is fully deinitialized. Fixes: f92e1869d74e ("Add Mellanox BlueField Gigabit Ethernet driver") Signed-off-by: David Thompson Link: https://lore.kernel.org/r/20240325210929.25362-1-davthompson@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c index 77134ca929382..ba303868686a7 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include "mlxbf_gige.h" @@ -492,8 +493,13 @@ static void mlxbf_gige_shutdown(struct platform_device *pdev) { struct mlxbf_gige *priv = platform_get_drvdata(pdev); - writeq(0, priv->base + MLXBF_GIGE_INT_EN); - mlxbf_gige_clean_port(priv); + rtnl_lock(); + netif_device_detach(priv->netdev); + + if (netif_running(priv->netdev)) + dev_close(priv->netdev); + + rtnl_unlock(); } static const struct acpi_device_id __maybe_unused mlxbf_gige_acpi_match[] = { From 625aefac340f45a4fc60908da763f437599a0d6f Mon Sep 17 00:00:00 2001 From: Michael Krummsdorf Date: Tue, 26 Mar 2024 13:36:54 +0100 Subject: [PATCH 079/167] net: dsa: mv88e6xxx: fix usable ports on 88e6020 The switch has 4 ports with 2 internal PHYs, but ports are numbered up to 6, with ports 0, 1, 5 and 6 being usable. Fixes: 71d94a432a15 ("net: dsa: mv88e6xxx: add support for MV88E6020 switch") Signed-off-by: Michael Krummsdorf Signed-off-by: Matthias Schiffer Reviewed-by: Andrew Lunn Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240326123655.40666-1-matthias.schiffer@ew.tq-group.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 9ed1821184ece..c95787cb90867 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -5503,8 +5503,12 @@ static const struct mv88e6xxx_info mv88e6xxx_table[] = { .family = MV88E6XXX_FAMILY_6250, .name = "Marvell 88E6020", .num_databases = 64, - .num_ports = 4, + /* Ports 2-4 are not routed to pins + * => usable ports 0, 1, 5, 6 + */ + .num_ports = 7, .num_internal_phys = 2, + .invalid_port_mask = BIT(2) | BIT(3) | BIT(4), .max_vid = 4095, .port_base_addr = 0x8, .phy_base_addr = 0x0, From 62fc3357e079a07a22465b9b6ef71bb6ea75ee4b Mon Sep 17 00:00:00 2001 From: Mahmoud Adam Date: Tue, 26 Mar 2024 16:31:33 +0100 Subject: [PATCH 080/167] net/rds: fix possible cp null dereference cp might be null, calling cp->cp_conn would produce null dereference [Simon Horman adds:] Analysis: * cp is a parameter of __rds_rdma_map and is not reassigned. * The following call-sites pass a NULL cp argument to __rds_rdma_map() - rds_get_mr() - rds_get_mr_for_dest * Prior to the code above, the following assumes that cp may be NULL (which is indicative, but could itself be unnecessary) trans_private = rs->rs_transport->get_mr( sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL, args->vec.addr, args->vec.bytes, need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED); * The code modified by this patch is guarded by IS_ERR(trans_private), where trans_private is assigned as per the previous point in this analysis. The only implementation of get_mr that I could locate is rds_ib_get_mr() which can return an ERR_PTR if the conn (4th) argument is NULL. * ret is set to PTR_ERR(trans_private). rds_ib_get_mr can return ERR_PTR(-ENODEV) if the conn (4th) argument is NULL. Thus ret may be -ENODEV in which case the code in question will execute. Conclusion: * cp may be NULL at the point where this patch adds a check; this patch does seem to address a possible bug Fixes: c055fc00c07b ("net/rds: fix WARNING in rds_conn_connect_if_down") Cc: stable@vger.kernel.org # v4.19+ Signed-off-by: Mahmoud Adam Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240326153132.55580-1-mngyadam@amazon.com Signed-off-by: Jakub Kicinski --- net/rds/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rds/rdma.c b/net/rds/rdma.c index a4e3c5de998be..00dbcd4d28e68 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -302,7 +302,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, } ret = PTR_ERR(trans_private); /* Trigger connection so that its ready for the next retry */ - if (ret == -ENODEV) + if (ret == -ENODEV && cp) rds_conn_connect_if_down(cp->cp_conn); goto out; } From 2434ba2bc8518eafdb04852e70f5a2ba2ba1feee Mon Sep 17 00:00:00 2001 From: Eric Woudstra Date: Tue, 26 Mar 2024 17:23:04 +0100 Subject: [PATCH 081/167] dt-bindings: net: airoha,en8811h: Add en8811h Add the Airoha EN8811H 2.5 Gigabit PHY. The en8811h phy can be set with serdes polarity reversed on rx and/or tx. Signed-off-by: Eric Woudstra Reviewed-by: Krzysztof Kozlowski Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20240326162305.303598-2-ericwouds@gmail.com Signed-off-by: Jakub Kicinski --- .../bindings/net/airoha,en8811h.yaml | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/airoha,en8811h.yaml diff --git a/Documentation/devicetree/bindings/net/airoha,en8811h.yaml b/Documentation/devicetree/bindings/net/airoha,en8811h.yaml new file mode 100644 index 0000000000000..ecb5149ec6b0d --- /dev/null +++ b/Documentation/devicetree/bindings/net/airoha,en8811h.yaml @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/airoha,en8811h.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Airoha EN8811H PHY + +maintainers: + - Eric Woudstra + +description: + The Airoha EN8811H PHY has the ability to reverse polarity + on the lines to and/or from the MAC. It is reversed by + the booleans in the devicetree node of the phy. + +allOf: + - $ref: ethernet-phy.yaml# + +properties: + compatible: + enum: + - ethernet-phy-id03a2.a411 + + reg: + maxItems: 1 + + airoha,pnswap-rx: + type: boolean + description: + Reverse rx polarity of the SERDES. This is the receiving + side of the lines from the MAC towards the EN881H. + + airoha,pnswap-tx: + type: boolean + description: + Reverse tx polarity of SERDES. This is the transmitting + side of the lines from EN8811H towards the MAC. + +required: + - reg + +unevaluatedProperties: false + +examples: + - | + mdio { + #address-cells = <1>; + #size-cells = <0>; + + ethernet-phy@1 { + compatible = "ethernet-phy-id03a2.a411"; + reg = <1>; + airoha,pnswap-rx; + }; + }; From 71e79430117d56c409c5ea485a263bc0d8083390 Mon Sep 17 00:00:00 2001 From: Eric Woudstra Date: Tue, 26 Mar 2024 17:23:05 +0100 Subject: [PATCH 082/167] net: phy: air_en8811h: Add the Airoha EN8811H PHY driver Add the driver for the Airoha EN8811H 2.5 Gigabit PHY. The phy supports 100/1000/2500 Mbps with auto negotiation only. The driver uses two firmware files, for which updated versions are added to linux-firmware already. Note: At phy-address + 8 there is another device on the mdio bus, that belongs to the EN881H. While the original driver writes to it, Airoha has confirmed this is not needed. Therefore, communication with this device is not included in this driver. Signed-off-by: Eric Woudstra Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20240326162305.303598-3-ericwouds@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/Kconfig | 5 + drivers/net/phy/Makefile | 1 + drivers/net/phy/air_en8811h.c | 1086 +++++++++++++++++++++++++++++++++ 3 files changed, 1092 insertions(+) create mode 100644 drivers/net/phy/air_en8811h.c diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index 1df0595c5ba9f..7fddc8306d822 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -76,6 +76,11 @@ config SFP comment "MII PHY device drivers" +config AIR_EN8811H_PHY + tristate "Airoha EN8811H 2.5 Gigabit PHY" + help + Currently supports the Airoha EN8811H PHY. + config AMD_PHY tristate "AMD and Altima PHYs" help diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile index 197acfa0b4126..202ed7f450da6 100644 --- a/drivers/net/phy/Makefile +++ b/drivers/net/phy/Makefile @@ -34,6 +34,7 @@ obj-y += $(sfp-obj-y) $(sfp-obj-m) obj-$(CONFIG_ADIN_PHY) += adin.o obj-$(CONFIG_ADIN1100_PHY) += adin1100.o +obj-$(CONFIG_AIR_EN8811H_PHY) += air_en8811h.o obj-$(CONFIG_AMD_PHY) += amd.o obj-$(CONFIG_AQUANTIA_PHY) += aquantia/ ifdef CONFIG_AX88796B_RUST_PHY diff --git a/drivers/net/phy/air_en8811h.c b/drivers/net/phy/air_en8811h.c new file mode 100644 index 0000000000000..720542a4fd82a --- /dev/null +++ b/drivers/net/phy/air_en8811h.c @@ -0,0 +1,1086 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Driver for the Airoha EN8811H 2.5 Gigabit PHY. + * + * Limitations of the EN8811H: + * - Only full duplex supported + * - Forced speed (AN off) is not supported by hardware (100Mbps) + * + * Source originated from airoha's en8811h.c and en8811h.h v1.2.1 + * + * Copyright (C) 2023 Airoha Technology Corp. + */ + +#include +#include +#include +#include +#include + +#define EN8811H_PHY_ID 0x03a2a411 + +#define EN8811H_MD32_DM "airoha/EthMD32.dm.bin" +#define EN8811H_MD32_DSP "airoha/EthMD32.DSP.bin" + +#define AIR_FW_ADDR_DM 0x00000000 +#define AIR_FW_ADDR_DSP 0x00100000 + +/* MII Registers */ +#define AIR_AUX_CTRL_STATUS 0x1d +#define AIR_AUX_CTRL_STATUS_SPEED_MASK GENMASK(4, 2) +#define AIR_AUX_CTRL_STATUS_SPEED_100 0x4 +#define AIR_AUX_CTRL_STATUS_SPEED_1000 0x8 +#define AIR_AUX_CTRL_STATUS_SPEED_2500 0xc + +#define AIR_EXT_PAGE_ACCESS 0x1f +#define AIR_PHY_PAGE_STANDARD 0x0000 +#define AIR_PHY_PAGE_EXTENDED_4 0x0004 + +/* MII Registers Page 4*/ +#define AIR_BPBUS_MODE 0x10 +#define AIR_BPBUS_MODE_ADDR_FIXED 0x0000 +#define AIR_BPBUS_MODE_ADDR_INCR BIT(15) +#define AIR_BPBUS_WR_ADDR_HIGH 0x11 +#define AIR_BPBUS_WR_ADDR_LOW 0x12 +#define AIR_BPBUS_WR_DATA_HIGH 0x13 +#define AIR_BPBUS_WR_DATA_LOW 0x14 +#define AIR_BPBUS_RD_ADDR_HIGH 0x15 +#define AIR_BPBUS_RD_ADDR_LOW 0x16 +#define AIR_BPBUS_RD_DATA_HIGH 0x17 +#define AIR_BPBUS_RD_DATA_LOW 0x18 + +/* Registers on MDIO_MMD_VEND1 */ +#define EN8811H_PHY_FW_STATUS 0x8009 +#define EN8811H_PHY_READY 0x02 + +#define AIR_PHY_MCU_CMD_1 0x800c +#define AIR_PHY_MCU_CMD_1_MODE1 0x0 +#define AIR_PHY_MCU_CMD_2 0x800d +#define AIR_PHY_MCU_CMD_2_MODE1 0x0 +#define AIR_PHY_MCU_CMD_3 0x800e +#define AIR_PHY_MCU_CMD_3_MODE1 0x1101 +#define AIR_PHY_MCU_CMD_3_DOCMD 0x1100 +#define AIR_PHY_MCU_CMD_4 0x800f +#define AIR_PHY_MCU_CMD_4_MODE1 0x0002 +#define AIR_PHY_MCU_CMD_4_INTCLR 0x00e4 + +/* Registers on MDIO_MMD_VEND2 */ +#define AIR_PHY_LED_BCR 0x021 +#define AIR_PHY_LED_BCR_MODE_MASK GENMASK(1, 0) +#define AIR_PHY_LED_BCR_TIME_TEST BIT(2) +#define AIR_PHY_LED_BCR_CLK_EN BIT(3) +#define AIR_PHY_LED_BCR_EXT_CTRL BIT(15) + +#define AIR_PHY_LED_DUR_ON 0x022 + +#define AIR_PHY_LED_DUR_BLINK 0x023 + +#define AIR_PHY_LED_ON(i) (0x024 + ((i) * 2)) +#define AIR_PHY_LED_ON_MASK (GENMASK(6, 0) | BIT(8)) +#define AIR_PHY_LED_ON_LINK1000 BIT(0) +#define AIR_PHY_LED_ON_LINK100 BIT(1) +#define AIR_PHY_LED_ON_LINK10 BIT(2) +#define AIR_PHY_LED_ON_LINKDOWN BIT(3) +#define AIR_PHY_LED_ON_FDX BIT(4) /* Full duplex */ +#define AIR_PHY_LED_ON_HDX BIT(5) /* Half duplex */ +#define AIR_PHY_LED_ON_FORCE_ON BIT(6) +#define AIR_PHY_LED_ON_LINK2500 BIT(8) +#define AIR_PHY_LED_ON_POLARITY BIT(14) +#define AIR_PHY_LED_ON_ENABLE BIT(15) + +#define AIR_PHY_LED_BLINK(i) (0x025 + ((i) * 2)) +#define AIR_PHY_LED_BLINK_1000TX BIT(0) +#define AIR_PHY_LED_BLINK_1000RX BIT(1) +#define AIR_PHY_LED_BLINK_100TX BIT(2) +#define AIR_PHY_LED_BLINK_100RX BIT(3) +#define AIR_PHY_LED_BLINK_10TX BIT(4) +#define AIR_PHY_LED_BLINK_10RX BIT(5) +#define AIR_PHY_LED_BLINK_COLLISION BIT(6) +#define AIR_PHY_LED_BLINK_RX_CRC_ERR BIT(7) +#define AIR_PHY_LED_BLINK_RX_IDLE_ERR BIT(8) +#define AIR_PHY_LED_BLINK_FORCE_BLINK BIT(9) +#define AIR_PHY_LED_BLINK_2500TX BIT(10) +#define AIR_PHY_LED_BLINK_2500RX BIT(11) + +/* Registers on BUCKPBUS */ +#define EN8811H_2P5G_LPA 0x3b30 +#define EN8811H_2P5G_LPA_2P5G BIT(0) + +#define EN8811H_FW_VERSION 0x3b3c + +#define EN8811H_POLARITY 0xca0f8 +#define EN8811H_POLARITY_TX_NORMAL BIT(0) +#define EN8811H_POLARITY_RX_REVERSE BIT(1) + +#define EN8811H_GPIO_OUTPUT 0xcf8b8 +#define EN8811H_GPIO_OUTPUT_345 (BIT(3) | BIT(4) | BIT(5)) + +#define EN8811H_FW_CTRL_1 0x0f0018 +#define EN8811H_FW_CTRL_1_START 0x0 +#define EN8811H_FW_CTRL_1_FINISH 0x1 +#define EN8811H_FW_CTRL_2 0x800000 +#define EN8811H_FW_CTRL_2_LOADING BIT(11) + +/* Led definitions */ +#define EN8811H_LED_COUNT 3 + +/* Default LED setup: + * GPIO5 <-> LED0 On: Link detected, blink Rx/Tx + * GPIO4 <-> LED1 On: Link detected at 2500 or 1000 Mbps + * GPIO3 <-> LED2 On: Link detected at 2500 or 100 Mbps + */ +#define AIR_DEFAULT_TRIGGER_LED0 (BIT(TRIGGER_NETDEV_LINK) | \ + BIT(TRIGGER_NETDEV_RX) | \ + BIT(TRIGGER_NETDEV_TX)) +#define AIR_DEFAULT_TRIGGER_LED1 (BIT(TRIGGER_NETDEV_LINK_2500) | \ + BIT(TRIGGER_NETDEV_LINK_1000)) +#define AIR_DEFAULT_TRIGGER_LED2 (BIT(TRIGGER_NETDEV_LINK_2500) | \ + BIT(TRIGGER_NETDEV_LINK_100)) + +struct led { + unsigned long rules; + unsigned long state; +}; + +struct en8811h_priv { + u32 firmware_version; + bool mcu_needs_restart; + struct led led[EN8811H_LED_COUNT]; +}; + +enum { + AIR_PHY_LED_STATE_FORCE_ON, + AIR_PHY_LED_STATE_FORCE_BLINK, +}; + +enum { + AIR_PHY_LED_DUR_BLINK_32MS, + AIR_PHY_LED_DUR_BLINK_64MS, + AIR_PHY_LED_DUR_BLINK_128MS, + AIR_PHY_LED_DUR_BLINK_256MS, + AIR_PHY_LED_DUR_BLINK_512MS, + AIR_PHY_LED_DUR_BLINK_1024MS, +}; + +enum { + AIR_LED_DISABLE, + AIR_LED_ENABLE, +}; + +enum { + AIR_ACTIVE_LOW, + AIR_ACTIVE_HIGH, +}; + +enum { + AIR_LED_MODE_DISABLE, + AIR_LED_MODE_USER_DEFINE, +}; + +#define AIR_PHY_LED_DUR_UNIT 1024 +#define AIR_PHY_LED_DUR (AIR_PHY_LED_DUR_UNIT << AIR_PHY_LED_DUR_BLINK_64MS) + +static const unsigned long en8811h_led_trig = BIT(TRIGGER_NETDEV_FULL_DUPLEX) | + BIT(TRIGGER_NETDEV_LINK) | + BIT(TRIGGER_NETDEV_LINK_10) | + BIT(TRIGGER_NETDEV_LINK_100) | + BIT(TRIGGER_NETDEV_LINK_1000) | + BIT(TRIGGER_NETDEV_LINK_2500) | + BIT(TRIGGER_NETDEV_RX) | + BIT(TRIGGER_NETDEV_TX); + +static int air_phy_read_page(struct phy_device *phydev) +{ + return __phy_read(phydev, AIR_EXT_PAGE_ACCESS); +} + +static int air_phy_write_page(struct phy_device *phydev, int page) +{ + return __phy_write(phydev, AIR_EXT_PAGE_ACCESS, page); +} + +static int __air_buckpbus_reg_write(struct phy_device *phydev, + u32 pbus_address, u32 pbus_data) +{ + int ret; + + ret = __phy_write(phydev, AIR_BPBUS_MODE, AIR_BPBUS_MODE_ADDR_FIXED); + if (ret < 0) + return ret; + + ret = __phy_write(phydev, AIR_BPBUS_WR_ADDR_HIGH, + upper_16_bits(pbus_address)); + if (ret < 0) + return ret; + + ret = __phy_write(phydev, AIR_BPBUS_WR_ADDR_LOW, + lower_16_bits(pbus_address)); + if (ret < 0) + return ret; + + ret = __phy_write(phydev, AIR_BPBUS_WR_DATA_HIGH, + upper_16_bits(pbus_data)); + if (ret < 0) + return ret; + + ret = __phy_write(phydev, AIR_BPBUS_WR_DATA_LOW, + lower_16_bits(pbus_data)); + if (ret < 0) + return ret; + + return 0; +} + +static int air_buckpbus_reg_write(struct phy_device *phydev, + u32 pbus_address, u32 pbus_data) +{ + int saved_page; + int ret = 0; + + saved_page = phy_select_page(phydev, AIR_PHY_PAGE_EXTENDED_4); + + if (saved_page >= 0) { + ret = __air_buckpbus_reg_write(phydev, pbus_address, + pbus_data); + if (ret < 0) + phydev_err(phydev, "%s 0x%08x failed: %d\n", __func__, + pbus_address, ret); + } + + return phy_restore_page(phydev, saved_page, ret); +} + +static int __air_buckpbus_reg_read(struct phy_device *phydev, + u32 pbus_address, u32 *pbus_data) +{ + int pbus_data_low, pbus_data_high; + int ret; + + ret = __phy_write(phydev, AIR_BPBUS_MODE, AIR_BPBUS_MODE_ADDR_FIXED); + if (ret < 0) + return ret; + + ret = __phy_write(phydev, AIR_BPBUS_RD_ADDR_HIGH, + upper_16_bits(pbus_address)); + if (ret < 0) + return ret; + + ret = __phy_write(phydev, AIR_BPBUS_RD_ADDR_LOW, + lower_16_bits(pbus_address)); + if (ret < 0) + return ret; + + pbus_data_high = __phy_read(phydev, AIR_BPBUS_RD_DATA_HIGH); + if (pbus_data_high < 0) + return ret; + + pbus_data_low = __phy_read(phydev, AIR_BPBUS_RD_DATA_LOW); + if (pbus_data_low < 0) + return ret; + + *pbus_data = pbus_data_low | (pbus_data_high << 16); + return 0; +} + +static int air_buckpbus_reg_read(struct phy_device *phydev, + u32 pbus_address, u32 *pbus_data) +{ + int saved_page; + int ret = 0; + + saved_page = phy_select_page(phydev, AIR_PHY_PAGE_EXTENDED_4); + + if (saved_page >= 0) { + ret = __air_buckpbus_reg_read(phydev, pbus_address, pbus_data); + if (ret < 0) + phydev_err(phydev, "%s 0x%08x failed: %d\n", __func__, + pbus_address, ret); + } + + return phy_restore_page(phydev, saved_page, ret); +} + +static int __air_buckpbus_reg_modify(struct phy_device *phydev, + u32 pbus_address, u32 mask, u32 set) +{ + int pbus_data_low, pbus_data_high; + u32 pbus_data_old, pbus_data_new; + int ret; + + ret = __phy_write(phydev, AIR_BPBUS_MODE, AIR_BPBUS_MODE_ADDR_FIXED); + if (ret < 0) + return ret; + + ret = __phy_write(phydev, AIR_BPBUS_RD_ADDR_HIGH, + upper_16_bits(pbus_address)); + if (ret < 0) + return ret; + + ret = __phy_write(phydev, AIR_BPBUS_RD_ADDR_LOW, + lower_16_bits(pbus_address)); + if (ret < 0) + return ret; + + pbus_data_high = __phy_read(phydev, AIR_BPBUS_RD_DATA_HIGH); + if (pbus_data_high < 0) + return ret; + + pbus_data_low = __phy_read(phydev, AIR_BPBUS_RD_DATA_LOW); + if (pbus_data_low < 0) + return ret; + + pbus_data_old = pbus_data_low | (pbus_data_high << 16); + pbus_data_new = (pbus_data_old & ~mask) | set; + if (pbus_data_new == pbus_data_old) + return 0; + + ret = __phy_write(phydev, AIR_BPBUS_WR_ADDR_HIGH, + upper_16_bits(pbus_address)); + if (ret < 0) + return ret; + + ret = __phy_write(phydev, AIR_BPBUS_WR_ADDR_LOW, + lower_16_bits(pbus_address)); + if (ret < 0) + return ret; + + ret = __phy_write(phydev, AIR_BPBUS_WR_DATA_HIGH, + upper_16_bits(pbus_data_new)); + if (ret < 0) + return ret; + + ret = __phy_write(phydev, AIR_BPBUS_WR_DATA_LOW, + lower_16_bits(pbus_data_new)); + if (ret < 0) + return ret; + + return 0; +} + +static int air_buckpbus_reg_modify(struct phy_device *phydev, + u32 pbus_address, u32 mask, u32 set) +{ + int saved_page; + int ret = 0; + + saved_page = phy_select_page(phydev, AIR_PHY_PAGE_EXTENDED_4); + + if (saved_page >= 0) { + ret = __air_buckpbus_reg_modify(phydev, pbus_address, mask, + set); + if (ret < 0) + phydev_err(phydev, "%s 0x%08x failed: %d\n", __func__, + pbus_address, ret); + } + + return phy_restore_page(phydev, saved_page, ret); +} + +static int __air_write_buf(struct phy_device *phydev, u32 address, + const struct firmware *fw) +{ + unsigned int offset; + int ret; + u16 val; + + ret = __phy_write(phydev, AIR_BPBUS_MODE, AIR_BPBUS_MODE_ADDR_INCR); + if (ret < 0) + return ret; + + ret = __phy_write(phydev, AIR_BPBUS_WR_ADDR_HIGH, + upper_16_bits(address)); + if (ret < 0) + return ret; + + ret = __phy_write(phydev, AIR_BPBUS_WR_ADDR_LOW, + lower_16_bits(address)); + if (ret < 0) + return ret; + + for (offset = 0; offset < fw->size; offset += 4) { + val = get_unaligned_le16(&fw->data[offset + 2]); + ret = __phy_write(phydev, AIR_BPBUS_WR_DATA_HIGH, val); + if (ret < 0) + return ret; + + val = get_unaligned_le16(&fw->data[offset]); + ret = __phy_write(phydev, AIR_BPBUS_WR_DATA_LOW, val); + if (ret < 0) + return ret; + } + + return 0; +} + +static int air_write_buf(struct phy_device *phydev, u32 address, + const struct firmware *fw) +{ + int saved_page; + int ret = 0; + + saved_page = phy_select_page(phydev, AIR_PHY_PAGE_EXTENDED_4); + + if (saved_page >= 0) { + ret = __air_write_buf(phydev, address, fw); + if (ret < 0) + phydev_err(phydev, "%s 0x%08x failed: %d\n", __func__, + address, ret); + } + + return phy_restore_page(phydev, saved_page, ret); +} + +static int en8811h_wait_mcu_ready(struct phy_device *phydev) +{ + int ret, reg_value; + + /* Because of mdio-lock, may have to wait for multiple loads */ + ret = phy_read_mmd_poll_timeout(phydev, MDIO_MMD_VEND1, + EN8811H_PHY_FW_STATUS, reg_value, + reg_value == EN8811H_PHY_READY, + 20000, 7500000, true); + if (ret) { + phydev_err(phydev, "MCU not ready: 0x%x\n", reg_value); + return -ENODEV; + } + + return 0; +} + +static int en8811h_load_firmware(struct phy_device *phydev) +{ + struct en8811h_priv *priv = phydev->priv; + struct device *dev = &phydev->mdio.dev; + const struct firmware *fw1, *fw2; + int ret; + + ret = request_firmware_direct(&fw1, EN8811H_MD32_DM, dev); + if (ret < 0) + return ret; + + ret = request_firmware_direct(&fw2, EN8811H_MD32_DSP, dev); + if (ret < 0) + goto en8811h_load_firmware_rel1; + + ret = air_buckpbus_reg_write(phydev, EN8811H_FW_CTRL_1, + EN8811H_FW_CTRL_1_START); + if (ret < 0) + goto en8811h_load_firmware_out; + + ret = air_buckpbus_reg_modify(phydev, EN8811H_FW_CTRL_2, + EN8811H_FW_CTRL_2_LOADING, + EN8811H_FW_CTRL_2_LOADING); + if (ret < 0) + goto en8811h_load_firmware_out; + + ret = air_write_buf(phydev, AIR_FW_ADDR_DM, fw1); + if (ret < 0) + goto en8811h_load_firmware_out; + + ret = air_write_buf(phydev, AIR_FW_ADDR_DSP, fw2); + if (ret < 0) + goto en8811h_load_firmware_out; + + ret = air_buckpbus_reg_modify(phydev, EN8811H_FW_CTRL_2, + EN8811H_FW_CTRL_2_LOADING, 0); + if (ret < 0) + goto en8811h_load_firmware_out; + + ret = air_buckpbus_reg_write(phydev, EN8811H_FW_CTRL_1, + EN8811H_FW_CTRL_1_FINISH); + if (ret < 0) + goto en8811h_load_firmware_out; + + ret = en8811h_wait_mcu_ready(phydev); + + air_buckpbus_reg_read(phydev, EN8811H_FW_VERSION, + &priv->firmware_version); + phydev_info(phydev, "MD32 firmware version: %08x\n", + priv->firmware_version); + +en8811h_load_firmware_out: + release_firmware(fw2); + +en8811h_load_firmware_rel1: + release_firmware(fw1); + + if (ret < 0) + phydev_err(phydev, "Load firmware failed: %d\n", ret); + + return ret; +} + +static int en8811h_restart_mcu(struct phy_device *phydev) +{ + int ret; + + ret = air_buckpbus_reg_write(phydev, EN8811H_FW_CTRL_1, + EN8811H_FW_CTRL_1_START); + if (ret < 0) + return ret; + + ret = air_buckpbus_reg_write(phydev, EN8811H_FW_CTRL_1, + EN8811H_FW_CTRL_1_FINISH); + if (ret < 0) + return ret; + + return en8811h_wait_mcu_ready(phydev); +} + +static int air_hw_led_on_set(struct phy_device *phydev, u8 index, bool on) +{ + struct en8811h_priv *priv = phydev->priv; + bool changed; + + if (index >= EN8811H_LED_COUNT) + return -EINVAL; + + if (on) + changed = !test_and_set_bit(AIR_PHY_LED_STATE_FORCE_ON, + &priv->led[index].state); + else + changed = !!test_and_clear_bit(AIR_PHY_LED_STATE_FORCE_ON, + &priv->led[index].state); + + changed |= (priv->led[index].rules != 0); + + if (changed) + return phy_modify_mmd(phydev, MDIO_MMD_VEND2, + AIR_PHY_LED_ON(index), + AIR_PHY_LED_ON_MASK, + on ? AIR_PHY_LED_ON_FORCE_ON : 0); + + return 0; +} + +static int air_hw_led_blink_set(struct phy_device *phydev, u8 index, + bool blinking) +{ + struct en8811h_priv *priv = phydev->priv; + bool changed; + + if (index >= EN8811H_LED_COUNT) + return -EINVAL; + + if (blinking) + changed = !test_and_set_bit(AIR_PHY_LED_STATE_FORCE_BLINK, + &priv->led[index].state); + else + changed = !!test_and_clear_bit(AIR_PHY_LED_STATE_FORCE_BLINK, + &priv->led[index].state); + + changed |= (priv->led[index].rules != 0); + + if (changed) + return phy_write_mmd(phydev, MDIO_MMD_VEND2, + AIR_PHY_LED_BLINK(index), + blinking ? + AIR_PHY_LED_BLINK_FORCE_BLINK : 0); + else + return 0; +} + +static int air_led_blink_set(struct phy_device *phydev, u8 index, + unsigned long *delay_on, + unsigned long *delay_off) +{ + struct en8811h_priv *priv = phydev->priv; + bool blinking = false; + int err; + + if (index >= EN8811H_LED_COUNT) + return -EINVAL; + + if (delay_on && delay_off && (*delay_on > 0) && (*delay_off > 0)) { + blinking = true; + *delay_on = 50; + *delay_off = 50; + } + + err = air_hw_led_blink_set(phydev, index, blinking); + if (err) + return err; + + /* led-blink set, so switch led-on off */ + err = air_hw_led_on_set(phydev, index, false); + if (err) + return err; + + /* hw-control is off*/ + if (!!test_bit(AIR_PHY_LED_STATE_FORCE_BLINK, &priv->led[index].state)) + priv->led[index].rules = 0; + + return 0; +} + +static int air_led_brightness_set(struct phy_device *phydev, u8 index, + enum led_brightness value) +{ + struct en8811h_priv *priv = phydev->priv; + int err; + + if (index >= EN8811H_LED_COUNT) + return -EINVAL; + + /* led-on set, so switch led-blink off */ + err = air_hw_led_blink_set(phydev, index, false); + if (err) + return err; + + err = air_hw_led_on_set(phydev, index, (value != LED_OFF)); + if (err) + return err; + + /* hw-control is off */ + if (!!test_bit(AIR_PHY_LED_STATE_FORCE_ON, &priv->led[index].state)) + priv->led[index].rules = 0; + + return 0; +} + +static int air_led_hw_control_get(struct phy_device *phydev, u8 index, + unsigned long *rules) +{ + struct en8811h_priv *priv = phydev->priv; + + if (index >= EN8811H_LED_COUNT) + return -EINVAL; + + *rules = priv->led[index].rules; + + return 0; +}; + +static int air_led_hw_control_set(struct phy_device *phydev, u8 index, + unsigned long rules) +{ + struct en8811h_priv *priv = phydev->priv; + u16 on = 0, blink = 0; + int ret; + + if (index >= EN8811H_LED_COUNT) + return -EINVAL; + + priv->led[index].rules = rules; + + if (rules & BIT(TRIGGER_NETDEV_FULL_DUPLEX)) + on |= AIR_PHY_LED_ON_FDX; + + if (rules & (BIT(TRIGGER_NETDEV_LINK_10) | BIT(TRIGGER_NETDEV_LINK))) + on |= AIR_PHY_LED_ON_LINK10; + + if (rules & (BIT(TRIGGER_NETDEV_LINK_100) | BIT(TRIGGER_NETDEV_LINK))) + on |= AIR_PHY_LED_ON_LINK100; + + if (rules & (BIT(TRIGGER_NETDEV_LINK_1000) | BIT(TRIGGER_NETDEV_LINK))) + on |= AIR_PHY_LED_ON_LINK1000; + + if (rules & (BIT(TRIGGER_NETDEV_LINK_2500) | BIT(TRIGGER_NETDEV_LINK))) + on |= AIR_PHY_LED_ON_LINK2500; + + if (rules & BIT(TRIGGER_NETDEV_RX)) { + blink |= AIR_PHY_LED_BLINK_10RX | + AIR_PHY_LED_BLINK_100RX | + AIR_PHY_LED_BLINK_1000RX | + AIR_PHY_LED_BLINK_2500RX; + } + + if (rules & BIT(TRIGGER_NETDEV_TX)) { + blink |= AIR_PHY_LED_BLINK_10TX | + AIR_PHY_LED_BLINK_100TX | + AIR_PHY_LED_BLINK_1000TX | + AIR_PHY_LED_BLINK_2500TX; + } + + if (blink || on) { + /* switch hw-control on, so led-on and led-blink are off */ + clear_bit(AIR_PHY_LED_STATE_FORCE_ON, + &priv->led[index].state); + clear_bit(AIR_PHY_LED_STATE_FORCE_BLINK, + &priv->led[index].state); + } else { + priv->led[index].rules = 0; + } + + ret = phy_modify_mmd(phydev, MDIO_MMD_VEND2, AIR_PHY_LED_ON(index), + AIR_PHY_LED_ON_MASK, on); + + if (ret < 0) + return ret; + + return phy_write_mmd(phydev, MDIO_MMD_VEND2, AIR_PHY_LED_BLINK(index), + blink); +}; + +static int air_led_init(struct phy_device *phydev, u8 index, u8 state, u8 pol) +{ + int val = 0; + int err; + + if (index >= EN8811H_LED_COUNT) + return -EINVAL; + + if (state == AIR_LED_ENABLE) + val |= AIR_PHY_LED_ON_ENABLE; + else + val &= ~AIR_PHY_LED_ON_ENABLE; + + if (pol == AIR_ACTIVE_HIGH) + val |= AIR_PHY_LED_ON_POLARITY; + else + val &= ~AIR_PHY_LED_ON_POLARITY; + + err = phy_modify_mmd(phydev, MDIO_MMD_VEND2, AIR_PHY_LED_ON(index), + AIR_PHY_LED_ON_ENABLE | + AIR_PHY_LED_ON_POLARITY, val); + + if (err < 0) + return err; + + return 0; +} + +static int air_leds_init(struct phy_device *phydev, int num, int dur, int mode) +{ + struct en8811h_priv *priv = phydev->priv; + int ret, i; + + ret = phy_write_mmd(phydev, MDIO_MMD_VEND2, AIR_PHY_LED_DUR_BLINK, + dur); + if (ret < 0) + return ret; + + ret = phy_write_mmd(phydev, MDIO_MMD_VEND2, AIR_PHY_LED_DUR_ON, + dur >> 1); + if (ret < 0) + return ret; + + switch (mode) { + case AIR_LED_MODE_DISABLE: + ret = phy_modify_mmd(phydev, MDIO_MMD_VEND2, AIR_PHY_LED_BCR, + AIR_PHY_LED_BCR_EXT_CTRL | + AIR_PHY_LED_BCR_MODE_MASK, 0); + if (ret < 0) + return ret; + break; + case AIR_LED_MODE_USER_DEFINE: + ret = phy_modify_mmd(phydev, MDIO_MMD_VEND2, AIR_PHY_LED_BCR, + AIR_PHY_LED_BCR_EXT_CTRL | + AIR_PHY_LED_BCR_CLK_EN, + AIR_PHY_LED_BCR_EXT_CTRL | + AIR_PHY_LED_BCR_CLK_EN); + if (ret < 0) + return ret; + break; + default: + phydev_err(phydev, "LED mode %d is not supported\n", mode); + return -EINVAL; + } + + for (i = 0; i < num; ++i) { + ret = air_led_init(phydev, i, AIR_LED_ENABLE, AIR_ACTIVE_HIGH); + if (ret < 0) { + phydev_err(phydev, "LED%d init failed: %d\n", i, ret); + return ret; + } + air_led_hw_control_set(phydev, i, priv->led[i].rules); + } + + return 0; +} + +static int en8811h_led_hw_is_supported(struct phy_device *phydev, u8 index, + unsigned long rules) +{ + if (index >= EN8811H_LED_COUNT) + return -EINVAL; + + /* All combinations of the supported triggers are allowed */ + if (rules & ~en8811h_led_trig) + return -EOPNOTSUPP; + + return 0; +}; + +static int en8811h_probe(struct phy_device *phydev) +{ + struct en8811h_priv *priv; + int ret; + + priv = devm_kzalloc(&phydev->mdio.dev, sizeof(struct en8811h_priv), + GFP_KERNEL); + if (!priv) + return -ENOMEM; + phydev->priv = priv; + + ret = en8811h_load_firmware(phydev); + if (ret < 0) + return ret; + + /* mcu has just restarted after firmware load */ + priv->mcu_needs_restart = false; + + priv->led[0].rules = AIR_DEFAULT_TRIGGER_LED0; + priv->led[1].rules = AIR_DEFAULT_TRIGGER_LED1; + priv->led[2].rules = AIR_DEFAULT_TRIGGER_LED2; + + /* MDIO_DEVS1/2 empty, so set mmds_present bits here */ + phydev->c45_ids.mmds_present |= MDIO_DEVS_PMAPMD | MDIO_DEVS_AN; + + ret = air_leds_init(phydev, EN8811H_LED_COUNT, AIR_PHY_LED_DUR, + AIR_LED_MODE_DISABLE); + if (ret < 0) { + phydev_err(phydev, "Failed to disable leds: %d\n", ret); + return ret; + } + + /* Configure led gpio pins as output */ + ret = air_buckpbus_reg_modify(phydev, EN8811H_GPIO_OUTPUT, + EN8811H_GPIO_OUTPUT_345, + EN8811H_GPIO_OUTPUT_345); + if (ret < 0) + return ret; + + return 0; +} + +static int en8811h_config_init(struct phy_device *phydev) +{ + struct en8811h_priv *priv = phydev->priv; + struct device *dev = &phydev->mdio.dev; + u32 pbus_value; + int ret; + + /* If restart happened in .probe(), no need to restart now */ + if (priv->mcu_needs_restart) { + ret = en8811h_restart_mcu(phydev); + if (ret < 0) + return ret; + } else { + /* Next calls to .config_init() mcu needs to restart */ + priv->mcu_needs_restart = true; + } + + /* Select mode 1, the only mode supported. + * Configures the SerDes for 2500Base-X with rate adaptation + */ + ret = phy_write_mmd(phydev, MDIO_MMD_VEND1, AIR_PHY_MCU_CMD_1, + AIR_PHY_MCU_CMD_1_MODE1); + if (ret < 0) + return ret; + ret = phy_write_mmd(phydev, MDIO_MMD_VEND1, AIR_PHY_MCU_CMD_2, + AIR_PHY_MCU_CMD_2_MODE1); + if (ret < 0) + return ret; + ret = phy_write_mmd(phydev, MDIO_MMD_VEND1, AIR_PHY_MCU_CMD_3, + AIR_PHY_MCU_CMD_3_MODE1); + if (ret < 0) + return ret; + ret = phy_write_mmd(phydev, MDIO_MMD_VEND1, AIR_PHY_MCU_CMD_4, + AIR_PHY_MCU_CMD_4_MODE1); + if (ret < 0) + return ret; + + /* Serdes polarity */ + pbus_value = 0; + if (device_property_read_bool(dev, "airoha,pnswap-rx")) + pbus_value |= EN8811H_POLARITY_RX_REVERSE; + else + pbus_value &= ~EN8811H_POLARITY_RX_REVERSE; + if (device_property_read_bool(dev, "airoha,pnswap-tx")) + pbus_value &= ~EN8811H_POLARITY_TX_NORMAL; + else + pbus_value |= EN8811H_POLARITY_TX_NORMAL; + ret = air_buckpbus_reg_modify(phydev, EN8811H_POLARITY, + EN8811H_POLARITY_RX_REVERSE | + EN8811H_POLARITY_TX_NORMAL, pbus_value); + if (ret < 0) + return ret; + + ret = air_leds_init(phydev, EN8811H_LED_COUNT, AIR_PHY_LED_DUR, + AIR_LED_MODE_USER_DEFINE); + if (ret < 0) { + phydev_err(phydev, "Failed to initialize leds: %d\n", ret); + return ret; + } + + return 0; +} + +static int en8811h_get_features(struct phy_device *phydev) +{ + linkmode_set_bit_array(phy_basic_ports_array, + ARRAY_SIZE(phy_basic_ports_array), + phydev->supported); + + return genphy_c45_pma_read_abilities(phydev); +} + +static int en8811h_get_rate_matching(struct phy_device *phydev, + phy_interface_t iface) +{ + return RATE_MATCH_PAUSE; +} + +static int en8811h_config_aneg(struct phy_device *phydev) +{ + bool changed = false; + int ret; + u32 adv; + + if (phydev->autoneg == AUTONEG_DISABLE) { + phydev_warn(phydev, "Disabling autoneg is not supported\n"); + return -EINVAL; + } + + adv = linkmode_adv_to_mii_10gbt_adv_t(phydev->advertising); + + ret = phy_modify_mmd_changed(phydev, MDIO_MMD_AN, MDIO_AN_10GBT_CTRL, + MDIO_AN_10GBT_CTRL_ADV2_5G, adv); + if (ret < 0) + return ret; + if (ret > 0) + changed = true; + + return __genphy_config_aneg(phydev, changed); +} + +static int en8811h_read_status(struct phy_device *phydev) +{ + struct en8811h_priv *priv = phydev->priv; + u32 pbus_value; + int ret, val; + + ret = genphy_update_link(phydev); + if (ret) + return ret; + + phydev->master_slave_get = MASTER_SLAVE_CFG_UNSUPPORTED; + phydev->master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED; + phydev->speed = SPEED_UNKNOWN; + phydev->duplex = DUPLEX_UNKNOWN; + phydev->pause = 0; + phydev->asym_pause = 0; + phydev->rate_matching = RATE_MATCH_PAUSE; + + ret = genphy_read_master_slave(phydev); + if (ret < 0) + return ret; + + ret = genphy_read_lpa(phydev); + if (ret < 0) + return ret; + + /* Get link partner 2.5GBASE-T ability from vendor register */ + ret = air_buckpbus_reg_read(phydev, EN8811H_2P5G_LPA, &pbus_value); + if (ret < 0) + return ret; + linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, + phydev->lp_advertising, + pbus_value & EN8811H_2P5G_LPA_2P5G); + + if (phydev->autoneg_complete) + phy_resolve_aneg_pause(phydev); + + if (!phydev->link) + return 0; + + /* Get real speed from vendor register */ + val = phy_read(phydev, AIR_AUX_CTRL_STATUS); + if (val < 0) + return val; + switch (val & AIR_AUX_CTRL_STATUS_SPEED_MASK) { + case AIR_AUX_CTRL_STATUS_SPEED_2500: + phydev->speed = SPEED_2500; + break; + case AIR_AUX_CTRL_STATUS_SPEED_1000: + phydev->speed = SPEED_1000; + break; + case AIR_AUX_CTRL_STATUS_SPEED_100: + phydev->speed = SPEED_100; + break; + } + + /* Firmware before version 24011202 has no vendor register 2P5G_LPA. + * Assume link partner advertised it if connected at 2500Mbps. + */ + if (priv->firmware_version < 0x24011202) { + linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, + phydev->lp_advertising, + phydev->speed == SPEED_2500); + } + + /* Only supports full duplex */ + phydev->duplex = DUPLEX_FULL; + + return 0; +} + +static int en8811h_clear_intr(struct phy_device *phydev) +{ + int ret; + + ret = phy_write_mmd(phydev, MDIO_MMD_VEND1, AIR_PHY_MCU_CMD_3, + AIR_PHY_MCU_CMD_3_DOCMD); + if (ret < 0) + return ret; + + ret = phy_write_mmd(phydev, MDIO_MMD_VEND1, AIR_PHY_MCU_CMD_4, + AIR_PHY_MCU_CMD_4_INTCLR); + if (ret < 0) + return ret; + + return 0; +} + +static irqreturn_t en8811h_handle_interrupt(struct phy_device *phydev) +{ + int ret; + + ret = en8811h_clear_intr(phydev); + if (ret < 0) { + phy_error(phydev); + return IRQ_NONE; + } + + phy_trigger_machine(phydev); + + return IRQ_HANDLED; +} + +static struct phy_driver en8811h_driver[] = { +{ + PHY_ID_MATCH_MODEL(EN8811H_PHY_ID), + .name = "Airoha EN8811H", + .probe = en8811h_probe, + .get_features = en8811h_get_features, + .config_init = en8811h_config_init, + .get_rate_matching = en8811h_get_rate_matching, + .config_aneg = en8811h_config_aneg, + .read_status = en8811h_read_status, + .config_intr = en8811h_clear_intr, + .handle_interrupt = en8811h_handle_interrupt, + .led_hw_is_supported = en8811h_led_hw_is_supported, + .read_page = air_phy_read_page, + .write_page = air_phy_write_page, + .led_blink_set = air_led_blink_set, + .led_brightness_set = air_led_brightness_set, + .led_hw_control_set = air_led_hw_control_set, + .led_hw_control_get = air_led_hw_control_get, +} }; + +module_phy_driver(en8811h_driver); + +static struct mdio_device_id __maybe_unused en8811h_tbl[] = { + { PHY_ID_MATCH_MODEL(EN8811H_PHY_ID) }, + { } +}; + +MODULE_DEVICE_TABLE(mdio, en8811h_tbl); +MODULE_FIRMWARE(EN8811H_MD32_DM); +MODULE_FIRMWARE(EN8811H_MD32_DSP); + +MODULE_DESCRIPTION("Airoha EN8811H PHY drivers"); +MODULE_AUTHOR("Airoha"); +MODULE_AUTHOR("Eric Woudstra "); +MODULE_LICENSE("GPL"); From b7b1c59069123368a939327d49009e8771e88d9c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 26 Mar 2024 19:58:36 +0200 Subject: [PATCH 083/167] nfc: st95hf: Switch to using gpiod API This updates the driver to gpiod API, and removes yet another use of of_get_named_gpio(). Signed-off-by: Andy Shevchenko Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240326175836.1418718-1-andriy.shevchenko@linux.intel.com Signed-off-by: Jakub Kicinski --- drivers/nfc/st95hf/core.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/drivers/nfc/st95hf/core.c b/drivers/nfc/st95hf/core.c index ed704bb772264..067e0ec31d2dc 100644 --- a/drivers/nfc/st95hf/core.c +++ b/drivers/nfc/st95hf/core.c @@ -7,14 +7,13 @@ */ #include -#include +#include #include #include #include #include #include #include -#include #include #include #include @@ -196,7 +195,7 @@ struct st95_digital_cmd_complete_arg { * for spi communication between st95hf and host. * @ddev: nfc digital device object. * @nfcdev: nfc device object. - * @enable_gpio: gpio used to enable st95hf transceiver. + * @enable_gpiod: gpio used to enable st95hf transceiver. * @complete_cb_arg: structure to store various context information * that is passed from nfc requesting thread to the threaded ISR. * @st95hf_supply: regulator "consumer" for NFC device. @@ -219,7 +218,7 @@ struct st95hf_context { struct st95hf_spi_context spicontext; struct nfc_digital_dev *ddev; struct nfc_dev *nfcdev; - unsigned int enable_gpio; + struct gpio_desc *enable_gpiod; struct st95_digital_cmd_complete_arg complete_cb_arg; struct regulator *st95hf_supply; unsigned char sendrcv_trflag; @@ -451,19 +450,19 @@ static int st95hf_select_protocol(struct st95hf_context *stcontext, int type) static void st95hf_send_st95enable_negativepulse(struct st95hf_context *st95con) { /* First make irq_in pin high */ - gpio_set_value(st95con->enable_gpio, HIGH); + gpiod_set_value(st95con->enable_gpiod, HIGH); /* wait for 1 milisecond */ usleep_range(1000, 2000); /* Make irq_in pin low */ - gpio_set_value(st95con->enable_gpio, LOW); + gpiod_set_value(st95con->enable_gpiod, LOW); /* wait for minimum interrupt pulse to make st95 active */ usleep_range(1000, 2000); /* At end make it high */ - gpio_set_value(st95con->enable_gpio, HIGH); + gpiod_set_value(st95con->enable_gpiod, HIGH); } /* @@ -1063,6 +1062,7 @@ MODULE_DEVICE_TABLE(of, st95hf_spi_of_match); static int st95hf_probe(struct spi_device *nfc_spi_dev) { + struct device *dev = &nfc_spi_dev->dev; int ret; struct st95hf_context *st95context; @@ -1108,19 +1108,14 @@ static int st95hf_probe(struct spi_device *nfc_spi_dev) */ dev_set_drvdata(&nfc_spi_dev->dev, spicontext); - st95context->enable_gpio = - of_get_named_gpio(nfc_spi_dev->dev.of_node, - "enable-gpio", - 0); - if (!gpio_is_valid(st95context->enable_gpio)) { + st95context->enable_gpiod = devm_gpiod_get(dev, "enable", GPIOD_OUT_HIGH); + if (IS_ERR(st95context->enable_gpiod)) { + ret = PTR_ERR(st95context->enable_gpiod); dev_err(&nfc_spi_dev->dev, "No valid enable gpio\n"); - ret = st95context->enable_gpio; goto err_disable_regulator; } - ret = devm_gpio_request_one(&nfc_spi_dev->dev, st95context->enable_gpio, - GPIOF_DIR_OUT | GPIOF_INIT_HIGH, - "enable_gpio"); + ret = gpiod_set_consumer_name(st95context->enable_gpiod, "enable_gpio"); if (ret) goto err_disable_regulator; From a0ad11fc2632903ec0fe9cd16b07d7dd2ade8341 Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Tue, 26 Mar 2024 19:05:46 +0100 Subject: [PATCH 084/167] net: port TP_STORE_ADDR_PORTS_SKB macro to be tcp/udp independent This patch moves TP_STORE_ADDR_PORTS_SKB() to a common header and removes the TCP specific implementation details. Previously the macro assumed the skb passed as an argument is a TCP packet, the implementation now uses an argument to the L4 header and uses that to extract the source/destination ports, which happen to be named the same in "struct tcphdr" and "struct udphdr" Reviewed-by: Jason Xing Signed-off-by: Balazs Scheidler Link: https://lore.kernel.org/r/9e306f78260dfbbdc7353ba5f864cc027a409540.1711475011.git.balazs.scheidler@axoflow.com Signed-off-by: Jakub Kicinski --- include/trace/events/net_probe_common.h | 40 ++++++++++++++++++++++ include/trace/events/tcp.h | 45 ++----------------------- 2 files changed, 42 insertions(+), 43 deletions(-) diff --git a/include/trace/events/net_probe_common.h b/include/trace/events/net_probe_common.h index b1f9a4d3ee136..5e33f91bdea37 100644 --- a/include/trace/events/net_probe_common.h +++ b/include/trace/events/net_probe_common.h @@ -70,4 +70,44 @@ TP_STORE_V4MAPPED(__entry, saddr, daddr) #endif +#define TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb, protoh) \ + do { \ + struct sockaddr_in *v4 = (void *)__entry->saddr; \ + \ + v4->sin_family = AF_INET; \ + v4->sin_port = protoh->source; \ + v4->sin_addr.s_addr = ip_hdr(skb)->saddr; \ + v4 = (void *)__entry->daddr; \ + v4->sin_family = AF_INET; \ + v4->sin_port = protoh->dest; \ + v4->sin_addr.s_addr = ip_hdr(skb)->daddr; \ + } while (0) + +#if IS_ENABLED(CONFIG_IPV6) + +#define TP_STORE_ADDR_PORTS_SKB(__entry, skb, protoh) \ + do { \ + const struct iphdr *iph = ip_hdr(skb); \ + \ + if (iph->version == 6) { \ + struct sockaddr_in6 *v6 = (void *)__entry->saddr; \ + \ + v6->sin6_family = AF_INET6; \ + v6->sin6_port = protoh->source; \ + v6->sin6_addr = ipv6_hdr(skb)->saddr; \ + v6 = (void *)__entry->daddr; \ + v6->sin6_family = AF_INET6; \ + v6->sin6_port = protoh->dest; \ + v6->sin6_addr = ipv6_hdr(skb)->daddr; \ + } else \ + TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb, protoh); \ + } while (0) + +#else + +#define TP_STORE_ADDR_PORTS_SKB(__entry, skb, protoh) \ + TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb, protoh) + +#endif + #endif diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 3c08a0846c47d..1db95175c1e52 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -273,48 +273,6 @@ TRACE_EVENT(tcp_probe, __entry->skbaddr, __entry->skaddr) ); -#define TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb) \ - do { \ - const struct tcphdr *th = (const struct tcphdr *)skb->data; \ - struct sockaddr_in *v4 = (void *)__entry->saddr; \ - \ - v4->sin_family = AF_INET; \ - v4->sin_port = th->source; \ - v4->sin_addr.s_addr = ip_hdr(skb)->saddr; \ - v4 = (void *)__entry->daddr; \ - v4->sin_family = AF_INET; \ - v4->sin_port = th->dest; \ - v4->sin_addr.s_addr = ip_hdr(skb)->daddr; \ - } while (0) - -#if IS_ENABLED(CONFIG_IPV6) - -#define TP_STORE_ADDR_PORTS_SKB(__entry, skb) \ - do { \ - const struct iphdr *iph = ip_hdr(skb); \ - \ - if (iph->version == 6) { \ - const struct tcphdr *th = (const struct tcphdr *)skb->data; \ - struct sockaddr_in6 *v6 = (void *)__entry->saddr; \ - \ - v6->sin6_family = AF_INET6; \ - v6->sin6_port = th->source; \ - v6->sin6_addr = ipv6_hdr(skb)->saddr; \ - v6 = (void *)__entry->daddr; \ - v6->sin6_family = AF_INET6; \ - v6->sin6_port = th->dest; \ - v6->sin6_addr = ipv6_hdr(skb)->daddr; \ - } else \ - TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb); \ - } while (0) - -#else - -#define TP_STORE_ADDR_PORTS_SKB(__entry, skb) \ - TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb) - -#endif - /* * tcp event with only skb */ @@ -331,12 +289,13 @@ DECLARE_EVENT_CLASS(tcp_event_skb, ), TP_fast_assign( + const struct tcphdr *th = (const struct tcphdr *)skb->data; __entry->skbaddr = skb; memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); - TP_STORE_ADDR_PORTS_SKB(__entry, skb); + TP_STORE_ADDR_PORTS_SKB(__entry, skb, th); ), TP_printk("skbaddr=%p src=%pISpc dest=%pISpc", From e9669a00bba79442dd4862c57761333d6a020c24 Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Tue, 26 Mar 2024 19:05:47 +0100 Subject: [PATCH 085/167] net: udp: add IP/port data to the tracepoint udp/udp_fail_queue_rcv_skb The udp_fail_queue_rcv_skb() tracepoint lacks any details on the source and destination IP/port whereas this information can be critical in case of UDP/syslog. Signed-off-by: Balazs Scheidler Reviewed-by: Jason Xing Link: https://lore.kernel.org/r/0c8b3e33dbf679e190be6f4c6736603a76988a20.1711475011.git.balazs.scheidler@axoflow.com Signed-off-by: Jakub Kicinski --- include/trace/events/udp.h | 29 ++++++++++++++++++++++++----- net/ipv4/udp.c | 2 +- net/ipv6/udp.c | 3 ++- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/include/trace/events/udp.h b/include/trace/events/udp.h index 336fe272889f5..62bebe2a6eceb 100644 --- a/include/trace/events/udp.h +++ b/include/trace/events/udp.h @@ -7,24 +7,43 @@ #include #include +#include TRACE_EVENT(udp_fail_queue_rcv_skb, - TP_PROTO(int rc, struct sock *sk), + TP_PROTO(int rc, struct sock *sk, struct sk_buff *skb), - TP_ARGS(rc, sk), + TP_ARGS(rc, sk, skb), TP_STRUCT__entry( __field(int, rc) - __field(__u16, lport) + + __field(__u16, sport) + __field(__u16, dport) + __field(__u16, family) + __array(__u8, saddr, sizeof(struct sockaddr_in6)) + __array(__u8, daddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( + const struct udphdr *uh = (const struct udphdr *)udp_hdr(skb); + __entry->rc = rc; - __entry->lport = inet_sk(sk)->inet_num; + + /* for filtering use */ + __entry->sport = ntohs(uh->source); + __entry->dport = ntohs(uh->dest); + __entry->family = sk->sk_family; + + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); + + TP_STORE_ADDR_PORTS_SKB(__entry, skb, uh); ), - TP_printk("rc=%d port=%hu", __entry->rc, __entry->lport) + TP_printk("rc=%d family=%s src=%pISpc dest=%pISpc", __entry->rc, + show_family_name(__entry->family), + __entry->saddr, __entry->daddr) ); #endif /* _TRACE_UDP_H */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 661d0e0d273f6..531882f321f2d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2049,8 +2049,8 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) drop_reason = SKB_DROP_REASON_PROTO_MEM; } UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + trace_udp_fail_queue_rcv_skb(rc, sk, skb); kfree_skb_reason(skb, drop_reason); - trace_udp_fail_queue_rcv_skb(rc, sk); return -1; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7c1e6469d091d..2e4dc5e6137bd 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -658,8 +659,8 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) drop_reason = SKB_DROP_REASON_PROTO_MEM; } UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + trace_udp_fail_queue_rcv_skb(rc, sk, skb); kfree_skb_reason(skb, drop_reason); - trace_udp_fail_queue_rcv_skb(rc, sk); return -1; } From b6694abcf5dffd2dac83a7fea067d27a49df0079 Mon Sep 17 00:00:00 2001 From: Radha Mohan Chintakuntla Date: Tue, 26 Mar 2024 11:45:14 -0700 Subject: [PATCH 086/167] octeontx2-af: Increase maximum BPID channels Any NIX interface type can have maximum 256 channels. So increased the backpressure ID count to 256 so that it can cover cn9k and cn10k SoCs that have different NIX interface types with varied maximum channels. Signed-off-by: Radha Mohan Chintakuntla Link: https://lore.kernel.org/r/20240326184514.1628284-1-radhac@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/af/mbox.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h index eb2a20b5a0d0c..3d801a1a4f701 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h @@ -1213,10 +1213,8 @@ struct nix_bp_cfg_req { /* bpid_per_chan = 1 assigns separate bp id for each channel */ }; -/* PF can be mapped to either CGX or LBK interface, - * so maximum 64 channels are possible. - */ -#define NIX_MAX_BPID_CHAN 64 +/* Maximum channels any single NIX interface can have */ +#define NIX_MAX_BPID_CHAN 256 struct nix_bp_cfg_rsp { struct mbox_msghdr hdr; u16 chan_bpid[NIX_MAX_BPID_CHAN]; /* Channel and bpid mapping */ From 9046d581ed586f3c715357638ca12c0e84402002 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 26 Mar 2024 23:38:01 +0100 Subject: [PATCH 087/167] enetc: avoid truncating error message As clang points out, the error message in enetc_setup_xdp_prog() still does not fit in the buffer and will be truncated: drivers/net/ethernet/freescale/enetc/enetc.c:2771:3: error: 'snprintf' will always be truncated; specified size is 80, but format string expands to at least 87 [-Werror,-Wformat-truncation] Replace it with an even shorter message that should fit. Fixes: f968c56417f0 ("net: enetc: shorten enetc_setup_xdp_prog() error message to fit NETLINK_MAX_FMTMSG_LEN") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240326223825.4084412-3-arnd@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 9f07f4947b631..5c45f42232d32 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -2769,7 +2769,7 @@ static int enetc_setup_xdp_prog(struct net_device *ndev, struct bpf_prog *prog, if (priv->min_num_stack_tx_queues + num_xdp_tx_queues > priv->num_tx_rings) { NL_SET_ERR_MSG_FMT_MOD(extack, - "Reserving %d XDP TXQs does not leave a minimum of %d for stack (total %d)", + "Reserving %d XDP TXQs leaves under %d for stack (total %d)", num_xdp_tx_queues, priv->min_num_stack_tx_queues, priv->num_tx_rings); From 954fd908f177604d4cce77e2a88cc50b29bad5ff Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 26 Mar 2024 23:38:02 +0100 Subject: [PATCH 088/167] qed: avoid truncating work queue length clang complains that the temporary string for the name passed into alloc_workqueue() is too short for its contents: drivers/net/ethernet/qlogic/qed/qed_main.c:1218:3: error: 'snprintf' will always be truncated; specified size is 16, but format string expands to at least 18 [-Werror,-Wformat-truncation] There is no need for a temporary buffer, and the actual name of a workqueue is 32 bytes (WQ_NAME_LEN), so just use the interface as intended to avoid the truncation. Fixes: 59ccf86fe69a ("qed: Add driver infrastucture for handling mfw requests.") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240326223825.4084412-4-arnd@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qed/qed_main.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index 408da4312e018..17f284e9f06d3 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -1205,7 +1205,6 @@ static void qed_slowpath_task(struct work_struct *work) static int qed_slowpath_wq_start(struct qed_dev *cdev) { struct qed_hwfn *hwfn; - char name[NAME_SIZE]; int i; if (IS_VF(cdev)) @@ -1214,11 +1213,11 @@ static int qed_slowpath_wq_start(struct qed_dev *cdev) for_each_hwfn(cdev, i) { hwfn = &cdev->hwfns[i]; - snprintf(name, NAME_SIZE, "slowpath-%02x:%02x.%02x", - cdev->pdev->bus->number, - PCI_SLOT(cdev->pdev->devfn), hwfn->abs_pf_id); + hwfn->slowpath_wq = alloc_workqueue("slowpath-%02x:%02x.%02x", + 0, 0, cdev->pdev->bus->number, + PCI_SLOT(cdev->pdev->devfn), + hwfn->abs_pf_id); - hwfn->slowpath_wq = alloc_workqueue(name, 0, 0); if (!hwfn->slowpath_wq) { DP_NOTICE(hwfn, "Cannot create slowpath workqueue\n"); return -ENOMEM; From b324a960354b872431d25959ad384ab66a7116ec Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 26 Mar 2024 23:38:03 +0100 Subject: [PATCH 089/167] mlx5: avoid truncating error message clang warns that one error message is too long for its destination buffer: drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c:1876:4: error: 'snprintf' will always be truncated; specified size is 80, but format string expands to at least 94 [-Werror,-Wformat-truncation-non-kprintf] Reword it to be a bit shorter so it always fits. Fixes: 70f0302b3f20 ("net/mlx5: Bridge, implement mdb offload") Signed-off-by: Arnd Bergmann Reviewed-by: Subbaraya Sundeep Link: https://lore.kernel.org/r/20240326223825.4084412-5-arnd@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c index 1b9bc32efd6fa..c5ea1d1d2b035 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c @@ -1874,7 +1874,7 @@ int mlx5_esw_bridge_port_mdb_add(struct net_device *dev, u16 vport_num, u16 esw_ "Failed to lookup bridge port vlan metadata to create MDB (MAC=%pM,vid=%u,vport=%u)\n", addr, vid, vport_num); NL_SET_ERR_MSG_FMT_MOD(extack, - "Failed to lookup bridge port vlan metadata to create MDB (MAC=%pM,vid=%u,vport=%u)\n", + "Failed to lookup vlan metadata for MDB (MAC=%pM,vid=%u,vport=%u)\n", addr, vid, vport_num); return -EINVAL; } From 730fffce4fd2eb7a0be2d0b6cd7e55e9194d76d5 Mon Sep 17 00:00:00 2001 From: Jian Wen Date: Wed, 27 Mar 2024 16:21:28 +0800 Subject: [PATCH 090/167] devlink: use kvzalloc() to allocate devlink instance resources During live migration of a virtual machine, the SR-IOV VF need to be re-registered. It may fail when the memory is badly fragmented. The related log is as follows. kernel: hv_netvsc 6045bdaa-c0d1-6045-bdaa-c0d16045bdaa eth0: VF slot 1 added ... kernel: kworker/0:0: page allocation failure: order:7, mode:0x40dc0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO), nodemask=(null),cpuset=/,mems_allowed=0 kernel: CPU: 0 PID: 24006 Comm: kworker/0:0 Tainted: G E 5.4...x86_64 #1 kernel: Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS 090008 12/07/2018 kernel: Workqueue: events work_for_cpu_fn kernel: Call Trace: kernel: dump_stack+0x8b/0xc8 kernel: warn_alloc+0xff/0x170 kernel: __alloc_pages_slowpath+0x92c/0xb2b kernel: ? get_page_from_freelist+0x1d4/0x1140 kernel: __alloc_pages_nodemask+0x2f9/0x320 kernel: alloc_pages_current+0x6a/0xb0 kernel: kmalloc_order+0x1e/0x70 kernel: kmalloc_order_trace+0x26/0xb0 kernel: ? __switch_to_asm+0x34/0x70 kernel: __kmalloc+0x276/0x280 kernel: ? _raw_spin_unlock_irqrestore+0x1e/0x40 kernel: devlink_alloc+0x29/0x110 kernel: mlx5_devlink_alloc+0x1a/0x20 [mlx5_core] kernel: init_one+0x1d/0x650 [mlx5_core] kernel: local_pci_probe+0x46/0x90 kernel: work_for_cpu_fn+0x1a/0x30 kernel: process_one_work+0x16d/0x390 kernel: worker_thread+0x1d3/0x3f0 kernel: kthread+0x105/0x140 kernel: ? max_active_store+0x80/0x80 kernel: ? kthread_bind+0x20/0x20 kernel: ret_from_fork+0x3a/0x50 Signed-off-by: Jian Wen Link: https://lore.kernel.org/r/20240327082128.942818-1-wenjian1@xiaomi.com Signed-off-by: Jakub Kicinski --- net/devlink/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/devlink/core.c b/net/devlink/core.c index 7f0b093208d75..f49cd83f1955f 100644 --- a/net/devlink/core.c +++ b/net/devlink/core.c @@ -314,7 +314,7 @@ static void devlink_release(struct work_struct *work) mutex_destroy(&devlink->lock); lockdep_unregister_key(&devlink->lock_key); put_device(devlink->dev); - kfree(devlink); + kvfree(devlink); } void devlink_put(struct devlink *devlink) @@ -420,7 +420,7 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, if (!devlink_reload_actions_valid(ops)) return NULL; - devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); + devlink = kvzalloc(struct_size(devlink, priv, priv_size), GFP_KERNEL); if (!devlink) return NULL; @@ -455,7 +455,7 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, return devlink; err_xa_alloc: - kfree(devlink); + kvfree(devlink); return NULL; } EXPORT_SYMBOL_GPL(devlink_alloc_ns); From bd3ce405fecc6f2f9ce09c789386e326346899a0 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Thu, 28 Mar 2024 15:56:36 +0000 Subject: [PATCH 091/167] tools/net/ynl: Add extack policy attribute decoding The NLMSGERR_ATTR_POLICY extack attribute has been ignored by ynl up to now. Extend extack decoding to include _POLICY and the nested NL_POLICY_TYPE_ATTR_* attributes. For example: ./tools/net/ynl/cli.py \ --spec Documentation/netlink/specs/rt_link.yaml \ --create --do newlink --json '{ "ifname": "12345678901234567890", "linkinfo": {"kind": "bridge"} }' Netlink error: Numerical result out of range nl_len = 104 (88) nl_flags = 0x300 nl_type = 2 error: -34 extack: {'msg': 'Attribute failed policy validation', 'policy': {'max-length': 15, 'type': 'string'}, 'bad-attr': '.ifname'} Signed-off-by: Donald Hunter Link: https://lore.kernel.org/r/20240328155636.64688-1-donald.hunter@gmail.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl.py | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tools/net/ynl/lib/ynl.py b/tools/net/ynl/lib/ynl.py index e73b027c5624f..82d3c98067aa0 100644 --- a/tools/net/ynl/lib/ynl.py +++ b/tools/net/ynl/lib/ynl.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause from collections import namedtuple +from enum import Enum import functools import os import random @@ -76,6 +77,25 @@ class Netlink: NLMSGERR_ATTR_MISS_TYPE = 5 NLMSGERR_ATTR_MISS_NEST = 6 + # Policy types + NL_POLICY_TYPE_ATTR_TYPE = 1 + NL_POLICY_TYPE_ATTR_MIN_VALUE_S = 2 + NL_POLICY_TYPE_ATTR_MAX_VALUE_S = 3 + NL_POLICY_TYPE_ATTR_MIN_VALUE_U = 4 + NL_POLICY_TYPE_ATTR_MAX_VALUE_U = 5 + NL_POLICY_TYPE_ATTR_MIN_LENGTH = 6 + NL_POLICY_TYPE_ATTR_MAX_LENGTH = 7 + NL_POLICY_TYPE_ATTR_POLICY_IDX = 8 + NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE = 9 + NL_POLICY_TYPE_ATTR_BITFIELD32_MASK = 10 + NL_POLICY_TYPE_ATTR_PAD = 11 + NL_POLICY_TYPE_ATTR_MASK = 12 + + AttrType = Enum('AttrType', ['flag', 'u8', 'u16', 'u32', 'u64', + 's8', 's16', 's32', 's64', + 'binary', 'string', 'nul-string', + 'nested', 'nested-array', + 'bitfield32', 'sint', 'uint']) class NlError(Exception): def __init__(self, nl_msg): @@ -198,6 +218,8 @@ def __init__(self, msg, offset, attr_space=None): self.extack['miss-nest'] = extack.as_scalar('u32') elif extack.type == Netlink.NLMSGERR_ATTR_OFFS: self.extack['bad-attr-offs'] = extack.as_scalar('u32') + elif extack.type == Netlink.NLMSGERR_ATTR_POLICY: + self.extack['policy'] = self._decode_policy(extack.raw) else: if 'unknown' not in self.extack: self.extack['unknown'] = [] @@ -214,6 +236,30 @@ def __init__(self, msg, offset, attr_space=None): desc += f" ({spec['doc']})" self.extack['miss-type'] = desc + def _decode_policy(self, raw): + policy = {} + for attr in NlAttrs(raw): + if attr.type == Netlink.NL_POLICY_TYPE_ATTR_TYPE: + type = attr.as_scalar('u32') + policy['type'] = Netlink.AttrType(type).name + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_VALUE_S: + policy['min-value'] = attr.as_scalar('s64') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_VALUE_S: + policy['max-value'] = attr.as_scalar('s64') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_VALUE_U: + policy['min-value'] = attr.as_scalar('u64') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_VALUE_U: + policy['max-value'] = attr.as_scalar('u64') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_LENGTH: + policy['min-length'] = attr.as_scalar('u32') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_LENGTH: + policy['max-length'] = attr.as_scalar('u32') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_BITFIELD32_MASK: + policy['bitfield32-mask'] = attr.as_scalar('u32') + elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MASK: + policy['mask'] = attr.as_scalar('u64') + return policy + def cmd(self): return self.nl_type From 5086f0fe46dcf8687c8c2e41e1f07826affebbba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Mar 2024 17:34:48 +0000 Subject: [PATCH 092/167] net: do not consume a cacheline for system_page_pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no reason to consume a full cacheline to store system_page_pool. We can eventually move it to softnet_data later for full locality control. Fixes: 2b0cfa6e4956 ("net: add generic percpu page_pool allocator") Signed-off-by: Eric Dumazet Cc: Lorenzo Bianconi Cc: Toke Høiland-Jørgensen Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20240328173448.2262593-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 9a67003e49db8..984ff8b9d0e1a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -429,7 +429,7 @@ EXPORT_PER_CPU_SYMBOL(softnet_data); * PP consumers must pay attention to run APIs in the appropriate context * (e.g. NAPI context). */ -static DEFINE_PER_CPU_ALIGNED(struct page_pool *, system_page_pool); +static DEFINE_PER_CPU(struct page_pool *, system_page_pool); #ifdef CONFIG_LOCKDEP /* From 648bb2bf444fd557192d6647945d6d8c8f7062e1 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 27 Mar 2024 18:48:07 +0100 Subject: [PATCH 093/167] net: microchip: encx24j600: drop driver owner assignment Core in spi_register_driver() already sets the .owner, so driver does not need to. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240327174810.519676-1-krzysztof.kozlowski@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/encx24j600.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/encx24j600.c b/drivers/net/ethernet/microchip/encx24j600.c index d7c8aa77ec75c..cdc2872ace1b4 100644 --- a/drivers/net/ethernet/microchip/encx24j600.c +++ b/drivers/net/ethernet/microchip/encx24j600.c @@ -1112,7 +1112,6 @@ MODULE_DEVICE_TABLE(spi, encx24j600_spi_id_table); static struct spi_driver encx24j600_spi_net_driver = { .driver = { .name = DRV_NAME, - .owner = THIS_MODULE, .bus = &spi_bus_type, }, .probe = encx24j600_spi_probe, From 343941206138ed5b29e4e17834dc80f9f1bc9aaa Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 27 Mar 2024 18:48:08 +0100 Subject: [PATCH 094/167] net: wwan: mhi: drop driver owner assignment Core in mhi_driver_register() already sets the .owner, so driver does not need to. Signed-off-by: Krzysztof Kozlowski Acked-by: Sergey Ryazanov Link: https://lore.kernel.org/r/20240327174810.519676-2-krzysztof.kozlowski@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/wwan/mhi_wwan_mbim.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/wwan/mhi_wwan_mbim.c b/drivers/net/wwan/mhi_wwan_mbim.c index 3f72ae943b294..f2aef84fc08d2 100644 --- a/drivers/net/wwan/mhi_wwan_mbim.c +++ b/drivers/net/wwan/mhi_wwan_mbim.c @@ -648,7 +648,6 @@ static struct mhi_driver mhi_mbim_driver = { .id_table = mhi_mbim_id_table, .driver = { .name = "mhi_wwan_mbim", - .owner = THIS_MODULE, }, }; From e3c95d56190c97a214115881947d1a8f97927479 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 27 Mar 2024 18:48:09 +0100 Subject: [PATCH 095/167] nfc: mrvl: spi: drop driver owner assignment Core in spi_register_driver() already sets the .owner, so driver does not need to. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240327174810.519676-3-krzysztof.kozlowski@linaro.org Signed-off-by: Jakub Kicinski --- drivers/nfc/nfcmrvl/spi.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nfc/nfcmrvl/spi.c b/drivers/nfc/nfcmrvl/spi.c index ad3359a4942c7..9c8cde1250fb5 100644 --- a/drivers/nfc/nfcmrvl/spi.c +++ b/drivers/nfc/nfcmrvl/spi.c @@ -199,7 +199,6 @@ static struct spi_driver nfcmrvl_spi_driver = { .id_table = nfcmrvl_spi_id_table, .driver = { .name = "nfcmrvl_spi", - .owner = THIS_MODULE, .of_match_table = of_match_ptr(of_nfcmrvl_spi_match), }, }; From e93af72286054e76d71aa5b7d721f33667ce0385 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 27 Mar 2024 18:48:10 +0100 Subject: [PATCH 096/167] nfc: st95hf: drop driver owner assignment Core in spi_register_driver() already sets the .owner, so driver does not need to. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240327174810.519676-4-krzysztof.kozlowski@linaro.org Signed-off-by: Jakub Kicinski --- drivers/nfc/st95hf/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nfc/st95hf/core.c b/drivers/nfc/st95hf/core.c index 067e0ec31d2dc..ffe5b4eab457f 100644 --- a/drivers/nfc/st95hf/core.c +++ b/drivers/nfc/st95hf/core.c @@ -1237,7 +1237,6 @@ static void st95hf_remove(struct spi_device *nfc_spi_dev) static struct spi_driver st95hf_driver = { .driver = { .name = "st95hf", - .owner = THIS_MODULE, .of_match_table = of_match_ptr(st95hf_spi_of_match), }, .id_table = st95hf_id, From e709acbd84fb6ef32736331b0147f027a3ef4c20 Mon Sep 17 00:00:00 2001 From: Su Hui Date: Thu, 28 Mar 2024 10:06:21 +0800 Subject: [PATCH 097/167] octeontx2-pf: check negative error code in otx2_open() otx2_rxtx_enable() return negative error code such as -EIO, check -EIO rather than EIO to fix this problem. Fixes: c926252205c4 ("octeontx2-pf: Disable packet I/O for graceful exit") Signed-off-by: Su Hui Reviewed-by: Subbaraya Sundeep Reviewed-by: Simon Horman Reviewed-by: Kalesh AP Link: https://lore.kernel.org/r/20240328020620.4054692-1-suhui@nfschina.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index b40bd0e467514..3f46d5e0fb2ec 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -1933,7 +1933,7 @@ int otx2_open(struct net_device *netdev) * mcam entries are enabled to receive the packets. Hence disable the * packet I/O. */ - if (err == EIO) + if (err == -EIO) goto err_disable_rxtx; else if (err) goto err_tx_stop_queues; From 1ab6fe64d220eaeb42a1130b3c31ea24c84cf5ad Mon Sep 17 00:00:00 2001 From: Su Hui Date: Thu, 28 Mar 2024 10:07:24 +0800 Subject: [PATCH 098/167] octeontx2-pf: remove unused variables req_hdr and rsp_hdr Clang static checker(scan-buid): drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c:503:2: warning: Value stored to 'rsp_hdr' is never read [deadcode.DeadStores] Remove these unused variables to save some space. Signed-off-by: Su Hui Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240328020723.4071539-1-suhui@nfschina.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index b40bd0e467514..5d8359b540985 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -450,7 +450,6 @@ static void otx2_pfvf_mbox_handler(struct work_struct *work) struct mbox_msghdr *msg = NULL; int offset, vf_idx, id, err; struct otx2_mbox_dev *mdev; - struct mbox_hdr *req_hdr; struct otx2_mbox *mbox; struct mbox *vf_mbox; struct otx2_nic *pf; @@ -461,9 +460,8 @@ static void otx2_pfvf_mbox_handler(struct work_struct *work) mbox = &pf->mbox_pfvf[0].mbox; mdev = &mbox->dev[vf_idx]; - req_hdr = (struct mbox_hdr *)(mdev->mbase + mbox->rx_start); - offset = ALIGN(sizeof(*req_hdr), MBOX_MSG_ALIGN); + offset = ALIGN(sizeof(struct mbox_hdr), MBOX_MSG_ALIGN); for (id = 0; id < vf_mbox->num_msgs; id++) { msg = (struct mbox_msghdr *)(mdev->mbase + mbox->rx_start + @@ -494,7 +492,6 @@ static void otx2_pfvf_mbox_up_handler(struct work_struct *work) struct otx2_nic *pf = vf_mbox->pfvf; struct otx2_mbox_dev *mdev; int offset, id, vf_idx = 0; - struct mbox_hdr *rsp_hdr; struct mbox_msghdr *msg; struct otx2_mbox *mbox; @@ -502,8 +499,7 @@ static void otx2_pfvf_mbox_up_handler(struct work_struct *work) mbox = &pf->mbox_pfvf[0].mbox_up; mdev = &mbox->dev[vf_idx]; - rsp_hdr = (struct mbox_hdr *)(mdev->mbase + mbox->rx_start); - offset = mbox->rx_start + ALIGN(sizeof(*rsp_hdr), MBOX_MSG_ALIGN); + offset = mbox->rx_start + ALIGN(sizeof(struct mbox_hdr), MBOX_MSG_ALIGN); for (id = 0; id < vf_mbox->up_num_msgs; id++) { msg = mdev->mbase + offset; From 5e864d90b20803edf6bd44a99fb9afa7171785f2 Mon Sep 17 00:00:00 2001 From: Atlas Yu Date: Thu, 28 Mar 2024 13:51:52 +0800 Subject: [PATCH 099/167] r8169: skip DASH fw status checks when DASH is disabled On devices that support DASH, the current code in the "rtl_loop_wait" function raises false alarms when DASH is disabled. This occurs because the function attempts to wait for the DASH firmware to be ready, even though it's not relevant in this case. r8169 0000:0c:00.0 eth0: RTL8168ep/8111ep, 38:7c:76:49:08:d9, XID 502, IRQ 86 r8169 0000:0c:00.0 eth0: jumbo features [frames: 9194 bytes, tx checksumming: ko] r8169 0000:0c:00.0 eth0: DASH disabled ... r8169 0000:0c:00.0 eth0: rtl_ep_ocp_read_cond == 0 (loop: 30, delay: 10000). This patch modifies the driver start/stop functions to skip checking the DASH firmware status when DASH is explicitly disabled. This prevents unnecessary delays and false alarms. The patch has been tested on several ThinkStation P8/PX workstations. Fixes: 0ab0c45d8aae ("r8169: add handling DASH when DASH is disabled") Signed-off-by: Atlas Yu Reviewed-by: Heiner Kallweit Link: https://lore.kernel.org/r/20240328055152.18443-1-atlas.yu@canonical.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 31 ++++++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 5c879a5c86d70..4ac444eb269ff 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -1314,17 +1314,40 @@ static void rtl8168ep_stop_cmac(struct rtl8169_private *tp) RTL_W8(tp, IBCR0, RTL_R8(tp, IBCR0) & ~0x01); } +static void rtl_dash_loop_wait(struct rtl8169_private *tp, + const struct rtl_cond *c, + unsigned long usecs, int n, bool high) +{ + if (!tp->dash_enabled) + return; + rtl_loop_wait(tp, c, usecs, n, high); +} + +static void rtl_dash_loop_wait_high(struct rtl8169_private *tp, + const struct rtl_cond *c, + unsigned long d, int n) +{ + rtl_dash_loop_wait(tp, c, d, n, true); +} + +static void rtl_dash_loop_wait_low(struct rtl8169_private *tp, + const struct rtl_cond *c, + unsigned long d, int n) +{ + rtl_dash_loop_wait(tp, c, d, n, false); +} + static void rtl8168dp_driver_start(struct rtl8169_private *tp) { r8168dp_oob_notify(tp, OOB_CMD_DRIVER_START); - rtl_loop_wait_high(tp, &rtl_dp_ocp_read_cond, 10000, 10); + rtl_dash_loop_wait_high(tp, &rtl_dp_ocp_read_cond, 10000, 10); } static void rtl8168ep_driver_start(struct rtl8169_private *tp) { r8168ep_ocp_write(tp, 0x01, 0x180, OOB_CMD_DRIVER_START); r8168ep_ocp_write(tp, 0x01, 0x30, r8168ep_ocp_read(tp, 0x30) | 0x01); - rtl_loop_wait_high(tp, &rtl_ep_ocp_read_cond, 10000, 30); + rtl_dash_loop_wait_high(tp, &rtl_ep_ocp_read_cond, 10000, 30); } static void rtl8168_driver_start(struct rtl8169_private *tp) @@ -1338,7 +1361,7 @@ static void rtl8168_driver_start(struct rtl8169_private *tp) static void rtl8168dp_driver_stop(struct rtl8169_private *tp) { r8168dp_oob_notify(tp, OOB_CMD_DRIVER_STOP); - rtl_loop_wait_low(tp, &rtl_dp_ocp_read_cond, 10000, 10); + rtl_dash_loop_wait_low(tp, &rtl_dp_ocp_read_cond, 10000, 10); } static void rtl8168ep_driver_stop(struct rtl8169_private *tp) @@ -1346,7 +1369,7 @@ static void rtl8168ep_driver_stop(struct rtl8169_private *tp) rtl8168ep_stop_cmac(tp); r8168ep_ocp_write(tp, 0x01, 0x180, OOB_CMD_DRIVER_STOP); r8168ep_ocp_write(tp, 0x01, 0x30, r8168ep_ocp_read(tp, 0x30) | 0x01); - rtl_loop_wait_low(tp, &rtl_ep_ocp_read_cond, 10000, 10); + rtl_dash_loop_wait_low(tp, &rtl_ep_ocp_read_cond, 10000, 10); } static void rtl8168_driver_stop(struct rtl8169_private *tp) From 06c2a5cd48fe50f24f8801dd10fcd2b6fd526566 Mon Sep 17 00:00:00 2001 From: Suraj Gupta Date: Thu, 28 Mar 2024 16:37:13 +0530 Subject: [PATCH 100/167] net: axienet: Fix kernel doc warnings Add description of mdio enable, mdio disable and mdio wait functions. Add description of skb pointer in axidma_bd data structure. Remove 'phy_node' description in axienet local data structure since it is not a valid struct member. Correct description of struct axienet_option. Fix below kernel-doc warnings in drivers/net/ethernet/xilinx/: 1) xilinx_axienet_mdio.c:1: warning: no structured comments found 2) xilinx_axienet.h:379: warning: Function parameter or struct member 'skb' not described in 'axidma_bd' 3) xilinx_axienet.h:538: warning: Excess struct member 'phy_node' description in 'axienet_local' 4) xilinx_axienet.h:1002: warning: expecting prototype for struct axiethernet_option. Prototype was for struct axienet_option instead Signed-off-by: Suraj Gupta Reviewed-by: Radhey Shyam Pandey Link: https://lore.kernel.org/r/20240328110713.12885-1-suraj.gupta2@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/xilinx/xilinx_axienet.h | 4 ++-- .../net/ethernet/xilinx/xilinx_axienet_mdio.c | 23 ++++++++++++++++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet.h b/drivers/net/ethernet/xilinx/xilinx_axienet.h index 807ead6785511..fa5500decc960 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet.h +++ b/drivers/net/ethernet/xilinx/xilinx_axienet.h @@ -359,6 +359,7 @@ * @app2: MM2S/S2MM User Application Field 2. * @app3: MM2S/S2MM User Application Field 3. * @app4: MM2S/S2MM User Application Field 4. + * @skb: Pointer to SKB transferred using DMA */ struct axidma_bd { u32 next; /* Physical address of next buffer descriptor */ @@ -399,7 +400,6 @@ struct skbuf_dma_descriptor { * struct axienet_local - axienet private per device data * @ndev: Pointer for net_device to which it will be attached. * @dev: Pointer to device structure - * @phy_node: Pointer to device node structure * @phylink: Pointer to phylink instance * @phylink_config: phylink configuration settings * @pcs_phy: Reference to PCS/PMA PHY if used @@ -537,7 +537,7 @@ struct axienet_local { }; /** - * struct axiethernet_option - Used to set axi ethernet hardware options + * struct axienet_option - Used to set axi ethernet hardware options * @opt: Option to be set. * @reg: Register offset to be written for setting the option * @m_or: Mask to be ORed for setting the option in the register diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_mdio.c b/drivers/net/ethernet/xilinx/xilinx_axienet_mdio.c index 2f07fde361aaf..9ca2643c921e5 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_mdio.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_mdio.c @@ -20,7 +20,14 @@ #define DEFAULT_MDIO_FREQ 2500000 /* 2.5 MHz */ #define DEFAULT_HOST_CLOCK 150000000 /* 150 MHz */ -/* Wait till MDIO interface is ready to accept a new transaction.*/ +/** + * axienet_mdio_wait_until_ready - MDIO wait function + * @lp: Pointer to axienet local data structure. + * + * Return : 0 on success, Negative value on errors + * + * Wait till MDIO interface is ready to accept a new transaction. + */ static int axienet_mdio_wait_until_ready(struct axienet_local *lp) { u32 val; @@ -30,14 +37,24 @@ static int axienet_mdio_wait_until_ready(struct axienet_local *lp) 1, 20000); } -/* Enable the MDIO MDC. Called prior to a read/write operation */ +/** + * axienet_mdio_mdc_enable - MDIO MDC enable function + * @lp: Pointer to axienet local data structure. + * + * Enable the MDIO MDC. Called prior to a read/write operation + */ static void axienet_mdio_mdc_enable(struct axienet_local *lp) { axienet_iow(lp, XAE_MDIO_MC_OFFSET, ((u32)lp->mii_clk_div | XAE_MDIO_MC_MDIOEN_MASK)); } -/* Disable the MDIO MDC. Called after a read/write operation*/ +/** + * axienet_mdio_mdc_disable - MDIO MDC disable function + * @lp: Pointer to axienet local data structure. + * + * Disable the MDIO MDC. Called after a read/write operation + */ static void axienet_mdio_mdc_disable(struct axienet_local *lp) { u32 mc_reg; From 17af420545a750f763025149fa7b833a4fc8b8f0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Mar 2024 11:22:48 +0000 Subject: [PATCH 101/167] erspan: make sure erspan_base_hdr is present in skb->head syzbot reported a problem in ip6erspan_rcv() [1] Issue is that ip6erspan_rcv() (and erspan_rcv()) no longer make sure erspan_base_hdr is present in skb linear part (skb->head) before getting @ver field from it. Add the missing pskb_may_pull() calls. v2: Reload iph pointer in erspan_rcv() after pskb_may_pull() because skb->head might have changed. [1] BUG: KMSAN: uninit-value in pskb_may_pull_reason include/linux/skbuff.h:2742 [inline] BUG: KMSAN: uninit-value in pskb_may_pull include/linux/skbuff.h:2756 [inline] BUG: KMSAN: uninit-value in ip6erspan_rcv net/ipv6/ip6_gre.c:541 [inline] BUG: KMSAN: uninit-value in gre_rcv+0x11f8/0x1930 net/ipv6/ip6_gre.c:610 pskb_may_pull_reason include/linux/skbuff.h:2742 [inline] pskb_may_pull include/linux/skbuff.h:2756 [inline] ip6erspan_rcv net/ipv6/ip6_gre.c:541 [inline] gre_rcv+0x11f8/0x1930 net/ipv6/ip6_gre.c:610 ip6_protocol_deliver_rcu+0x1d4c/0x2ca0 net/ipv6/ip6_input.c:438 ip6_input_finish net/ipv6/ip6_input.c:483 [inline] NF_HOOK include/linux/netfilter.h:314 [inline] ip6_input+0x15d/0x430 net/ipv6/ip6_input.c:492 ip6_mc_input+0xa7e/0xc80 net/ipv6/ip6_input.c:586 dst_input include/net/dst.h:460 [inline] ip6_rcv_finish+0x955/0x970 net/ipv6/ip6_input.c:79 NF_HOOK include/linux/netfilter.h:314 [inline] ipv6_rcv+0xde/0x390 net/ipv6/ip6_input.c:310 __netif_receive_skb_one_core net/core/dev.c:5538 [inline] __netif_receive_skb+0x1da/0xa00 net/core/dev.c:5652 netif_receive_skb_internal net/core/dev.c:5738 [inline] netif_receive_skb+0x58/0x660 net/core/dev.c:5798 tun_rx_batched+0x3ee/0x980 drivers/net/tun.c:1549 tun_get_user+0x5566/0x69e0 drivers/net/tun.c:2002 tun_chr_write_iter+0x3af/0x5d0 drivers/net/tun.c:2048 call_write_iter include/linux/fs.h:2108 [inline] new_sync_write fs/read_write.c:497 [inline] vfs_write+0xb63/0x1520 fs/read_write.c:590 ksys_write+0x20f/0x4c0 fs/read_write.c:643 __do_sys_write fs/read_write.c:655 [inline] __se_sys_write fs/read_write.c:652 [inline] __x64_sys_write+0x93/0xe0 fs/read_write.c:652 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Uninit was created at: slab_post_alloc_hook mm/slub.c:3804 [inline] slab_alloc_node mm/slub.c:3845 [inline] kmem_cache_alloc_node+0x613/0xc50 mm/slub.c:3888 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:577 __alloc_skb+0x35b/0x7a0 net/core/skbuff.c:668 alloc_skb include/linux/skbuff.h:1318 [inline] alloc_skb_with_frags+0xc8/0xbf0 net/core/skbuff.c:6504 sock_alloc_send_pskb+0xa81/0xbf0 net/core/sock.c:2795 tun_alloc_skb drivers/net/tun.c:1525 [inline] tun_get_user+0x209a/0x69e0 drivers/net/tun.c:1846 tun_chr_write_iter+0x3af/0x5d0 drivers/net/tun.c:2048 call_write_iter include/linux/fs.h:2108 [inline] new_sync_write fs/read_write.c:497 [inline] vfs_write+0xb63/0x1520 fs/read_write.c:590 ksys_write+0x20f/0x4c0 fs/read_write.c:643 __do_sys_write fs/read_write.c:655 [inline] __se_sys_write fs/read_write.c:652 [inline] __x64_sys_write+0x93/0xe0 fs/read_write.c:652 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 CPU: 1 PID: 5045 Comm: syz-executor114 Not tainted 6.9.0-rc1-syzkaller-00021-g962490525cff #0 Fixes: cb73ee40b1b3 ("net: ip_gre: use erspan key field for tunnel lookup") Reported-by: syzbot+1c1cf138518bf0c53d68@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/000000000000772f2c0614b66ef7@google.com/ Signed-off-by: Eric Dumazet Cc: Lorenzo Bianconi Link: https://lore.kernel.org/r/20240328112248.1101491-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ip_gre.c | 5 +++++ net/ipv6/ip6_gre.c | 3 +++ 2 files changed, 8 insertions(+) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 7b16c211b9044..57ddcd8c62f67 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -280,8 +280,13 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, tpi->flags | TUNNEL_NO_KEY, iph->saddr, iph->daddr, 0); } else { + if (unlikely(!pskb_may_pull(skb, + gre_hdr_len + sizeof(*ershdr)))) + return PACKET_REJECT; + ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); ver = ershdr->ver; + iph = ip_hdr(skb); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags | TUNNEL_KEY, iph->saddr, iph->daddr, tpi->key); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index ca7e77e842835..c89aef524df9a 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -528,6 +528,9 @@ static int ip6erspan_rcv(struct sk_buff *skb, struct ip6_tnl *tunnel; u8 ver; + if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr)))) + return PACKET_REJECT; + ipv6h = ipv6_hdr(skb); ershdr = (struct erspan_base_hdr *)skb->data; ver = ershdr->ver; From a5535e5336943b33689f558199366102387b7bbf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 28 Mar 2024 15:30:46 +0100 Subject: [PATCH 102/167] mlx5: stop warning for 64KB pages When building with 64KB pages, clang points out that xsk->chunk_size can never be PAGE_SIZE: drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c:19:22: error: result of comparison of constant 65536 with expression of type 'u16' (aka 'unsigned short') is always false [-Werror,-Wtautological-constant-out-of-range-compare] if (xsk->chunk_size > PAGE_SIZE || ~~~~~~~~~~~~~~~ ^ ~~~~~~~~~ In older versions of this code, using PAGE_SIZE was the only possibility, so this would have never worked on 64KB page kernels, but the patch apparently did not address this case completely. As Maxim Mikityanskiy suggested, 64KB chunks are really not all that useful, so just shut up the warning by adding a cast. Fixes: 282c0c798f8e ("net/mlx5e: Allow XSK frames smaller than a page") Link: https://lore.kernel.org/netdev/20211013150232.2942146-1-arnd@kernel.org/ Link: https://lore.kernel.org/lkml/a7b27541-0ebb-4f2d-bd06-270a4d404613@app.fastmail.com/ Signed-off-by: Arnd Bergmann Acked-by: Maxim Mikityanskiy Reviewed-by: Justin Stitt Reviewed-by: Tariq Toukan Link: https://lore.kernel.org/r/20240328143051.1069575-9-arnd@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c index 06592b9f04242..9240cfe25d102 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c @@ -28,8 +28,10 @@ bool mlx5e_validate_xsk_param(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk, struct mlx5_core_dev *mdev) { - /* AF_XDP doesn't support frames larger than PAGE_SIZE. */ - if (xsk->chunk_size > PAGE_SIZE || xsk->chunk_size < MLX5E_MIN_XSK_CHUNK_SIZE) { + /* AF_XDP doesn't support frames larger than PAGE_SIZE, + * and xsk->chunk_size is limited to 65535 bytes. + */ + if ((size_t)xsk->chunk_size > PAGE_SIZE || xsk->chunk_size < MLX5E_MIN_XSK_CHUNK_SIZE) { mlx5_core_err(mdev, "XSK chunk size %u out of bounds [%u, %lu]\n", xsk->chunk_size, MLX5E_MIN_XSK_CHUNK_SIZE, PAGE_SIZE); return false; From ea111449501ea32bf6da82750de860243691efc7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:44 -0700 Subject: [PATCH 103/167] tcp: Fix bind() regression for v6-only wildcard and v4-mapped-v6 non-wildcard addresses. Commit 5e07e672412b ("tcp: Use bhash2 for v4-mapped-v6 non-wildcard address.") introduced bind() regression for v4-mapped-v6 address. When we bind() the following two addresses on the same port, the 2nd bind() should succeed but fails now. 1. [::] w/ IPV6_ONLY 2. ::ffff:127.0.0.1 After the chagne, v4-mapped-v6 uses bhash2 instead of bhash to detect conflict faster, but I forgot to add a necessary change. During the 2nd bind(), inet_bind2_bucket_match_addr_any() returns the tb2 bucket of [::], and inet_bhash2_conflict() finally calls inet_bind_conflict(), which returns true, meaning conflict. inet_bhash2_addr_any_conflict |- inet_bind2_bucket_match_addr_any <-- return [::] bucket `- inet_bhash2_conflict `- __inet_bhash2_conflict <-- checks IPV6_ONLY for AF_INET | but not for v4-mapped-v6 address `- inet_bind_conflict <-- does not check address inet_bind_conflict() does not check socket addresses because __inet_bhash2_conflict() is expected to do so. However, it checks IPV6_V6ONLY attribute only against AF_INET socket, and not for v4-mapped-v6 address. As a result, v4-mapped-v6 address conflicts with v6-only wildcard address. To avoid that, let's add the missing test to use bhash2 for v4-mapped-v6 address. Fixes: 5e07e672412b ("tcp: Use bhash2 for v4-mapped-v6 non-wildcard address.") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index c038e28e2f1e6..4184d45f890c8 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -203,8 +203,15 @@ static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2, kuid_t sk_uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { - if (sk->sk_family == AF_INET && ipv6_only_sock(sk2)) - return false; + if (ipv6_only_sock(sk2)) { + if (sk->sk_family == AF_INET) + return false; + +#if IS_ENABLED(CONFIG_IPV6) + if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + return false; +#endif + } return inet_bind_conflict(sk, sk2, sk_uid, relax, reuseport_cb_ok, reuseport_ok); From d91ef1e1b55f730bee8ce286b02b7bdccbc42973 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:45 -0700 Subject: [PATCH 104/167] tcp: Fix bind() regression for v6-only wildcard and v4(-mapped-v6) non-wildcard addresses. Jianguo Wu reported another bind() regression introduced by bhash2. Calling bind() for the following 3 addresses on the same port, the 3rd one should fail but now succeeds. 1. 0.0.0.0 or ::ffff:0.0.0.0 2. [::] w/ IPV6_V6ONLY 3. IPv4 non-wildcard address or v4-mapped-v6 non-wildcard address The first two bind() create tb2 like this: bhash2 -> tb2(:: w/ IPV6_V6ONLY) -> tb2(0.0.0.0) The 3rd bind() will match with the IPv6 only wildcard address bucket in inet_bind2_bucket_match_addr_any(), however, no conflicting socket exists in the bucket. So, inet_bhash2_conflict() will returns false, and thus, inet_bhash2_addr_any_conflict() returns false consequently. As a result, the 3rd bind() bypasses conflict check, which should be done against the IPv4 wildcard address bucket. So, in inet_bhash2_addr_any_conflict(), we must iterate over all buckets. Note that we cannot add ipv6_only flag for inet_bind2_bucket as it would confuse the following patetrn. 1. [::] w/ SO_REUSE{ADDR,PORT} and IPV6_V6ONLY 2. [::] w/ SO_REUSE{ADDR,PORT} 3. IPv4 non-wildcard address or v4-mapped-v6 non-wildcard address The first bind() would create a bucket with ipv6_only flag true, the second bind() would add the [::] socket into the same bucket, and the third bind() could succeed based on the wrong assumption that ipv6_only bucket would not conflict with v4(-mapped-v6) address. Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address") Diagnosed-by: Jianguo Wu Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4184d45f890c8..3b38610958ee4 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -294,6 +294,7 @@ static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l struct sock_reuseport *reuseport_cb; struct inet_bind_hashbucket *head2; struct inet_bind2_bucket *tb2; + bool conflict = false; bool reuseport_cb_ok; rcu_read_lock(); @@ -306,18 +307,20 @@ static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l spin_lock(&head2->lock); - inet_bind_bucket_for_each(tb2, &head2->chain) - if (inet_bind2_bucket_match_addr_any(tb2, net, port, l3mdev, sk)) - break; + inet_bind_bucket_for_each(tb2, &head2->chain) { + if (!inet_bind2_bucket_match_addr_any(tb2, net, port, l3mdev, sk)) + continue; - if (tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, - reuseport_ok)) { - spin_unlock(&head2->lock); - return true; + if (!inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, reuseport_ok)) + continue; + + conflict = true; + break; } spin_unlock(&head2->lock); - return false; + + return conflict; } /* From c48baf567dedbba731d66f5a2cd46f1b6def50aa Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:46 -0700 Subject: [PATCH 105/167] selftest: tcp: Make bind() selftest flexible. Currently, bind_wildcard.c tests only (IPv4, IPv6) pairs, but we will add more tests for the same protocol pairs. This patch makes it possible by changing the address pointer to void. No functional changes are intended. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bind_wildcard.c | 92 +++++++++++++-------- 1 file changed, 58 insertions(+), 34 deletions(-) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index a2662348cdb1a..d65c3bb6ba138 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -6,7 +6,9 @@ #include "../kselftest_harness.h" -struct in6_addr in6addr_v4mapped_any = { +static const __u32 in4addr_any = INADDR_ANY; +static const __u32 in4addr_loopback = INADDR_LOOPBACK; +static const struct in6_addr in6addr_v4mapped_any = { .s6_addr = { 0, 0, 0, 0, 0, 0, 0, 0, @@ -14,8 +16,7 @@ struct in6_addr in6addr_v4mapped_any = { 0, 0, 0, 0 } }; - -struct in6_addr in6addr_v4mapped_loopback = { +static const struct in6_addr in6addr_v4mapped_loopback = { .s6_addr = { 0, 0, 0, 0, 0, 0, 0, 0, @@ -26,82 +27,105 @@ struct in6_addr in6addr_v4mapped_loopback = { FIXTURE(bind_wildcard) { - struct sockaddr_in addr4; - struct sockaddr_in6 addr6; + socklen_t addrlen[2]; + union { + struct sockaddr addr; + struct sockaddr_in addr4; + struct sockaddr_in6 addr6; + } addr[2]; }; FIXTURE_VARIANT(bind_wildcard) { - const __u32 addr4_const; - const struct in6_addr *addr6_const; + sa_family_t family[2]; + const void *addr[2]; int expected_errno; }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) { - .addr4_const = INADDR_ANY, - .addr6_const = &in6addr_any, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_any, &in6addr_any}, .expected_errno = EADDRINUSE, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_local) { - .addr4_const = INADDR_ANY, - .addr6_const = &in6addr_loopback, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_any, &in6addr_loopback}, .expected_errno = 0, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_any) { - .addr4_const = INADDR_ANY, - .addr6_const = &in6addr_v4mapped_any, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_any, &in6addr_v4mapped_any}, .expected_errno = EADDRINUSE, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_local) { - .addr4_const = INADDR_ANY, - .addr6_const = &in6addr_v4mapped_loopback, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_any, &in6addr_v4mapped_loopback}, .expected_errno = EADDRINUSE, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any) { - .addr4_const = INADDR_LOOPBACK, - .addr6_const = &in6addr_any, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_loopback, &in6addr_any}, .expected_errno = EADDRINUSE, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_local) { - .addr4_const = INADDR_LOOPBACK, - .addr6_const = &in6addr_loopback, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_loopback, &in6addr_loopback}, .expected_errno = 0, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_any) { - .addr4_const = INADDR_LOOPBACK, - .addr6_const = &in6addr_v4mapped_any, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_loopback, &in6addr_v4mapped_any}, .expected_errno = EADDRINUSE, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_local) { - .addr4_const = INADDR_LOOPBACK, - .addr6_const = &in6addr_v4mapped_loopback, + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_loopback, &in6addr_v4mapped_loopback}, .expected_errno = EADDRINUSE, }; -FIXTURE_SETUP(bind_wildcard) +static void setup_addr(FIXTURE_DATA(bind_wildcard) *self, int i, + int family, const void *addr_const) { - self->addr4.sin_family = AF_INET; - self->addr4.sin_port = htons(0); - self->addr4.sin_addr.s_addr = htonl(variant->addr4_const); + if (family == AF_INET) { + struct sockaddr_in *addr4 = &self->addr[i].addr4; + const __u32 *addr4_const = addr_const; + + addr4->sin_family = AF_INET; + addr4->sin_port = htons(0); + addr4->sin_addr.s_addr = htonl(*addr4_const); + + self->addrlen[i] = sizeof(struct sockaddr_in); + } else { + struct sockaddr_in6 *addr6 = &self->addr[i].addr6; + const struct in6_addr *addr6_const = addr_const; + + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(0); + addr6->sin6_addr = *addr6_const; - self->addr6.sin6_family = AF_INET6; - self->addr6.sin6_port = htons(0); - self->addr6.sin6_addr = *variant->addr6_const; + self->addrlen[i] = sizeof(struct sockaddr_in6); + } +} + +FIXTURE_SETUP(bind_wildcard) +{ + setup_addr(self, 0, variant->family[0], variant->addr[0]); + setup_addr(self, 1, variant->family[1], variant->addr[1]); } FIXTURE_TEARDOWN(bind_wildcard) @@ -146,15 +170,15 @@ void bind_sockets(struct __test_metadata *_metadata, TEST_F(bind_wildcard, v4_v6) { bind_sockets(_metadata, self, variant->expected_errno, - (struct sockaddr *)&self->addr4, sizeof(self->addr4), - (struct sockaddr *)&self->addr6, sizeof(self->addr6)); + &self->addr[0].addr, self->addrlen[0], + &self->addr[1].addr, self->addrlen[1]); } TEST_F(bind_wildcard, v6_v4) { bind_sockets(_metadata, self, variant->expected_errno, - (struct sockaddr *)&self->addr6, sizeof(self->addr6), - (struct sockaddr *)&self->addr4, sizeof(self->addr4)); + &self->addr[1].addr, self->addrlen[1], + &self->addr[0].addr, self->addrlen[0]); } TEST_HARNESS_MAIN From 6f9bc755c0215501c45897aa5c8b8b56fb65724e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:47 -0700 Subject: [PATCH 106/167] selftest: tcp: Define the reverse order bind() tests explicitly. Currently, bind_wildcard.c calls bind() twice for two addresses and checks the pre-defined errno against the 2nd call. Also, the two bind() calls are swapped to cover various patterns how bind buckets are created. However, only testing two addresses is insufficient to detect regression. So, we will add more bind() calls, and then, we need to define different errno for each bind() per test case. As a prepartion, let's define the reverse order bind() test cases as fixtures. No functional changes are intended. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bind_wildcard.c | 67 ++++++++++++++++++--- 1 file changed, 59 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index d65c3bb6ba138..143aae383da39 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -42,6 +42,7 @@ FIXTURE_VARIANT(bind_wildcard) int expected_errno; }; +/* (IPv4, IPv6) */ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) { .family = {AF_INET, AF_INET6}, @@ -98,6 +99,63 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_local) .expected_errno = EADDRINUSE, }; +/* (IPv6, IPv4) */ +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_any) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_any, &in4addr_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_local) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_any, &in4addr_loopback}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_any) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_loopback, &in4addr_any}, + .expected_errno = 0, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_local) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_loopback, &in4addr_loopback}, + .expected_errno = 0, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_any) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_v4mapped_any, &in4addr_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_local) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_v4mapped_any, &in4addr_loopback}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_any) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_v4mapped_loopback, &in4addr_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_local) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_v4mapped_loopback, &in4addr_loopback}, + .expected_errno = EADDRINUSE, +}; + static void setup_addr(FIXTURE_DATA(bind_wildcard) *self, int i, int family, const void *addr_const) { @@ -167,18 +225,11 @@ void bind_sockets(struct __test_metadata *_metadata, close(fd[0]); } -TEST_F(bind_wildcard, v4_v6) +TEST_F(bind_wildcard, plain) { bind_sockets(_metadata, self, variant->expected_errno, &self->addr[0].addr, self->addrlen[0], &self->addr[1].addr, self->addrlen[1]); } -TEST_F(bind_wildcard, v6_v4) -{ - bind_sockets(_metadata, self, variant->expected_errno, - &self->addr[1].addr, self->addrlen[1], - &self->addr[0].addr, self->addrlen[0]); -} - TEST_HARNESS_MAIN From 5e9e9afdb50449f35d3e65dd6b1cdf87e8ce185e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:48 -0700 Subject: [PATCH 107/167] selftest: tcp: Add v4-v4 and v6-v6 bind() conflict tests. We don't have bind() conflict tests for the same protocol pairs. Let's add them except for the same address pair, which will be covered by the following patch adding 6 more bind() calls for each test case. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bind_wildcard.c | 100 ++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index 143aae383da39..5100dd713a494 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -42,6 +42,21 @@ FIXTURE_VARIANT(bind_wildcard) int expected_errno; }; +/* (IPv4, IPv4) */ +FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v4_local) +{ + .family = {AF_INET, AF_INET}, + .addr = {&in4addr_any, &in4addr_loopback}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v4_any) +{ + .family = {AF_INET, AF_INET}, + .addr = {&in4addr_loopback, &in4addr_any}, + .expected_errno = EADDRINUSE, +}; + /* (IPv4, IPv6) */ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) { @@ -156,6 +171,91 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_local) .expected_errno = EADDRINUSE, }; +/* (IPv6, IPv6) */ +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_loopback}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_v4mapped_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_v4mapped_loopback}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_loopback, &in6addr_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_loopback, &in6addr_v4mapped_any}, + .expected_errno = 0, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_loopback, &in6addr_v4mapped_loopback}, + .expected_errno = 0, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_any, &in6addr_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_any, &in6addr_loopback}, + .expected_errno = 0, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_v4mapped_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_any, &in6addr_v4mapped_loopback}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_loopback, &in6addr_any}, + .expected_errno = EADDRINUSE, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_loopback, &in6addr_loopback}, + .expected_errno = 0, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_v4mapped_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_loopback, &in6addr_v4mapped_any}, + .expected_errno = EADDRINUSE, +}; + static void setup_addr(FIXTURE_DATA(bind_wildcard) *self, int i, int family, const void *addr_const) { From f40742c22a6e9ffb53bf02f22ea5eda55fbcfcc5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:49 -0700 Subject: [PATCH 108/167] selftest: tcp: Add more bind() calls. In addtition to the two addresses defined in the fixtures, this patch add 6 more bind calls(): * 0.0.0.0 * 127.0.0.1 * :: * ::1 * ::ffff:0.0.0.0 * ::ffff:127.0.0.1 The first two per-fixture bind() calls control how inet_bind2_bucket is created, and the rest 6 bind() calls cover as many conflicting patterns as possible. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-7-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bind_wildcard.c | 225 +++++++++++++++----- 1 file changed, 166 insertions(+), 59 deletions(-) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index 5100dd713a494..4ecd3835f33ca 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -25,21 +25,34 @@ static const struct in6_addr in6addr_v4mapped_loopback = { } }; +#define NR_SOCKETS 8 + FIXTURE(bind_wildcard) { - socklen_t addrlen[2]; + int fd[NR_SOCKETS]; + socklen_t addrlen[NR_SOCKETS]; union { struct sockaddr addr; struct sockaddr_in addr4; struct sockaddr_in6 addr6; - } addr[2]; + } addr[NR_SOCKETS]; }; FIXTURE_VARIANT(bind_wildcard) { sa_family_t family[2]; const void *addr[2]; - int expected_errno; + + /* 6 bind() calls below follow two bind() for the defined 2 addresses: + * + * 0.0.0.0 + * 127.0.0.1 + * :: + * ::1 + * ::ffff:0.0.0.0 + * ::ffff:127.0.0.1 + */ + int expected_errno[NR_SOCKETS]; }; /* (IPv4, IPv4) */ @@ -47,14 +60,20 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v4_local) { .family = {AF_INET, AF_INET}, .addr = {&in4addr_any, &in4addr_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v4_any) { .family = {AF_INET, AF_INET}, .addr = {&in4addr_loopback, &in4addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; /* (IPv4, IPv6) */ @@ -62,56 +81,80 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_any, &in6addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_local) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_any, &in6addr_loopback}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_any) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_any, &in6addr_v4mapped_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_local) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_any, &in6addr_v4mapped_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_loopback, &in6addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_local) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_loopback, &in6addr_loopback}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_any) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_loopback, &in6addr_v4mapped_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_local) { .family = {AF_INET, AF_INET6}, .addr = {&in4addr_loopback, &in6addr_v4mapped_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; /* (IPv6, IPv4) */ @@ -119,56 +162,80 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_any) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_any, &in4addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_local) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_any, &in4addr_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_any) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_loopback, &in4addr_any}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_local) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_loopback, &in4addr_loopback}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_any) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_v4mapped_any, &in4addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_local) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_v4mapped_any, &in4addr_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_any) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_v4mapped_loopback, &in4addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_local) { .family = {AF_INET6, AF_INET}, .addr = {&in6addr_v4mapped_loopback, &in4addr_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; /* (IPv6, IPv6) */ @@ -176,84 +243,120 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_local) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_any, &in6addr_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_any) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_any, &in6addr_v4mapped_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_local) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_any, &in6addr_v4mapped_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_loopback, &in6addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_any) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_loopback, &in6addr_v4mapped_any}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_local) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_loopback, &in6addr_v4mapped_loopback}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_v4mapped_any, &in6addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_local) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_v4mapped_any, &in6addr_loopback}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_v4mapped_local) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_v4mapped_any, &in6addr_v4mapped_loopback}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_v4mapped_loopback, &in6addr_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_local) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_v4mapped_loopback, &in6addr_loopback}, - .expected_errno = 0, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_v4mapped_any) { .family = {AF_INET6, AF_INET6}, .addr = {&in6addr_v4mapped_loopback, &in6addr_v4mapped_any}, - .expected_errno = EADDRINUSE, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; static void setup_addr(FIXTURE_DATA(bind_wildcard) *self, int i, @@ -284,52 +387,56 @@ FIXTURE_SETUP(bind_wildcard) { setup_addr(self, 0, variant->family[0], variant->addr[0]); setup_addr(self, 1, variant->family[1], variant->addr[1]); + + setup_addr(self, 2, AF_INET, &in4addr_any); + setup_addr(self, 3, AF_INET, &in4addr_loopback); + + setup_addr(self, 4, AF_INET6, &in6addr_any); + setup_addr(self, 5, AF_INET6, &in6addr_loopback); + setup_addr(self, 6, AF_INET6, &in6addr_v4mapped_any); + setup_addr(self, 7, AF_INET6, &in6addr_v4mapped_loopback); } FIXTURE_TEARDOWN(bind_wildcard) { + int i; + + for (i = 0; i < NR_SOCKETS; i++) + close(self->fd[i]); } -void bind_sockets(struct __test_metadata *_metadata, - FIXTURE_DATA(bind_wildcard) *self, - int expected_errno, - struct sockaddr *addr1, socklen_t addrlen1, - struct sockaddr *addr2, socklen_t addrlen2) +void bind_socket(struct __test_metadata *_metadata, + FIXTURE_DATA(bind_wildcard) *self, + const FIXTURE_VARIANT(bind_wildcard) *variant, + int i) { - int fd[2]; int ret; - fd[0] = socket(addr1->sa_family, SOCK_STREAM, 0); - ASSERT_GT(fd[0], 0); - - ret = bind(fd[0], addr1, addrlen1); - ASSERT_EQ(ret, 0); + self->fd[i] = socket(self->addr[i].addr.sa_family, SOCK_STREAM, 0); + ASSERT_GT(self->fd[i], 0); - ret = getsockname(fd[0], addr1, &addrlen1); - ASSERT_EQ(ret, 0); + self->addr[i].addr4.sin_port = self->addr[0].addr4.sin_port; - ((struct sockaddr_in *)addr2)->sin_port = ((struct sockaddr_in *)addr1)->sin_port; - - fd[1] = socket(addr2->sa_family, SOCK_STREAM, 0); - ASSERT_GT(fd[1], 0); - - ret = bind(fd[1], addr2, addrlen2); - if (expected_errno) { + ret = bind(self->fd[i], &self->addr[i].addr, self->addrlen[i]); + if (variant->expected_errno[i]) { ASSERT_EQ(ret, -1); - ASSERT_EQ(errno, expected_errno); + ASSERT_EQ(errno, variant->expected_errno[i]); } else { ASSERT_EQ(ret, 0); } - close(fd[1]); - close(fd[0]); + if (i == 0) { + ret = getsockname(self->fd[0], &self->addr[0].addr, &self->addrlen[0]); + ASSERT_EQ(ret, 0); + } } TEST_F(bind_wildcard, plain) { - bind_sockets(_metadata, self, variant->expected_errno, - &self->addr[0].addr, self->addrlen[0], - &self->addr[1].addr, self->addrlen[1]); + int i; + + for (i = 0; i < NR_SOCKETS; i++) + bind_socket(_metadata, self, variant, i); } TEST_HARNESS_MAIN From d37f2f72c91f2c5b61db7e6685c8b4bfdff85cb8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:50 -0700 Subject: [PATCH 109/167] selftest: tcp: Add bind() tests for IPV6_V6ONLY. bhash2 was not well tested for IPv6-only sockets. This patch adds test cases where we set IPV6_V6ONLY for per-fixture bind() calls if variant->ipv6_only[i] is true. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-8-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bind_wildcard.c | 116 ++++++++++++++++++++ 1 file changed, 116 insertions(+) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index 4ecd3835f33ca..6d7f02441b9d7 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -42,6 +42,7 @@ FIXTURE_VARIANT(bind_wildcard) { sa_family_t family[2]; const void *addr[2]; + bool ipv6_only[2]; /* 6 bind() calls below follow two bind() for the defined 2 addresses: * @@ -87,6 +88,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any_only) +{ + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_any, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_local) { .family = {AF_INET, AF_INET6}, @@ -127,6 +139,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any_only) +{ + .family = {AF_INET, AF_INET6}, + .addr = {&in4addr_loopback, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_local) { .family = {AF_INET, AF_INET6}, @@ -168,6 +191,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_any) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_any, &in4addr_any}, + .ipv6_only = {true, false}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_local) { .family = {AF_INET6, AF_INET}, @@ -178,6 +212,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_local) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_local) +{ + .family = {AF_INET6, AF_INET}, + .addr = {&in6addr_any, &in4addr_loopback}, + .ipv6_only = {true, false}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_any) { .family = {AF_INET6, AF_INET}, @@ -249,6 +294,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_local) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_loopback}, + .ipv6_only = {true, false}, + .expected_errno = {0, EADDRINUSE, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_any) { .family = {AF_INET6, AF_INET6}, @@ -259,6 +315,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_v4mapped_any}, + .ipv6_only = {true, false}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_local) { .family = {AF_INET6, AF_INET6}, @@ -269,6 +336,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_local) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_local) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_v4mapped_loopback}, + .ipv6_only = {true, false}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any) { .family = {AF_INET6, AF_INET6}, @@ -279,6 +357,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any_only) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_loopback, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, EADDRINUSE, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_any) { .family = {AF_INET6, AF_INET6}, @@ -309,6 +398,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any_only) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_any, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_local) { .family = {AF_INET6, AF_INET6}, @@ -339,6 +439,17 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any) EADDRINUSE, EADDRINUSE}, }; +FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any_only) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_v4mapped_loopback, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_local) { .family = {AF_INET6, AF_INET6}, @@ -415,6 +526,11 @@ void bind_socket(struct __test_metadata *_metadata, self->fd[i] = socket(self->addr[i].addr.sa_family, SOCK_STREAM, 0); ASSERT_GT(self->fd[i], 0); + if (i < 2 && variant->ipv6_only[i]) { + ret = setsockopt(self->fd[i], SOL_IPV6, IPV6_V6ONLY, &(int){1}, sizeof(int)); + ASSERT_EQ(ret, 0); + } + self->addr[i].addr4.sin_port = self->addr[0].addr4.sin_port; ret = bind(self->fd[i], &self->addr[i].addr, self->addrlen[i]); From 7679f0968d01878b8da80c5078eebe23231a19e8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 26 Mar 2024 13:42:51 -0700 Subject: [PATCH 110/167] selftest: tcp: Add bind() tests for SO_REUSEADDR/SO_REUSEPORT. This patch adds two tests using SO_REUSEADDR and SO_REUSEPORT and defines errno for each test case. SO_REUSEADDR/SO_REUSEPORT is set for the per-fixture two bind() calls. The notable pattern is the pair of v6only [::] and plain [::]. The two sockets are put into the same tb2, where per-bucket v6only flag would be useless to detect bind() conflict. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240326204251.51301-9-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bind_wildcard.c | 263 +++++++++++++++++++- 1 file changed, 257 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/bind_wildcard.c b/tools/testing/selftests/net/bind_wildcard.c index 6d7f02441b9d7..b7b54d646b937 100644 --- a/tools/testing/selftests/net/bind_wildcard.c +++ b/tools/testing/selftests/net/bind_wildcard.c @@ -54,6 +54,7 @@ FIXTURE_VARIANT(bind_wildcard) * ::ffff:127.0.0.1 */ int expected_errno[NR_SOCKETS]; + int expected_reuse_errno[NR_SOCKETS]; }; /* (IPv4, IPv4) */ @@ -65,6 +66,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v4_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v4_any) @@ -75,6 +80,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v4_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; /* (IPv4, IPv6) */ @@ -86,6 +95,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any_only) @@ -97,6 +110,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_any_only) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_local) @@ -107,6 +124,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_any) @@ -117,6 +138,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_local) @@ -127,6 +152,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_any_v6_v4mapped_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any) @@ -137,6 +166,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any_only) @@ -148,6 +181,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_any_only) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_local) @@ -158,6 +195,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_any) @@ -168,6 +209,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_local) @@ -178,6 +223,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v4_local_v6_v4mapped_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; /* (IPv6, IPv4) */ @@ -189,6 +238,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_any) @@ -200,6 +253,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_local) @@ -210,6 +267,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v4_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_local) @@ -221,6 +282,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v4_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_any) @@ -231,6 +296,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_local) @@ -241,6 +310,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v4_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_any) @@ -251,6 +324,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_local) @@ -261,6 +338,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v4_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_any) @@ -271,6 +352,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_local) @@ -281,9 +366,72 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_local_v4_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; /* (IPv6, IPv6) */ +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_any}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_any) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_any}, + .ipv6_only = {true, false}, + .expected_errno = {0, EADDRINUSE, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_any_only) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_any}, + .ipv6_only = {false, true}, + .expected_errno = {0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + +FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_any_only) +{ + .family = {AF_INET6, AF_INET6}, + .addr = {&in6addr_any, &in6addr_any}, + .ipv6_only = {true, true}, + .expected_errno = {0, EADDRINUSE, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, +}; + FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_local) { .family = {AF_INET6, AF_INET6}, @@ -292,6 +440,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_local) @@ -303,6 +455,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_local) 0, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_any) @@ -313,6 +469,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_any) @@ -324,6 +484,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_local) @@ -334,6 +498,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_v6_v4mapped_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_local) @@ -345,6 +513,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_any_only_v6_v4mapped_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any) @@ -355,6 +527,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any) 0, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any_only) @@ -366,6 +542,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_any_only) 0, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + 0, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_any) @@ -376,6 +556,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_local) @@ -386,6 +570,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_local_v6_v4mapped_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any) @@ -396,6 +584,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any_only) @@ -407,6 +599,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_any_only) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_local) @@ -417,6 +613,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_v4mapped_local) @@ -427,6 +627,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_any_v6_v4mapped_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any) @@ -437,6 +641,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any_only) @@ -448,6 +656,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_any_only) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_local) @@ -458,6 +670,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_local) EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, EADDRINUSE}, }; FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_v4mapped_any) @@ -468,6 +684,10 @@ FIXTURE_VARIANT_ADD(bind_wildcard, v6_v4mapped_loopback_v6_v4mapped_any) EADDRINUSE, EADDRINUSE, EADDRINUSE, 0, EADDRINUSE, EADDRINUSE}, + .expected_reuse_errno = {0, 0, + EADDRINUSE, EADDRINUSE, + EADDRINUSE, 0, + EADDRINUSE, EADDRINUSE}, }; static void setup_addr(FIXTURE_DATA(bind_wildcard) *self, int i, @@ -519,7 +739,7 @@ FIXTURE_TEARDOWN(bind_wildcard) void bind_socket(struct __test_metadata *_metadata, FIXTURE_DATA(bind_wildcard) *self, const FIXTURE_VARIANT(bind_wildcard) *variant, - int i) + int i, int reuse) { int ret; @@ -531,14 +751,29 @@ void bind_socket(struct __test_metadata *_metadata, ASSERT_EQ(ret, 0); } + if (i < 2 && reuse) { + ret = setsockopt(self->fd[i], SOL_SOCKET, reuse, &(int){1}, sizeof(int)); + ASSERT_EQ(ret, 0); + } + self->addr[i].addr4.sin_port = self->addr[0].addr4.sin_port; ret = bind(self->fd[i], &self->addr[i].addr, self->addrlen[i]); - if (variant->expected_errno[i]) { - ASSERT_EQ(ret, -1); - ASSERT_EQ(errno, variant->expected_errno[i]); + + if (reuse) { + if (variant->expected_reuse_errno[i]) { + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, variant->expected_reuse_errno[i]); + } else { + ASSERT_EQ(ret, 0); + } } else { - ASSERT_EQ(ret, 0); + if (variant->expected_errno[i]) { + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, variant->expected_errno[i]); + } else { + ASSERT_EQ(ret, 0); + } } if (i == 0) { @@ -552,7 +787,23 @@ TEST_F(bind_wildcard, plain) int i; for (i = 0; i < NR_SOCKETS; i++) - bind_socket(_metadata, self, variant, i); + bind_socket(_metadata, self, variant, i, 0); +} + +TEST_F(bind_wildcard, reuseaddr) +{ + int i; + + for (i = 0; i < NR_SOCKETS; i++) + bind_socket(_metadata, self, variant, i, SO_REUSEADDR); +} + +TEST_F(bind_wildcard, reuseport) +{ + int i; + + for (i = 0; i < NR_SOCKETS; i++) + bind_socket(_metadata, self, variant, i, SO_REUSEPORT); } TEST_HARNESS_MAIN From 60557969951304dad829f2829019907dfb43ecb3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Mar 2024 14:40:29 +0000 Subject: [PATCH 111/167] udp: annotate data-race in __udp_enqueue_schedule_skb() sk->sk_rcvbuf is read locklessly twice, while other threads could change its value. Use a READ_ONCE() to annotate the race. Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240328144032.1864988-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/udp.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 531882f321f2d..6a39e7fa06167 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1492,13 +1492,14 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) struct sk_buff_head *list = &sk->sk_receive_queue; int rmem, err = -ENOMEM; spinlock_t *busy = NULL; - int size; + int size, rcvbuf; - /* try to avoid the costly atomic add/sub pair when the receive - * queue is full; always allow at least a packet + /* Immediately drop when the receive queue is full. + * Always allow at least one packet. */ rmem = atomic_read(&sk->sk_rmem_alloc); - if (rmem > sk->sk_rcvbuf) + rcvbuf = READ_ONCE(sk->sk_rcvbuf); + if (rmem > rcvbuf) goto drop; /* Under mem pressure, it might be helpful to help udp_recvmsg() @@ -1507,7 +1508,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) * - Less cache line misses at copyout() time * - Less work at consume_skb() (less alien page frag freeing) */ - if (rmem > (sk->sk_rcvbuf >> 1)) { + if (rmem > (rcvbuf >> 1)) { skb_condense(skb); busy = busylock_acquire(sk); From 6a1f12dd85a8b24f871dfcf467378660af9c064d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Mar 2024 14:40:30 +0000 Subject: [PATCH 112/167] udp: relax atomic operation on sk->sk_rmem_alloc atomic_add_return() is more expensive than atomic_add() and seems overkill in UDP rx fast path. Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240328144032.1864988-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/udp.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6a39e7fa06167..19d7db4563acd 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1516,12 +1516,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) size = skb->truesize; udp_set_dev_scratch(skb); - /* we drop only if the receive buf is full and the receive - * queue contains some other skb - */ - rmem = atomic_add_return(size, &sk->sk_rmem_alloc); - if (rmem > (size + (unsigned int)sk->sk_rcvbuf)) - goto uncharge_drop; + atomic_add(size, &sk->sk_rmem_alloc); spin_lock(&list->lock); err = udp_rmem_schedule(sk, size); From 612b1c0dec5bc7367f90fc508448b8d0d7c05414 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Mar 2024 14:40:31 +0000 Subject: [PATCH 113/167] udp: avoid calling sock_def_readable() if possible sock_def_readable() is quite expensive (particularly when ep_poll_callback() is in the picture). We must call sk->sk_data_ready() when : - receive queue was empty, or - SO_PEEK_OFF is enabled on the socket, or - sk->sk_data_ready is not sock_def_readable. We still need to call sk_wake_async(). Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240328144032.1864988-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/udp.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 19d7db4563acd..143043cd2dcbd 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1492,6 +1492,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) struct sk_buff_head *list = &sk->sk_receive_queue; int rmem, err = -ENOMEM; spinlock_t *busy = NULL; + bool becomes_readable; int size, rcvbuf; /* Immediately drop when the receive queue is full. @@ -1532,12 +1533,19 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) */ sock_skb_set_dropcount(sk, skb); + becomes_readable = skb_queue_empty(list); __skb_queue_tail(list, skb); spin_unlock(&list->lock); - if (!sock_flag(sk, SOCK_DEAD)) - INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); - + if (!sock_flag(sk, SOCK_DEAD)) { + if (becomes_readable || + sk->sk_data_ready != sock_def_readable || + READ_ONCE(sk->sk_peek_off) >= 0) + INDIRECT_CALL_1(sk->sk_data_ready, + sock_def_readable, sk); + else + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + } busylock_release(busy); return 0; From 1abe267f173eae7ae76cf56232292e9641eb652f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Mar 2024 14:40:32 +0000 Subject: [PATCH 114/167] net: add sk_wake_async_rcu() helper While looking at UDP receive performance, I saw sk_wake_async() was no longer inlined. This matters at least on AMD Zen1-4 platforms (see SRSO) This might be because rcu_read_lock() and rcu_read_unlock() are no longer nops in recent kernels ? Add sk_wake_async_rcu() variant, which must be called from contexts already holding rcu lock. As SOCK_FASYNC is deprecated in modern days, use unlikely() to give a hint to the compiler. sk_wake_async_rcu() is properly inlined from __udp_enqueue_schedule_skb() and sock_def_readable(). Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240328144032.1864988-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- crypto/af_alg.c | 4 ++-- include/net/sock.h | 6 ++++++ net/atm/common.c | 2 +- net/core/sock.c | 8 ++++---- net/dccp/output.c | 2 +- net/ipv4/udp.c | 2 +- net/iucv/af_iucv.c | 2 +- net/rxrpc/af_rxrpc.c | 2 +- net/sctp/socket.c | 2 +- net/smc/smc_rx.c | 4 ++-- net/unix/af_unix.c | 2 +- 11 files changed, 21 insertions(+), 15 deletions(-) diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 68cc9290cabe9..5bc6d0fa7498d 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -847,7 +847,7 @@ void af_alg_wmem_wakeup(struct sock *sk) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(af_alg_wmem_wakeup); @@ -914,7 +914,7 @@ static void af_alg_data_wakeup(struct sock *sk) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } diff --git a/include/net/sock.h b/include/net/sock.h index f57bfd8a2ad2d..2253eefe28488 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2513,6 +2513,12 @@ static inline void sk_wake_async(const struct sock *sk, int how, int band) } } +static inline void sk_wake_async_rcu(const struct sock *sk, int how, int band) +{ + if (unlikely(sock_flag(sk, SOCK_FASYNC))) + sock_wake_async(rcu_dereference(sk->sk_wq), how, band); +} + /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak. * Note: for send buffers, TCP works better if we can build two skbs at diff --git a/net/atm/common.c b/net/atm/common.c index 2a1ec014e901d..9b75699992ff9 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -116,7 +116,7 @@ static void vcc_write_space(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible(&wq->wait); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); diff --git a/net/core/sock.c b/net/core/sock.c index 0963689a59506..5ed411231fc7b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3338,7 +3338,7 @@ static void sock_def_error_report(struct sock *sk) wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, EPOLLERR); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); + sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR); rcu_read_unlock(); } @@ -3353,7 +3353,7 @@ void sock_def_readable(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } @@ -3373,7 +3373,7 @@ static void sock_def_write_space(struct sock *sk) EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); @@ -3398,7 +3398,7 @@ static void sock_def_write_space_wfree(struct sock *sk) EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } } diff --git a/net/dccp/output.c b/net/dccp/output.c index fd2eb148d24de..5c2e24f3c39b7 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -204,7 +204,7 @@ void dccp_write_space(struct sock *sk) wake_up_interruptible(&wq->wait); /* Should agree with poll, otherwise some programs break */ if (sock_writeable(sk)) - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 143043cd2dcbd..11460d751e731 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1544,7 +1544,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); else - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); } busylock_release(busy); return 0; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 7c8c3adcac6e9..c951bb9cc2e04 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -184,7 +184,7 @@ static void iucv_sock_wake_msglim(struct sock *sk) wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 5222bc97d192e..f4844683e1203 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -65,7 +65,7 @@ static void rxrpc_write_space(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible(&wq->wait); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index c67679a41044f..e416b6d3d2705 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9276,7 +9276,7 @@ void sctp_data_ready(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 9a2f3638d161d..f0cbe77a80b44 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -42,10 +42,10 @@ static void smc_rx_wake_up(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); if ((sk->sk_shutdown == SHUTDOWN_MASK) || (sk->sk_state == SMC_CLOSED)) - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_HUP); rcu_read_unlock(); } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 27ca50ab1cd18..533fb682c9547 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -546,7 +546,7 @@ static void unix_write_space(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } From e8058a49e67fe7bc7e4a0308851a3ca3a6d2e45d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 Mar 2024 20:31:45 +0100 Subject: [PATCH 115/167] netlink: introduce type-checking attribute iteration There are, especially with multi-attr arrays, many cases of needing to iterate all attributes of a specific type in a netlink message or a nested attribute. Add specific macros to support that case. Also convert many instances using this spatch: @@ iterator nla_for_each_attr; iterator name nla_for_each_attr_type; identifier nla; expression head, len, rem; expression ATTR; type T; identifier x; @@ -nla_for_each_attr(nla, head, len, rem) +nla_for_each_attr_type(nla, ATTR, head, len, rem) { <... T x; ...> -if (nla_type(nla) == ATTR) { ... -} } @@ identifier nla; iterator nla_for_each_nested; iterator name nla_for_each_nested_type; expression attr, rem; expression ATTR; type T; identifier x; @@ -nla_for_each_nested(nla, attr, rem) +nla_for_each_nested_type(nla, ATTR, attr, rem) { <... T x; ...> -if (nla_type(nla) == ATTR) { ... -} } @@ iterator nla_for_each_attr; iterator name nla_for_each_attr_type; identifier nla; expression head, len, rem; expression ATTR; type T; identifier x; @@ -nla_for_each_attr(nla, head, len, rem) +nla_for_each_attr_type(nla, ATTR, head, len, rem) { <... T x; ...> -if (nla_type(nla) != ATTR) continue; ... } @@ identifier nla; iterator nla_for_each_nested; iterator name nla_for_each_nested_type; expression attr, rem; expression ATTR; type T; identifier x; @@ -nla_for_each_nested(nla, attr, rem) +nla_for_each_nested_type(nla, ATTR, attr, rem) { <... T x; ...> -if (nla_type(nla) != ATTR) continue; ... } Although I had to undo one bad change this made, and I also adjusted some other code for whitespace and to use direct variable initialization now. Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20240328203144.b5a6c895fb80.I1869b44767379f204998ff44dd239803f39c23e0@changeid Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 +--- drivers/net/ethernet/emulex/benet/be_main.c | 5 +--- drivers/net/ethernet/intel/i40e/i40e_main.c | 8 ++---- drivers/net/ethernet/intel/ice/ice_main.c | 7 ++--- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 11 +++----- .../net/ethernet/mellanox/mlx5/core/en_main.c | 5 +--- .../ethernet/netronome/nfp/nfp_net_common.c | 5 +--- include/net/netlink.h | 27 +++++++++++++++++++ net/8021q/vlan_netlink.c | 10 +++---- net/core/bpf_sk_storage.c | 23 +++++++--------- net/core/rtnetlink.c | 15 +++++------ net/devlink/dev.c | 12 +++------ net/sched/sch_mqprio.c | 6 ++--- net/sched/sch_taprio.c | 5 +--- 14 files changed, 65 insertions(+), 79 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 388e80bf91f5e..b4db4b1aaffbf 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -14581,12 +14581,9 @@ static int bnxt_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, if (!br_spec) return -EINVAL; - nla_for_each_nested(attr, br_spec, rem) { + nla_for_each_nested_type(attr, IFLA_BRIDGE_MODE, br_spec, rem) { u16 mode; - if (nla_type(attr) != IFLA_BRIDGE_MODE) - continue; - mode = nla_get_u16(attr); if (mode == bp->br_mode) break; diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index ad862ed7888ac..a8596ebcdfd60 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -4982,10 +4982,7 @@ static int be_ndo_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, if (!br_spec) return -EINVAL; - nla_for_each_nested(attr, br_spec, rem) { - if (nla_type(attr) != IFLA_BRIDGE_MODE) - continue; - + nla_for_each_nested_type(attr, IFLA_BRIDGE_MODE, br_spec, rem) { mode = nla_get_u16(attr); if (BE3_chip(adapter) && mode == BRIDGE_MODE_VEPA) return -EOPNOTSUPP; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index f86578857e8ae..e427e65af205b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -13108,13 +13108,9 @@ static int i40e_ndo_bridge_setlink(struct net_device *dev, if (!br_spec) return -EINVAL; - nla_for_each_nested(attr, br_spec, rem) { - __u16 mode; + nla_for_each_nested_type(attr, IFLA_BRIDGE_MODE, br_spec, rem) { + __u16 mode = nla_get_u16(attr); - if (nla_type(attr) != IFLA_BRIDGE_MODE) - continue; - - mode = nla_get_u16(attr); if ((mode != BRIDGE_MODE_VEPA) && (mode != BRIDGE_MODE_VEB)) return -EINVAL; diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index f2b7d6ca8805e..618570f235804 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -7993,12 +7993,9 @@ ice_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, if (!br_spec) return -EINVAL; - nla_for_each_nested(attr, br_spec, rem) { - __u16 mode; + nla_for_each_nested_type(attr, IFLA_BRIDGE_MODE, br_spec, rem) { + __u16 mode = nla_get_u16(attr); - if (nla_type(attr) != IFLA_BRIDGE_MODE) - continue; - mode = nla_get_u16(attr); if (mode != BRIDGE_MODE_VEPA && mode != BRIDGE_MODE_VEB) return -EINVAL; /* Continue if bridge mode is not being flipped */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index f985252c8c8d6..ed05af665466d 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -10061,15 +10061,10 @@ static int ixgbe_ndo_bridge_setlink(struct net_device *dev, if (!br_spec) return -EINVAL; - nla_for_each_nested(attr, br_spec, rem) { - int status; - __u16 mode; + nla_for_each_nested_type(attr, IFLA_BRIDGE_MODE, br_spec, rem) { + __u16 mode = nla_get_u16(attr); + int status = ixgbe_configure_bridge_mode(adapter, mode); - if (nla_type(attr) != IFLA_BRIDGE_MODE) - continue; - - mode = nla_get_u16(attr); - status = ixgbe_configure_bridge_mode(adapter, mode); if (status) return status; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 91848eae45655..81e1c1e401f9f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4950,10 +4950,7 @@ static int mlx5e_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, if (!br_spec) return -EINVAL; - nla_for_each_nested(attr, br_spec, rem) { - if (nla_type(attr) != IFLA_BRIDGE_MODE) - continue; - + nla_for_each_nested_type(attr, IFLA_BRIDGE_MODE, br_spec, rem) { mode = nla_get_u16(attr); if (mode > BRIDGE_MODE_VEPA) return -EINVAL; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index f28e769e6fdad..997cc4fcffdbf 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -2289,10 +2289,7 @@ static int nfp_net_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, if (!br_spec) return -EINVAL; - nla_for_each_nested(attr, br_spec, rem) { - if (nla_type(attr) != IFLA_BRIDGE_MODE) - continue; - + nla_for_each_nested_type(attr, IFLA_BRIDGE_MODE, br_spec, rem) { new_ctrl = nn->dp.ctrl; mode = nla_get_u16(attr); if (mode == BRIDGE_MODE_VEPA) diff --git a/include/net/netlink.h b/include/net/netlink.h index c19ff921b661a..1d2bbcc50212d 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -157,7 +157,11 @@ * nla_parse() parse and validate stream of attrs * nla_parse_nested() parse nested attributes * nla_for_each_attr() loop over all attributes + * nla_for_each_attr_type() loop over all attributes with the + * given type * nla_for_each_nested() loop over the nested attributes + * nla_for_each_nested_type() loop over the nested attributes with + * the given type *========================================================================= */ @@ -2070,6 +2074,18 @@ static inline int nla_total_size_64bit(int payload) nla_ok(pos, rem); \ pos = nla_next(pos, &(rem))) +/** + * nla_for_each_attr_type - iterate over a stream of attributes + * @pos: loop counter, set to current attribute + * @type: required attribute type for @pos + * @head: head of attribute stream + * @len: length of attribute stream + * @rem: initialized to len, holds bytes currently remaining in stream + */ +#define nla_for_each_attr_type(pos, type, head, len, rem) \ + nla_for_each_attr(pos, head, len, rem) \ + if (nla_type(pos) == type) + /** * nla_for_each_nested - iterate over nested attributes * @pos: loop counter, set to current attribute @@ -2079,6 +2095,17 @@ static inline int nla_total_size_64bit(int payload) #define nla_for_each_nested(pos, nla, rem) \ nla_for_each_attr(pos, nla_data(nla), nla_len(nla), rem) +/** + * nla_for_each_nested_type - iterate over nested attributes + * @pos: loop counter, set to current attribute + * @type: required attribute type for @pos + * @nla: attribute containing the nested attributes + * @rem: initialized to len, holds bytes currently remaining in stream + */ +#define nla_for_each_nested_type(pos, type, nla, rem) \ + nla_for_each_nested(pos, nla, rem) \ + if (nla_type(pos) == type) + /** * nla_is_last - Test if attribute is last in stream * @nla: attribute to test diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index a3b68243fd4b1..cf5219df7903c 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -117,17 +117,15 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[], return err; } if (data[IFLA_VLAN_INGRESS_QOS]) { - nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) { - if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING) - continue; + nla_for_each_nested_type(attr, IFLA_VLAN_QOS_MAPPING, + data[IFLA_VLAN_INGRESS_QOS], rem) { m = nla_data(attr); vlan_dev_set_ingress_priority(dev, m->to, m->from); } } if (data[IFLA_VLAN_EGRESS_QOS]) { - nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) { - if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING) - continue; + nla_for_each_nested_type(attr, IFLA_VLAN_QOS_MAPPING, + data[IFLA_VLAN_EGRESS_QOS], rem) { m = nla_data(attr); err = vlan_dev_set_egress_priority(dev, m->from, m->to); if (err) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 6c4d90b24d467..bc01b3aa6b0fa 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -496,27 +496,22 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) if (!bpf_capable()) return ERR_PTR(-EPERM); - nla_for_each_nested(nla, nla_stgs, rem) { - if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) { - if (nla_len(nla) != sizeof(u32)) - return ERR_PTR(-EINVAL); - nr_maps++; - } + nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD, + nla_stgs, rem) { + if (nla_len(nla) != sizeof(u32)) + return ERR_PTR(-EINVAL); + nr_maps++; } diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL); if (!diag) return ERR_PTR(-ENOMEM); - nla_for_each_nested(nla, nla_stgs, rem) { - struct bpf_map *map; - int map_fd; - - if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD) - continue; + nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD, + nla_stgs, rem) { + int map_fd = nla_get_u32(nla); + struct bpf_map *map = bpf_map_get(map_fd); - map_fd = nla_get_u32(nla); - map = bpf_map_get(map_fd); if (IS_ERR(map)) { err = PTR_ERR(map); goto err_free; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a3d7847ce69d3..283e42f48af68 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -5245,15 +5245,14 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { - nla_for_each_nested(attr, br_spec, rem) { - if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { - if (nla_len(attr) < sizeof(flags)) - return -EINVAL; + nla_for_each_nested_type(attr, IFLA_BRIDGE_FLAGS, br_spec, + rem) { + if (nla_len(attr) < sizeof(flags)) + return -EINVAL; - have_flags = true; - flags = nla_get_u16(attr); - break; - } + have_flags = true; + flags = nla_get_u16(attr); + break; } } diff --git a/net/devlink/dev.c b/net/devlink/dev.c index 19dbf540748ab..c609deb42e889 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -1202,17 +1202,13 @@ static void __devlink_compat_running_version(struct devlink *devlink, if (err) goto free_msg; - nla_for_each_attr(nlattr, (void *)msg->data, msg->len, rem) { + nla_for_each_attr_type(nlattr, DEVLINK_ATTR_INFO_VERSION_RUNNING, + (void *)msg->data, msg->len, rem) { const struct nlattr *kv; int rem_kv; - if (nla_type(nlattr) != DEVLINK_ATTR_INFO_VERSION_RUNNING) - continue; - - nla_for_each_nested(kv, nlattr, rem_kv) { - if (nla_type(kv) != DEVLINK_ATTR_INFO_VERSION_VALUE) - continue; - + nla_for_each_nested_type(kv, DEVLINK_ATTR_INFO_VERSION_VALUE, + nlattr, rem_kv) { strlcat(buf, nla_data(kv), len); strlcat(buf, " ", len); } diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 225353fbb3f15..51d4013b61219 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -215,10 +215,8 @@ static int mqprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *nlattr_opt, for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) fp[tc] = priv->fp[tc]; - nla_for_each_attr(n, nlattr_opt, nlattr_opt_len, rem) { - if (nla_type(n) != TCA_MQPRIO_TC_ENTRY) - continue; - + nla_for_each_attr_type(n, TCA_MQPRIO_TC_ENTRY, nlattr_opt, + nlattr_opt_len, rem) { err = mqprio_parse_tc_entry(fp, n, &seen_tcs, extack); if (err) goto out; diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index a0d54b422186f..1ab17e8a72605 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1752,10 +1752,7 @@ static int taprio_parse_tc_entries(struct Qdisc *sch, fp[tc] = q->fp[tc]; } - nla_for_each_nested(n, opt, rem) { - if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY) - continue; - + nla_for_each_nested_type(n, TCA_TAPRIO_ATTR_TC_ENTRY, opt, rem) { err = taprio_parse_tc_entry(sch, n, max_sdu, fp, &seen_tcs, extack); if (err) From 929107d3d2a3a363302c4cece3c73742863ef94b Mon Sep 17 00:00:00 2001 From: Christophe Roullier Date: Thu, 28 Mar 2024 19:53:37 +0100 Subject: [PATCH 116/167] dt-bindings: net: dwmac: Document STM32 property st,ext-phyclk The Linux kernel dwmac-stm32 driver currently supports three DT properties used to configure whether PHY clock are generated by the MAC or supplied to the MAC from the PHY. Originally there were two properties, st,eth-clk-sel and st,eth-ref-clk-sel, each used to configure MAC clocking in different bus mode and for different MAC clock frequency. Since it is possible to determine the MAC 'eth-ck' clock frequency from the clock subsystem and PHY bus mode from the 'phy-mode' property, two disparate DT properties are no longer required to configure MAC clocking. Linux kernel commit 1bb694e20839 ("net: ethernet: stmmac: simplify phy modes management for stm32") introduced a third, unified, property st,ext-phyclk. This property covers both use cases of st,eth-clk-sel and st,eth-ref-clk-sel DT properties, as well as a new use case for 25 MHz clock generated by the MAC. The third property st,ext-phyclk is so far undocumented, document it. Below table summarizes the clock requirement and clock sources for supported PHY interface modes. __________________________________________________________________________ |PHY_MODE | Normal | PHY wo crystal| PHY wo crystal |No 125Mhz from PHY| | | | 25MHz | 50MHz | | --------------------------------------------------------------------------- | MII | - | eth-ck | n/a | n/a | | | | st,ext-phyclk | | | --------------------------------------------------------------------------- | GMII | - | eth-ck | n/a | n/a | | | | st,ext-phyclk | | | --------------------------------------------------------------------------- | RGMII | - | eth-ck | n/a | eth-ck | | | | st,ext-phyclk | | st,eth-clk-sel or| | | | | | st,ext-phyclk | --------------------------------------------------------------------------- | RMII | - | eth-ck | eth-ck | n/a | | | | st,ext-phyclk | st,eth-ref-clk-sel | | | | | | or st,ext-phyclk | | --------------------------------------------------------------------------- Reviewed-by: Krzysztof Kozlowski Reviewed-by: Marek Vasut Signed-off-by: Christophe Roullier Link: https://lore.kernel.org/r/20240328185337.332703-2-christophe.roullier@foss.st.com Signed-off-by: Jakub Kicinski --- Documentation/devicetree/bindings/net/stm32-dwmac.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/devicetree/bindings/net/stm32-dwmac.yaml b/Documentation/devicetree/bindings/net/stm32-dwmac.yaml index fc8c96b08d7dc..857d58949b029 100644 --- a/Documentation/devicetree/bindings/net/stm32-dwmac.yaml +++ b/Documentation/devicetree/bindings/net/stm32-dwmac.yaml @@ -82,6 +82,13 @@ properties: Should be phandle/offset pair. The phandle to the syscon node which encompases the glue register, and the offset of the control register + st,ext-phyclk: + description: + set this property in RMII mode when you have PHY without crystal 50MHz and want to + select RCC clock instead of ETH_REF_CLK. OR in RGMII mode when you want to select + RCC clock instead of ETH_CLK125. + type: boolean + st,eth-clk-sel: description: set this property in RGMII PHY when you want to select RCC clock instead of ETH_CLK125. From 20b73732d6fd9d9ba882a5b87fdc463222c2612c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 Mar 2024 08:27:49 +0100 Subject: [PATCH 117/167] rtnetlink: add guard for RTNL The new guard/scoped_gard can be useful for the RTNL as well, so add a guard definition for it. It gets used like { guard(rtnl)(); // RTNL held until end of block } or scoped_guard(rtnl) { // RTNL held in this block } as with any other guard/scoped_guard. Signed-off-by: Johannes Berg Signed-off-by: NipaLocal --- include/linux/rtnetlink.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index cdfc897f1e3c6..a7da7dfc06a2a 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -7,6 +7,7 @@ #include #include #include +#include #include extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); @@ -46,6 +47,8 @@ extern int rtnl_is_locked(void); extern int rtnl_lock_killable(void); extern bool refcount_dec_and_rtnl_lock(refcount_t *r); +DEFINE_LOCK_GUARD_0(rtnl, rtnl_lock(), rtnl_unlock()) + extern wait_queue_head_t netdev_unregistering_wq; extern atomic_t dev_unreg_count; extern struct rw_semaphore pernet_ops_rwsem; From 5593205f8d49360cf93d640c7f2ac3926d94fe9f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 28 Mar 2024 08:27:50 +0100 Subject: [PATCH 118/167] netdevice: add DEFINE_FREE() for dev_put For short netdev holds within a function there are still a lot of users of dev_put() rather than netdev_put(). Add DEFINE_FREE() to allow making those safer. Signed-off-by: Johannes Berg Signed-off-by: NipaLocal --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e41d30ebaca61..533293435e7bb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4128,6 +4128,8 @@ static inline void dev_put(struct net_device *dev) netdev_put(dev, NULL); } +DEFINE_FREE(dev_put, struct net_device *, if (_T) dev_put(_T)) + static inline void netdev_ref_replace(struct net_device *odev, struct net_device *ndev, netdevice_tracker *tracker, From 7532bf8e5d6aba6066713b267161d34e373094d7 Mon Sep 17 00:00:00 2001 From: Catalin Popescu Date: Thu, 28 Mar 2024 14:33:58 +0100 Subject: [PATCH 119/167] net: phy: dp8382x: keep WOL settings across suspends Unlike other ethernet PHYs from TI, PHY dp8382x has WOL enabled at reset. The driver explicitly disables WOL in config_init callback which is called during init and during resume from suspend. Hence, WOL is unconditionally disabled during resume, even if it was enabled before the suspend. We make sure that WOL configuration is persistent across suspends. Signed-off-by: Catalin Popescu Signed-off-by: NipaLocal --- drivers/net/phy/dp83822.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index c3426a17e6d06..683ebb13bd4f5 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -140,10 +140,11 @@ struct dp83822_private { u16 fx_sd_enable; u8 cfg_dac_minus; u8 cfg_dac_plus; + struct ethtool_wolinfo wol; }; -static int dp83822_set_wol(struct phy_device *phydev, - struct ethtool_wolinfo *wol) +static int _dp83822_set_wol(struct phy_device *phydev, + struct ethtool_wolinfo *wol) { struct net_device *ndev = phydev->attached_dev; u16 value; @@ -197,10 +198,25 @@ static int dp83822_set_wol(struct phy_device *phydev, MII_DP83822_WOL_CFG, value); } else { return phy_clear_bits_mmd(phydev, DP83822_DEVADDR, - MII_DP83822_WOL_CFG, DP83822_WOL_EN); + MII_DP83822_WOL_CFG, + DP83822_WOL_EN | + DP83822_WOL_MAGIC_EN | + DP83822_WOL_SECURE_ON); } } +static int dp83822_set_wol(struct phy_device *phydev, + struct ethtool_wolinfo *wol) +{ + struct dp83822_private *dp83822 = phydev->priv; + int ret; + + ret = _dp83822_set_wol(phydev, wol); + if (!ret) + memcpy(&dp83822->wol, wol, sizeof(*wol)); + return ret; +} + static void dp83822_get_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol) { @@ -346,13 +362,6 @@ static irqreturn_t dp83822_handle_interrupt(struct phy_device *phydev) return IRQ_HANDLED; } -static int dp8382x_disable_wol(struct phy_device *phydev) -{ - return phy_clear_bits_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG, - DP83822_WOL_EN | DP83822_WOL_MAGIC_EN | - DP83822_WOL_SECURE_ON); -} - static int dp83822_read_status(struct phy_device *phydev) { struct dp83822_private *dp83822 = phydev->priv; @@ -496,7 +505,7 @@ static int dp83822_config_init(struct phy_device *phydev) return err; } } - return dp8382x_disable_wol(phydev); + return _dp83822_set_wol(phydev, &dp83822->wol); } static int dp83826_config_rmii_mode(struct phy_device *phydev) @@ -575,12 +584,14 @@ static int dp83826_config_init(struct phy_device *phydev) return ret; } - return dp8382x_disable_wol(phydev); + return _dp83822_set_wol(phydev, &dp83822->wol); } static int dp8382x_config_init(struct phy_device *phydev) { - return dp8382x_disable_wol(phydev); + struct dp83822_private *dp83822 = phydev->priv; + + return _dp83822_set_wol(phydev, &dp83822->wol); } static int dp83822_phy_reset(struct phy_device *phydev) From f590bf666b7ab5af82b80bd5e33c7835aca70d88 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Thu, 28 Mar 2024 16:40:02 +0100 Subject: [PATCH 120/167] networking: Remove the now superfluous sentinel elements from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) * Remove sentinel element from ctl_table structs. * Remove extra element in ctl_table arrays declarations * Remove instances where an array element is zeroed out to make it look like a sentinel. This is not longer needed and is safe after commit c899710fe7f9 ("networking: Update to register_net_sysctl_sz") added the array size to the ctl_table registration * Replace the for loop stop condition that tests for procname == NULL with one that depends on array size * Removed the "-1" that adjusted for having an extra empty element when looping over ctl_table arrays * Removing the unprivileged user check in ipv6_route_sysctl_init is safe as it is replaced by calling ipv6_route_sysctl_table_size; introduced in commit c899710fe7f9 ("networking: Update to register_net_sysctl_sz") * Replace empty array registration with the register_net_sysctl_sz call. Signed-off-by: Joel Granados Signed-off-by: NipaLocal --- net/core/neighbour.c | 5 +---- net/core/sysctl_net_core.c | 9 ++++----- net/dccp/sysctl.c | 2 -- net/ieee802154/6lowpan/reassembly.c | 6 +----- net/ipv4/devinet.c | 5 ++--- net/ipv4/ip_fragment.c | 2 -- net/ipv4/route.c | 8 ++------ net/ipv4/sysctl_net_ipv4.c | 7 +++---- net/ipv4/xfrm4_policy.c | 1 - net/ipv6/addrconf.c | 5 +---- net/ipv6/icmp.c | 1 - net/ipv6/reassembly.c | 2 -- net/ipv6/route.c | 5 ----- net/ipv6/sysctl_net_ipv6.c | 4 +--- net/ipv6/xfrm6_policy.c | 1 - net/llc/sysctl_net_llc.c | 8 ++------ net/mpls/af_mpls.c | 3 +-- net/mptcp/ctrl.c | 1 - net/netrom/sysctl_net_netrom.c | 1 - net/phonet/sysctl.c | 1 - net/rds/ib_sysctl.c | 1 - net/rds/sysctl.c | 1 - net/rds/tcp.c | 1 - net/rose/sysctl_net_rose.c | 1 - net/rxrpc/sysctl.c | 1 - net/sctp/sysctl.c | 6 +----- net/smc/smc_sysctl.c | 1 - net/sunrpc/sysctl.c | 1 - net/sunrpc/xprtrdma/svc_rdma.c | 1 - net/sunrpc/xprtrdma/transport.c | 1 - net/sunrpc/xprtsock.c | 1 - net/tipc/sysctl.c | 1 - net/unix/sysctl_net_unix.c | 1 - net/x25/sysctl_net_x25.c | 1 - net/xfrm/xfrm_sysctl.c | 5 +---- 35 files changed, 20 insertions(+), 81 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 552719c3bbc3d..b0327402b3e66 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3728,7 +3728,7 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; - struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1]; + struct ctl_table neigh_vars[NEIGH_VAR_MAX]; } neigh_sysctl_template __read_mostly = { .neigh_vars = { NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), @@ -3779,7 +3779,6 @@ static struct neigh_sysctl_table { .extra2 = SYSCTL_INT_MAX, .proc_handler = proc_dointvec_minmax, }, - {}, }, }; @@ -3807,8 +3806,6 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, if (dev) { dev_name_source = dev->name; /* Terminate the table early */ - memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, - sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL])); neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1; } else { struct neigh_table *tbl = p->tbl; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 6973dda3abda6..46f5143e86be1 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -660,7 +660,6 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, - { } }; static struct ctl_table netns_core_table[] = { @@ -697,7 +696,6 @@ static struct ctl_table netns_core_table[] = { .extra2 = SYSCTL_ONE, .proc_handler = proc_dou8vec_minmax, }, - { } }; static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str) @@ -715,7 +713,8 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); static __net_init int sysctl_core_net_init(struct net *net) { - struct ctl_table *tbl, *tmp; + struct ctl_table *tbl; + size_t table_size = ARRAY_SIZE(netns_core_table); tbl = netns_core_table; if (!net_eq(net, &init_net)) { @@ -723,8 +722,8 @@ static __net_init int sysctl_core_net_init(struct net *net) if (tbl == NULL) goto err_dup; - for (tmp = tbl; tmp->procname; tmp++) - tmp->data += (char *)net - (char *)&init_net; + for (int i = 0; i < table_size; ++i) + (tbl + i)->data += (char *)net - (char *)&init_net; } net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index ee8d4f5afa722..3fc474d6e57dd 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -90,8 +90,6 @@ static struct ctl_table dccp_default_table[] = { .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, - - { } }; static struct ctl_table_header *dccp_table_header; diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 6dd960ec558cf..09b18ee6df003 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -338,7 +338,6 @@ static struct ctl_table lowpan_frags_ns_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; /* secret interval has been deprecated */ @@ -351,7 +350,6 @@ static struct ctl_table lowpan_frags_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) @@ -370,10 +368,8 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) goto err_alloc; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - table[0].procname = NULL; + if (net->user_ns != &init_user_ns) table_size = 0; - } } table[0].data = &ieee802154_lowpan->fqdir->high_thresh; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 7a437f0d41905..6195cc5be1fcc 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2515,7 +2515,7 @@ static int ipv4_doint_and_flush(struct ctl_table *ctl, int write, static struct devinet_sysctl_table { struct ctl_table_header *sysctl_header; - struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX]; + struct ctl_table devinet_vars[IPV4_DEVCONF_MAX]; } devinet_sysctl = { .devinet_vars = { DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", @@ -2578,7 +2578,7 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, if (!t) goto out; - for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) { + for (i = 0; i < ARRAY_SIZE(t->devinet_vars); i++) { t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf; t->devinet_vars[i].extra1 = p; t->devinet_vars[i].extra2 = net; @@ -2652,7 +2652,6 @@ static struct ctl_table ctl_forward_entry[] = { .extra1 = &ipv4_devconf, .extra2 = &init_net, }, - { }, }; #endif diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index fb947d1613fe2..cb1be83748c38 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -580,7 +580,6 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &dist_min, }, - { } }; /* secret interval has been deprecated */ @@ -593,7 +592,6 @@ static struct ctl_table ip4_frags_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; static int __net_init ip4_frags_ns_ctl_register(struct net *net) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c8f76f56dc165..deecfc0e5a91c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3509,7 +3509,6 @@ static struct ctl_table ipv4_route_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static const char ipv4_route_flush_procname[] = "flush"; @@ -3543,7 +3542,6 @@ static struct ctl_table ipv4_route_netns_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { }, }; static __net_init int sysctl_route_net_init(struct net *net) @@ -3561,16 +3559,14 @@ static __net_init int sysctl_route_net_init(struct net *net) /* Don't export non-whitelisted sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { - if (tbl[0].procname != ipv4_route_flush_procname) { - tbl[0].procname = NULL; + if (tbl[0].procname != ipv4_route_flush_procname) table_size = 0; - } } /* Update the variables to point into the current struct net * except for the first element flush */ - for (i = 1; i < ARRAY_SIZE(ipv4_route_netns_table) - 1; i++) + for (i = 1; i < table_size; i++) tbl[i].data += (void *)net - (void *)&init_net; } tbl[0].extra1 = net; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 7e4f16a7dcc17..c8dae8fc682a0 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -575,7 +575,6 @@ static struct ctl_table ipv4_table[] = { .extra1 = &sysctl_fib_sync_mem_min, .extra2 = &sysctl_fib_sync_mem_max, }, - { } }; static struct ctl_table ipv4_net_table[] = { @@ -1502,12 +1501,12 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ONE, }, - { } }; static __net_init int ipv4_sysctl_init_net(struct net *net) { struct ctl_table *table; + size_t table_size = ARRAY_SIZE(ipv4_net_table); table = ipv4_net_table; if (!net_eq(net, &init_net)) { @@ -1517,7 +1516,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) if (!table) goto err_alloc; - for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) { + for (i = 0; i < table_size; i++) { if (table[i].data) { /* Update the variables to point into * the current struct net @@ -1533,7 +1532,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) } net->ipv4.ipv4_hdr = register_net_sysctl_sz(net, "net/ipv4", table, - ARRAY_SIZE(ipv4_net_table)); + table_size); if (!net->ipv4.ipv4_hdr) goto err_reg; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index c33bca2c38415..4c74fec034c5d 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -152,7 +152,6 @@ static struct ctl_table xfrm4_policy_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static __net_init int xfrm4_net_sysctl_init(struct net *net) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 92db9b474f2bd..841315de5cf25 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -7182,9 +7182,6 @@ static const struct ctl_table addrconf_sysctl[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - { - /* sentinel */ - } }; static int __addrconf_sysctl_register(struct net *net, char *dev_name, @@ -7198,7 +7195,7 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, if (!table) goto out; - for (i = 0; table[i].data; i++) { + for (i = 0; i < ARRAY_SIZE(addrconf_sysctl); i++) { table[i].data += (char *)p - (char *)&ipv6_devconf; /* If one of these is already set, then it is not safe to * overwrite either of them: this makes proc_dointvec_minmax diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 1635da07285f2..91cbf8e8009fe 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -1206,7 +1206,6 @@ static struct ctl_table ipv6_icmp_table_template[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { }, }; struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index acb4f119e11f0..afb343cb77ac6 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -436,7 +436,6 @@ static struct ctl_table ip6_frags_ns_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; /* secret interval has been deprecated */ @@ -449,7 +448,6 @@ static struct ctl_table ip6_frags_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; static int __net_init ip6_frags_ns_sysctl_register(struct net *net) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1f4b935a0e57a..b49137c3031b2 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6428,7 +6428,6 @@ static struct ctl_table ipv6_route_table_template[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { } }; struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) @@ -6452,10 +6451,6 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - table[1].procname = NULL; } return table; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 888676163e900..9f40bb12fbdd5 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -213,7 +213,6 @@ static struct ctl_table ipv6_table_template[] = { .proc_handler = proc_doulongvec_minmax, .extra2 = &ioam6_id_wide_max, }, - { } }; static struct ctl_table ipv6_rotable[] = { @@ -248,7 +247,6 @@ static struct ctl_table ipv6_rotable[] = { .proc_handler = proc_dointvec, }, #endif /* CONFIG_NETLABEL */ - { } }; static int __net_init ipv6_sysctl_net_init(struct net *net) @@ -264,7 +262,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) if (!ipv6_table) goto out; /* Update the variables to point into the current struct net */ - for (i = 0; i < ARRAY_SIZE(ipv6_table_template) - 1; i++) + for (i = 0; i < ARRAY_SIZE(ipv6_table_template); i++) ipv6_table[i].data += (void *)net - (void *)&init_net; ipv6_route_table = ipv6_route_sysctl_init(net); diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 42fb6996b0777..499b5f5c19fc3 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -184,7 +184,6 @@ static struct ctl_table xfrm6_policy_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static int __net_init xfrm6_net_sysctl_init(struct net *net) diff --git a/net/llc/sysctl_net_llc.c b/net/llc/sysctl_net_llc.c index 8443a6d841b06..72e101135f8cb 100644 --- a/net/llc/sysctl_net_llc.c +++ b/net/llc/sysctl_net_llc.c @@ -44,11 +44,6 @@ static struct ctl_table llc2_timeout_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { }, -}; - -static struct ctl_table llc_station_table[] = { - { }, }; static struct ctl_table_header *llc2_timeout_header; @@ -56,8 +51,9 @@ static struct ctl_table_header *llc_station_header; int __init llc_sysctl_init(void) { + struct ctl_table empty[1] = {}; llc2_timeout_header = register_net_sysctl(&init_net, "net/llc/llc2/timeout", llc2_timeout_table); - llc_station_header = register_net_sysctl(&init_net, "net/llc/station", llc_station_table); + llc_station_header = register_net_sysctl_sz(&init_net, "net/llc/station", empty, 0); if (!llc2_timeout_header || !llc_station_header) { llc_sysctl_exit(); diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 6dab883a08dda..e163fac55ffad 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1393,7 +1393,6 @@ static const struct ctl_table mpls_dev_table[] = { .proc_handler = mpls_conf_proc, .data = MPLS_PERDEV_SYSCTL_OFFSET(input_enabled), }, - { } }; static int mpls_dev_sysctl_register(struct net_device *dev, @@ -2689,7 +2688,7 @@ static int mpls_net_init(struct net *net) /* Table data contains only offsets relative to the base of * the mdev at this point, so make them absolute. */ - for (i = 0; i < ARRAY_SIZE(mpls_table) - 1; i++) + for (i = 0; i < ARRAY_SIZE(mpls_table); i++) table[i].data = (char *)net + (uintptr_t)table[i].data; net->mpls.ctl = register_net_sysctl_sz(net, "net/mpls", table, diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 13fe0748dde83..8bf7c26a08780 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -156,7 +156,6 @@ static struct ctl_table mptcp_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - {} }; static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) diff --git a/net/netrom/sysctl_net_netrom.c b/net/netrom/sysctl_net_netrom.c index 79fb2d3f477b5..7dc0fa628f2e7 100644 --- a/net/netrom/sysctl_net_netrom.c +++ b/net/netrom/sysctl_net_netrom.c @@ -140,7 +140,6 @@ static struct ctl_table nr_table[] = { .extra1 = &min_reset, .extra2 = &max_reset }, - { } }; int __init nr_register_sysctl(void) diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c index 0d0bf41381c22..82fc22467a095 100644 --- a/net/phonet/sysctl.c +++ b/net/phonet/sysctl.c @@ -81,7 +81,6 @@ static struct ctl_table phonet_table[] = { .mode = 0644, .proc_handler = proc_local_port_range, }, - { } }; int __init phonet_sysctl_init(void) diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c index e4e41b3afce71..2af678e71e3c6 100644 --- a/net/rds/ib_sysctl.c +++ b/net/rds/ib_sysctl.c @@ -103,7 +103,6 @@ static struct ctl_table rds_ib_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; void rds_ib_sysctl_exit(void) diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c index e381bbcd9cc1c..025f518a43495 100644 --- a/net/rds/sysctl.c +++ b/net/rds/sysctl.c @@ -89,7 +89,6 @@ static struct ctl_table rds_sysctl_rds_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; void rds_sysctl_exit(void) diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 2dba7505b4148..d8111ac83bb6a 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -86,7 +86,6 @@ static struct ctl_table rds_tcp_sysctl_table[] = { .proc_handler = rds_tcp_skbuf_handler, .extra1 = &rds_tcp_min_rcvbuf, }, - { } }; u32 rds_tcp_write_seq(struct rds_tcp_connection *tc) diff --git a/net/rose/sysctl_net_rose.c b/net/rose/sysctl_net_rose.c index d391d7758f52a..d801315b70839 100644 --- a/net/rose/sysctl_net_rose.c +++ b/net/rose/sysctl_net_rose.c @@ -112,7 +112,6 @@ static struct ctl_table rose_table[] = { .extra1 = &min_window, .extra2 = &max_window }, - { } }; void __init rose_register_sysctl(void) diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index c9bedd0e2d867..9bf9a1f6e4cb2 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -127,7 +127,6 @@ static struct ctl_table rxrpc_sysctl_table[] = { .extra1 = (void *)SYSCTL_ONE, .extra2 = (void *)&four, }, - { } }; int __init rxrpc_sysctl_init(void) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index f65d6f92afcbc..6894072af210c 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -80,8 +80,6 @@ static struct ctl_table sctp_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - - { /* sentinel */ } }; /* The following index defines are used in sctp_sysctl_net_register(). @@ -384,8 +382,6 @@ static struct ctl_table sctp_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &pf_expose_max, }, - - { /* sentinel */ } }; static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, @@ -604,7 +600,7 @@ int sctp_sysctl_net_register(struct net *net) if (!table) return -ENOMEM; - for (i = 0; table[i].data; i++) + for (i = 0; i < ARRAY_SIZE(sctp_net_table); i++) table[i].data += (char *)(&net->sctp) - (char *)&init_net.sctp; table[SCTP_RTO_MIN_IDX].extra2 = &net->sctp.rto_max; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index a5946d1b9d60c..bd0b7e2f88245 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -90,7 +90,6 @@ static struct ctl_table smc_table[] = { .extra1 = &conns_per_lgr_min, .extra2 = &conns_per_lgr_max, }, - { } }; int __net_init smc_sysctl_net_init(struct net *net) diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index 93941ab12549f..5f3170a1c9bbe 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -160,7 +160,6 @@ static struct ctl_table debug_table[] = { .mode = 0444, .proc_handler = proc_do_xprt, }, - { } }; void diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index f86970733eb0a..474f7a98fe9ee 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -209,7 +209,6 @@ static struct ctl_table svcrdma_parm_table[] = { .extra1 = &zero, .extra2 = &zero, }, - { }, }; static void svc_rdma_proc_cleanup(void) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 29b0562d62e75..9a8ce5df83cad 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -137,7 +137,6 @@ static struct ctl_table xr_tunables_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { }, }; #endif diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index bb9b747d58a1a..f62f7b65455b8 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -160,7 +160,6 @@ static struct ctl_table xs_tunables_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { }, }; /* diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c index 9fb65c988f7f3..30d2e06e3d8c7 100644 --- a/net/tipc/sysctl.c +++ b/net/tipc/sysctl.c @@ -91,7 +91,6 @@ static struct ctl_table tipc_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, - {} }; int tipc_register_sysctl(void) diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c index 3e84b31c355a2..ae45d4cfac394 100644 --- a/net/unix/sysctl_net_unix.c +++ b/net/unix/sysctl_net_unix.c @@ -19,7 +19,6 @@ static struct ctl_table unix_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { } }; int __net_init unix_sysctl_register(struct net *net) diff --git a/net/x25/sysctl_net_x25.c b/net/x25/sysctl_net_x25.c index e9802afa43d0d..643f50874dfe6 100644 --- a/net/x25/sysctl_net_x25.c +++ b/net/x25/sysctl_net_x25.c @@ -71,7 +71,6 @@ static struct ctl_table x25_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { }, }; int __init x25_register_sysctl(void) diff --git a/net/xfrm/xfrm_sysctl.c b/net/xfrm/xfrm_sysctl.c index 7fdeafc838a7c..b0f542805e6eb 100644 --- a/net/xfrm/xfrm_sysctl.c +++ b/net/xfrm/xfrm_sysctl.c @@ -38,7 +38,6 @@ static struct ctl_table xfrm_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - {} }; int __net_init xfrm_sysctl_init(struct net *net) @@ -57,10 +56,8 @@ int __net_init xfrm_sysctl_init(struct net *net) table[3].data = &net->xfrm.sysctl_acq_expires; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - table[0].procname = NULL; + if (net->user_ns != &init_user_ns) table_size = 0; - } net->xfrm.sysctl_hdr = register_net_sysctl_sz(net, "net/core", table, table_size); From ad62c4b9a42a58172c01aa32addbd26f3082b63e Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Thu, 28 Mar 2024 16:40:03 +0100 Subject: [PATCH 121/167] netfilter: Remove the now superfluous sentinel elements from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) * Remove sentinel elements from ctl_table structs * Remove instances where an array element is zeroed out to make it look like a sentinel. This is not longer needed and is safe after commit c899710fe7f9 ("networking: Update to register_net_sysctl_sz") added the array size to the ctl_table registration * Remove the need for having __NF_SYSCTL_CT_LAST_SYSCTL as the sysctl array size is now in NF_SYSCTL_CT_LAST_SYSCTL * Remove extra element in ctl_table arrays declarations Signed-off-by: Joel Granados Signed-off-by: NipaLocal --- net/bridge/br_netfilter_hooks.c | 1 - net/ipv6/netfilter/nf_conntrack_reasm.c | 1 - net/netfilter/ipvs/ip_vs_ctl.c | 5 +---- net/netfilter/ipvs/ip_vs_lblc.c | 5 +---- net/netfilter/ipvs/ip_vs_lblcr.c | 5 +---- net/netfilter/nf_conntrack_standalone.c | 6 +----- net/netfilter/nf_log.c | 3 +-- 7 files changed, 5 insertions(+), 21 deletions(-) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 35e10c5a766d5..d31f57ffe9859 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -1219,7 +1219,6 @@ static struct ctl_table brnf_table[] = { .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, - { } }; static inline void br_netfilter_sysctl_default(struct brnf_net *brnf) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index d0dcbaca19943..abf6e24ae4ecf 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -62,7 +62,6 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, - { } }; static int nf_ct_frag6_sysctl_register(struct net *net) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 143a341bbc0a4..50b5dbe40eb85 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2263,7 +2263,6 @@ static struct ctl_table vs_vars[] = { .proc_handler = proc_dointvec, }, #endif - { } }; #endif @@ -4286,10 +4285,8 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) return -ENOMEM; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - tbl[0].procname = NULL; + if (net->user_ns != &init_user_ns) ctl_table_size = 0; - } } else tbl = vs_vars; /* Initialize sysctl defaults */ diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 8ceec7a2fa8f3..2423513d701d4 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -123,7 +123,6 @@ static struct ctl_table vs_vars_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; #endif @@ -563,10 +562,8 @@ static int __net_init __ip_vs_lblc_init(struct net *net) return -ENOMEM; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - ipvs->lblc_ctl_table[0].procname = NULL; + if (net->user_ns != &init_user_ns) vars_table_size = 0; - } } else ipvs->lblc_ctl_table = vs_vars_table; diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 0fb64707213f8..cdb1d4bf6761c 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -294,7 +294,6 @@ static struct ctl_table vs_vars_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { } }; #endif @@ -749,10 +748,8 @@ static int __net_init __ip_vs_lblcr_init(struct net *net) return -ENOMEM; /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - ipvs->lblcr_ctl_table[0].procname = NULL; + if (net->user_ns != &init_user_ns) vars_table_size = 0; - } } else ipvs->lblcr_ctl_table = vs_vars_table; ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 0ee98ce5b8165..2f226cfb32d04 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -616,11 +616,9 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_LWTUNNEL, #endif - __NF_SYSCTL_CT_LAST_SYSCTL, + NF_SYSCTL_CT_LAST_SYSCTL, }; -#define NF_SYSCTL_CT_LAST_SYSCTL (__NF_SYSCTL_CT_LAST_SYSCTL + 1) - static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_MAX] = { .procname = "nf_conntrack_max", @@ -957,7 +955,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .proc_handler = nf_hooks_lwtunnel_sysctl_handler, }, #endif - {} }; static struct ctl_table nf_ct_netfilter_table[] = { @@ -968,7 +965,6 @@ static struct ctl_table nf_ct_netfilter_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net, diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 370f8231385ca..d42ba733496b2 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -395,7 +395,7 @@ static const struct seq_operations nflog_seq_ops = { #ifdef CONFIG_SYSCTL static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3]; -static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; +static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO]; static struct ctl_table_header *nf_log_sysctl_fhdr; static struct ctl_table nf_log_sysctl_ftable[] = { @@ -406,7 +406,6 @@ static struct ctl_table nf_log_sysctl_ftable[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static int nf_log_proc_dostring(struct ctl_table *table, int write, From fb1273ce3a8ac444e4bac324e73a3dcf521496fb Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Thu, 28 Mar 2024 16:40:04 +0100 Subject: [PATCH 122/167] appletalk: Remove the now superfluous sentinel elements from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) Remove sentinel from atalk_table ctl_table array. Signed-off-by: Joel Granados Signed-off-by: NipaLocal --- net/appletalk/sysctl_net_atalk.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/appletalk/sysctl_net_atalk.c b/net/appletalk/sysctl_net_atalk.c index d945b7c0176db..7aebfe9032425 100644 --- a/net/appletalk/sysctl_net_atalk.c +++ b/net/appletalk/sysctl_net_atalk.c @@ -40,7 +40,6 @@ static struct ctl_table atalk_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { }, }; static struct ctl_table_header *atalk_table_header; From 3058ccb7efc7ac5a5109e3a138bd3ba849d0f1cf Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Thu, 28 Mar 2024 16:40:05 +0100 Subject: [PATCH 123/167] ax.25: Remove the now superfluous sentinel elements from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) When we remove the sentinel from ax25_param_table a buffer overflow shows its ugly head. The sentinel's data element used to be changed when CONFIG_AX25_DAMA_SLAVE was not defined. This did not have any adverse effects as we still stopped on the sentinel because of its null procname. But now that we do not have the sentinel element, we are careful to check ax25_param_table's size. Signed-off-by: Joel Granados Signed-off-by: NipaLocal --- net/ax25/sysctl_net_ax25.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/ax25/sysctl_net_ax25.c b/net/ax25/sysctl_net_ax25.c index db66e11e7fe8b..e55be8817a1e4 100644 --- a/net/ax25/sysctl_net_ax25.c +++ b/net/ax25/sysctl_net_ax25.c @@ -141,8 +141,6 @@ static const struct ctl_table ax25_param_table[] = { .extra2 = &max_ds_timeout }, #endif - - { } /* that's all, folks! */ }; int ax25_register_dev_sysctl(ax25_dev *ax25_dev) @@ -155,7 +153,7 @@ int ax25_register_dev_sysctl(ax25_dev *ax25_dev) if (!table) return -ENOMEM; - for (k = 0; k < AX25_MAX_VALUES; k++) + for (k = 0; k < AX25_MAX_VALUES && k < ARRAY_SIZE(ax25_param_table); k++) table[k].data = &ax25_dev->values[k]; snprintf(path, sizeof(path), "net/ax25/%s", ax25_dev->dev->name); From 6071815984fc72eb76702055a98a0a8c627620b1 Mon Sep 17 00:00:00 2001 From: Gerd Bayer Date: Thu, 28 Mar 2024 16:41:44 +0100 Subject: [PATCH 124/167] s390/ism: fix receive message buffer allocation Since [1], dma_alloc_coherent() does not accept requests for GFP_COMP anymore, even on archs that may be able to fulfill this. Functionality that relied on the receive buffer being a compound page broke at that point: The SMC-D protocol, that utilizes the ism device driver, passes receive buffers to the splice processor in a struct splice_pipe_desc with a single entry list of struct pages. As the buffer is no longer a compound page, the splice processor now rejects requests to handle more than a page worth of data. Replace dma_alloc_coherent() and allocate a buffer with kmalloc() then create a DMA map for it with dma_map_page(). Since only receive buffers on ISM devices use DMA, qualify the mapping as FROM_DEVICE. Since ISM devices are available on arch s390, only and on that arch all DMA is coherent, there is no need to introduce and export some kind of dma_sync_to_cpu() method to be called by the SMC-D protocol layer. Analogously, replace dma_free_coherent by a two step dma_unmap_page, then kfree to free the receive buffer. [1] https://lore.kernel.org/all/20221113163535.884299-1-hch@lst.de/ Fixes: c08004eede4b ("s390/ism: don't pass bogus GFP_ flags to dma_alloc_coherent") Signed-off-by: Gerd Bayer Signed-off-by: NipaLocal --- drivers/s390/net/ism_drv.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 2c8e964425dc3..25911b887e5ee 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include "ism.h" @@ -292,13 +294,15 @@ static int ism_read_local_gid(struct ism_dev *ism) static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { clear_bit(dmb->sba_idx, ism->sba_bitmap); - dma_free_coherent(&ism->pdev->dev, dmb->dmb_len, - dmb->cpu_addr, dmb->dma_addr); + dma_unmap_page(&ism->pdev->dev, dmb->dma_addr, dmb->dmb_len, + DMA_FROM_DEVICE); + kfree(dmb->cpu_addr); } static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { unsigned long bit; + int rc; if (PAGE_ALIGN(dmb->dmb_len) > dma_get_max_seg_size(&ism->pdev->dev)) return -EINVAL; @@ -315,14 +319,27 @@ static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) test_and_set_bit(dmb->sba_idx, ism->sba_bitmap)) return -EINVAL; - dmb->cpu_addr = dma_alloc_coherent(&ism->pdev->dev, dmb->dmb_len, - &dmb->dma_addr, - GFP_KERNEL | __GFP_NOWARN | - __GFP_NOMEMALLOC | __GFP_NORETRY); - if (!dmb->cpu_addr) - clear_bit(dmb->sba_idx, ism->sba_bitmap); + dmb->cpu_addr = kmalloc(dmb->dmb_len, GFP_KERNEL | __GFP_NOWARN | + __GFP_COMP | __GFP_NOMEMALLOC | __GFP_NORETRY); + if (!dmb->cpu_addr) { + rc = -ENOMEM; + goto out_bit; + } + dmb->dma_addr = dma_map_page(&ism->pdev->dev, + virt_to_page(dmb->cpu_addr), 0, + dmb->dmb_len, DMA_FROM_DEVICE); + if (dma_mapping_error(&ism->pdev->dev, dmb->dma_addr)) { + rc = -ENOMEM; + goto out_free; + } + + return 0; - return dmb->cpu_addr ? 0 : -ENOMEM; +out_free: + kfree(dmb->cpu_addr); +out_bit: + clear_bit(dmb->sba_idx, ism->sba_bitmap); + return rc; } int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, From a15c34d8aceb64ba1573e65f72266e71b513cfd0 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Thu, 28 Mar 2024 15:59:29 +0000 Subject: [PATCH 125/167] net: fec: Set mac_managed_pm during probe Setting mac_managed_pm during interface up is too late. In situations where the link is not brought up yet and the system suspends the regular PHY power management will run. Since the FEC ETHEREN control bit is cleared (automatically) on suspend the controller is off in resume. When the regular PHY power management resume path runs in this context it will write to the MII_DATA register but nothing will be transmitted on the MDIO bus. This can be observed by the following log: fec 5b040000.ethernet eth0: MDIO read timeout Microchip LAN87xx T1 5b040000.ethernet-1:04: PM: dpm_run_callback(): mdio_bus_phy_resume+0x0/0xc8 returns -110 Microchip LAN87xx T1 5b040000.ethernet-1:04: PM: failed to resume: error -110 The data written will however remain in the MII_DATA register. When the link later is set to administrative up it will trigger a call to fec_restart() which will restore the MII_SPEED register. This triggers the quirk explained in f166f890c8f0 ("net: ethernet: fec: Replace interrupt driven MDIO with polled IO") causing an extra MII_EVENT. This extra event desynchronizes all the MDIO register reads, causing them to complete too early. Leading all reads to read as 0 because fec_enet_mdio_wait() returns too early. When a Microchip LAN8700R PHY is connected to the FEC, the 0 reads causes the PHY to be initialized incorrectly and the PHY will not transmit any ethernet signal in this state. It cannot be brought out of this state without a power cycle of the PHY. Fixes: 557d5dc83f68 ("net: fec: use mac-managed PHY PM") Closes: https://lore.kernel.org/netdev/1f45bdbe-eab1-4e59-8f24-add177590d27@actia.se/ Signed-off-by: Wei Fang [jernberg: commit message] Signed-off-by: John Ernberg Signed-off-by: NipaLocal --- drivers/net/ethernet/freescale/fec_main.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index d7693fdf640d5..8bd213da8fb6f 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -2454,8 +2454,6 @@ static int fec_enet_mii_probe(struct net_device *ndev) fep->link = 0; fep->full_duplex = 0; - phy_dev->mac_managed_pm = true; - phy_attached_info(phy_dev); return 0; @@ -2467,10 +2465,12 @@ static int fec_enet_mii_init(struct platform_device *pdev) struct net_device *ndev = platform_get_drvdata(pdev); struct fec_enet_private *fep = netdev_priv(ndev); bool suppress_preamble = false; + struct phy_device *phydev; struct device_node *node; int err = -ENXIO; u32 mii_speed, holdtime; u32 bus_freq; + int addr; /* * The i.MX28 dual fec interfaces are not equal. @@ -2584,6 +2584,13 @@ static int fec_enet_mii_init(struct platform_device *pdev) goto err_out_free_mdiobus; of_node_put(node); + /* find all the PHY devices on the bus and set mac_managed_pm to true */ + for (addr = 0; addr < PHY_MAX_ADDR; addr++) { + phydev = mdiobus_get_phy(fep->mii_bus, addr); + if (phydev) + phydev->mac_managed_pm = true; + } + mii_cnt++; /* save fec0 mii_bus */ From 5110b594fb2d4621b914a6a86443e8c32c1bee57 Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Thu, 28 Mar 2024 19:55:05 +0300 Subject: [PATCH 126/167] octeontx2-af: Add array index check In rvu_map_cgx_lmac_pf() the 'iter', which is used as an array index, can reach value (up to 14) that exceed the size (MAX_LMAC_COUNT = 8) of the array. Fix this bug by adding 'iter' value check. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 91c6945ea1f9 ("octeontx2-af: cn10k: Add RPM MAC support") Signed-off-by: Aleksandr Mishin Signed-off-by: NipaLocal --- drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c index 72e060cf6b618..e9bf9231b0185 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c @@ -160,6 +160,8 @@ static int rvu_map_cgx_lmac_pf(struct rvu *rvu) continue; lmac_bmap = cgx_get_lmac_bmap(rvu_cgx_pdata(cgx, rvu)); for_each_set_bit(iter, &lmac_bmap, rvu->hw->lmac_per_cgx) { + if (iter >= MAX_LMAC_COUNT) + continue; lmac = cgx_get_lmacid(rvu_cgx_pdata(cgx, rvu), iter); rvu->pf2cgxlmac_map[pf] = cgxlmac_id_to_bmap(cgx, lmac); From 6a96948db8266596afd1e41f1501c980ad4efea2 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 28 Mar 2024 15:52:49 -0600 Subject: [PATCH 127/167] net/smc: Avoid -Wflex-array-member-not-at-end warnings -Wflex-array-member-not-at-end is coming in GCC-14, and we are getting ready to enable it globally. There are currently a couple of objects in `struct smc_clc_msg_proposal_area` that contain a couple of flexible structures: struct smc_clc_msg_proposal_area { ... struct smc_clc_v2_extension pclc_v2_ext; ... struct smc_clc_smcd_v2_extension pclc_smcd_v2_ext; ... }; So, in order to avoid ending up with a couple of flexible-array members in the middle of a struct, we use the `struct_group_tagged()` helper to separate the flexible array from the rest of the members in the flexible structure: struct smc_clc_smcd_v2_extension { struct_group_tagged(smc_clc_smcd_v2_extension_fixed, fixed, u8 system_eid[SMC_MAX_EID_LEN]; u8 reserved[16]; ); struct smc_clc_smcd_gid_chid gidchid[]; }; With the change described above, we now declare objects of the type of the tagged struct without embedding flexible arrays in the middle of another struct: struct smc_clc_msg_proposal_area { ... struct smc_clc_v2_extension_fixed pclc_v2_ext; ... struct smc_clc_smcd_v2_extension_fixed pclc_smcd_v2_ext; ... }; We also use `container_of()` when we need to retrieve a pointer to the flexible structures. So, with these changes, fix the following warnings: In file included from net/smc/af_smc.c:42: net/smc/smc_clc.h:186:49: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] 186 | struct smc_clc_v2_extension pclc_v2_ext; | ^~~~~~~~~~~ net/smc/smc_clc.h:188:49: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] 188 | struct smc_clc_smcd_v2_extension pclc_smcd_v2_ext; | ^~~~~~~~~~~~~~~~ Reviewed-by: Kees Cook Signed-off-by: Gustavo A. R. Silva Signed-off-by: NipaLocal --- net/smc/smc_clc.c | 6 ++++-- net/smc/smc_clc.h | 26 ++++++++++++++++---------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index e55026c7529cf..33fa787c28ebb 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -853,8 +853,10 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) pclc_smcd = &pclc->pclc_smcd; pclc_prfx = &pclc->pclc_prfx; ipv6_prfx = pclc->pclc_prfx_ipv6; - v2_ext = &pclc->pclc_v2_ext; - smcd_v2_ext = &pclc->pclc_smcd_v2_ext; + v2_ext = container_of(&pclc->pclc_v2_ext, + struct smc_clc_v2_extension, fixed); + smcd_v2_ext = container_of(&pclc->pclc_smcd_v2_ext, + struct smc_clc_smcd_v2_extension, fixed); gidchids = pclc->pclc_gidchids; trl = &pclc->pclc_trl; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 7cc7070b9772d..467effb50cd65 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -134,12 +134,15 @@ struct smc_clc_smcd_gid_chid { */ struct smc_clc_v2_extension { - struct smc_clnt_opts_area_hdr hdr; - u8 roce[16]; /* RoCEv2 GID */ - u8 max_conns; - u8 max_links; - __be16 feature_mask; - u8 reserved[12]; + /* New members must be added within the struct_group() macro below. */ + struct_group_tagged(smc_clc_v2_extension_fixed, fixed, + struct smc_clnt_opts_area_hdr hdr; + u8 roce[16]; /* RoCEv2 GID */ + u8 max_conns; + u8 max_links; + __be16 feature_mask; + u8 reserved[12]; + ); u8 user_eids[][SMC_MAX_EID_LEN]; }; @@ -159,8 +162,11 @@ struct smc_clc_msg_smcd { /* SMC-D GID information */ }; struct smc_clc_smcd_v2_extension { - u8 system_eid[SMC_MAX_EID_LEN]; - u8 reserved[16]; + /* New members must be added within the struct_group() macro below. */ + struct_group_tagged(smc_clc_smcd_v2_extension_fixed, fixed, + u8 system_eid[SMC_MAX_EID_LEN]; + u8 reserved[16]; + ); struct smc_clc_smcd_gid_chid gidchid[]; }; @@ -183,9 +189,9 @@ struct smc_clc_msg_proposal_area { struct smc_clc_msg_smcd pclc_smcd; struct smc_clc_msg_proposal_prefix pclc_prfx; struct smc_clc_ipv6_prefix pclc_prfx_ipv6[SMC_CLC_MAX_V6_PREFIX]; - struct smc_clc_v2_extension pclc_v2_ext; + struct smc_clc_v2_extension_fixed pclc_v2_ext; u8 user_eids[SMC_CLC_MAX_UEID][SMC_MAX_EID_LEN]; - struct smc_clc_smcd_v2_extension pclc_smcd_v2_ext; + struct smc_clc_smcd_v2_extension_fixed pclc_smcd_v2_ext; struct smc_clc_smcd_gid_chid pclc_gidchids[SMCD_CLC_MAX_V2_GID_ENTRIES]; struct smc_clc_msg_trail pclc_trl; From 8796dcdbe9ff481d75a671e932f683ddda60de47 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 29 Mar 2024 09:50:23 +0800 Subject: [PATCH 128/167] ax25: fix use-after-free bugs caused by ax25_ds_del_timer When the ax25 device is detaching, the ax25_dev_device_down() calls ax25_ds_del_timer() to cleanup the slave_timer. When the timer handler is running, the ax25_ds_del_timer() that calls del_timer() in it will return directly. As a result, the use-after-free bugs could happen, one of the scenarios is shown below: (Thread 1) | (Thread 2) | ax25_ds_timeout() ax25_dev_device_down() | ax25_ds_del_timer() | del_timer() | ax25_dev_put() //FREE | | ax25_dev-> //USE In order to mitigate bugs, when the device is detaching, use timer_shutdown_sync() to stop the timer. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Duoming Zhou Signed-off-by: NipaLocal --- net/ax25/ax25_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index c5462486dbca1..282ec581c0720 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -105,7 +105,7 @@ void ax25_dev_device_down(struct net_device *dev) spin_lock_bh(&ax25_dev_lock); #ifdef CONFIG_AX25_DAMA_SLAVE - ax25_ds_del_timer(ax25_dev); + timer_shutdown_sync(&ax25_dev->dama.slave_timer); #endif /* From 91c008266d008a6ac340cc57da413f35f1a41f18 Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Fri, 29 Mar 2024 09:16:31 +0300 Subject: [PATCH 129/167] net: phy: micrel: Fix potential null pointer dereference In lan8814_get_sig_rx() and lan8814_get_sig_tx() ptp_parse_header() may return NULL as ptp_header due to abnormal packet type or corrupted packet. Fix this bug by adding ptp_header check. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: ece19502834d ("net: phy: micrel: 1588 support for LAN8814 phy") Signed-off-by: Aleksandr Mishin Signed-off-by: NipaLocal --- drivers/net/phy/micrel.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 8b8634600c519..0f8a8ad7ea0bd 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -2537,7 +2537,7 @@ static void lan8814_txtstamp(struct mii_timestamper *mii_ts, } } -static void lan8814_get_sig_rx(struct sk_buff *skb, u16 *sig) +static bool lan8814_get_sig_rx(struct sk_buff *skb, u16 *sig) { struct ptp_header *ptp_header; u32 type; @@ -2547,7 +2547,11 @@ static void lan8814_get_sig_rx(struct sk_buff *skb, u16 *sig) ptp_header = ptp_parse_header(skb, type); skb_pull_inline(skb, ETH_HLEN); + if (!ptp_header) + return false; + *sig = (__force u16)(ntohs(ptp_header->sequence_id)); + return true; } static bool lan8814_match_rx_skb(struct kszphy_ptp_priv *ptp_priv, @@ -2559,7 +2563,8 @@ static bool lan8814_match_rx_skb(struct kszphy_ptp_priv *ptp_priv, bool ret = false; u16 skb_sig; - lan8814_get_sig_rx(skb, &skb_sig); + if (!lan8814_get_sig_rx(skb, &skb_sig)) + return ret; /* Iterate over all RX timestamps and match it with the received skbs */ spin_lock_irqsave(&ptp_priv->rx_ts_lock, flags); @@ -2834,7 +2839,7 @@ static int lan8814_ptpci_adjfine(struct ptp_clock_info *ptpci, long scaled_ppm) return 0; } -static void lan8814_get_sig_tx(struct sk_buff *skb, u16 *sig) +static bool lan8814_get_sig_tx(struct sk_buff *skb, u16 *sig) { struct ptp_header *ptp_header; u32 type; @@ -2842,7 +2847,11 @@ static void lan8814_get_sig_tx(struct sk_buff *skb, u16 *sig) type = ptp_classify_raw(skb); ptp_header = ptp_parse_header(skb, type); + if (!ptp_header) + return false; + *sig = (__force u16)(ntohs(ptp_header->sequence_id)); + return true; } static void lan8814_match_tx_skb(struct kszphy_ptp_priv *ptp_priv, @@ -2856,7 +2865,8 @@ static void lan8814_match_tx_skb(struct kszphy_ptp_priv *ptp_priv, spin_lock_irqsave(&ptp_priv->tx_queue.lock, flags); skb_queue_walk_safe(&ptp_priv->tx_queue, skb, skb_tmp) { - lan8814_get_sig_tx(skb, &skb_sig); + if (!lan8814_get_sig_tx(skb, &skb_sig)) + continue; if (memcmp(&skb_sig, &seq_id, sizeof(seq_id))) continue; @@ -2910,7 +2920,8 @@ static bool lan8814_match_skb(struct kszphy_ptp_priv *ptp_priv, spin_lock_irqsave(&ptp_priv->rx_queue.lock, flags); skb_queue_walk_safe(&ptp_priv->rx_queue, skb, skb_tmp) { - lan8814_get_sig_rx(skb, &skb_sig); + if (!lan8814_get_sig_rx(skb, &skb_sig)) + continue; if (memcmp(&skb_sig, &rx_ts->seq_id, sizeof(rx_ts->seq_id))) continue; From 321647bd32968161a5e956d47b9e26159c5aa3ca Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Fri, 29 Mar 2024 18:56:09 +0800 Subject: [PATCH 130/167] net/iucv: Avoid explicit cpumask var allocation on stack For CONFIG_CPUMASK_OFFSTACK=y kernel, explicit allocation of cpumask variable on stack is not recommended since it can cause potential stack overflow. Instead, kernel code should always use *cpumask_var API(s) to allocate cpumask var in config-neutral way, leaving allocation strategy to CONFIG_CPUMASK_OFFSTACK. Use *cpumask_var API(s) to address it. Signed-off-by: Dawei Li Signed-off-by: NipaLocal --- net/iucv/iucv.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index a4ab615ca3e3e..b51f46ec32f95 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -520,14 +520,19 @@ static void iucv_setmask_mp(void) */ static void iucv_setmask_up(void) { - cpumask_t cpumask; + cpumask_var_t cpumask; int cpu; + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + return; + /* Disable all cpu but the first in cpu_irq_cpumask. */ - cpumask_copy(&cpumask, &iucv_irq_cpumask); - cpumask_clear_cpu(cpumask_first(&iucv_irq_cpumask), &cpumask); - for_each_cpu(cpu, &cpumask) + cpumask_copy(cpumask, &iucv_irq_cpumask); + cpumask_clear_cpu(cpumask_first(&iucv_irq_cpumask), cpumask); + for_each_cpu(cpu, cpumask) smp_call_function_single(cpu, iucv_block_cpu, NULL, 1); + + free_cpumask_var(cpumask); } /* @@ -628,23 +633,33 @@ static int iucv_cpu_online(unsigned int cpu) static int iucv_cpu_down_prep(unsigned int cpu) { - cpumask_t cpumask; + cpumask_var_t cpumask; + int ret = 0; if (!iucv_path_table) return 0; - cpumask_copy(&cpumask, &iucv_buffer_cpumask); - cpumask_clear_cpu(cpu, &cpumask); - if (cpumask_empty(&cpumask)) + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_copy(cpumask, &iucv_buffer_cpumask); + cpumask_clear_cpu(cpu, cpumask); + if (cpumask_empty(cpumask)) { /* Can't offline last IUCV enabled cpu. */ - return -EINVAL; + ret = -EINVAL; + goto __free_cpumask; + } iucv_retrieve_cpu(NULL); if (!cpumask_empty(&iucv_irq_cpumask)) - return 0; + goto __free_cpumask; + smp_call_function_single(cpumask_first(&iucv_buffer_cpumask), iucv_allow_cpu, NULL, 1); - return 0; + +__free_cpumask: + free_cpumask_var(cpumask); + return ret; } /** From 96ee19125f20d040942000f4d9a35845afccef8a Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Fri, 29 Mar 2024 18:56:10 +0800 Subject: [PATCH 131/167] net/dpaa2: Avoid explicit cpumask var allocation on stack For CONFIG_CPUMASK_OFFSTACK=y kernel, explicit allocation of cpumask variable on stack is not recommended since it can cause potential stack overflow. Instead, kernel code should always use *cpumask_var API(s) to allocate cpumask var in config-neutral way, leaving allocation strategy to CONFIG_CPUMASK_OFFSTACK. Use *cpumask_var API(s) to address it. Signed-off-by: Dawei Li Signed-off-by: NipaLocal --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 888509cf1f210..40e8818295951 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -2896,11 +2896,14 @@ static int dpaa2_eth_xdp_xmit(struct net_device *net_dev, int n, static int update_xps(struct dpaa2_eth_priv *priv) { struct net_device *net_dev = priv->net_dev; - struct cpumask xps_mask; - struct dpaa2_eth_fq *fq; int i, num_queues, netdev_queues; + struct dpaa2_eth_fq *fq; + cpumask_var_t xps_mask; int err = 0; + if (!alloc_cpumask_var(&xps_mask, GFP_KERNEL)) + return -ENOMEM; + num_queues = dpaa2_eth_queue_count(priv); netdev_queues = (net_dev->num_tc ? : 1) * num_queues; @@ -2910,16 +2913,17 @@ static int update_xps(struct dpaa2_eth_priv *priv) for (i = 0; i < netdev_queues; i++) { fq = &priv->fq[i % num_queues]; - cpumask_clear(&xps_mask); - cpumask_set_cpu(fq->target_cpu, &xps_mask); + cpumask_clear(xps_mask); + cpumask_set_cpu(fq->target_cpu, xps_mask); - err = netif_set_xps_queue(net_dev, &xps_mask, i); + err = netif_set_xps_queue(net_dev, xps_mask, i); if (err) { netdev_warn_once(net_dev, "Error setting XPS queue\n"); break; } } + free_cpumask_var(xps_mask); return err; } From b7884653f803ea0c688d35cb5db95a7a3975d64f Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 29 Mar 2024 13:08:52 +0100 Subject: [PATCH 132/167] mptcp: don't account accept() of non-MPC client as fallback to TCP Current MPTCP servers increment MPTcpExtMPCapableFallbackACK when they accept non-MPC connections. As reported by Christoph, this is "surprising" because the counter might become greater than MPTcpExtMPCapableSYNRX. MPTcpExtMPCapableFallbackACK counter's name suggests it should only be incremented when a connection was seen using MPTCP options, then a fallback to TCP has been done. Let's do that by incrementing it when the subflow context of an inbound MPC connection attempt is dropped. Also, update mptcp_connect.sh kselftest, to ensure that the above MIB does not increment in case a pure TCP client connects to a MPTCP server. Fixes: fc518953bc9c ("mptcp: add and use MIB counter infrastructure") Cc: stable@vger.kernel.org Reported-by: Christoph Paasch Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/449 Signed-off-by: Davide Caratti Reviewed-by: Mat Martineau Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: NipaLocal --- net/mptcp/protocol.c | 2 -- net/mptcp/subflow.c | 2 ++ tools/testing/selftests/net/mptcp/mptcp_connect.sh | 9 +++++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3a1967bc7bad6..7e74b812e366a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3937,8 +3937,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, mptcp_set_state(newsk, TCP_CLOSE); } } else { - MPTCP_INC_STATS(sock_net(ssk), - MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); tcpfallback: newsk->sk_kern_sock = kern; lock_sock(newsk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 1626dd20c68f1..6042a47da61be 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -905,6 +905,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, return child; fallback: + if (fallback) + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); mptcp_subflow_drop_ctx(child); return child; } diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 4c42485548262..4131f3263a482 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -383,12 +383,14 @@ do_transfer() local stat_cookierx_last local stat_csum_err_s local stat_csum_err_c + local stat_tcpfb_last_l stat_synrx_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableSYNRX") stat_ackrx_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableACKRX") stat_cookietx_last=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesSent") stat_cookierx_last=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesRecv") stat_csum_err_s=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtDataCsumErr") stat_csum_err_c=$(mptcp_lib_get_counter "${connector_ns}" "MPTcpExtDataCsumErr") + stat_tcpfb_last_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableFallbackACK") timeout ${timeout_test} \ ip netns exec ${listener_ns} \ @@ -457,11 +459,13 @@ do_transfer() local stat_cookietx_now local stat_cookierx_now local stat_ooo_now + local stat_tcpfb_now_l stat_synrx_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableSYNRX") stat_ackrx_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableACKRX") stat_cookietx_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesSent") stat_cookierx_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtSyncookiesRecv") stat_ooo_now=$(mptcp_lib_get_counter "${listener_ns}" "TcpExtTCPOFOQueue") + stat_tcpfb_now_l=$(mptcp_lib_get_counter "${listener_ns}" "MPTcpExtMPCapableFallbackACK") expect_synrx=$((stat_synrx_last_l)) expect_ackrx=$((stat_ackrx_last_l)) @@ -508,6 +512,11 @@ do_transfer() fi fi + if [ ${stat_ooo_now} -eq 0 ] && [ ${stat_tcpfb_last_l} -ne ${stat_tcpfb_now_l} ]; then + mptcp_lib_pr_fail "unexpected fallback to TCP" + rets=1 + fi + if [ $cookies -eq 2 ];then if [ $stat_cookietx_last -ge $stat_cookietx_now ] ;then extra+=" WARN: CookieSent: did not advance" From 2f9f25be096c576418430f3af2055a3e18624fa7 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 29 Mar 2024 13:08:53 +0100 Subject: [PATCH 133/167] selftests: mptcp: join: fix dev in check_endpoint There's a bug in pm_nl_check_endpoint(), 'dev' didn't be parsed correctly. If calling it in the 2nd test of endpoint_tests() too, it fails with an error like this: creation [FAIL] expected '10.0.2.2 id 2 subflow dev dev' \ found '10.0.2.2 id 2 subflow dev ns2eth2' The reason is '$2' should be set to 'dev', not '$1'. This patch fixes it. Fixes: 69c6ce7b6eca ("selftests: mptcp: add implicit endpoint test case") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: NipaLocal --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 5e9211e898256..e4403236f6554 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -729,7 +729,7 @@ pm_nl_check_endpoint() [ -n "$_flags" ]; flags="flags $_flags" shift elif [ $1 = "dev" ]; then - [ -n "$2" ]; dev="dev $1" + [ -n "$2" ]; dev="dev $2" shift elif [ $1 = "id" ]; then _id=$2 @@ -3610,6 +3610,8 @@ endpoint_tests() local tests_pid=$! wait_mpj $ns2 + pm_nl_check_endpoint "creation" \ + $ns2 10.0.2.2 id 2 flags subflow dev ns2eth2 chk_subflow_nr "before delete" 2 chk_mptcp_info subflows 1 subflows 1 From 05a779426d517e1459d41f06aff8014960802d75 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Fri, 29 Mar 2024 21:40:37 +0800 Subject: [PATCH 134/167] bpf, skmsg: fix NULL pointer dereference in sk_psock_skb_ingress_enqueue Fix NULL pointer data-races in sk_psock_skb_ingress_enqueue() which syzbot reported [1]. [1] BUG: KCSAN: data-race in sk_psock_drop / sk_psock_skb_ingress_enqueue write to 0xffff88814b3278b8 of 8 bytes by task 10724 on cpu 1: sk_psock_stop_verdict net/core/skmsg.c:1257 [inline] sk_psock_drop+0x13e/0x1f0 net/core/skmsg.c:843 sk_psock_put include/linux/skmsg.h:459 [inline] sock_map_close+0x1a7/0x260 net/core/sock_map.c:1648 unix_release+0x4b/0x80 net/unix/af_unix.c:1048 __sock_release net/socket.c:659 [inline] sock_close+0x68/0x150 net/socket.c:1421 __fput+0x2c1/0x660 fs/file_table.c:422 __fput_sync+0x44/0x60 fs/file_table.c:507 __do_sys_close fs/open.c:1556 [inline] __se_sys_close+0x101/0x1b0 fs/open.c:1541 __x64_sys_close+0x1f/0x30 fs/open.c:1541 do_syscall_64+0xd3/0x1d0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 read to 0xffff88814b3278b8 of 8 bytes by task 10713 on cpu 0: sk_psock_data_ready include/linux/skmsg.h:464 [inline] sk_psock_skb_ingress_enqueue+0x32d/0x390 net/core/skmsg.c:555 sk_psock_skb_ingress_self+0x185/0x1e0 net/core/skmsg.c:606 sk_psock_verdict_apply net/core/skmsg.c:1008 [inline] sk_psock_verdict_recv+0x3e4/0x4a0 net/core/skmsg.c:1202 unix_read_skb net/unix/af_unix.c:2546 [inline] unix_stream_read_skb+0x9e/0xf0 net/unix/af_unix.c:2682 sk_psock_verdict_data_ready+0x77/0x220 net/core/skmsg.c:1223 unix_stream_sendmsg+0x527/0x860 net/unix/af_unix.c:2339 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x140/0x180 net/socket.c:745 ____sys_sendmsg+0x312/0x410 net/socket.c:2584 ___sys_sendmsg net/socket.c:2638 [inline] __sys_sendmsg+0x1e9/0x280 net/socket.c:2667 __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x46/0x50 net/socket.c:2674 do_syscall_64+0xd3/0x1d0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 value changed: 0xffffffff83d7feb0 -> 0x0000000000000000 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 10713 Comm: syz-executor.4 Tainted: G W 6.8.0-syzkaller-08951-gfe46a7dd189e #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/29/2024 Prior to this, commit 4cd12c6065df ("bpf, sockmap: Fix NULL pointer dereference in sk_psock_verdict_data_ready()") fixed one NULL pointer similarly due to no protection of saved_data_ready. Here is another different caller causing the same issue because of the same reason. So we should protect it with sk_callback_lock read lock because the writer side in the sk_psock_drop() uses "write_lock_bh(&sk->sk_callback_lock);". Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Reported-by: syzbot+aa8c8ec2538929f18f2d@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=aa8c8ec2538929f18f2d Signed-off-by: Jason Xing Signed-off-by: NipaLocal --- net/core/skmsg.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 4d75ef9d24bfa..67c4c01c52359 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -552,7 +552,9 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, msg->skb = skb; sk_psock_queue_msg(psock, msg); + read_lock_bh(&sk->sk_callback_lock); sk_psock_data_ready(sk, psock); + read_unlock_bh(&sk->sk_callback_lock); return copied; } From 2a881e933111f0b285e8f370cec3264ed45271d4 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Fri, 29 Mar 2024 13:50:19 +0000 Subject: [PATCH 135/167] doc: netlink: Change generated docs to limit TOC to depth 3 The tables of contents in the generated Netlink docs include individual attribute definitions. This can make the contents exceedingly long and repeats a lot of what is on the rest of the pages. See for example: https://docs.kernel.org/networking/netlink_spec/tc.html Add a depth limit to the contents directive in generated .rst files to limit the contents depth to 3 levels. This reduces the contents to: - Family - Summary - Operations - op-one - op-two - ... - Definitions - struct-one - struct-two - enum-one - ... - Attribute sets - attrs-one - attrs-two - ... Signed-off-by: Donald Hunter Reviewed-by: Breno Leitao Signed-off-by: NipaLocal --- tools/net/ynl/ynl-gen-rst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/net/ynl/ynl-gen-rst.py b/tools/net/ynl/ynl-gen-rst.py index 927407b3efb3d..5825a8b3bfb46 100755 --- a/tools/net/ynl/ynl-gen-rst.py +++ b/tools/net/ynl/ynl-gen-rst.py @@ -291,7 +291,7 @@ def parse_yaml(obj: Dict[str, Any]) -> str: title = f"Family ``{obj['name']}`` netlink specification" lines.append(rst_title(title)) - lines.append(rst_paragraph(".. contents::\n")) + lines.append(rst_paragraph(".. contents:: :depth: 3\n")) if "doc" in obj: lines.append(rst_subtitle("Summary")) From 9e2e05652c282c476f7f95a925e7061f7c0cf86e Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Fri, 29 Mar 2024 13:50:20 +0000 Subject: [PATCH 136/167] doc: netlink: Add hyperlinks to generated Netlink docs Update ynl-gen-rst to generate hyperlinks to definitions, attribute sets and sub-messages from all the places that reference them. Note that there is a single label namespace for all of the kernel docs. Hyperlinks within a single netlink doc need to be qualified by the family name to avoid collisions. The label format is 'family-type-name' which gives, for example, 'rt-link-attribute-set-link-attrs' as the link id. Signed-off-by: Donald Hunter Signed-off-by: NipaLocal --- tools/net/ynl/ynl-gen-rst.py | 60 +++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/tools/net/ynl/ynl-gen-rst.py b/tools/net/ynl/ynl-gen-rst.py index 5825a8b3bfb46..657e881d2ea4a 100755 --- a/tools/net/ynl/ynl-gen-rst.py +++ b/tools/net/ynl/ynl-gen-rst.py @@ -82,9 +82,9 @@ def rst_subsubsection(title: str) -> str: return f"{title}\n" + "~" * len(title) -def rst_section(title: str) -> str: +def rst_section(namespace: str, prefix: str, title: str) -> str: """Add a section to the document""" - return f"\n{title}\n" + "=" * len(title) + return f".. _{namespace}-{prefix}-{title}:\n\n{title}\n" + "=" * len(title) def rst_subtitle(title: str) -> str: @@ -102,6 +102,17 @@ def rst_list_inline(list_: List[str], level: int = 0) -> str: return headroom(level) + "[" + ", ".join(inline(i) for i in list_) + "]" +def rst_ref(namespace: str, prefix: str, name: str) -> str: + """Add a hyperlink to the document""" + mappings = {'enum': 'definition', + 'fixed-header': 'definition', + 'nested-attributes': 'attribute-set', + 'struct': 'definition'} + if prefix in mappings: + prefix = mappings[prefix] + return f":ref:`{namespace}-{prefix}-{name}`" + + def rst_header() -> str: """The headers for all the auto generated RST files""" lines = [] @@ -159,20 +170,24 @@ def parse_do_attributes(attrs: Dict[str, Any], level: int = 0) -> str: return "\n".join(lines) -def parse_operations(operations: List[Dict[str, Any]]) -> str: +def parse_operations(operations: List[Dict[str, Any]], namespace: str) -> str: """Parse operations block""" preprocessed = ["name", "doc", "title", "do", "dump"] + linkable = ["fixed-header", "attribute-set"] lines = [] for operation in operations: - lines.append(rst_section(operation["name"])) + lines.append(rst_section(namespace, 'operation', operation["name"])) lines.append(rst_paragraph(sanitize(operation["doc"])) + "\n") for key in operation.keys(): if key in preprocessed: # Skip the special fields continue - lines.append(rst_fields(key, operation[key], 0)) + value = operation[key] + if key in linkable: + value = rst_ref(namespace, key, value) + lines.append(rst_fields(key, value, 0)) if "do" in operation: lines.append(rst_paragraph(":do:", 0)) @@ -212,14 +227,14 @@ def parse_entries(entries: List[Dict[str, Any]], level: int) -> str: return "\n".join(lines) -def parse_definitions(defs: Dict[str, Any]) -> str: +def parse_definitions(defs: Dict[str, Any], namespace: str) -> str: """Parse definitions section""" preprocessed = ["name", "entries", "members"] ignored = ["render-max"] # This is not printed lines = [] for definition in defs: - lines.append(rst_section(definition["name"])) + lines.append(rst_section(namespace, 'definition', definition["name"])) for k in definition.keys(): if k in preprocessed + ignored: continue @@ -237,14 +252,15 @@ def parse_definitions(defs: Dict[str, Any]) -> str: return "\n".join(lines) -def parse_attr_sets(entries: List[Dict[str, Any]]) -> str: +def parse_attr_sets(entries: List[Dict[str, Any]], namespace: str) -> str: """Parse attribute from attribute-set""" preprocessed = ["name", "type"] + linkable = ["enum", "nested-attributes", "struct", "sub-message"] ignored = ["checks"] lines = [] for entry in entries: - lines.append(rst_section(entry["name"])) + lines.append(rst_section(namespace, 'attribute-set', entry["name"])) for attr in entry["attributes"]: type_ = attr.get("type") attr_line = attr["name"] @@ -257,25 +273,31 @@ def parse_attr_sets(entries: List[Dict[str, Any]]) -> str: for k in attr.keys(): if k in preprocessed + ignored: continue - lines.append(rst_fields(k, sanitize(attr[k]), 0)) + if k in linkable: + value = rst_ref(namespace, k, attr[k]) + else: + value = sanitize(attr[k]) + lines.append(rst_fields(k, value, 0)) lines.append("\n") return "\n".join(lines) -def parse_sub_messages(entries: List[Dict[str, Any]]) -> str: +def parse_sub_messages(entries: List[Dict[str, Any]], namespace: str) -> str: """Parse sub-message definitions""" lines = [] for entry in entries: - lines.append(rst_section(entry["name"])) + lines.append(rst_section(namespace, 'sub-message', entry["name"])) for fmt in entry["formats"]: value = fmt["value"] lines.append(rst_bullet(bold(value))) for attr in ['fixed-header', 'attribute-set']: if attr in fmt: - lines.append(rst_fields(attr, fmt[attr], 1)) + lines.append(rst_fields(attr, + rst_ref(namespace, attr, fmt[attr]), + 1)) lines.append("\n") return "\n".join(lines) @@ -289,7 +311,9 @@ def parse_yaml(obj: Dict[str, Any]) -> str: lines.append(rst_header()) - title = f"Family ``{obj['name']}`` netlink specification" + family = obj['name'] + + title = f"Family ``{family}`` netlink specification" lines.append(rst_title(title)) lines.append(rst_paragraph(".. contents:: :depth: 3\n")) @@ -300,7 +324,7 @@ def parse_yaml(obj: Dict[str, Any]) -> str: # Operations if "operations" in obj: lines.append(rst_subtitle("Operations")) - lines.append(parse_operations(obj["operations"]["list"])) + lines.append(parse_operations(obj["operations"]["list"], family)) # Multicast groups if "mcast-groups" in obj: @@ -310,17 +334,17 @@ def parse_yaml(obj: Dict[str, Any]) -> str: # Definitions if "definitions" in obj: lines.append(rst_subtitle("Definitions")) - lines.append(parse_definitions(obj["definitions"])) + lines.append(parse_definitions(obj["definitions"], family)) # Attributes set if "attribute-sets" in obj: lines.append(rst_subtitle("Attribute sets")) - lines.append(parse_attr_sets(obj["attribute-sets"])) + lines.append(parse_attr_sets(obj["attribute-sets"], family)) # Sub-messages if "sub-messages" in obj: lines.append(rst_subtitle("Sub-messages")) - lines.append(parse_sub_messages(obj["sub-messages"])) + lines.append(parse_sub_messages(obj["sub-messages"], family)) return "\n".join(lines) From 06413c6046b8c04e1421a20d6f356d70f4571ac4 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Fri, 29 Mar 2024 13:50:21 +0000 Subject: [PATCH 137/167] doc: netlink: Update tc spec with missing definitions The tc spec referenced tc-u32-mark and tc-act-police-attrs but did not define them. The missing definitions were discovered when building the docs with generated hyperlinks because the hyperlink target labels were missing. Add definitions for tc-u32-mark and tc-act-police-attrs. Signed-off-by: Donald Hunter Signed-off-by: NipaLocal --- Documentation/netlink/specs/tc.yaml | 51 +++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/Documentation/netlink/specs/tc.yaml b/Documentation/netlink/specs/tc.yaml index 324fa182cd149..6068c105c5ee2 100644 --- a/Documentation/netlink/specs/tc.yaml +++ b/Documentation/netlink/specs/tc.yaml @@ -1099,6 +1099,19 @@ definitions: - name: offmask type: s32 + - + name: tc-u32-mark + type: struct + members: + - + name: val + type: u32 + - + name: mask + type: u32 + - + name: success + type: u32 - name: tc-u32-sel type: struct @@ -1774,6 +1787,44 @@ attribute-sets: - name: key-ex type: binary + - + name: tc-act-police-attrs + attributes: + - + name: tbf + type: binary + struct: tc-police + - + name: rate + type: binary # TODO + - + name: peakrate + type: binary # TODO + - + name: avrate + type: u32 + - + name: result + type: u32 + - + name: tm + type: binary + struct: tcf-t + - + name: pad + type: pad + - + name: rate64 + type: u64 + - + name: peakrate64 + type: u64 + - + name: pktrate64 + type: u64 + - + name: pktburst64 + type: u64 - name: tc-act-simple-attrs attributes: From 6ecc8d33d675d564d6733c7ce17a5fb5b33bd728 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 14:49:31 +0000 Subject: [PATCH 138/167] inet: preserve const qualifier in inet_csk() We can change inet_csk() to propagate its argument const qualifier, thanks to container_of_const(). We have to fix few places that had mistakes, like tcp_bound_rto(). Signed-off-by: Eric Dumazet Signed-off-by: NipaLocal --- include/net/espintcp.h | 2 +- include/net/inet_connection_sock.h | 5 +---- include/net/tcp.h | 2 +- include/net/tls.h | 2 +- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_timer.c | 4 ++-- net/mptcp/protocol.h | 2 +- 7 files changed, 8 insertions(+), 11 deletions(-) diff --git a/include/net/espintcp.h b/include/net/espintcp.h index 0335bbd76552a..c70efd704b6d5 100644 --- a/include/net/espintcp.h +++ b/include/net/espintcp.h @@ -32,7 +32,7 @@ struct espintcp_ctx { static inline struct espintcp_ctx *espintcp_getctx(const struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); /* RCU is only needed for diag */ return (__force void *)icsk->icsk_ulp_data; diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index ccf171f7eb60d..d96c871b0d581 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -147,10 +147,7 @@ struct inet_connection_sock { #define ICSK_TIME_LOSS_PROBE 5 /* Tail loss probe timer */ #define ICSK_TIME_REO_TIMEOUT 6 /* Reordering timer */ -static inline struct inet_connection_sock *inet_csk(const struct sock *sk) -{ - return (struct inet_connection_sock *)sk; -} +#define inet_csk(ptr) container_of_const(ptr, struct inet_connection_sock, icsk_inet.sk) static inline void *inet_csk_ca(const struct sock *sk) { diff --git a/include/net/tcp.h b/include/net/tcp.h index 6ae35199d3b3c..6c27195619aba 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -742,7 +742,7 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu); int tcp_mss_to_mtu(struct sock *sk, int mss); void tcp_mtup_init(struct sock *sk); -static inline void tcp_bound_rto(const struct sock *sk) +static inline void tcp_bound_rto(struct sock *sk) { if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) inet_csk(sk)->icsk_rto = TCP_RTO_MAX; diff --git a/include/net/tls.h b/include/net/tls.h index 340ad43971e47..400f4857dc117 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -361,7 +361,7 @@ static inline bool tls_is_skb_tx_device_offloaded(const struct sk_buff *skb) static inline struct tls_context *tls_get_ctx(const struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); /* Use RCU on icsk_ulp_data only for sock diag code, * TLS data path doesn't need rcu_dereference(). diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5d874817a78db..1b6cd38400120 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6999,7 +6999,7 @@ EXPORT_SYMBOL(inet_reqsk_alloc); /* * Return true if a syncookie should be sent */ -static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) +static bool tcp_syn_flood_action(struct sock *sk, const char *proto) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index d1ad20ce1c8c7..976db57b95d40 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -25,7 +25,7 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); u32 elapsed, user_timeout; s32 remaining; @@ -47,7 +47,7 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) { - struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); u32 remaining, user_timeout; s32 elapsed; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a10ebf3ee10a1..46f4655b71236 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -558,7 +558,7 @@ struct mptcp_subflow_context { static inline struct mptcp_subflow_context * mptcp_subflow_ctx(const struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); /* Use RCU on icsk_ulp_data only for sock diag code */ return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data; From 5258b75b4d09afaf02750ce7478b82558b1c21ad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:32:03 +0000 Subject: [PATCH 139/167] tcp/dccp: do not care about families in inet_twsk_purge() We lost ability to unload ipv6 module a long time ago. Instead of calling expensive inet_twsk_purge() twice, we can handle all families in one round. Also remove an extra line added in my prior patch, per Kuniyuki Iwashima feedback. Signed-off-by: Eric Dumazet Cc: Kuniyuki Iwashima Link: https://lore.kernel.org/netdev/20240327192934.6843-1-kuniyu@amazon.com/ Signed-off-by: NipaLocal --- include/net/inet_timewait_sock.h | 2 +- include/net/tcp.h | 2 +- net/dccp/ipv4.c | 2 +- net/dccp/ipv6.c | 6 ------ net/ipv4/inet_timewait_sock.c | 9 +++------ net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_minisocks.c | 6 +++--- net/ipv6/tcp_ipv6.c | 6 ------ 8 files changed, 10 insertions(+), 25 deletions(-) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index f28da08a37b4e..2a536eea9424e 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -111,7 +111,7 @@ static inline void inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo void inet_twsk_deschedule_put(struct inet_timewait_sock *tw); -void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family); +void inet_twsk_purge(struct inet_hashinfo *hashinfo); static inline struct net *twsk_net(const struct inet_timewait_sock *twsk) diff --git a/include/net/tcp.h b/include/net/tcp.h index 6c27195619aba..9ab5b37e9d532 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -353,7 +353,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); -void tcp_twsk_purge(struct list_head *net_exit_list, int family); +void tcp_twsk_purge(struct list_head *net_exit_list); ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 44b033fe1ef68..9fc9cea4c251b 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -1039,7 +1039,7 @@ static void __net_exit dccp_v4_exit_net(struct net *net) static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list) { - inet_twsk_purge(&dccp_hashinfo, AF_INET); + inet_twsk_purge(&dccp_hashinfo); } static struct pernet_operations dccp_v4_ops = { diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index ded07e09f8135..c8ca703dc331a 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1119,15 +1119,9 @@ static void __net_exit dccp_v6_exit_net(struct net *net) inet_ctl_sock_destroy(pn->v6_ctl_sk); } -static void __net_exit dccp_v6_exit_batch(struct list_head *net_exit_list) -{ - inet_twsk_purge(&dccp_hashinfo, AF_INET6); -} - static struct pernet_operations dccp_v6_ops = { .init = dccp_v6_init_net, .exit = dccp_v6_exit_net, - .exit_batch = dccp_v6_exit_batch, .id = &dccp_v6_pernet_id, .size = sizeof(struct dccp_v6_pernet), }; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index b0cc07d9a568c..e28075f0006e3 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -264,7 +264,7 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) EXPORT_SYMBOL_GPL(__inet_twsk_schedule); /* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */ -void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family) +void inet_twsk_purge(struct inet_hashinfo *hashinfo) { struct inet_ehash_bucket *head = &hashinfo->ehash[0]; unsigned int ehash_mask = hashinfo->ehash_mask; @@ -273,7 +273,6 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family) struct sock *sk; for (slot = 0; slot <= ehash_mask; slot++, head++) { - if (hlist_nulls_empty(&head->chain)) continue; @@ -288,15 +287,13 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family) TCPF_NEW_SYN_RECV)) continue; - if (sk->sk_family != family || - refcount_read(&sock_net(sk)->ns.count)) + if (refcount_read(&sock_net(sk)->ns.count)) continue; if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) continue; - if (unlikely(sk->sk_family != family || - refcount_read(&sock_net(sk)->ns.count))) { + if (refcount_read(&sock_net(sk)->ns.count)) { sock_gen_put(sk); goto restart; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a22ee58387518..1e0a9762f92e6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3501,7 +3501,7 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { struct net *net; - tcp_twsk_purge(net_exit_list, AF_INET); + tcp_twsk_purge(net_exit_list); list_for_each_entry(net, net_exit_list, exit_list) { inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f0761f060a837..5b21a07ddf9aa 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -388,7 +388,7 @@ void tcp_twsk_destructor(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_twsk_destructor); -void tcp_twsk_purge(struct list_head *net_exit_list, int family) +void tcp_twsk_purge(struct list_head *net_exit_list) { bool purged_once = false; struct net *net; @@ -396,9 +396,9 @@ void tcp_twsk_purge(struct list_head *net_exit_list, int family) list_for_each_entry(net, net_exit_list, exit_list) { if (net->ipv4.tcp_death_row.hashinfo->pernet) { /* Even if tw_refcount == 1, we must clean up kernel reqsk */ - inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo, family); + inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo); } else if (!purged_once) { - inet_twsk_purge(&tcp_hashinfo, family); + inet_twsk_purge(&tcp_hashinfo); purged_once = true; } } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3f4cba49e9ee6..5ae74f661d25f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2389,15 +2389,9 @@ static void __net_exit tcpv6_net_exit(struct net *net) inet_ctl_sock_destroy(net->ipv6.tcp_sk); } -static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list) -{ - tcp_twsk_purge(net_exit_list, AF_INET6); -} - static struct pernet_operations tcpv6_net_ops = { .init = tcpv6_net_init, .exit = tcpv6_net_exit, - .exit_batch = tcpv6_net_exit_batch, }; int __init tcpv6_init(void) From d81685bf93c0ef8514f4463398d1f5db46b2a73a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:18 +0000 Subject: [PATCH 140/167] net: move kick_defer_list_purge() to net/core/dev.h kick_defer_list_purge() is defined in net/core/dev.c and used from net/core/skubff.c Because we need softnet_data, include from net/core/dev.h Signed-off-by: Eric Dumazet Signed-off-by: NipaLocal --- include/linux/netdevice.h | 1 - net/core/dev.h | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 533293435e7bb..f6c0d731fa350 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3287,7 +3287,6 @@ static inline void dev_xmit_recursion_dec(void) __this_cpu_dec(softnet_data.xmit.recursion); } -void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); void __netif_schedule(struct Qdisc *q); void netif_schedule_queue(struct netdev_queue *txq); diff --git a/net/core/dev.h b/net/core/dev.h index 2bcaf8eee50c1..9d0f8b4587f81 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -4,11 +4,9 @@ #include #include +#include struct net; -struct net_device; -struct netdev_bpf; -struct netdev_phys_item_id; struct netlink_ext_ack; struct cpumask; @@ -150,4 +148,6 @@ static inline void xdp_do_check_flushed(struct napi_struct *napi) { } #endif struct napi_struct *napi_by_id(unsigned int napi_id); +void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); + #endif From f1a92da3edb250b070f35c051ad6d2f967a9cc11 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:19 +0000 Subject: [PATCH 141/167] net: move dev_xmit_recursion() helpers to net/core/dev.h Move dev_xmit_recursion() and friends to net/core/dev.h They are only used from net/core/dev.c and net/core/filter.c. Signed-off-by: Eric Dumazet Signed-off-by: NipaLocal --- include/linux/netdevice.h | 17 ----------------- net/core/dev.h | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f6c0d731fa350..d83e2f09273af 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3270,23 +3270,6 @@ static inline int dev_recursion_level(void) return this_cpu_read(softnet_data.xmit.recursion); } -#define XMIT_RECURSION_LIMIT 8 -static inline bool dev_xmit_recursion(void) -{ - return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > - XMIT_RECURSION_LIMIT); -} - -static inline void dev_xmit_recursion_inc(void) -{ - __this_cpu_inc(softnet_data.xmit.recursion); -} - -static inline void dev_xmit_recursion_dec(void) -{ - __this_cpu_dec(softnet_data.xmit.recursion); -} - void __netif_schedule(struct Qdisc *q); void netif_schedule_queue(struct netdev_queue *txq); diff --git a/net/core/dev.h b/net/core/dev.h index 9d0f8b4587f81..8572d2c8dc4ad 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -150,4 +150,21 @@ static inline void xdp_do_check_flushed(struct napi_struct *napi) { } struct napi_struct *napi_by_id(unsigned int napi_id); void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); +#define XMIT_RECURSION_LIMIT 8 +static inline bool dev_xmit_recursion(void) +{ + return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > + XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + __this_cpu_inc(softnet_data.xmit.recursion); +} + +static inline void dev_xmit_recursion_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.recursion); +} + #endif From 18bbbda7d0957215683894c9b0ce1a80b318d474 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:20 +0000 Subject: [PATCH 142/167] net: enqueue_to_backlog() change vs not running device If the device attached to the packet given to enqueue_to_backlog() is not running, we drop the packet. But we accidentally increase sd->dropped, giving false signals to admins: sd->dropped should be reserved to cpu backlog pressure, not to temporary glitches at device dismantles. While we are at it, perform the netif_running() test before we get the rps lock, and use REASON_DEV_READY drop reason instead of NOT_SPECIFIED. Signed-off-by: Eric Dumazet Signed-off-by: NipaLocal --- net/core/dev.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index bfd0eb4750996..5cbf7500a4a25 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4801,12 +4801,13 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned long flags; unsigned int qlen; - reason = SKB_DROP_REASON_NOT_SPECIFIED; + reason = SKB_DROP_REASON_DEV_READY; + if (!netif_running(skb->dev)) + goto bad_dev; + sd = &per_cpu(softnet_data, cpu); backlog_lock_irq_save(sd, &flags); - if (!netif_running(skb->dev)) - goto drop; qlen = skb_queue_len(&sd->input_pkt_queue); if (qlen <= READ_ONCE(net_hotdata.max_backlog) && !skb_flow_limit(skb, qlen)) { @@ -4827,10 +4828,10 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, } reason = SKB_DROP_REASON_CPU_BACKLOG; -drop: sd->dropped++; backlog_unlock_irq_restore(sd, &flags); +bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); return NET_RX_DROP; From 4e4c48f791868afdbd27d3455df2505af8bf1f3f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:21 +0000 Subject: [PATCH 143/167] net: make softnet_data.dropped an atomic_t If under extreme cpu backlog pressure enqueue_to_backlog() has to drop a packet, it could do this without dirtying a cache line and potentially slowing down the target cpu. Move sd->dropped into a separate cache line, and make it atomic. In non pressure mode, this field is not touched, no need to consume valuable space in a hot cache line. Signed-off-by: Eric Dumazet Signed-off-by: NipaLocal --- include/linux/netdevice.h | 3 ++- net/core/dev.c | 13 +++++++++---- net/core/net-procfs.c | 3 ++- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d83e2f09273af..d5a2ae6498bdf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3236,10 +3236,11 @@ struct softnet_data { unsigned int input_queue_tail; #endif unsigned int received_rps; - unsigned int dropped; struct sk_buff_head input_pkt_queue; struct napi_struct backlog; + atomic_t dropped ____cacheline_aligned_in_smp; + /* Another possibly contended cache line */ spinlock_t defer_lock ____cacheline_aligned_in_smp; int defer_count; diff --git a/net/core/dev.c b/net/core/dev.c index 5cbf7500a4a25..093ba25d79ab2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4800,17 +4800,22 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, struct softnet_data *sd; unsigned long flags; unsigned int qlen; + int max_backlog; reason = SKB_DROP_REASON_DEV_READY; if (!netif_running(skb->dev)) goto bad_dev; + reason = SKB_DROP_REASON_CPU_BACKLOG; sd = &per_cpu(softnet_data, cpu); + qlen = skb_queue_len_lockless(&sd->input_pkt_queue); + max_backlog = READ_ONCE(net_hotdata.max_backlog); + if (unlikely(qlen > max_backlog)) + goto cpu_backlog_drop; backlog_lock_irq_save(sd, &flags); qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen <= READ_ONCE(net_hotdata.max_backlog) && - !skb_flow_limit(skb, qlen)) { + if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) { if (qlen) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); @@ -4826,11 +4831,11 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, napi_schedule_rps(sd); goto enqueue; } - reason = SKB_DROP_REASON_CPU_BACKLOG; - sd->dropped++; backlog_unlock_irq_restore(sd, &flags); +cpu_backlog_drop: + atomic_inc(&sd->dropped); bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index a97eceb84e61e..fa6d3969734a6 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -144,7 +144,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x\n", - sd->processed, sd->dropped, sd->time_squeeze, 0, + sd->processed, atomic_read(&sd->dropped), + sd->time_squeeze, 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ sd->received_rps, flow_limit_count, From dc19769cecd7c26f3f2a4d23c95566f7b82102cb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:22 +0000 Subject: [PATCH 144/167] net: enqueue_to_backlog() cleanup We can remove a goto and a label by reversing a condition. Signed-off-by: Eric Dumazet Signed-off-by: NipaLocal --- net/core/dev.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 093ba25d79ab2..11846c1e2807d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4816,20 +4816,18 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, backlog_lock_irq_save(sd, &flags); qlen = skb_queue_len(&sd->input_pkt_queue); if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) { - if (qlen) { -enqueue: - __skb_queue_tail(&sd->input_pkt_queue, skb); - input_queue_tail_incr_save(sd, qtail); - backlog_unlock_irq_restore(sd, &flags); - return NET_RX_SUCCESS; + if (!qlen) { + /* Schedule NAPI for backlog device. We can use + * non atomic operation as we own the queue lock. + */ + if (!__test_and_set_bit(NAPI_STATE_SCHED, + &sd->backlog.state)) + napi_schedule_rps(sd); } - - /* Schedule NAPI for backlog device - * We can use non atomic operation since we own the queue lock - */ - if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) - napi_schedule_rps(sd); - goto enqueue; + __skb_queue_tail(&sd->input_pkt_queue, skb); + input_queue_tail_incr_save(sd, qtail); + backlog_unlock_irq_restore(sd, &flags); + return NET_RX_SUCCESS; } backlog_unlock_irq_restore(sd, &flags); From 1da4016abf185d680ccc7fa6fac66b831c3b79a3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:23 +0000 Subject: [PATCH 145/167] net: rps: change input_queue_tail_incr_save() input_queue_tail_incr_save() is incrementing the sd queue_tail and save it in the flow last_qtail. Two issues here : - no lock protects the write on last_qtail, we should use appropriate annotations. - We can perform this write after releasing the per-cpu backlog lock, to decrease this lock hold duration (move away the cache line miss) Also move input_queue_head_incr() and rps helpers to include/net/rps.h, while adding rps_ prefix to better reflect their role. v2: Fixed a build issue (Jakub and kernel build bots) Signed-off-by: Eric Dumazet Signed-off-by: NipaLocal --- include/linux/netdevice.h | 15 --------------- include/net/rps.h | 23 +++++++++++++++++++++++ net/core/dev.c | 20 ++++++++++++-------- 3 files changed, 35 insertions(+), 23 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d5a2ae6498bdf..5a1c05c4f599b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3249,21 +3249,6 @@ struct softnet_data { call_single_data_t defer_csd; }; -static inline void input_queue_head_incr(struct softnet_data *sd) -{ -#ifdef CONFIG_RPS - sd->input_queue_head++; -#endif -} - -static inline void input_queue_tail_incr_save(struct softnet_data *sd, - unsigned int *qtail) -{ -#ifdef CONFIG_RPS - *qtail = ++sd->input_queue_tail; -#endif -} - DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); static inline int dev_recursion_level(void) diff --git a/include/net/rps.h b/include/net/rps.h index 7660243e905b9..10ca25731c1ef 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -122,4 +122,27 @@ static inline void sock_rps_record_flow(const struct sock *sk) #endif } +static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + return ++sd->input_queue_tail; +#else + return 0; +#endif +} + +static inline void rps_input_queue_tail_save(u32 *dest, u32 tail) +{ +#ifdef CONFIG_RPS + WRITE_ONCE(*dest, tail); +#endif +} + +static inline void rps_input_queue_head_incr(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + sd->input_queue_head++; +#endif +} + #endif /* _NET_RPS_H */ diff --git a/net/core/dev.c b/net/core/dev.c index 11846c1e2807d..786b5086a74db 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4611,7 +4611,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (unlikely(tcpu != next_cpu) && (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || ((int)(per_cpu(softnet_data, tcpu).input_queue_head - - rflow->last_qtail)) >= 0)) { + READ_ONCE(rflow->last_qtail))) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); } @@ -4666,7 +4666,7 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, cpu = READ_ONCE(rflow->cpu); if (rflow->filter == filter_id && cpu < nr_cpu_ids && ((int)(per_cpu(softnet_data, cpu).input_queue_head - - rflow->last_qtail) < + READ_ONCE(rflow->last_qtail)) < (int)(10 * flow_table->mask))) expire = false; } @@ -4801,6 +4801,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned long flags; unsigned int qlen; int max_backlog; + u32 tail; reason = SKB_DROP_REASON_DEV_READY; if (!netif_running(skb->dev)) @@ -4825,8 +4826,11 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, napi_schedule_rps(sd); } __skb_queue_tail(&sd->input_pkt_queue, skb); - input_queue_tail_incr_save(sd, qtail); + tail = rps_input_queue_tail_incr(sd); backlog_unlock_irq_restore(sd, &flags); + + /* save the tail outside of the critical section */ + rps_input_queue_tail_save(qtail, tail); return NET_RX_SUCCESS; } @@ -5904,7 +5908,7 @@ static void flush_backlog(struct work_struct *work) if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); dev_kfree_skb_irq(skb); - input_queue_head_incr(sd); + rps_input_queue_head_incr(sd); } } backlog_unlock_irq_enable(sd); @@ -5913,7 +5917,7 @@ static void flush_backlog(struct work_struct *work) if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); kfree_skb(skb); - input_queue_head_incr(sd); + rps_input_queue_head_incr(sd); } } local_bh_enable(); @@ -6041,7 +6045,7 @@ static int process_backlog(struct napi_struct *napi, int quota) rcu_read_lock(); __netif_receive_skb(skb); rcu_read_unlock(); - input_queue_head_incr(sd); + rps_input_queue_head_incr(sd); if (++work >= quota) return work; @@ -11455,11 +11459,11 @@ static int dev_cpu_dead(unsigned int oldcpu) /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx(skb); - input_queue_head_incr(oldsd); + rps_input_queue_head_incr(oldsd); } return 0; From eacbd4dbd3f1e08fab552ec20a6fe1a6b9a33b28 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:24 +0000 Subject: [PATCH 146/167] net: rps: add rps_input_queue_head_add() helper process_backlog() can batch increments of sd->input_queue_head, saving some memory bandwidth. Also add READ_ONCE()/WRITE_ONCE() annotations around sd->input_queue_head accesses. Signed-off-by: Eric Dumazet Signed-off-by: NipaLocal --- include/net/rps.h | 9 +++++++-- net/core/dev.c | 13 ++++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/include/net/rps.h b/include/net/rps.h index 10ca25731c1ef..a93401d23d66e 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -138,11 +138,16 @@ static inline void rps_input_queue_tail_save(u32 *dest, u32 tail) #endif } -static inline void rps_input_queue_head_incr(struct softnet_data *sd) +static inline void rps_input_queue_head_add(struct softnet_data *sd, int val) { #ifdef CONFIG_RPS - sd->input_queue_head++; + WRITE_ONCE(sd->input_queue_head, sd->input_queue_head + val); #endif } +static inline void rps_input_queue_head_incr(struct softnet_data *sd) +{ + rps_input_queue_head_add(sd, 1); +} + #endif /* _NET_RPS_H */ diff --git a/net/core/dev.c b/net/core/dev.c index 786b5086a74db..5a5b5683358a2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4528,7 +4528,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, out: #endif rflow->last_qtail = - per_cpu(softnet_data, next_cpu).input_queue_head; + READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head); } rflow->cpu = next_cpu; @@ -4610,7 +4610,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, */ if (unlikely(tcpu != next_cpu) && (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || - ((int)(per_cpu(softnet_data, tcpu).input_queue_head - + ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) - READ_ONCE(rflow->last_qtail))) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); @@ -4665,7 +4665,7 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); if (rflow->filter == filter_id && cpu < nr_cpu_ids && - ((int)(per_cpu(softnet_data, cpu).input_queue_head - + ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - READ_ONCE(rflow->last_qtail)) < (int)(10 * flow_table->mask))) expire = false; @@ -6045,9 +6045,10 @@ static int process_backlog(struct napi_struct *napi, int quota) rcu_read_lock(); __netif_receive_skb(skb); rcu_read_unlock(); - rps_input_queue_head_incr(sd); - if (++work >= quota) + if (++work >= quota) { + rps_input_queue_head_add(sd, work); return work; + } } @@ -6070,6 +6071,8 @@ static int process_backlog(struct napi_struct *napi, int quota) backlog_unlock_irq_enable(sd); } + if (work) + rps_input_queue_head_add(sd, work); return work; } From c43574ca8abad59e2d4c0162cb0b93aa79dd4a58 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 15:42:25 +0000 Subject: [PATCH 147/167] net: rps: move received_rps field to a better location Commit 14d898f3c1b3 ("dev: Move received_rps counter next to RPS members in softnet data") was unfortunate: received_rps is dirtied by a cpu and never read by other cpus in fast path. Its presence in the hot RPS cache line (shared by many cpus) is hurting RPS/RFS performance. Signed-off-by: Eric Dumazet Signed-off-by: NipaLocal --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5a1c05c4f599b..890dbbd3c11ec 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3203,6 +3203,7 @@ struct softnet_data { struct softnet_data *rps_ipi_list; #endif + unsigned int received_rps; bool in_net_rx_action; bool in_napi_threaded_poll; @@ -3235,7 +3236,6 @@ struct softnet_data { unsigned int cpu; unsigned int input_queue_tail; #endif - unsigned int received_rps; struct sk_buff_head input_pkt_queue; struct napi_struct backlog; From 9294b6c6aa1a2eb869e0a4fa3d49791986d29855 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 29 Mar 2024 09:05:59 -0700 Subject: [PATCH 148/167] selftests: reuseaddr_conflict: add missing new line at the end of the output The netdev CI runs in a VM and captures serial, so stdout and stderr get combined. Because there's a missing new line in stderr the test ends up corrupting KTAP: # Successok 1 selftests: net: reuseaddr_conflict which should have been: # Success ok 1 selftests: net: reuseaddr_conflict Fixes: 422d8dc6fd3a ("selftest: add a reuseaddr test") Signed-off-by: Jakub Kicinski Reviewed-by: Muhammad Usama Anjum Signed-off-by: NipaLocal --- tools/testing/selftests/net/reuseaddr_conflict.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/reuseaddr_conflict.c b/tools/testing/selftests/net/reuseaddr_conflict.c index 7c5b12664b03b..bfb07dc495186 100644 --- a/tools/testing/selftests/net/reuseaddr_conflict.c +++ b/tools/testing/selftests/net/reuseaddr_conflict.c @@ -109,6 +109,6 @@ int main(void) fd1 = open_port(0, 1); if (fd1 >= 0) error(1, 0, "Was allowed to create an ipv4 reuseport on an already bound non-reuseport socket with no ipv6"); - fprintf(stderr, "Success"); + fprintf(stderr, "Success\n"); return 0; } From 6cbf1a1107e9d712d708a9656762a77c2232a5cd Mon Sep 17 00:00:00 2001 From: Marco Pinna Date: Fri, 29 Mar 2024 17:12:59 +0100 Subject: [PATCH 149/167] vsock/virtio: fix packet delivery to tap device Commit 82dfb540aeb2 ("VSOCK: Add virtio vsock vsockmon hooks") added virtio_transport_deliver_tap_pkt() for handing packets to the vsockmon device. However, in virtio_transport_send_pkt_work(), the function is called before actually sending the packet (i.e. before placing it in the virtqueue with virtqueue_add_sgs() and checking whether it returned successfully). Queuing the packet in the virtqueue can fail even multiple times. However, in virtio_transport_deliver_tap_pkt() we deliver the packet to the monitoring tap interface only the first time we call it. This certainly avoids seeing the same packet replicated multiple times in the monitoring interface, but it can show the packet sent with the wrong timestamp or even before we succeed to queue it in the virtqueue. Move virtio_transport_deliver_tap_pkt() after calling virtqueue_add_sgs() and making sure it returned successfully. Fixes: 82dfb540aeb2 ("VSOCK: Add virtio vsock vsockmon hooks") Cc: stable@vge.kernel.org Signed-off-by: Marco Pinna Reviewed-by: Stefano Garzarella Signed-off-by: NipaLocal --- net/vmw_vsock/virtio_transport.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 1748268e0694f..ee5d306a96d0f 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -120,7 +120,6 @@ virtio_transport_send_pkt_work(struct work_struct *work) if (!skb) break; - virtio_transport_deliver_tap_pkt(skb); reply = virtio_vsock_skb_reply(skb); sgs = vsock->out_sgs; sg_init_one(sgs[out_sg], virtio_vsock_hdr(skb), @@ -170,6 +169,8 @@ virtio_transport_send_pkt_work(struct work_struct *work) break; } + virtio_transport_deliver_tap_pkt(skb); + if (reply) { struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX]; int val; From 31677c1dbb24923258c8e7c0600e607613d48cdd Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 29 Mar 2024 17:55:06 +0100 Subject: [PATCH 150/167] page_pool: check for PP direct cache locality later Since we have pool->p.napi (Jakub) and pool->cpuid (Lorenzo) to check whether it's safe to use direct recycling, we can use both globally for each page instead of relying solely on @allow_direct argument. Let's assume that @allow_direct means "I'm sure it's local, don't waste time rechecking this" and when it's false, try the mentioned params to still recycle the page directly. If neither is true, we'll lose some CPU cycles, but then it surely won't be hotpath. On the other hand, paths where it's possible to use direct cache, but not possible to safely set @allow_direct, will benefit from this move. The whole propagation of @napi_safe through a dozen of skb freeing functions can now go away, which saves us some stack space. Signed-off-by: Alexander Lobakin Signed-off-by: NipaLocal --- include/linux/skbuff.h | 12 ++++---- net/core/page_pool.c | 31 +++++++++++++++++-- net/core/skbuff.c | 70 +++++++++++++----------------------------- net/ipv4/esp4.c | 2 +- net/ipv6/esp6.c | 2 +- 5 files changed, 58 insertions(+), 59 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b7f1ecdaec382..03ea36a82cdd7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3510,25 +3510,25 @@ int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, unsigned int headroom); int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, struct bpf_prog *prog); -bool napi_pp_put_page(struct page *page, bool napi_safe); +bool napi_pp_put_page(struct page *page); static inline void -skb_page_unref(const struct sk_buff *skb, struct page *page, bool napi_safe) +skb_page_unref(const struct sk_buff *skb, struct page *page) { #ifdef CONFIG_PAGE_POOL - if (skb->pp_recycle && napi_pp_put_page(page, napi_safe)) + if (skb->pp_recycle && napi_pp_put_page(page)) return; #endif put_page(page); } static inline void -napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe) +napi_frag_unref(skb_frag_t *frag, bool recycle) { struct page *page = skb_frag_page(frag); #ifdef CONFIG_PAGE_POOL - if (recycle && napi_pp_put_page(page, napi_safe)) + if (recycle && napi_pp_put_page(page)) return; #endif put_page(page); @@ -3544,7 +3544,7 @@ napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe) */ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) { - napi_frag_unref(frag, recycle, false); + napi_frag_unref(frag, recycle); } /** diff --git a/net/core/page_pool.c b/net/core/page_pool.c index dd364d738c006..9d56257e444be 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -690,8 +690,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, page_pool_dma_sync_for_device(pool, page, dma_sync_size); - if (allow_direct && in_softirq() && - page_pool_recycle_in_cache(page, pool)) + if (allow_direct && page_pool_recycle_in_cache(page, pool)) return NULL; /* Page found as candidate for recycling */ @@ -716,9 +715,35 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, return NULL; } +static bool page_pool_napi_local(const struct page_pool *pool) +{ + const struct napi_struct *napi; + u32 cpuid; + + if (unlikely(!in_softirq())) + return false; + + /* Allow direct recycle if we have reasons to believe that we are + * in the same context as the consumer would run, so there's + * no possible race. + * __page_pool_put_page() makes sure we're not in hardirq context + * and interrupts are enabled prior to accessing the cache. + */ + cpuid = smp_processor_id(); + if (READ_ONCE(pool->cpuid) == cpuid) + return true; + + napi = READ_ONCE(pool->p.napi); + + return napi && READ_ONCE(napi->list_owner) == cpuid; +} + void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { + if (!allow_direct) + allow_direct = page_pool_napi_local(pool); + page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); if (page && !page_pool_recycle_in_ring(pool, page)) { /* Cache full, fallback to free pages */ @@ -969,7 +994,7 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), static void page_pool_disable_direct_recycling(struct page_pool *pool) { /* Disable direct recycling based on pool->cpuid. - * Paired with READ_ONCE() in napi_pp_put_page(). + * Paired with READ_ONCE() in page_pool_napi_local(). */ WRITE_ONCE(pool->cpuid, -1); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a1be84be5d35a..2a5ce6667bbb9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1004,11 +1004,8 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, EXPORT_SYMBOL(skb_cow_data_for_xdp); #if IS_ENABLED(CONFIG_PAGE_POOL) -bool napi_pp_put_page(struct page *page, bool napi_safe) +bool napi_pp_put_page(struct page *page) { - bool allow_direct = false; - struct page_pool *pp; - page = compound_head(page); /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation @@ -1021,39 +1018,18 @@ bool napi_pp_put_page(struct page *page, bool napi_safe) if (unlikely(!is_pp_page(page))) return false; - pp = page->pp; - - /* Allow direct recycle if we have reasons to believe that we are - * in the same context as the consumer would run, so there's - * no possible race. - * __page_pool_put_page() makes sure we're not in hardirq context - * and interrupts are enabled prior to accessing the cache. - */ - if (napi_safe || in_softirq()) { - const struct napi_struct *napi = READ_ONCE(pp->p.napi); - unsigned int cpuid = smp_processor_id(); - - allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid; - allow_direct |= READ_ONCE(pp->cpuid) == cpuid; - } - - /* Driver set this to memory recycling info. Reset it on recycle. - * This will *not* work for NIC using a split-page memory model. - * The page will be returned to the pool here regardless of the - * 'flipped' fragment being in use or not. - */ - page_pool_put_full_page(pp, page, allow_direct); + page_pool_put_full_page(page->pp, page, false); return true; } EXPORT_SYMBOL(napi_pp_put_page); #endif -static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) +static bool skb_pp_recycle(struct sk_buff *skb, void *data) { if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) return false; - return napi_pp_put_page(virt_to_page(data), napi_safe); + return napi_pp_put_page(virt_to_page(data)); } /** @@ -1095,12 +1071,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset) kfree(head); } -static void skb_free_head(struct sk_buff *skb, bool napi_safe) +static void skb_free_head(struct sk_buff *skb) { unsigned char *head = skb->head; if (skb->head_frag) { - if (skb_pp_recycle(skb, head, napi_safe)) + if (skb_pp_recycle(skb, head)) return; skb_free_frag(head); } else { @@ -1108,8 +1084,7 @@ static void skb_free_head(struct sk_buff *skb, bool napi_safe) } } -static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, - bool napi_safe) +static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) { struct skb_shared_info *shinfo = skb_shinfo(skb); int i; @@ -1126,13 +1101,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, } for (i = 0; i < shinfo->nr_frags; i++) - napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe); + napi_frag_unref(&shinfo->frags[i], skb->pp_recycle); free_head: if (shinfo->frag_list) kfree_skb_list_reason(shinfo->frag_list, reason); - skb_free_head(skb, napi_safe); + skb_free_head(skb); exit: /* When we clone an SKB we copy the reycling bit. The pp_recycle * bit is only set on the head though, so in order to avoid races @@ -1193,12 +1168,11 @@ void skb_release_head_state(struct sk_buff *skb) } /* Free everything but the sk_buff shell. */ -static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, - bool napi_safe) +static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason) { skb_release_head_state(skb); if (likely(skb->head)) - skb_release_data(skb, reason, napi_safe); + skb_release_data(skb, reason); } /** @@ -1212,7 +1186,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, void __kfree_skb(struct sk_buff *skb) { - skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false); + skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); kfree_skbmem(skb); } EXPORT_SYMBOL(__kfree_skb); @@ -1269,7 +1243,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb, return; } - skb_release_all(skb, reason, false); + skb_release_all(skb, reason); sa->skb_array[sa->skb_count++] = skb; if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { @@ -1443,7 +1417,7 @@ EXPORT_SYMBOL(consume_skb); void __consume_stateless_skb(struct sk_buff *skb) { trace_consume_skb(skb, __builtin_return_address(0)); - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); kfree_skbmem(skb); } @@ -1470,7 +1444,7 @@ static void napi_skb_cache_put(struct sk_buff *skb) void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason) { - skb_release_all(skb, reason, true); + skb_release_all(skb, reason); napi_skb_cache_put(skb); } @@ -1508,7 +1482,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; } - skb_release_all(skb, SKB_CONSUMED, !!budget); + skb_release_all(skb, SKB_CONSUMED); napi_skb_cache_put(skb); } EXPORT_SYMBOL(napi_consume_skb); @@ -1639,7 +1613,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg); */ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) { - skb_release_all(dst, SKB_CONSUMED, false); + skb_release_all(dst, SKB_CONSUMED); return __skb_clone(dst, src); } EXPORT_SYMBOL_GPL(skb_morph); @@ -2271,9 +2245,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); } else { - skb_free_head(skb, false); + skb_free_head(skb); } off = (data + nhead) - skb->head; @@ -6574,12 +6548,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, skb_frag_ref(skb, i); if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); } else { /* we can reuse existing recount- all we did was * relocate values */ - skb_free_head(skb, false); + skb_free_head(skb); } skb->head = data; @@ -6714,7 +6688,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, skb_kfree_head(data, size); return -ENOMEM; } - skb_release_data(skb, SKB_CONSUMED, false); + skb_release_data(skb, SKB_CONSUMED); skb->head = data; skb->head_frag = 0; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index d33d124218140..3d647c9a7a21e 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -114,7 +114,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - skb_page_unref(skb, sg_page(sg), false); + skb_page_unref(skb, sg_page(sg)); } #ifdef CONFIG_INET_ESPINTCP diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 7371886d4f9f4..fe8d53f5a5eec 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -131,7 +131,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - skb_page_unref(skb, sg_page(sg), false); + skb_page_unref(skb, sg_page(sg)); } #ifdef CONFIG_INET6_ESPINTCP From d1595ac142373ba9197e979bea0b8a03329d3d9e Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 29 Mar 2024 17:55:07 +0100 Subject: [PATCH 151/167] page_pool: try direct bulk recycling Now that the checks for direct recycling possibility live inside the Page Pool core, reuse them when performing bulk recycling. page_pool_put_page_bulk() can be called from process context as well, page_pool_napi_local() takes care of this at the very beginning. Under high .ndo_xdp_xmit() traffic load, the win is 2-3% Pps assuming the sending driver uses xdp_return_frame_bulk() on Tx completion. Signed-off-by: Alexander Lobakin Signed-off-by: NipaLocal --- net/core/page_pool.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 9d56257e444be..4c175091fc0ab 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -772,8 +772,11 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, int count) { int i, bulk_len = 0; + bool allow_direct; bool in_softirq; + allow_direct = page_pool_napi_local(pool); + for (i = 0; i < count; i++) { struct page *page = virt_to_head_page(data[i]); @@ -781,13 +784,13 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, if (!page_pool_is_last_ref(page)) continue; - page = __page_pool_put_page(pool, page, -1, false); + page = __page_pool_put_page(pool, page, -1, allow_direct); /* Approved for bulk recycling in ptr_ring cache */ if (page) data[bulk_len++] = page; } - if (unlikely(!bulk_len)) + if (!bulk_len) return; /* Bulk producer into ptr_ring page_pool cache */ From 567dd94a0e63fc3331129036cb105796ad117f09 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 29 Mar 2024 10:16:41 -0700 Subject: [PATCH 152/167] virtio_net: Do not send RSS key if it is not supported There is a bug when setting the RSS options in virtio_net that can break the whole machine, getting the kernel into an infinite loop. Running the following command in any QEMU virtual machine with virtionet will reproduce this problem: # ethtool -X eth0 hfunc toeplitz This is how the problem happens: 1) ethtool_set_rxfh() calls virtnet_set_rxfh() 2) virtnet_set_rxfh() calls virtnet_commit_rss_command() 3) virtnet_commit_rss_command() populates 4 entries for the rss scatter-gather 4) Since the command above does not have a key, then the last scatter-gatter entry will be zeroed, since rss_key_size == 0. sg_buf_size = vi->rss_key_size; 5) This buffer is passed to qemu, but qemu is not happy with a buffer with zero length, and do the following in virtqueue_map_desc() (QEMU function): if (!sz) { virtio_error(vdev, "virtio: zero sized buffers are not allowed"); 6) virtio_error() (also QEMU function) set the device as broken vdev->broken = true; 7) Qemu bails out, and do not repond this crazy kernel. 8) The kernel is waiting for the response to come back (function virtnet_send_command()) 9) The kernel is waiting doing the following : while (!virtqueue_get_buf(vi->cvq, &tmp) && !virtqueue_is_broken(vi->cvq)) cpu_relax(); 10) None of the following functions above is true, thus, the kernel loops here forever. Keeping in mind that virtqueue_is_broken() does not look at the qemu `vdev->broken`, so, it never realizes that the vitio is broken at QEMU side. Fix it by not sending RSS commands if the feature is not available in the device. Fixes: c7114b1249fa ("drivers/net/virtio_net: Added basic RSS support.") Cc: stable@vger.kernel.org Cc: qemu-devel@nongnu.org Signed-off-by: Breno Leitao Signed-off-by: NipaLocal --- drivers/net/virtio_net.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index c22d1118a1333..c4a21ec51adf1 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3807,6 +3807,7 @@ static int virtnet_set_rxfh(struct net_device *dev, struct netlink_ext_ack *extack) { struct virtnet_info *vi = netdev_priv(dev); + bool update = false; int i; if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE && @@ -3814,13 +3815,24 @@ static int virtnet_set_rxfh(struct net_device *dev, return -EOPNOTSUPP; if (rxfh->indir) { + if (!vi->has_rss) + return -EOPNOTSUPP; + for (i = 0; i < vi->rss_indir_table_size; ++i) vi->ctrl->rss.indirection_table[i] = rxfh->indir[i]; + update = true; } - if (rxfh->key) + + if (rxfh->key) { + if (!vi->has_rss && !vi->has_rss_hash_report) + return -EOPNOTSUPP; + memcpy(vi->ctrl->rss.key, rxfh->key, vi->rss_key_size); + update = true; + } - virtnet_commit_rss_command(vi); + if (update) + virtnet_commit_rss_command(vi); return 0; } @@ -4729,13 +4741,15 @@ static int virtnet_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT)) vi->has_rss_hash_report = true; - if (virtio_has_feature(vdev, VIRTIO_NET_F_RSS)) + if (virtio_has_feature(vdev, VIRTIO_NET_F_RSS)) { vi->has_rss = true; - if (vi->has_rss || vi->has_rss_hash_report) { vi->rss_indir_table_size = virtio_cread16(vdev, offsetof(struct virtio_net_config, rss_max_indirection_table_length)); + } + + if (vi->has_rss || vi->has_rss_hash_report) { vi->rss_key_size = virtio_cread8(vdev, offsetof(struct virtio_net_config, rss_max_key_size)); From c511f3695885a83314a0d9284d3a147dc1f84824 Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Fri, 29 Mar 2024 10:56:24 -0700 Subject: [PATCH 153/167] igb: simplify pci ops declaration The igb driver was pre-declaring tons of functions just so that it could have an early declaration of the pci_driver struct. Delete a bunch of the declarations and move the struct to the bottom of the file, after all the functions are declared. Reviewed-by: Alan Brady Signed-off-by: Jesse Brandeburg Reviewed-by: Maciej Fijalkowski Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Signed-off-by: NipaLocal --- drivers/net/ethernet/intel/igb/igb_main.c | 53 ++++++++++------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index a3f100769e399..89604c9d8e493 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -106,8 +106,6 @@ static int igb_setup_all_rx_resources(struct igb_adapter *); static void igb_free_all_tx_resources(struct igb_adapter *); static void igb_free_all_rx_resources(struct igb_adapter *); static void igb_setup_mrqc(struct igb_adapter *); -static int igb_probe(struct pci_dev *, const struct pci_device_id *); -static void igb_remove(struct pci_dev *pdev); static void igb_init_queue_configuration(struct igb_adapter *adapter); static int igb_sw_init(struct igb_adapter *); int igb_open(struct net_device *); @@ -178,20 +176,6 @@ static int igb_vf_configure(struct igb_adapter *adapter, int vf); static int igb_disable_sriov(struct pci_dev *dev, bool reinit); #endif -static int igb_suspend(struct device *); -static int igb_resume(struct device *); -static int igb_runtime_suspend(struct device *dev); -static int igb_runtime_resume(struct device *dev); -static int igb_runtime_idle(struct device *dev); -#ifdef CONFIG_PM -static const struct dev_pm_ops igb_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(igb_suspend, igb_resume) - SET_RUNTIME_PM_OPS(igb_runtime_suspend, igb_runtime_resume, - igb_runtime_idle) -}; -#endif -static void igb_shutdown(struct pci_dev *); -static int igb_pci_sriov_configure(struct pci_dev *dev, int num_vfs); #ifdef CONFIG_IGB_DCA static int igb_notify_dca(struct notifier_block *, unsigned long, void *); static struct notifier_block dca_notifier = { @@ -219,19 +203,6 @@ static const struct pci_error_handlers igb_err_handler = { static void igb_init_dmac(struct igb_adapter *adapter, u32 pba); -static struct pci_driver igb_driver = { - .name = igb_driver_name, - .id_table = igb_pci_tbl, - .probe = igb_probe, - .remove = igb_remove, -#ifdef CONFIG_PM - .driver.pm = &igb_pm_ops, -#endif - .shutdown = igb_shutdown, - .sriov_configure = igb_pci_sriov_configure, - .err_handler = &igb_err_handler -}; - MODULE_AUTHOR("Intel Corporation, "); MODULE_DESCRIPTION("Intel(R) Gigabit Ethernet Network Driver"); MODULE_LICENSE("GPL v2"); @@ -647,6 +618,8 @@ struct net_device *igb_get_hw_dev(struct e1000_hw *hw) return adapter->netdev; } +static struct pci_driver igb_driver; + /** * igb_init_module - Driver Registration Routine * @@ -10157,4 +10130,26 @@ static void igb_nfc_filter_restore(struct igb_adapter *adapter) spin_unlock(&adapter->nfc_lock); } + +#ifdef CONFIG_PM +static const struct dev_pm_ops igb_pm_ops = { + SET_SYSTEM_SLEEP_PM_OPS(igb_suspend, igb_resume) + SET_RUNTIME_PM_OPS(igb_runtime_suspend, igb_runtime_resume, + igb_runtime_idle) +}; +#endif + +static struct pci_driver igb_driver = { + .name = igb_driver_name, + .id_table = igb_pci_tbl, + .probe = igb_probe, + .remove = igb_remove, +#ifdef CONFIG_PM + .driver.pm = &igb_pm_ops, +#endif + .shutdown = igb_shutdown, + .sriov_configure = igb_pci_sriov_configure, + .err_handler = &igb_err_handler +}; + /* igb_main.c */ From 8b867e3f261f7021ffd606f8768367e98894e6e7 Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Fri, 29 Mar 2024 10:56:25 -0700 Subject: [PATCH 154/167] net: intel: implement modern PM ops declarations Switch the Intel networking drivers to use the new power management ops declaration formats and macros, which allows us to drop __maybe_unused, as well as a bunch of ifdef checking CONFIG_PM. This is safe to do because the compiler drops the unused functions, verified by checking for any of the power management function symbols being present in System.map for a build without CONFIG_PM. If a driver has runtime PM, define the ops with pm_ptr(), and if the driver has Simple PM, use pm_sleep_ptr(), as well as the new versions of the macros for declaring the members of the pm_ops structs. Checked with network-enabled allnoconfig, allyesconfig, allmodconfig on x64_64. Reviewed-by: Alan Brady Signed-off-by: Jesse Brandeburg Reviewed-by: Maciej Fijalkowski Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Signed-off-by: NipaLocal --- drivers/net/ethernet/intel/e100.c | 8 +++--- drivers/net/ethernet/intel/e1000/e1000_main.c | 14 +++++----- drivers/net/ethernet/intel/e1000e/netdev.c | 22 +++++++--------- drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 10 +++---- drivers/net/ethernet/intel/i40e/i40e_main.c | 10 +++---- drivers/net/ethernet/intel/iavf/iavf_main.c | 8 +++--- drivers/net/ethernet/intel/ice/ice_main.c | 12 +++------ drivers/net/ethernet/intel/igb/igb_main.c | 26 +++++++------------ drivers/net/ethernet/intel/igbvf/netdev.c | 6 ++--- drivers/net/ethernet/intel/igc/igc_main.c | 24 ++++++----------- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 8 +++--- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 8 +++--- 12 files changed, 64 insertions(+), 92 deletions(-) diff --git a/drivers/net/ethernet/intel/e100.c b/drivers/net/ethernet/intel/e100.c index 3fcb8daaa2437..9b068d40778db 100644 --- a/drivers/net/ethernet/intel/e100.c +++ b/drivers/net/ethernet/intel/e100.c @@ -3037,7 +3037,7 @@ static int __e100_power_off(struct pci_dev *pdev, bool wake) return 0; } -static int __maybe_unused e100_suspend(struct device *dev_d) +static int e100_suspend(struct device *dev_d) { bool wake; @@ -3046,7 +3046,7 @@ static int __maybe_unused e100_suspend(struct device *dev_d) return 0; } -static int __maybe_unused e100_resume(struct device *dev_d) +static int e100_resume(struct device *dev_d) { struct net_device *netdev = dev_get_drvdata(dev_d); struct nic *nic = netdev_priv(netdev); @@ -3163,7 +3163,7 @@ static const struct pci_error_handlers e100_err_handler = { .resume = e100_io_resume, }; -static SIMPLE_DEV_PM_OPS(e100_pm_ops, e100_suspend, e100_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(e100_pm_ops, e100_suspend, e100_resume); static struct pci_driver e100_driver = { .name = DRV_NAME, @@ -3172,7 +3172,7 @@ static struct pci_driver e100_driver = { .remove = e100_remove, /* Power Management hooks */ - .driver.pm = &e100_pm_ops, + .driver.pm = pm_sleep_ptr(&e100_pm_ops), .shutdown = e100_shutdown, .err_handler = &e100_err_handler, diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c index 1d1e93686af2b..5b43f9b194fc2 100644 --- a/drivers/net/ethernet/intel/e1000/e1000_main.c +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c @@ -149,8 +149,8 @@ static int e1000_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid); static void e1000_restore_vlan(struct e1000_adapter *adapter); -static int __maybe_unused e1000_suspend(struct device *dev); -static int __maybe_unused e1000_resume(struct device *dev); +static int e1000_suspend(struct device *dev); +static int e1000_resume(struct device *dev); static void e1000_shutdown(struct pci_dev *pdev); #ifdef CONFIG_NET_POLL_CONTROLLER @@ -175,16 +175,14 @@ static const struct pci_error_handlers e1000_err_handler = { .resume = e1000_io_resume, }; -static SIMPLE_DEV_PM_OPS(e1000_pm_ops, e1000_suspend, e1000_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(e1000_pm_ops, e1000_suspend, e1000_resume); static struct pci_driver e1000_driver = { .name = e1000_driver_name, .id_table = e1000_pci_tbl, .probe = e1000_probe, .remove = e1000_remove, - .driver = { - .pm = &e1000_pm_ops, - }, + .driver.pm = pm_sleep_ptr(&e1000_pm_ops), .shutdown = e1000_shutdown, .err_handler = &e1000_err_handler }; @@ -5135,7 +5133,7 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool *enable_wake) return 0; } -static int __maybe_unused e1000_suspend(struct device *dev) +static int e1000_suspend(struct device *dev) { int retval; struct pci_dev *pdev = to_pci_dev(dev); @@ -5147,7 +5145,7 @@ static int __maybe_unused e1000_suspend(struct device *dev) return retval; } -static int __maybe_unused e1000_resume(struct device *dev) +static int e1000_resume(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); struct net_device *netdev = pci_get_drvdata(pdev); diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 3692fce201959..bef65ee4c549a 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -6968,13 +6968,13 @@ static int __e1000_resume(struct pci_dev *pdev) return 0; } -static __maybe_unused int e1000e_pm_prepare(struct device *dev) +static int e1000e_pm_prepare(struct device *dev) { return pm_runtime_suspended(dev) && pm_suspend_via_firmware(); } -static __maybe_unused int e1000e_pm_suspend(struct device *dev) +static int e1000e_pm_suspend(struct device *dev) { struct net_device *netdev = pci_get_drvdata(to_pci_dev(dev)); struct e1000_adapter *adapter = netdev_priv(netdev); @@ -6997,7 +6997,7 @@ static __maybe_unused int e1000e_pm_suspend(struct device *dev) return rc; } -static __maybe_unused int e1000e_pm_resume(struct device *dev) +static int e1000e_pm_resume(struct device *dev) { struct net_device *netdev = pci_get_drvdata(to_pci_dev(dev)); struct e1000_adapter *adapter = netdev_priv(netdev); @@ -7031,7 +7031,7 @@ static __maybe_unused int e1000e_pm_runtime_idle(struct device *dev) return -EBUSY; } -static __maybe_unused int e1000e_pm_runtime_resume(struct device *dev) +static int e1000e_pm_runtime_resume(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); struct net_device *netdev = pci_get_drvdata(pdev); @@ -7050,7 +7050,7 @@ static __maybe_unused int e1000e_pm_runtime_resume(struct device *dev) return rc; } -static __maybe_unused int e1000e_pm_runtime_suspend(struct device *dev) +static int e1000e_pm_runtime_suspend(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); struct net_device *netdev = pci_get_drvdata(pdev); @@ -7937,8 +7937,7 @@ static const struct pci_device_id e1000_pci_tbl[] = { }; MODULE_DEVICE_TABLE(pci, e1000_pci_tbl); -static const struct dev_pm_ops e1000_pm_ops = { -#ifdef CONFIG_PM_SLEEP +static const struct dev_pm_ops e1000e_pm_ops = { .prepare = e1000e_pm_prepare, .suspend = e1000e_pm_suspend, .resume = e1000e_pm_resume, @@ -7946,9 +7945,8 @@ static const struct dev_pm_ops e1000_pm_ops = { .thaw = e1000e_pm_thaw, .poweroff = e1000e_pm_suspend, .restore = e1000e_pm_resume, -#endif - SET_RUNTIME_PM_OPS(e1000e_pm_runtime_suspend, e1000e_pm_runtime_resume, - e1000e_pm_runtime_idle) + RUNTIME_PM_OPS(e1000e_pm_runtime_suspend, e1000e_pm_runtime_resume, + e1000e_pm_runtime_idle) }; /* PCI Device API Driver */ @@ -7957,9 +7955,7 @@ static struct pci_driver e1000_driver = { .id_table = e1000_pci_tbl, .probe = e1000_probe, .remove = e1000_remove, - .driver = { - .pm = &e1000_pm_ops, - }, + .driver.pm = pm_ptr(&e1000e_pm_ops), .shutdown = e1000_shutdown, .err_handler = &e1000_err_handler }; diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c index d748b98274e79..92de609b72189 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c @@ -2342,7 +2342,7 @@ static int fm10k_handle_resume(struct fm10k_intfc *interface) * suspend or hibernation. This function does not need to handle lower PCIe * device state as the stack takes care of that for us. **/ -static int __maybe_unused fm10k_resume(struct device *dev) +static int fm10k_resume(struct device *dev) { struct fm10k_intfc *interface = dev_get_drvdata(dev); struct net_device *netdev = interface->netdev; @@ -2369,7 +2369,7 @@ static int __maybe_unused fm10k_resume(struct device *dev) * system suspend or hibernation. This function does not need to handle lower * PCIe device state as the stack takes care of that for us. **/ -static int __maybe_unused fm10k_suspend(struct device *dev) +static int fm10k_suspend(struct device *dev) { struct fm10k_intfc *interface = dev_get_drvdata(dev); struct net_device *netdev = interface->netdev; @@ -2502,16 +2502,14 @@ static const struct pci_error_handlers fm10k_err_handler = { .reset_done = fm10k_io_reset_done, }; -static SIMPLE_DEV_PM_OPS(fm10k_pm_ops, fm10k_suspend, fm10k_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(fm10k_pm_ops, fm10k_suspend, fm10k_resume); static struct pci_driver fm10k_driver = { .name = fm10k_driver_name, .id_table = fm10k_pci_tbl, .probe = fm10k_probe, .remove = fm10k_remove, - .driver = { - .pm = &fm10k_pm_ops, - }, + .driver.pm = pm_sleep_ptr(&fm10k_pm_ops), .sriov_configure = fm10k_iov_configure, .err_handler = &fm10k_err_handler }; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index a1134a0157349..0bdcdea0be3ec 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -16514,7 +16514,7 @@ static void i40e_shutdown(struct pci_dev *pdev) * i40e_suspend - PM callback for moving to D3 * @dev: generic device information structure **/ -static int __maybe_unused i40e_suspend(struct device *dev) +static int i40e_suspend(struct device *dev) { struct i40e_pf *pf = dev_get_drvdata(dev); struct i40e_hw *hw = &pf->hw; @@ -16565,7 +16565,7 @@ static int __maybe_unused i40e_suspend(struct device *dev) * i40e_resume - PM callback for waking up from D3 * @dev: generic device information structure **/ -static int __maybe_unused i40e_resume(struct device *dev) +static int i40e_resume(struct device *dev) { struct i40e_pf *pf = dev_get_drvdata(dev); int err; @@ -16611,16 +16611,14 @@ static const struct pci_error_handlers i40e_err_handler = { .resume = i40e_pci_error_resume, }; -static SIMPLE_DEV_PM_OPS(i40e_pm_ops, i40e_suspend, i40e_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(i40e_pm_ops, i40e_suspend, i40e_resume); static struct pci_driver i40e_driver = { .name = i40e_driver_name, .id_table = i40e_pci_tbl, .probe = i40e_probe, .remove = i40e_remove, - .driver = { - .pm = &i40e_pm_ops, - }, + .driver.pm = pm_sleep_ptr(&i40e_pm_ops), .shutdown = i40e_shutdown, .err_handler = &i40e_err_handler, .sriov_configure = i40e_pci_sriov_configure, diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index ef2440f3abf8b..13361a780ece5 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -5023,7 +5023,7 @@ static int iavf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) * * Called when the system (VM) is entering sleep/suspend. **/ -static int __maybe_unused iavf_suspend(struct device *dev_d) +static int iavf_suspend(struct device *dev_d) { struct net_device *netdev = dev_get_drvdata(dev_d); struct iavf_adapter *adapter = netdev_priv(netdev); @@ -5051,7 +5051,7 @@ static int __maybe_unused iavf_suspend(struct device *dev_d) * * Called when the system (VM) is resumed from sleep/suspend. **/ -static int __maybe_unused iavf_resume(struct device *dev_d) +static int iavf_resume(struct device *dev_d) { struct pci_dev *pdev = to_pci_dev(dev_d); struct iavf_adapter *adapter; @@ -5238,14 +5238,14 @@ static void iavf_shutdown(struct pci_dev *pdev) pci_set_power_state(pdev, PCI_D3hot); } -static SIMPLE_DEV_PM_OPS(iavf_pm_ops, iavf_suspend, iavf_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(iavf_pm_ops, iavf_suspend, iavf_resume); static struct pci_driver iavf_driver = { .name = iavf_driver_name, .id_table = iavf_pci_tbl, .probe = iavf_probe, .remove = iavf_remove, - .driver.pm = &iavf_pm_ops, + .driver.pm = pm_sleep_ptr(&iavf_pm_ops), .shutdown = iavf_shutdown, }; diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 618570f235804..1c70551482645 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -5321,7 +5321,6 @@ static void ice_shutdown(struct pci_dev *pdev) } } -#ifdef CONFIG_PM /** * ice_prepare_for_shutdown - prep for PCI shutdown * @pf: board private structure @@ -5410,7 +5409,7 @@ static int ice_reinit_interrupt_scheme(struct ice_pf *pf) * Power Management callback to quiesce the device and prepare * for D3 transition. */ -static int __maybe_unused ice_suspend(struct device *dev) +static int ice_suspend(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); struct ice_pf *pf; @@ -5477,7 +5476,7 @@ static int __maybe_unused ice_suspend(struct device *dev) * ice_resume - PM callback for waking up from D3 * @dev: generic device information structure */ -static int __maybe_unused ice_resume(struct device *dev) +static int ice_resume(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); enum ice_reset_req reset_type; @@ -5528,7 +5527,6 @@ static int __maybe_unused ice_resume(struct device *dev) return 0; } -#endif /* CONFIG_PM */ /** * ice_pci_err_detected - warning that PCI error has been detected @@ -5702,7 +5700,7 @@ static const struct pci_device_id ice_pci_tbl[] = { }; MODULE_DEVICE_TABLE(pci, ice_pci_tbl); -static __maybe_unused SIMPLE_DEV_PM_OPS(ice_pm_ops, ice_suspend, ice_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(ice_pm_ops, ice_suspend, ice_resume); static const struct pci_error_handlers ice_pci_err_handler = { .error_detected = ice_pci_err_detected, @@ -5717,9 +5715,7 @@ static struct pci_driver ice_driver = { .id_table = ice_pci_tbl, .probe = ice_probe, .remove = ice_remove, -#ifdef CONFIG_PM - .driver.pm = &ice_pm_ops, -#endif /* CONFIG_PM */ + .driver.pm = pm_sleep_ptr(&ice_pm_ops), .shutdown = ice_shutdown, .sriov_configure = ice_sriov_configure, .sriov_get_vf_total_msix = ice_sriov_get_vf_total_msix, diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 89604c9d8e493..74a998fcaa6f3 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -9426,12 +9426,12 @@ static void igb_deliver_wake_packet(struct net_device *netdev) netif_rx(skb); } -static int __maybe_unused igb_suspend(struct device *dev) +static int igb_suspend(struct device *dev) { return __igb_shutdown(to_pci_dev(dev), NULL, 0); } -static int __maybe_unused __igb_resume(struct device *dev, bool rpm) +static int __igb_resume(struct device *dev, bool rpm) { struct pci_dev *pdev = to_pci_dev(dev); struct net_device *netdev = pci_get_drvdata(pdev); @@ -9487,12 +9487,12 @@ static int __maybe_unused __igb_resume(struct device *dev, bool rpm) return err; } -static int __maybe_unused igb_resume(struct device *dev) +static int igb_resume(struct device *dev) { return __igb_resume(dev, false); } -static int __maybe_unused igb_runtime_idle(struct device *dev) +static int igb_runtime_idle(struct device *dev) { struct net_device *netdev = dev_get_drvdata(dev); struct igb_adapter *adapter = netdev_priv(netdev); @@ -9503,12 +9503,12 @@ static int __maybe_unused igb_runtime_idle(struct device *dev) return -EBUSY; } -static int __maybe_unused igb_runtime_suspend(struct device *dev) +static int igb_runtime_suspend(struct device *dev) { return __igb_shutdown(to_pci_dev(dev), NULL, 1); } -static int __maybe_unused igb_runtime_resume(struct device *dev) +static int igb_runtime_resume(struct device *dev) { return __igb_resume(dev, true); } @@ -10131,22 +10131,16 @@ static void igb_nfc_filter_restore(struct igb_adapter *adapter) spin_unlock(&adapter->nfc_lock); } -#ifdef CONFIG_PM -static const struct dev_pm_ops igb_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(igb_suspend, igb_resume) - SET_RUNTIME_PM_OPS(igb_runtime_suspend, igb_runtime_resume, - igb_runtime_idle) -}; -#endif +static _DEFINE_DEV_PM_OPS(igb_pm_ops, igb_suspend, igb_resume, + igb_runtime_suspend, igb_runtime_resume, + igb_runtime_idle); static struct pci_driver igb_driver = { .name = igb_driver_name, .id_table = igb_pci_tbl, .probe = igb_probe, .remove = igb_remove, -#ifdef CONFIG_PM - .driver.pm = &igb_pm_ops, -#endif + .driver.pm = pm_ptr(&igb_pm_ops), .shutdown = igb_shutdown, .sriov_configure = igb_pci_sriov_configure, .err_handler = &igb_err_handler diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c b/drivers/net/ethernet/intel/igbvf/netdev.c index b0cf310e6f7bd..40ccd24ffc539 100644 --- a/drivers/net/ethernet/intel/igbvf/netdev.c +++ b/drivers/net/ethernet/intel/igbvf/netdev.c @@ -2470,7 +2470,7 @@ static int igbvf_suspend(struct device *dev_d) return 0; } -static int __maybe_unused igbvf_resume(struct device *dev_d) +static int igbvf_resume(struct device *dev_d) { struct pci_dev *pdev = to_pci_dev(dev_d); struct net_device *netdev = pci_get_drvdata(pdev); @@ -2957,7 +2957,7 @@ static const struct pci_device_id igbvf_pci_tbl[] = { }; MODULE_DEVICE_TABLE(pci, igbvf_pci_tbl); -static SIMPLE_DEV_PM_OPS(igbvf_pm_ops, igbvf_suspend, igbvf_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(igbvf_pm_ops, igbvf_suspend, igbvf_resume); /* PCI Device API Driver */ static struct pci_driver igbvf_driver = { @@ -2965,7 +2965,7 @@ static struct pci_driver igbvf_driver = { .id_table = igbvf_pci_tbl, .probe = igbvf_probe, .remove = igbvf_remove, - .driver.pm = &igbvf_pm_ops, + .driver.pm = pm_sleep_ptr(&igbvf_pm_ops), .shutdown = igbvf_shutdown, .err_handler = &igbvf_err_handler }; diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 0cd923c8ac9b2..998d8c345a786 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -7104,8 +7104,7 @@ static int __igc_shutdown(struct pci_dev *pdev, bool *enable_wake, return 0; } -#ifdef CONFIG_PM -static int __maybe_unused igc_runtime_suspend(struct device *dev) +static int igc_runtime_suspend(struct device *dev) { return __igc_shutdown(to_pci_dev(dev), NULL, 1); } @@ -7140,7 +7139,7 @@ static void igc_deliver_wake_packet(struct net_device *netdev) netif_rx(skb); } -static int __maybe_unused igc_resume(struct device *dev) +static int igc_resume(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); struct net_device *netdev = pci_get_drvdata(pdev); @@ -7193,12 +7192,12 @@ static int __maybe_unused igc_resume(struct device *dev) return err; } -static int __maybe_unused igc_runtime_resume(struct device *dev) +static int igc_runtime_resume(struct device *dev) { return igc_resume(dev); } -static int __maybe_unused igc_suspend(struct device *dev) +static int igc_suspend(struct device *dev) { return __igc_shutdown(to_pci_dev(dev), NULL, 0); } @@ -7213,7 +7212,6 @@ static int __maybe_unused igc_runtime_idle(struct device *dev) return -EBUSY; } -#endif /* CONFIG_PM */ static void igc_shutdown(struct pci_dev *pdev) { @@ -7328,22 +7326,16 @@ static const struct pci_error_handlers igc_err_handler = { .resume = igc_io_resume, }; -#ifdef CONFIG_PM -static const struct dev_pm_ops igc_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(igc_suspend, igc_resume) - SET_RUNTIME_PM_OPS(igc_runtime_suspend, igc_runtime_resume, - igc_runtime_idle) -}; -#endif +static _DEFINE_DEV_PM_OPS(igc_pm_ops, igc_suspend, igc_resume, + igc_runtime_suspend, igc_runtime_resume, + igc_runtime_idle); static struct pci_driver igc_driver = { .name = igc_driver_name, .id_table = igc_pci_tbl, .probe = igc_probe, .remove = igc_remove, -#ifdef CONFIG_PM - .driver.pm = &igc_pm_ops, -#endif + .driver.pm = pm_ptr(&igc_pm_ops), .shutdown = igc_shutdown, .err_handler = &igc_err_handler, }; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index ed05af665466d..43e7e75ae18c0 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6974,7 +6974,7 @@ int ixgbe_close(struct net_device *netdev) return 0; } -static int __maybe_unused ixgbe_resume(struct device *dev_d) +static int ixgbe_resume(struct device *dev_d) { struct pci_dev *pdev = to_pci_dev(dev_d); struct ixgbe_adapter *adapter = pci_get_drvdata(pdev); @@ -7082,7 +7082,7 @@ static int __ixgbe_shutdown(struct pci_dev *pdev, bool *enable_wake) return 0; } -static int __maybe_unused ixgbe_suspend(struct device *dev_d) +static int ixgbe_suspend(struct device *dev_d) { struct pci_dev *pdev = to_pci_dev(dev_d); int retval; @@ -11583,14 +11583,14 @@ static const struct pci_error_handlers ixgbe_err_handler = { .resume = ixgbe_io_resume, }; -static SIMPLE_DEV_PM_OPS(ixgbe_pm_ops, ixgbe_suspend, ixgbe_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(ixgbe_pm_ops, ixgbe_suspend, ixgbe_resume); static struct pci_driver ixgbe_driver = { .name = ixgbe_driver_name, .id_table = ixgbe_pci_tbl, .probe = ixgbe_probe, .remove = ixgbe_remove, - .driver.pm = &ixgbe_pm_ops, + .driver.pm = pm_sleep_ptr(&ixgbe_pm_ops), .shutdown = ixgbe_shutdown, .sriov_configure = ixgbe_pci_sriov_configure, .err_handler = &ixgbe_err_handler diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 9c960017a6de5..3161a13079fe6 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -4300,7 +4300,7 @@ static int ixgbevf_change_mtu(struct net_device *netdev, int new_mtu) return 0; } -static int __maybe_unused ixgbevf_suspend(struct device *dev_d) +static int ixgbevf_suspend(struct device *dev_d) { struct net_device *netdev = dev_get_drvdata(dev_d); struct ixgbevf_adapter *adapter = netdev_priv(netdev); @@ -4317,7 +4317,7 @@ static int __maybe_unused ixgbevf_suspend(struct device *dev_d) return 0; } -static int __maybe_unused ixgbevf_resume(struct device *dev_d) +static int ixgbevf_resume(struct device *dev_d) { struct pci_dev *pdev = to_pci_dev(dev_d); struct net_device *netdev = pci_get_drvdata(pdev); @@ -4854,7 +4854,7 @@ static const struct pci_error_handlers ixgbevf_err_handler = { .resume = ixgbevf_io_resume, }; -static SIMPLE_DEV_PM_OPS(ixgbevf_pm_ops, ixgbevf_suspend, ixgbevf_resume); +static DEFINE_SIMPLE_DEV_PM_OPS(ixgbevf_pm_ops, ixgbevf_suspend, ixgbevf_resume); static struct pci_driver ixgbevf_driver = { .name = ixgbevf_driver_name, @@ -4863,7 +4863,7 @@ static struct pci_driver ixgbevf_driver = { .remove = ixgbevf_remove, /* Power Management Hooks */ - .driver.pm = &ixgbevf_pm_ops, + .driver.pm = pm_sleep_ptr(&ixgbevf_pm_ops), .shutdown = ixgbevf_shutdown, .err_handler = &ixgbevf_err_handler From e569b36447fdc8c9b8b984c6dd6506a2dc78fb6b Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Fri, 29 Mar 2024 10:56:26 -0700 Subject: [PATCH 155/167] igc: Refactor runtime power management flow Following the corresponding discussion [1] and [2] refactor the 'igc_open' method and avoid taking the rtnl_lock() during the 'igc_resume' method. The rtnl_lock is held by the upper layer and could lead to the deadlock during resuming from a runtime power management flow. Notify the stack of the actual queue counts 'netif_set_real_num_*_queues' outside the '_igc_open' wrapper. This notification doesn't have to be called on each resume. Test: 1. Disconnect the ethernet cable 2. Enable the runtime power management via file system: echo auto > /sys/devices/pci0000\.../power/control 3. Check the device state (lspci -s -vvv | grep -i Status) Link: https://lore.kernel.org/netdev/20231206113934.8d7819857574.I2deb5804 ef1739a2af307283d320ef7d82456494@changeid/#r [1] Link: https://lore.kernel.org/netdev/20211125074949.5f897431@kicinski-fedo ra-pc1c0hjn.dhcp.thefacebook.com/t/ [2] Signed-off-by: Sasha Neftin Tested-by: Naama Meir Signed-off-by: Tony Nguyen Signed-off-by: NipaLocal --- drivers/net/ethernet/intel/igc/igc_main.c | 32 +++++++++++------------ 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 998d8c345a786..d9bd001af7ba4 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -5929,15 +5929,6 @@ static int __igc_open(struct net_device *netdev, bool resuming) if (err) goto err_req_irq; - /* Notify the stack of the actual queue counts. */ - err = netif_set_real_num_tx_queues(netdev, adapter->num_tx_queues); - if (err) - goto err_set_queues; - - err = netif_set_real_num_rx_queues(netdev, adapter->num_rx_queues); - if (err) - goto err_set_queues; - clear_bit(__IGC_DOWN, &adapter->state); for (i = 0; i < adapter->num_q_vectors; i++) @@ -5958,8 +5949,6 @@ static int __igc_open(struct net_device *netdev, bool resuming) return IGC_SUCCESS; -err_set_queues: - igc_free_irq(adapter); err_req_irq: igc_release_hw_control(adapter); igc_power_down_phy_copper_base(&adapter->hw); @@ -5976,6 +5965,17 @@ static int __igc_open(struct net_device *netdev, bool resuming) int igc_open(struct net_device *netdev) { + struct igc_adapter *adapter = netdev_priv(netdev); + int err; + + /* Notify the stack of the actual queue counts. */ + err = netif_set_real_num_queues(netdev, adapter->num_tx_queues, + adapter->num_rx_queues); + if (err) { + netdev_err(netdev, "error setting real queue count\n"); + return err; + } + return __igc_open(netdev, false); } @@ -7181,13 +7181,11 @@ static int igc_resume(struct device *dev) wr32(IGC_WUS, ~0); - rtnl_lock(); - if (!err && netif_running(netdev)) + if (netif_running(netdev)) { err = __igc_open(netdev, true); - - if (!err) - netif_device_attach(netdev); - rtnl_unlock(); + if (!err) + netif_device_attach(netdev); + } return err; } From f7238c5c7dd1e89639a8c68ae451a38feb4cab92 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 29 Mar 2024 10:56:27 -0700 Subject: [PATCH 156/167] i40e: avoid forward declarations in i40e_nvm.c Move code around to get rid of forward declarations. No functional changes. After a plain code juggling, checkpatch reported: total: 0 errors, 7 warnings, 12 checks, 1581 lines checked so while at it let's address old issues as well. Should we ever address the remaining unnecessary forward declarations within drivers/net/ethernet/intel/, consider this change as a starting point/reference. As reported in [0], there would be a lot more of work to do...if we care. [0]: https://lore.kernel.org/intel-wired-lan/Zeh8qadiTGf413YU@boxer/T/#u Signed-off-by: Maciej Fijalkowski Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Reviewed-by: Simon Horman Signed-off-by: Tony Nguyen Signed-off-by: NipaLocal --- drivers/net/ethernet/intel/i40e/i40e_nvm.c | 1050 ++++++++++---------- 1 file changed, 509 insertions(+), 541 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_nvm.c b/drivers/net/ethernet/intel/i40e/i40e_nvm.c index 605fd82f5d20f..7f0936f4e05e2 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_nvm.c +++ b/drivers/net/ethernet/intel/i40e/i40e_nvm.c @@ -734,37 +734,7 @@ int i40e_validate_nvm_checksum(struct i40e_hw *hw, return ret_code; } -static int i40e_nvmupd_state_init(struct i40e_hw *hw, - struct i40e_nvm_access *cmd, - u8 *bytes, int *perrno); -static int i40e_nvmupd_state_reading(struct i40e_hw *hw, - struct i40e_nvm_access *cmd, - u8 *bytes, int *perrno); -static int i40e_nvmupd_state_writing(struct i40e_hw *hw, - struct i40e_nvm_access *cmd, - u8 *bytes, int *errno); -static enum i40e_nvmupd_cmd i40e_nvmupd_validate_command(struct i40e_hw *hw, - struct i40e_nvm_access *cmd, - int *perrno); -static int i40e_nvmupd_nvm_erase(struct i40e_hw *hw, - struct i40e_nvm_access *cmd, - int *perrno); -static int i40e_nvmupd_nvm_write(struct i40e_hw *hw, - struct i40e_nvm_access *cmd, - u8 *bytes, int *perrno); -static int i40e_nvmupd_nvm_read(struct i40e_hw *hw, - struct i40e_nvm_access *cmd, - u8 *bytes, int *perrno); -static int i40e_nvmupd_exec_aq(struct i40e_hw *hw, - struct i40e_nvm_access *cmd, - u8 *bytes, int *perrno); -static int i40e_nvmupd_get_aq_result(struct i40e_hw *hw, - struct i40e_nvm_access *cmd, - u8 *bytes, int *perrno); -static int i40e_nvmupd_get_aq_event(struct i40e_hw *hw, - struct i40e_nvm_access *cmd, - u8 *bytes, int *perrno); -static inline u8 i40e_nvmupd_get_module(u32 val) +static u8 i40e_nvmupd_get_module(u32 val) { return (u8)(val & I40E_NVM_MOD_PNT_MASK); } @@ -799,146 +769,433 @@ static const char * const i40e_nvm_update_state_str[] = { }; /** - * i40e_nvmupd_command - Process an NVM update command + * i40e_nvmupd_validate_command - Validate given command * @hw: pointer to hardware structure - * @cmd: pointer to nvm update command - * @bytes: pointer to the data buffer + * @cmd: pointer to nvm update command buffer * @perrno: pointer to return error code * - * Dispatches command depending on what update state is current + * Return one of the valid command types or I40E_NVMUPD_INVALID **/ -int i40e_nvmupd_command(struct i40e_hw *hw, - struct i40e_nvm_access *cmd, - u8 *bytes, int *perrno) +static enum i40e_nvmupd_cmd +i40e_nvmupd_validate_command(struct i40e_hw *hw, struct i40e_nvm_access *cmd, + int *perrno) { enum i40e_nvmupd_cmd upd_cmd; - int status; - - /* assume success */ - *perrno = 0; + u8 module, transaction; - /* early check for status command and debug msgs */ - upd_cmd = i40e_nvmupd_validate_command(hw, cmd, perrno); + /* anything that doesn't match a recognized case is an error */ + upd_cmd = I40E_NVMUPD_INVALID; - i40e_debug(hw, I40E_DEBUG_NVM, "%s state %d nvm_release_on_hold %d opc 0x%04x cmd 0x%08x config 0x%08x offset 0x%08x data_size 0x%08x\n", - i40e_nvm_update_state_str[upd_cmd], - hw->nvmupd_state, - hw->nvm_release_on_done, hw->nvm_wait_opcode, - cmd->command, cmd->config, cmd->offset, cmd->data_size); + transaction = i40e_nvmupd_get_transaction(cmd->config); + module = i40e_nvmupd_get_module(cmd->config); - if (upd_cmd == I40E_NVMUPD_INVALID) { - *perrno = -EFAULT; + /* limits on data size */ + if (cmd->data_size < 1 || cmd->data_size > I40E_NVMUPD_MAX_DATA) { i40e_debug(hw, I40E_DEBUG_NVM, - "i40e_nvmupd_validate_command returns %d errno %d\n", - upd_cmd, *perrno); + "%s data_size %d\n", __func__, cmd->data_size); + *perrno = -EFAULT; + return I40E_NVMUPD_INVALID; } - /* a status request returns immediately rather than - * going into the state machine - */ - if (upd_cmd == I40E_NVMUPD_STATUS) { - if (!cmd->data_size) { - *perrno = -EFAULT; - return -EINVAL; + switch (cmd->command) { + case I40E_NVM_READ: + switch (transaction) { + case I40E_NVM_CON: + upd_cmd = I40E_NVMUPD_READ_CON; + break; + case I40E_NVM_SNT: + upd_cmd = I40E_NVMUPD_READ_SNT; + break; + case I40E_NVM_LCB: + upd_cmd = I40E_NVMUPD_READ_LCB; + break; + case I40E_NVM_SA: + upd_cmd = I40E_NVMUPD_READ_SA; + break; + case I40E_NVM_EXEC: + if (module == 0xf) + upd_cmd = I40E_NVMUPD_STATUS; + else if (module == 0) + upd_cmd = I40E_NVMUPD_GET_AQ_RESULT; + break; + case I40E_NVM_AQE: + upd_cmd = I40E_NVMUPD_GET_AQ_EVENT; + break; } + break; - bytes[0] = hw->nvmupd_state; - - if (cmd->data_size >= 4) { - bytes[1] = 0; - *((u16 *)&bytes[2]) = hw->nvm_wait_opcode; + case I40E_NVM_WRITE: + switch (transaction) { + case I40E_NVM_CON: + upd_cmd = I40E_NVMUPD_WRITE_CON; + break; + case I40E_NVM_SNT: + upd_cmd = I40E_NVMUPD_WRITE_SNT; + break; + case I40E_NVM_LCB: + upd_cmd = I40E_NVMUPD_WRITE_LCB; + break; + case I40E_NVM_SA: + upd_cmd = I40E_NVMUPD_WRITE_SA; + break; + case I40E_NVM_ERA: + upd_cmd = I40E_NVMUPD_WRITE_ERA; + break; + case I40E_NVM_CSUM: + upd_cmd = I40E_NVMUPD_CSUM_CON; + break; + case (I40E_NVM_CSUM | I40E_NVM_SA): + upd_cmd = I40E_NVMUPD_CSUM_SA; + break; + case (I40E_NVM_CSUM | I40E_NVM_LCB): + upd_cmd = I40E_NVMUPD_CSUM_LCB; + break; + case I40E_NVM_EXEC: + if (module == 0) + upd_cmd = I40E_NVMUPD_EXEC_AQ; + break; } + break; + } - /* Clear error status on read */ - if (hw->nvmupd_state == I40E_NVMUPD_STATE_ERROR) - hw->nvmupd_state = I40E_NVMUPD_STATE_INIT; + return upd_cmd; +} - return 0; - } +/** + * i40e_nvmupd_nvm_erase - Erase an NVM module + * @hw: pointer to hardware structure + * @cmd: pointer to nvm update command buffer + * @perrno: pointer to return error code + * + * module, offset, data_size and data are in cmd structure + **/ +static int i40e_nvmupd_nvm_erase(struct i40e_hw *hw, + struct i40e_nvm_access *cmd, + int *perrno) +{ + struct i40e_asq_cmd_details cmd_details; + u8 module, transaction; + int status = 0; + bool last; - /* Clear status even it is not read and log */ - if (hw->nvmupd_state == I40E_NVMUPD_STATE_ERROR) { + transaction = i40e_nvmupd_get_transaction(cmd->config); + module = i40e_nvmupd_get_module(cmd->config); + last = (transaction & I40E_NVM_LCB); + + memset(&cmd_details, 0, sizeof(cmd_details)); + cmd_details.wb_desc = &hw->nvm_wb_desc; + + status = i40e_aq_erase_nvm(hw, module, cmd->offset, (u16)cmd->data_size, + last, &cmd_details); + if (status) { i40e_debug(hw, I40E_DEBUG_NVM, - "Clearing I40E_NVMUPD_STATE_ERROR state without reading\n"); - hw->nvmupd_state = I40E_NVMUPD_STATE_INIT; + "%s mod 0x%x off 0x%x len 0x%x\n", + __func__, module, cmd->offset, cmd->data_size); + i40e_debug(hw, I40E_DEBUG_NVM, + "%s status %d aq %d\n", + __func__, status, hw->aq.asq_last_status); + *perrno = i40e_aq_rc_to_posix(status, hw->aq.asq_last_status); } - /* Acquire lock to prevent race condition where adminq_task - * can execute after i40e_nvmupd_nvm_read/write but before state - * variables (nvm_wait_opcode, nvm_release_on_done) are updated. - * - * During NVMUpdate, it is observed that lock could be held for - * ~5ms for most commands. However lock is held for ~60ms for - * NVMUPD_CSUM_LCB command. - */ - mutex_lock(&hw->aq.arq_mutex); - switch (hw->nvmupd_state) { - case I40E_NVMUPD_STATE_INIT: - status = i40e_nvmupd_state_init(hw, cmd, bytes, perrno); - break; - - case I40E_NVMUPD_STATE_READING: - status = i40e_nvmupd_state_reading(hw, cmd, bytes, perrno); - break; + return status; +} - case I40E_NVMUPD_STATE_WRITING: - status = i40e_nvmupd_state_writing(hw, cmd, bytes, perrno); - break; +/** + * i40e_nvmupd_nvm_write - Write NVM + * @hw: pointer to hardware structure + * @cmd: pointer to nvm update command buffer + * @bytes: pointer to the data buffer + * @perrno: pointer to return error code + * + * module, offset, data_size and data are in cmd structure + **/ +static int i40e_nvmupd_nvm_write(struct i40e_hw *hw, + struct i40e_nvm_access *cmd, + u8 *bytes, int *perrno) +{ + struct i40e_asq_cmd_details cmd_details; + u8 module, transaction; + u8 preservation_flags; + int status = 0; + bool last; - case I40E_NVMUPD_STATE_INIT_WAIT: - case I40E_NVMUPD_STATE_WRITE_WAIT: - /* if we need to stop waiting for an event, clear - * the wait info and return before doing anything else - */ - if (cmd->offset == 0xffff) { - i40e_nvmupd_clear_wait_state(hw); - status = 0; - break; - } + transaction = i40e_nvmupd_get_transaction(cmd->config); + module = i40e_nvmupd_get_module(cmd->config); + last = (transaction & I40E_NVM_LCB); + preservation_flags = i40e_nvmupd_get_preservation_flags(cmd->config); - status = -EBUSY; - *perrno = -EBUSY; - break; + memset(&cmd_details, 0, sizeof(cmd_details)); + cmd_details.wb_desc = &hw->nvm_wb_desc; - default: - /* invalid state, should never happen */ + status = i40e_aq_update_nvm(hw, module, cmd->offset, + (u16)cmd->data_size, bytes, last, + preservation_flags, &cmd_details); + if (status) { i40e_debug(hw, I40E_DEBUG_NVM, - "NVMUPD: no such state %d\n", hw->nvmupd_state); - status = -EOPNOTSUPP; - *perrno = -ESRCH; - break; + "%s mod 0x%x off 0x%x len 0x%x\n", + __func__, module, cmd->offset, cmd->data_size); + i40e_debug(hw, I40E_DEBUG_NVM, + "%s status %d aq %d\n", + __func__, status, hw->aq.asq_last_status); + *perrno = i40e_aq_rc_to_posix(status, hw->aq.asq_last_status); } - mutex_unlock(&hw->aq.arq_mutex); return status; } /** - * i40e_nvmupd_state_init - Handle NVM update state Init + * i40e_nvmupd_nvm_read - Read NVM * @hw: pointer to hardware structure * @cmd: pointer to nvm update command buffer * @bytes: pointer to the data buffer * @perrno: pointer to return error code * - * Process legitimate commands of the Init state and conditionally set next - * state. Reject all other commands. + * cmd structure contains identifiers and data buffer **/ -static int i40e_nvmupd_state_init(struct i40e_hw *hw, - struct i40e_nvm_access *cmd, - u8 *bytes, int *perrno) +static int i40e_nvmupd_nvm_read(struct i40e_hw *hw, + struct i40e_nvm_access *cmd, + u8 *bytes, int *perrno) { - enum i40e_nvmupd_cmd upd_cmd; - int status = 0; + struct i40e_asq_cmd_details cmd_details; + u8 module, transaction; + int status; + bool last; - upd_cmd = i40e_nvmupd_validate_command(hw, cmd, perrno); + transaction = i40e_nvmupd_get_transaction(cmd->config); + module = i40e_nvmupd_get_module(cmd->config); + last = (transaction == I40E_NVM_LCB) || (transaction == I40E_NVM_SA); - switch (upd_cmd) { - case I40E_NVMUPD_READ_SA: - status = i40e_acquire_nvm(hw, I40E_RESOURCE_READ); - if (status) { - *perrno = i40e_aq_rc_to_posix(status, - hw->aq.asq_last_status); - } else { + memset(&cmd_details, 0, sizeof(cmd_details)); + cmd_details.wb_desc = &hw->nvm_wb_desc; + + status = i40e_aq_read_nvm(hw, module, cmd->offset, (u16)cmd->data_size, + bytes, last, &cmd_details); + if (status) { + i40e_debug(hw, I40E_DEBUG_NVM, + "%s mod 0x%x off 0x%x len 0x%x\n", + __func__, module, cmd->offset, cmd->data_size); + i40e_debug(hw, I40E_DEBUG_NVM, + "%s status %d aq %d\n", + __func__, status, hw->aq.asq_last_status); + *perrno = i40e_aq_rc_to_posix(status, hw->aq.asq_last_status); + } + + return status; +} + +/** + * i40e_nvmupd_exec_aq - Run an AQ command + * @hw: pointer to hardware structure + * @cmd: pointer to nvm update command buffer + * @bytes: pointer to the data buffer + * @perrno: pointer to return error code + * + * cmd structure contains identifiers and data buffer + **/ +static int i40e_nvmupd_exec_aq(struct i40e_hw *hw, + struct i40e_nvm_access *cmd, + u8 *bytes, int *perrno) +{ + struct i40e_asq_cmd_details cmd_details; + struct i40e_aq_desc *aq_desc; + u32 buff_size = 0; + u8 *buff = NULL; + u32 aq_desc_len; + u32 aq_data_len; + int status; + + i40e_debug(hw, I40E_DEBUG_NVM, "NVMUPD: %s\n", __func__); + if (cmd->offset == 0xffff) + return 0; + + memset(&cmd_details, 0, sizeof(cmd_details)); + cmd_details.wb_desc = &hw->nvm_wb_desc; + + aq_desc_len = sizeof(struct i40e_aq_desc); + memset(&hw->nvm_wb_desc, 0, aq_desc_len); + + /* get the aq descriptor */ + if (cmd->data_size < aq_desc_len) { + i40e_debug(hw, I40E_DEBUG_NVM, + "NVMUPD: not enough aq desc bytes for exec, size %d < %d\n", + cmd->data_size, aq_desc_len); + *perrno = -EINVAL; + return -EINVAL; + } + aq_desc = (struct i40e_aq_desc *)bytes; + + /* if data buffer needed, make sure it's ready */ + aq_data_len = cmd->data_size - aq_desc_len; + buff_size = max_t(u32, aq_data_len, le16_to_cpu(aq_desc->datalen)); + if (buff_size) { + if (!hw->nvm_buff.va) { + status = i40e_allocate_virt_mem(hw, &hw->nvm_buff, + hw->aq.asq_buf_size); + if (status) + i40e_debug(hw, I40E_DEBUG_NVM, + "NVMUPD: i40e_allocate_virt_mem for exec buff failed, %d\n", + status); + } + + if (hw->nvm_buff.va) { + buff = hw->nvm_buff.va; + memcpy(buff, &bytes[aq_desc_len], aq_data_len); + } + } + + if (cmd->offset) + memset(&hw->nvm_aq_event_desc, 0, aq_desc_len); + + /* and away we go! */ + status = i40e_asq_send_command(hw, aq_desc, buff, + buff_size, &cmd_details); + if (status) { + i40e_debug(hw, I40E_DEBUG_NVM, + "%s err %pe aq_err %s\n", + __func__, ERR_PTR(status), + i40e_aq_str(hw, hw->aq.asq_last_status)); + *perrno = i40e_aq_rc_to_posix(status, hw->aq.asq_last_status); + return status; + } + + /* should we wait for a followup event? */ + if (cmd->offset) { + hw->nvm_wait_opcode = cmd->offset; + hw->nvmupd_state = I40E_NVMUPD_STATE_INIT_WAIT; + } + + return status; +} + +/** + * i40e_nvmupd_get_aq_result - Get the results from the previous exec_aq + * @hw: pointer to hardware structure + * @cmd: pointer to nvm update command buffer + * @bytes: pointer to the data buffer + * @perrno: pointer to return error code + * + * cmd structure contains identifiers and data buffer + **/ +static int i40e_nvmupd_get_aq_result(struct i40e_hw *hw, + struct i40e_nvm_access *cmd, + u8 *bytes, int *perrno) +{ + u32 aq_total_len; + u32 aq_desc_len; + int remainder; + u8 *buff; + + i40e_debug(hw, I40E_DEBUG_NVM, "NVMUPD: %s\n", __func__); + + aq_desc_len = sizeof(struct i40e_aq_desc); + aq_total_len = aq_desc_len + le16_to_cpu(hw->nvm_wb_desc.datalen); + + /* check offset range */ + if (cmd->offset > aq_total_len) { + i40e_debug(hw, I40E_DEBUG_NVM, "%s: offset too big %d > %d\n", + __func__, cmd->offset, aq_total_len); + *perrno = -EINVAL; + return -EINVAL; + } + + /* check copylength range */ + if (cmd->data_size > (aq_total_len - cmd->offset)) { + int new_len = aq_total_len - cmd->offset; + + i40e_debug(hw, I40E_DEBUG_NVM, "%s: copy length %d too big, trimming to %d\n", + __func__, cmd->data_size, new_len); + cmd->data_size = new_len; + } + + remainder = cmd->data_size; + if (cmd->offset < aq_desc_len) { + u32 len = aq_desc_len - cmd->offset; + + len = min(len, cmd->data_size); + i40e_debug(hw, I40E_DEBUG_NVM, "%s: aq_desc bytes %d to %d\n", + __func__, cmd->offset, cmd->offset + len); + + buff = ((u8 *)&hw->nvm_wb_desc) + cmd->offset; + memcpy(bytes, buff, len); + + bytes += len; + remainder -= len; + buff = hw->nvm_buff.va; + } else { + buff = hw->nvm_buff.va + (cmd->offset - aq_desc_len); + } + + if (remainder > 0) { + int start_byte = buff - (u8 *)hw->nvm_buff.va; + + i40e_debug(hw, I40E_DEBUG_NVM, "%s: databuf bytes %d to %d\n", + __func__, start_byte, start_byte + remainder); + memcpy(bytes, buff, remainder); + } + + return 0; +} + +/** + * i40e_nvmupd_get_aq_event - Get the Admin Queue event from previous exec_aq + * @hw: pointer to hardware structure + * @cmd: pointer to nvm update command buffer + * @bytes: pointer to the data buffer + * @perrno: pointer to return error code + * + * cmd structure contains identifiers and data buffer + **/ +static int i40e_nvmupd_get_aq_event(struct i40e_hw *hw, + struct i40e_nvm_access *cmd, + u8 *bytes, int *perrno) +{ + u32 aq_total_len; + u32 aq_desc_len; + + i40e_debug(hw, I40E_DEBUG_NVM, "NVMUPD: %s\n", __func__); + + aq_desc_len = sizeof(struct i40e_aq_desc); + aq_total_len = aq_desc_len + le16_to_cpu(hw->nvm_aq_event_desc.datalen); + + /* check copylength range */ + if (cmd->data_size > aq_total_len) { + i40e_debug(hw, I40E_DEBUG_NVM, + "%s: copy length %d too big, trimming to %d\n", + __func__, cmd->data_size, aq_total_len); + cmd->data_size = aq_total_len; + } + + memcpy(bytes, &hw->nvm_aq_event_desc, cmd->data_size); + + return 0; +} + +/** + * i40e_nvmupd_state_init - Handle NVM update state Init + * @hw: pointer to hardware structure + * @cmd: pointer to nvm update command buffer + * @bytes: pointer to the data buffer + * @perrno: pointer to return error code + * + * Process legitimate commands of the Init state and conditionally set next + * state. Reject all other commands. + **/ +static int i40e_nvmupd_state_init(struct i40e_hw *hw, + struct i40e_nvm_access *cmd, + u8 *bytes, int *perrno) +{ + enum i40e_nvmupd_cmd upd_cmd; + int status = 0; + + upd_cmd = i40e_nvmupd_validate_command(hw, cmd, perrno); + + switch (upd_cmd) { + case I40E_NVMUPD_READ_SA: + status = i40e_acquire_nvm(hw, I40E_RESOURCE_READ); + if (status) { + *perrno = i40e_aq_rc_to_posix(status, + hw->aq.asq_last_status); + } else { status = i40e_nvmupd_nvm_read(hw, cmd, bytes, perrno); i40e_release_nvm(hw); } @@ -948,7 +1205,7 @@ static int i40e_nvmupd_state_init(struct i40e_hw *hw, status = i40e_acquire_nvm(hw, I40E_RESOURCE_READ); if (status) { *perrno = i40e_aq_rc_to_posix(status, - hw->aq.asq_last_status); + hw->aq.asq_last_status); } else { status = i40e_nvmupd_nvm_read(hw, cmd, bytes, perrno); if (status) @@ -962,7 +1219,7 @@ static int i40e_nvmupd_state_init(struct i40e_hw *hw, status = i40e_acquire_nvm(hw, I40E_RESOURCE_WRITE); if (status) { *perrno = i40e_aq_rc_to_posix(status, - hw->aq.asq_last_status); + hw->aq.asq_last_status); } else { status = i40e_nvmupd_nvm_erase(hw, cmd, perrno); if (status) { @@ -979,7 +1236,7 @@ static int i40e_nvmupd_state_init(struct i40e_hw *hw, status = i40e_acquire_nvm(hw, I40E_RESOURCE_WRITE); if (status) { *perrno = i40e_aq_rc_to_posix(status, - hw->aq.asq_last_status); + hw->aq.asq_last_status); } else { status = i40e_nvmupd_nvm_write(hw, cmd, bytes, perrno); if (status) { @@ -996,7 +1253,7 @@ static int i40e_nvmupd_state_init(struct i40e_hw *hw, status = i40e_acquire_nvm(hw, I40E_RESOURCE_WRITE); if (status) { *perrno = i40e_aq_rc_to_posix(status, - hw->aq.asq_last_status); + hw->aq.asq_last_status); } else { status = i40e_nvmupd_nvm_write(hw, cmd, bytes, perrno); if (status) { @@ -1012,7 +1269,7 @@ static int i40e_nvmupd_state_init(struct i40e_hw *hw, status = i40e_acquire_nvm(hw, I40E_RESOURCE_WRITE); if (status) { *perrno = i40e_aq_rc_to_posix(status, - hw->aq.asq_last_status); + hw->aq.asq_last_status); } else { status = i40e_update_nvm_checksum(hw); if (status) { @@ -1185,7 +1442,7 @@ static int i40e_nvmupd_state_writing(struct i40e_hw *hw, * so here we try to reacquire the semaphore then retry the write. * We only do one retry, then give up. */ - if (status && (hw->aq.asq_last_status == I40E_AQ_RC_EBUSY) && + if (status && hw->aq.asq_last_status == I40E_AQ_RC_EBUSY && !retry_attempt) { u32 old_asq_status = hw->aq.asq_last_status; int old_status = status; @@ -1215,457 +1472,168 @@ static int i40e_nvmupd_state_writing(struct i40e_hw *hw, } /** - * i40e_nvmupd_clear_wait_state - clear wait state on hw - * @hw: pointer to the hardware structure + * i40e_nvmupd_command - Process an NVM update command + * @hw: pointer to hardware structure + * @cmd: pointer to nvm update command + * @bytes: pointer to the data buffer + * @perrno: pointer to return error code + * + * Dispatches command depending on what update state is current **/ -void i40e_nvmupd_clear_wait_state(struct i40e_hw *hw) +int i40e_nvmupd_command(struct i40e_hw *hw, + struct i40e_nvm_access *cmd, + u8 *bytes, int *perrno) { - i40e_debug(hw, I40E_DEBUG_NVM, - "NVMUPD: clearing wait on opcode 0x%04x\n", - hw->nvm_wait_opcode); - - if (hw->nvm_release_on_done) { - i40e_release_nvm(hw); - hw->nvm_release_on_done = false; - } - hw->nvm_wait_opcode = 0; + enum i40e_nvmupd_cmd upd_cmd; + int status; - if (hw->aq.arq_last_status) { - hw->nvmupd_state = I40E_NVMUPD_STATE_ERROR; - return; - } + /* assume success */ + *perrno = 0; - switch (hw->nvmupd_state) { - case I40E_NVMUPD_STATE_INIT_WAIT: - hw->nvmupd_state = I40E_NVMUPD_STATE_INIT; - break; + /* early check for status command and debug msgs */ + upd_cmd = i40e_nvmupd_validate_command(hw, cmd, perrno); - case I40E_NVMUPD_STATE_WRITE_WAIT: - hw->nvmupd_state = I40E_NVMUPD_STATE_WRITING; - break; + i40e_debug(hw, I40E_DEBUG_NVM, "%s state %d nvm_release_on_hold %d opc 0x%04x cmd 0x%08x config 0x%08x offset 0x%08x data_size 0x%08x\n", + i40e_nvm_update_state_str[upd_cmd], + hw->nvmupd_state, + hw->nvm_release_on_done, hw->nvm_wait_opcode, + cmd->command, cmd->config, cmd->offset, cmd->data_size); - default: - break; + if (upd_cmd == I40E_NVMUPD_INVALID) { + *perrno = -EFAULT; + i40e_debug(hw, I40E_DEBUG_NVM, + "i40e_nvmupd_validate_command returns %d errno %d\n", + upd_cmd, *perrno); } -} -/** - * i40e_nvmupd_check_wait_event - handle NVM update operation events - * @hw: pointer to the hardware structure - * @opcode: the event that just happened - * @desc: AdminQ descriptor - **/ -void i40e_nvmupd_check_wait_event(struct i40e_hw *hw, u16 opcode, - struct i40e_aq_desc *desc) -{ - u32 aq_desc_len = sizeof(struct i40e_aq_desc); + /* a status request returns immediately rather than + * going into the state machine + */ + if (upd_cmd == I40E_NVMUPD_STATUS) { + if (!cmd->data_size) { + *perrno = -EFAULT; + return -EINVAL; + } - if (opcode == hw->nvm_wait_opcode) { - memcpy(&hw->nvm_aq_event_desc, desc, aq_desc_len); - i40e_nvmupd_clear_wait_state(hw); - } -} + bytes[0] = hw->nvmupd_state; -/** - * i40e_nvmupd_validate_command - Validate given command - * @hw: pointer to hardware structure - * @cmd: pointer to nvm update command buffer - * @perrno: pointer to return error code - * - * Return one of the valid command types or I40E_NVMUPD_INVALID - **/ -static enum i40e_nvmupd_cmd i40e_nvmupd_validate_command(struct i40e_hw *hw, - struct i40e_nvm_access *cmd, - int *perrno) -{ - enum i40e_nvmupd_cmd upd_cmd; - u8 module, transaction; + if (cmd->data_size >= 4) { + bytes[1] = 0; + *((u16 *)&bytes[2]) = hw->nvm_wait_opcode; + } - /* anything that doesn't match a recognized case is an error */ - upd_cmd = I40E_NVMUPD_INVALID; + /* Clear error status on read */ + if (hw->nvmupd_state == I40E_NVMUPD_STATE_ERROR) + hw->nvmupd_state = I40E_NVMUPD_STATE_INIT; - transaction = i40e_nvmupd_get_transaction(cmd->config); - module = i40e_nvmupd_get_module(cmd->config); + return 0; + } - /* limits on data size */ - if ((cmd->data_size < 1) || - (cmd->data_size > I40E_NVMUPD_MAX_DATA)) { + /* Clear status even it is not read and log */ + if (hw->nvmupd_state == I40E_NVMUPD_STATE_ERROR) { i40e_debug(hw, I40E_DEBUG_NVM, - "i40e_nvmupd_validate_command data_size %d\n", - cmd->data_size); - *perrno = -EFAULT; - return I40E_NVMUPD_INVALID; + "Clearing I40E_NVMUPD_STATE_ERROR state without reading\n"); + hw->nvmupd_state = I40E_NVMUPD_STATE_INIT; } - switch (cmd->command) { - case I40E_NVM_READ: - switch (transaction) { - case I40E_NVM_CON: - upd_cmd = I40E_NVMUPD_READ_CON; - break; - case I40E_NVM_SNT: - upd_cmd = I40E_NVMUPD_READ_SNT; - break; - case I40E_NVM_LCB: - upd_cmd = I40E_NVMUPD_READ_LCB; - break; - case I40E_NVM_SA: - upd_cmd = I40E_NVMUPD_READ_SA; - break; - case I40E_NVM_EXEC: - if (module == 0xf) - upd_cmd = I40E_NVMUPD_STATUS; - else if (module == 0) - upd_cmd = I40E_NVMUPD_GET_AQ_RESULT; - break; - case I40E_NVM_AQE: - upd_cmd = I40E_NVMUPD_GET_AQ_EVENT; - break; - } + /* Acquire lock to prevent race condition where adminq_task + * can execute after i40e_nvmupd_nvm_read/write but before state + * variables (nvm_wait_opcode, nvm_release_on_done) are updated. + * + * During NVMUpdate, it is observed that lock could be held for + * ~5ms for most commands. However lock is held for ~60ms for + * NVMUPD_CSUM_LCB command. + */ + mutex_lock(&hw->aq.arq_mutex); + switch (hw->nvmupd_state) { + case I40E_NVMUPD_STATE_INIT: + status = i40e_nvmupd_state_init(hw, cmd, bytes, perrno); break; - case I40E_NVM_WRITE: - switch (transaction) { - case I40E_NVM_CON: - upd_cmd = I40E_NVMUPD_WRITE_CON; - break; - case I40E_NVM_SNT: - upd_cmd = I40E_NVMUPD_WRITE_SNT; - break; - case I40E_NVM_LCB: - upd_cmd = I40E_NVMUPD_WRITE_LCB; - break; - case I40E_NVM_SA: - upd_cmd = I40E_NVMUPD_WRITE_SA; - break; - case I40E_NVM_ERA: - upd_cmd = I40E_NVMUPD_WRITE_ERA; - break; - case I40E_NVM_CSUM: - upd_cmd = I40E_NVMUPD_CSUM_CON; - break; - case (I40E_NVM_CSUM|I40E_NVM_SA): - upd_cmd = I40E_NVMUPD_CSUM_SA; - break; - case (I40E_NVM_CSUM|I40E_NVM_LCB): - upd_cmd = I40E_NVMUPD_CSUM_LCB; - break; - case I40E_NVM_EXEC: - if (module == 0) - upd_cmd = I40E_NVMUPD_EXEC_AQ; - break; - } + case I40E_NVMUPD_STATE_READING: + status = i40e_nvmupd_state_reading(hw, cmd, bytes, perrno); break; - } - - return upd_cmd; -} - -/** - * i40e_nvmupd_exec_aq - Run an AQ command - * @hw: pointer to hardware structure - * @cmd: pointer to nvm update command buffer - * @bytes: pointer to the data buffer - * @perrno: pointer to return error code - * - * cmd structure contains identifiers and data buffer - **/ -static int i40e_nvmupd_exec_aq(struct i40e_hw *hw, - struct i40e_nvm_access *cmd, - u8 *bytes, int *perrno) -{ - struct i40e_asq_cmd_details cmd_details; - struct i40e_aq_desc *aq_desc; - u32 buff_size = 0; - u8 *buff = NULL; - u32 aq_desc_len; - u32 aq_data_len; - int status; - - i40e_debug(hw, I40E_DEBUG_NVM, "NVMUPD: %s\n", __func__); - if (cmd->offset == 0xffff) - return 0; - - memset(&cmd_details, 0, sizeof(cmd_details)); - cmd_details.wb_desc = &hw->nvm_wb_desc; - aq_desc_len = sizeof(struct i40e_aq_desc); - memset(&hw->nvm_wb_desc, 0, aq_desc_len); - - /* get the aq descriptor */ - if (cmd->data_size < aq_desc_len) { - i40e_debug(hw, I40E_DEBUG_NVM, - "NVMUPD: not enough aq desc bytes for exec, size %d < %d\n", - cmd->data_size, aq_desc_len); - *perrno = -EINVAL; - return -EINVAL; - } - aq_desc = (struct i40e_aq_desc *)bytes; - - /* if data buffer needed, make sure it's ready */ - aq_data_len = cmd->data_size - aq_desc_len; - buff_size = max_t(u32, aq_data_len, le16_to_cpu(aq_desc->datalen)); - if (buff_size) { - if (!hw->nvm_buff.va) { - status = i40e_allocate_virt_mem(hw, &hw->nvm_buff, - hw->aq.asq_buf_size); - if (status) - i40e_debug(hw, I40E_DEBUG_NVM, - "NVMUPD: i40e_allocate_virt_mem for exec buff failed, %d\n", - status); - } + case I40E_NVMUPD_STATE_WRITING: + status = i40e_nvmupd_state_writing(hw, cmd, bytes, perrno); + break; - if (hw->nvm_buff.va) { - buff = hw->nvm_buff.va; - memcpy(buff, &bytes[aq_desc_len], aq_data_len); + case I40E_NVMUPD_STATE_INIT_WAIT: + case I40E_NVMUPD_STATE_WRITE_WAIT: + /* if we need to stop waiting for an event, clear + * the wait info and return before doing anything else + */ + if (cmd->offset == 0xffff) { + i40e_nvmupd_clear_wait_state(hw); + status = 0; + break; } - } - if (cmd->offset) - memset(&hw->nvm_aq_event_desc, 0, aq_desc_len); + status = -EBUSY; + *perrno = -EBUSY; + break; - /* and away we go! */ - status = i40e_asq_send_command(hw, aq_desc, buff, - buff_size, &cmd_details); - if (status) { + default: + /* invalid state, should never happen */ i40e_debug(hw, I40E_DEBUG_NVM, - "%s err %pe aq_err %s\n", - __func__, ERR_PTR(status), - i40e_aq_str(hw, hw->aq.asq_last_status)); - *perrno = i40e_aq_rc_to_posix(status, hw->aq.asq_last_status); - return status; - } - - /* should we wait for a followup event? */ - if (cmd->offset) { - hw->nvm_wait_opcode = cmd->offset; - hw->nvmupd_state = I40E_NVMUPD_STATE_INIT_WAIT; + "NVMUPD: no such state %d\n", hw->nvmupd_state); + status = -EOPNOTSUPP; + *perrno = -ESRCH; + break; } + mutex_unlock(&hw->aq.arq_mutex); return status; } /** - * i40e_nvmupd_get_aq_result - Get the results from the previous exec_aq - * @hw: pointer to hardware structure - * @cmd: pointer to nvm update command buffer - * @bytes: pointer to the data buffer - * @perrno: pointer to return error code - * - * cmd structure contains identifiers and data buffer - **/ -static int i40e_nvmupd_get_aq_result(struct i40e_hw *hw, - struct i40e_nvm_access *cmd, - u8 *bytes, int *perrno) -{ - u32 aq_total_len; - u32 aq_desc_len; - int remainder; - u8 *buff; - - i40e_debug(hw, I40E_DEBUG_NVM, "NVMUPD: %s\n", __func__); - - aq_desc_len = sizeof(struct i40e_aq_desc); - aq_total_len = aq_desc_len + le16_to_cpu(hw->nvm_wb_desc.datalen); - - /* check offset range */ - if (cmd->offset > aq_total_len) { - i40e_debug(hw, I40E_DEBUG_NVM, "%s: offset too big %d > %d\n", - __func__, cmd->offset, aq_total_len); - *perrno = -EINVAL; - return -EINVAL; - } - - /* check copylength range */ - if (cmd->data_size > (aq_total_len - cmd->offset)) { - int new_len = aq_total_len - cmd->offset; - - i40e_debug(hw, I40E_DEBUG_NVM, "%s: copy length %d too big, trimming to %d\n", - __func__, cmd->data_size, new_len); - cmd->data_size = new_len; - } - - remainder = cmd->data_size; - if (cmd->offset < aq_desc_len) { - u32 len = aq_desc_len - cmd->offset; - - len = min(len, cmd->data_size); - i40e_debug(hw, I40E_DEBUG_NVM, "%s: aq_desc bytes %d to %d\n", - __func__, cmd->offset, cmd->offset + len); - - buff = ((u8 *)&hw->nvm_wb_desc) + cmd->offset; - memcpy(bytes, buff, len); - - bytes += len; - remainder -= len; - buff = hw->nvm_buff.va; - } else { - buff = hw->nvm_buff.va + (cmd->offset - aq_desc_len); - } - - if (remainder > 0) { - int start_byte = buff - (u8 *)hw->nvm_buff.va; - - i40e_debug(hw, I40E_DEBUG_NVM, "%s: databuf bytes %d to %d\n", - __func__, start_byte, start_byte + remainder); - memcpy(bytes, buff, remainder); - } - - return 0; -} - -/** - * i40e_nvmupd_get_aq_event - Get the Admin Queue event from previous exec_aq - * @hw: pointer to hardware structure - * @cmd: pointer to nvm update command buffer - * @bytes: pointer to the data buffer - * @perrno: pointer to return error code - * - * cmd structure contains identifiers and data buffer + * i40e_nvmupd_clear_wait_state - clear wait state on hw + * @hw: pointer to the hardware structure **/ -static int i40e_nvmupd_get_aq_event(struct i40e_hw *hw, - struct i40e_nvm_access *cmd, - u8 *bytes, int *perrno) +void i40e_nvmupd_clear_wait_state(struct i40e_hw *hw) { - u32 aq_total_len; - u32 aq_desc_len; - - i40e_debug(hw, I40E_DEBUG_NVM, "NVMUPD: %s\n", __func__); - - aq_desc_len = sizeof(struct i40e_aq_desc); - aq_total_len = aq_desc_len + le16_to_cpu(hw->nvm_aq_event_desc.datalen); + i40e_debug(hw, I40E_DEBUG_NVM, + "NVMUPD: clearing wait on opcode 0x%04x\n", + hw->nvm_wait_opcode); - /* check copylength range */ - if (cmd->data_size > aq_total_len) { - i40e_debug(hw, I40E_DEBUG_NVM, - "%s: copy length %d too big, trimming to %d\n", - __func__, cmd->data_size, aq_total_len); - cmd->data_size = aq_total_len; + if (hw->nvm_release_on_done) { + i40e_release_nvm(hw); + hw->nvm_release_on_done = false; } + hw->nvm_wait_opcode = 0; - memcpy(bytes, &hw->nvm_aq_event_desc, cmd->data_size); - - return 0; -} - -/** - * i40e_nvmupd_nvm_read - Read NVM - * @hw: pointer to hardware structure - * @cmd: pointer to nvm update command buffer - * @bytes: pointer to the data buffer - * @perrno: pointer to return error code - * - * cmd structure contains identifiers and data buffer - **/ -static int i40e_nvmupd_nvm_read(struct i40e_hw *hw, - struct i40e_nvm_access *cmd, - u8 *bytes, int *perrno) -{ - struct i40e_asq_cmd_details cmd_details; - u8 module, transaction; - int status; - bool last; - - transaction = i40e_nvmupd_get_transaction(cmd->config); - module = i40e_nvmupd_get_module(cmd->config); - last = (transaction == I40E_NVM_LCB) || (transaction == I40E_NVM_SA); - - memset(&cmd_details, 0, sizeof(cmd_details)); - cmd_details.wb_desc = &hw->nvm_wb_desc; - - status = i40e_aq_read_nvm(hw, module, cmd->offset, (u16)cmd->data_size, - bytes, last, &cmd_details); - if (status) { - i40e_debug(hw, I40E_DEBUG_NVM, - "i40e_nvmupd_nvm_read mod 0x%x off 0x%x len 0x%x\n", - module, cmd->offset, cmd->data_size); - i40e_debug(hw, I40E_DEBUG_NVM, - "i40e_nvmupd_nvm_read status %d aq %d\n", - status, hw->aq.asq_last_status); - *perrno = i40e_aq_rc_to_posix(status, hw->aq.asq_last_status); + if (hw->aq.arq_last_status) { + hw->nvmupd_state = I40E_NVMUPD_STATE_ERROR; + return; } - return status; -} - -/** - * i40e_nvmupd_nvm_erase - Erase an NVM module - * @hw: pointer to hardware structure - * @cmd: pointer to nvm update command buffer - * @perrno: pointer to return error code - * - * module, offset, data_size and data are in cmd structure - **/ -static int i40e_nvmupd_nvm_erase(struct i40e_hw *hw, - struct i40e_nvm_access *cmd, - int *perrno) -{ - struct i40e_asq_cmd_details cmd_details; - u8 module, transaction; - int status = 0; - bool last; - - transaction = i40e_nvmupd_get_transaction(cmd->config); - module = i40e_nvmupd_get_module(cmd->config); - last = (transaction & I40E_NVM_LCB); + switch (hw->nvmupd_state) { + case I40E_NVMUPD_STATE_INIT_WAIT: + hw->nvmupd_state = I40E_NVMUPD_STATE_INIT; + break; - memset(&cmd_details, 0, sizeof(cmd_details)); - cmd_details.wb_desc = &hw->nvm_wb_desc; + case I40E_NVMUPD_STATE_WRITE_WAIT: + hw->nvmupd_state = I40E_NVMUPD_STATE_WRITING; + break; - status = i40e_aq_erase_nvm(hw, module, cmd->offset, (u16)cmd->data_size, - last, &cmd_details); - if (status) { - i40e_debug(hw, I40E_DEBUG_NVM, - "i40e_nvmupd_nvm_erase mod 0x%x off 0x%x len 0x%x\n", - module, cmd->offset, cmd->data_size); - i40e_debug(hw, I40E_DEBUG_NVM, - "i40e_nvmupd_nvm_erase status %d aq %d\n", - status, hw->aq.asq_last_status); - *perrno = i40e_aq_rc_to_posix(status, hw->aq.asq_last_status); + default: + break; } - - return status; } /** - * i40e_nvmupd_nvm_write - Write NVM - * @hw: pointer to hardware structure - * @cmd: pointer to nvm update command buffer - * @bytes: pointer to the data buffer - * @perrno: pointer to return error code - * - * module, offset, data_size and data are in cmd structure + * i40e_nvmupd_check_wait_event - handle NVM update operation events + * @hw: pointer to the hardware structure + * @opcode: the event that just happened + * @desc: AdminQ descriptor **/ -static int i40e_nvmupd_nvm_write(struct i40e_hw *hw, - struct i40e_nvm_access *cmd, - u8 *bytes, int *perrno) +void i40e_nvmupd_check_wait_event(struct i40e_hw *hw, u16 opcode, + struct i40e_aq_desc *desc) { - struct i40e_asq_cmd_details cmd_details; - u8 module, transaction; - u8 preservation_flags; - int status = 0; - bool last; - - transaction = i40e_nvmupd_get_transaction(cmd->config); - module = i40e_nvmupd_get_module(cmd->config); - last = (transaction & I40E_NVM_LCB); - preservation_flags = i40e_nvmupd_get_preservation_flags(cmd->config); - - memset(&cmd_details, 0, sizeof(cmd_details)); - cmd_details.wb_desc = &hw->nvm_wb_desc; + u32 aq_desc_len = sizeof(struct i40e_aq_desc); - status = i40e_aq_update_nvm(hw, module, cmd->offset, - (u16)cmd->data_size, bytes, last, - preservation_flags, &cmd_details); - if (status) { - i40e_debug(hw, I40E_DEBUG_NVM, - "i40e_nvmupd_nvm_write mod 0x%x off 0x%x len 0x%x\n", - module, cmd->offset, cmd->data_size); - i40e_debug(hw, I40E_DEBUG_NVM, - "i40e_nvmupd_nvm_write status %d aq %d\n", - status, hw->aq.asq_last_status); - *perrno = i40e_aq_rc_to_posix(status, hw->aq.asq_last_status); + if (opcode == hw->nvm_wait_opcode) { + memcpy(&hw->nvm_aq_event_desc, desc, aq_desc_len); + i40e_nvmupd_clear_wait_state(hw); } - - return status; } From f77a644b5813397881c585d29920621056568b4e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 29 Mar 2024 10:57:08 -0700 Subject: [PATCH 157/167] netlink: create a new header for internal genetlink symbols There are things in linux/genetlink.h which are only used under net/netlink/. Move them to a new local header. A new header with just 2 externs isn't great, but alternative would be to include af_netlink.h in genetlink.c which feels even worse. Signed-off-by: Jakub Kicinski Signed-off-by: NipaLocal --- include/linux/genetlink.h | 5 ----- net/netlink/af_netlink.c | 2 +- net/netlink/genetlink.c | 2 ++ net/netlink/genetlink.h | 11 +++++++++++ 4 files changed, 14 insertions(+), 6 deletions(-) create mode 100644 net/netlink/genetlink.h diff --git a/include/linux/genetlink.h b/include/linux/genetlink.h index c285968e437a6..9dbd7ba9b858e 100644 --- a/include/linux/genetlink.h +++ b/include/linux/genetlink.h @@ -4,15 +4,10 @@ #include - /* All generic netlink requests are serialized by a global lock. */ extern void genl_lock(void); extern void genl_unlock(void); -/* for synchronisation between af_netlink and genetlink */ -extern atomic_t genl_sk_destructing_cnt; -extern wait_queue_head_t genl_sk_destructing_waitq; - #define MODULE_ALIAS_GENL_FAMILY(family)\ MODULE_ALIAS_NET_PF_PROTO_NAME(PF_NETLINK, NETLINK_GENERIC, "-family-" family) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 7554803218a25..dc8c3c01d51b7 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -59,7 +59,6 @@ #include #include #include -#include #include #include #include @@ -73,6 +72,7 @@ #include #include "af_netlink.h" +#include "genetlink.h" struct listeners { struct rcu_head rcu; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 3b7666944b11c..feb54c63a1165 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -22,6 +22,8 @@ #include #include +#include "genetlink.h" + static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */ static DECLARE_RWSEM(cb_lock); diff --git a/net/netlink/genetlink.h b/net/netlink/genetlink.h new file mode 100644 index 0000000000000..89bd9d2631c38 --- /dev/null +++ b/net/netlink/genetlink.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_GENETLINK_H +#define __NET_GENETLINK_H + +#include + +/* for synchronisation between af_netlink and genetlink */ +extern atomic_t genl_sk_destructing_cnt; +extern wait_queue_head_t genl_sk_destructing_waitq; + +#endif /* __LINUX_GENERIC_NETLINK_H */ From e57e6bea1710e53163ed9e5e916efac6df1dc9b5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 29 Mar 2024 10:57:09 -0700 Subject: [PATCH 158/167] net: openvswitch: remove unnecessary linux/genetlink.h include The only legit reason I could think of for net/genetlink.h and linux/genetlink.h to be separate would be if one was included by other headers and we wanted to keep it lightweight. That is not the case, net/openvswitch/meter.h includes linux/genetlink.h but for no apparent reason (for struct genl_family perhaps? it's not necessary, types of externs do not need to be known). Signed-off-by: Jakub Kicinski Signed-off-by: NipaLocal --- net/openvswitch/meter.h | 1 - 1 file changed, 1 deletion(-) diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h index ed11cd12b512f..8bbf983cd2441 100644 --- a/net/openvswitch/meter.h +++ b/net/openvswitch/meter.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include From c4b4abda0ab5d3d942674fb1d5d0cb6b863bb731 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 29 Mar 2024 10:57:10 -0700 Subject: [PATCH 159/167] genetlink: remove linux/genetlink.h genetlink.h is a shell of what used to be a combined uAPI and kernel header over a decade ago. It has fewer than 10 lines of code. Merge it into net/genetlink.h. In some ways it'd be better to keep the combined header under linux/ but it would make looking through git history harder. Acked-by: Sven Eckelmann Signed-off-by: Jakub Kicinski Signed-off-by: NipaLocal --- drivers/net/wireguard/main.c | 2 +- include/linux/genetlink.h | 14 -------------- include/linux/genl_magic_struct.h | 2 +- include/net/genetlink.h | 10 +++++++++- net/batman-adv/main.c | 2 +- net/batman-adv/netlink.c | 1 - net/openvswitch/datapath.c | 1 - 7 files changed, 12 insertions(+), 20 deletions(-) delete mode 100644 include/linux/genetlink.h diff --git a/drivers/net/wireguard/main.c b/drivers/net/wireguard/main.c index ee4da9ab8013c..a00671b58701f 100644 --- a/drivers/net/wireguard/main.c +++ b/drivers/net/wireguard/main.c @@ -14,7 +14,7 @@ #include #include -#include +#include #include static int __init wg_mod_init(void) diff --git a/include/linux/genetlink.h b/include/linux/genetlink.h deleted file mode 100644 index 9dbd7ba9b858e..0000000000000 --- a/include/linux/genetlink.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_GENERIC_NETLINK_H -#define __LINUX_GENERIC_NETLINK_H - -#include - -/* All generic netlink requests are serialized by a global lock. */ -extern void genl_lock(void); -extern void genl_unlock(void); - -#define MODULE_ALIAS_GENL_FAMILY(family)\ - MODULE_ALIAS_NET_PF_PROTO_NAME(PF_NETLINK, NETLINK_GENERIC, "-family-" family) - -#endif /* __LINUX_GENERIC_NETLINK_H */ diff --git a/include/linux/genl_magic_struct.h b/include/linux/genl_magic_struct.h index a419d93789ff3..621b87a87d743 100644 --- a/include/linux/genl_magic_struct.h +++ b/include/linux/genl_magic_struct.h @@ -15,8 +15,8 @@ #endif #include -#include #include +#include extern int CONCATENATE(GENL_MAGIC_FAMILY, _genl_register)(void); extern void CONCATENATE(GENL_MAGIC_FAMILY, _genl_unregister)(void); diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 9ece6e5a3ea8c..9ab49bfeae789 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -2,12 +2,20 @@ #ifndef __NET_GENERIC_NETLINK_H #define __NET_GENERIC_NETLINK_H -#include +#include #include #include +#include #define GENLMSG_DEFAULT_SIZE (NLMSG_DEFAULT_SIZE - GENL_HDRLEN) +/* Non-parallel generic netlink requests are serialized by a global lock. */ +void genl_lock(void); +void genl_unlock(void); + +#define MODULE_ALIAS_GENL_FAMILY(family) \ + MODULE_ALIAS_NET_PF_PROTO_NAME(PF_NETLINK, NETLINK_GENERIC, "-family-" family) + /* Binding to multicast group requires %CAP_NET_ADMIN */ #define GENL_MCAST_CAP_NET_ADMIN BIT(0) /* Binding to multicast group requires %CAP_SYS_ADMIN */ diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 75119f1ffcccf..8e0f44c71696f 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -38,6 +37,7 @@ #include #include #include +#include #include #include #include diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 0954757f0b8b8..9362cd9d6f3d3 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 11c69415c6052..99d72543abd3a 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include From abdea0633e3f497aa4e242b1206be0c18e0ef766 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Fri, 29 Mar 2024 11:06:37 -0700 Subject: [PATCH 160/167] i40e: Fix VF MAC filter removal Commit 73d9629e1c8c ("i40e: Do not allow untrusted VF to remove administratively set MAC") fixed an issue where untrusted VF was allowed to remove its own MAC address although this was assigned administratively from PF. Unfortunately the introduced check is wrong because it causes that MAC filters for other MAC addresses including multi-cast ones are not removed. if (ether_addr_equal(addr, vf->default_lan_addr.addr) && i40e_can_vf_change_mac(vf)) was_unimac_deleted = true; else continue; if (i40e_del_mac_filter(vsi, al->list[i].addr)) { ... The else path with `continue` effectively skips any MAC filter removal except one for primary MAC addr when VF is allowed to do so. Fix the check condition so the `continue` is only done for primary MAC address. Fixes: 73d9629e1c8c ("i40e: Do not allow untrusted VF to remove administratively set MAC") Signed-off-by: Ivan Vecera Reviewed-by: Michal Schmidt Reviewed-by: Brett Creeley Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen Signed-off-by: NipaLocal --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 4c13f78af6756..232b65b9c8eac 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -3137,11 +3137,12 @@ static int i40e_vc_del_mac_addr_msg(struct i40e_vf *vf, u8 *msg) /* Allow to delete VF primary MAC only if it was not set * administratively by PF or if VF is trusted. */ - if (ether_addr_equal(addr, vf->default_lan_addr.addr) && - i40e_can_vf_change_mac(vf)) - was_unimac_deleted = true; - else - continue; + if (ether_addr_equal(addr, vf->default_lan_addr.addr)) { + if (i40e_can_vf_change_mac(vf)) + was_unimac_deleted = true; + else + continue; + } if (i40e_del_mac_filter(vsi, al->list[i].addr)) { ret = -EINVAL; From 46b2c1cca05f65c2a4375ce2ccac79e8ef76aded Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 29 Mar 2024 11:16:51 -0700 Subject: [PATCH 161/167] tools: ynl: add ynl_dump_empty() helper Checking if dump is empty requires a couple of casts. Add a convenient wrapper. Add an example use in the netdev sample, loopback is always present so an empty dump is an error. Signed-off-by: Jakub Kicinski Signed-off-by: NipaLocal --- tools/net/ynl/lib/ynl.h | 12 ++++++++++++ tools/net/ynl/samples/netdev.c | 2 ++ 2 files changed, 14 insertions(+) diff --git a/tools/net/ynl/lib/ynl.h b/tools/net/ynl/lib/ynl.h index 9842e85a8c57d..eef7c6324ed48 100644 --- a/tools/net/ynl/lib/ynl.h +++ b/tools/net/ynl/lib/ynl.h @@ -91,6 +91,18 @@ void ynl_sock_destroy(struct ynl_sock *ys); !ynl_dump_obj_is_last(iter); \ iter = ynl_dump_obj_next(iter)) +/** + * ynl_dump_empty() - does the dump have no entries + * @dump: pointer to the dump list, as returned by a dump call + * + * Check if the dump is empty, i.e. contains no objects. + * Dump calls return NULL on error, and terminator element if empty. + */ +static inline bool ynl_dump_empty(void *dump) +{ + return dump == (void *)YNL_LIST_END; +} + int ynl_subscribe(struct ynl_sock *ys, const char *grp_name); int ynl_socket_get_fd(struct ynl_sock *ys); int ynl_ntf_check(struct ynl_sock *ys); diff --git a/tools/net/ynl/samples/netdev.c b/tools/net/ynl/samples/netdev.c index 591b90e21890c..3e7b29bd55d5d 100644 --- a/tools/net/ynl/samples/netdev.c +++ b/tools/net/ynl/samples/netdev.c @@ -100,6 +100,8 @@ int main(int argc, char **argv) if (!devs) goto err_close; + if (ynl_dump_empty(devs)) + fprintf(stderr, "Error: no devices reported\n"); ynl_dump_foreach(devs, d) netdev_print_device(d, 0); netdev_dev_get_list_free(devs); From d5701f11bddbf5a3d15640e432fbdb9841b86629 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Mar 2024 18:30:53 +0000 Subject: [PATCH 162/167] ipv6: remove RTNL protection from inet6_dump_fib() No longer hold hold RTNL while calling inet6_dump_fib(). Also change return value for a completed dump, so that NLMSG_DONE can be appended to current skb, saving one recvmsg() system call. Signed-off-by: Eric Dumazet Cc: David Ahern Signed-off-by: NipaLocal --- net/ipv6/ip6_fib.c | 51 +++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 5c558dc1c6838..284002a2a8905 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -623,23 +623,22 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, .filter.dump_routes = true, - .filter.rtnl_held = true, + .filter.rtnl_held = false, }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - unsigned int h, s_h; unsigned int e = 0, s_e; + struct hlist_head *head; struct fib6_walker *w; struct fib6_table *tb; - struct hlist_head *head; - int res = 0; + unsigned int h, s_h; + int err = 0; + rcu_read_lock(); if (cb->strict_check) { - int err; - err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb); if (err < 0) - return err; + goto unlock; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); @@ -660,8 +659,10 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) * 2. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); - if (!w) - return -ENOMEM; + if (!w) { + err = -ENOMEM; + goto unlock; + } w->func = fib6_dump_node; cb->args[2] = (long)w; } @@ -675,46 +676,46 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET6) - goto out; + goto unlock; NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); - return -ENOENT; + err = -ENOENT; + goto unlock; } if (!cb->args[0]) { - res = fib6_dump_table(tb, skb, cb); - if (!res) + err = fib6_dump_table(tb, skb, cb); + if (!err) cb->args[0] = 1; } - goto out; + goto unlock; } s_h = cb->args[0]; s_e = cb->args[1]; - rcu_read_lock(); for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (e < s_e) goto next; - res = fib6_dump_table(tb, skb, cb); - if (res != 0) - goto out_unlock; + err = fib6_dump_table(tb, skb, cb); + if (err != 0) + goto out; next: e++; } } -out_unlock: - rcu_read_unlock(); +out: cb->args[1] = e; cb->args[0] = h; -out: - res = res < 0 ? res : skb->len; - if (res <= 0) + +unlock: + rcu_read_unlock(); + if (err <= 0) fib6_dump_end(cb); - return res; + return err; } void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) @@ -2506,7 +2507,7 @@ int __init fib6_init(void) goto out_kmem_cache_create; ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL, - inet6_dump_fib, 0); + inet6_dump_fib, RTNL_FLAG_DUMP_UNLOCKED); if (ret) goto out_unregister_subsys; From 876746815790fd3509d9bdfec78b1caf6165262a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 29 Mar 2024 19:50:36 +0100 Subject: [PATCH 163/167] mptcp: prevent BPF accessing lowat from a subflow socket. Alexei reported the following splat: WARNING: CPU: 32 PID: 3276 at net/mptcp/subflow.c:1430 subflow_data_ready+0x147/0x1c0 Modules linked in: dummy bpf_testmod(O) [last unloaded: bpf_test_no_cfi(O)] CPU: 32 PID: 3276 Comm: test_progs Tainted: GO 6.8.0-12873-g2c43c33bfd23 Call Trace: mptcp_set_rcvlowat+0x79/0x1d0 sk_setsockopt+0x6c0/0x1540 __bpf_setsockopt+0x6f/0x90 bpf_sock_ops_setsockopt+0x3c/0x90 bpf_prog_509ce5db2c7f9981_bpf_test_sockopt_int+0xb4/0x11b bpf_prog_dce07e362d941d2b_bpf_test_socket_sockopt+0x12b/0x132 bpf_prog_348c9b5faaf10092_skops_sockopt+0x954/0xe86 __cgroup_bpf_run_filter_sock_ops+0xbc/0x250 tcp_connect+0x879/0x1160 tcp_v6_connect+0x50c/0x870 mptcp_connect+0x129/0x280 __inet_stream_connect+0xce/0x370 inet_stream_connect+0x36/0x50 bpf_trampoline_6442491565+0x49/0xef inet_stream_connect+0x5/0x50 __sys_connect+0x63/0x90 __x64_sys_connect+0x14/0x20 The root cause of the issue is that bpf allows accessing mptcp-level proto_ops from a tcp subflow scope. Fix the issue detecting the problematic call and preventing any action. Reported-by: Alexei Starovoitov Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/482 Fixes: 5684ab1a0eff ("mptcp: give rcvlowat some love") Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: NipaLocal --- net/mptcp/sockopt.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index dcd1c76d2a3ba..73fdf423de44e 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -1493,6 +1493,10 @@ int mptcp_set_rcvlowat(struct sock *sk, int val) struct mptcp_subflow_context *subflow; int space, cap; + /* bpf can land here with a wrong sk type */ + if (sk->sk_protocol == IPPROTO_TCP) + return -EINVAL; + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; else From 313e0a37826cb499734f8885ba27747b638628e8 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Fri, 29 Mar 2024 14:36:53 -0700 Subject: [PATCH 164/167] net: mana: Fix Rx DMA datasize and skb_over_panic mana_get_rxbuf_cfg() aligns the RX buffer's DMA datasize to be multiple of 64. So a packet slightly bigger than mtu+14, say 1536, can be received and cause skb_over_panic. Sample dmesg: [ 5325.237162] skbuff: skb_over_panic: text:ffffffffc043277a len:1536 put:1536 head:ff1100018b517000 data:ff1100018b517100 tail:0x700 end:0x6ea dev: [ 5325.243689] ------------[ cut here ]------------ [ 5325.245748] kernel BUG at net/core/skbuff.c:192! [ 5325.247838] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI [ 5325.258374] RIP: 0010:skb_panic+0x4f/0x60 [ 5325.302941] Call Trace: [ 5325.304389] [ 5325.315794] ? skb_panic+0x4f/0x60 [ 5325.317457] ? asm_exc_invalid_op+0x1f/0x30 [ 5325.319490] ? skb_panic+0x4f/0x60 [ 5325.321161] skb_put+0x4e/0x50 [ 5325.322670] mana_poll+0x6fa/0xb50 [mana] [ 5325.324578] __napi_poll+0x33/0x1e0 [ 5325.326328] net_rx_action+0x12e/0x280 As discussed internally, this alignment is not necessary. To fix this bug, remove it from the code. So oversized packets will be marked as CQE_RX_TRUNCATED by NIC, and dropped. Cc: stable@vger.kernel.org Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)") Signed-off-by: Haiyang Zhang Reviewed-by: Dexuan Cui Signed-off-by: NipaLocal --- drivers/net/ethernet/microsoft/mana/mana_en.c | 2 +- include/net/mana/mana.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 59287c6e6cee6..d8af5e7e15b4d 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -601,7 +601,7 @@ static void mana_get_rxbuf_cfg(int mtu, u32 *datasize, u32 *alloc_size, *alloc_size = mtu + MANA_RXBUF_PAD + *headroom; - *datasize = ALIGN(mtu + ETH_HLEN, MANA_RX_DATA_ALIGN); + *datasize = mtu + ETH_HLEN; } static int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu) diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 76147feb0d10a..4eeedf14711b3 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -39,7 +39,6 @@ enum TRI_STATE { #define COMP_ENTRY_SIZE 64 #define RX_BUFFERS_PER_QUEUE 512 -#define MANA_RX_DATA_ALIGN 64 #define MAX_SEND_BUFFERS_PER_QUEUE 256 From 00b116b29f34f43ae1f1a9233a79fc7d29752d8a Mon Sep 17 00:00:00 2001 From: fedora Cloud User Date: Thu, 1 Feb 2024 06:34:18 -0800 Subject: [PATCH 165/167] forwarding: set timeout to 3 hours tc_actions.sh keeps hanging the forwarding tests. Signed-off-by: NipaLocal --- tools/testing/selftests/net/forwarding/settings | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/settings b/tools/testing/selftests/net/forwarding/settings index e7b9417537fbc..ad6a1699dd1bd 100644 --- a/tools/testing/selftests/net/forwarding/settings +++ b/tools/testing/selftests/net/forwarding/settings @@ -1 +1 @@ -timeout=0 +timeout=10800 From ab945a8502af59a5fe8fcb373c8ad13263d87ef2 Mon Sep 17 00:00:00 2001 From: fedora Cloud User Date: Tue, 6 Feb 2024 17:29:13 -0800 Subject: [PATCH 166/167] x86_debug: drop things we don't care about throw in DEBUG_NET instead Signed-off-by: NipaLocal --- kernel/configs/x86_debug.config | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config index 35f48671b8d5c..780788ef15457 100644 --- a/kernel/configs/x86_debug.config +++ b/kernel/configs/x86_debug.config @@ -1,5 +1,4 @@ # Help: Debugging options for tip tree testing -CONFIG_X86_DEBUG_FPU=y CONFIG_LOCK_STAT=y CONFIG_DEBUG_VM=y CONFIG_DEBUG_VM_VMACACHE=y @@ -10,9 +9,7 @@ CONFIG_DEBUG_PAGEALLOC=y CONFIG_SLUB_DEBUG_ON=y CONFIG_DEBUG_OBJECTS=y CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1 -CONFIG_GCOV_KERNEL=y CONFIG_LOCKDEP=y CONFIG_PROVE_LOCKING=y -CONFIG_SCHEDSTATS=y -CONFIG_NOINSTR_VALIDATION=y -CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y +CONFIG_NET_NS_REFCNT_TRACKER=y +CONFIG_DEBUG_NET=y From d0efd00ee05beb79b39daa4dd3f6cde3a65798f4 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 22 Mar 2024 06:14:53 -0700 Subject: [PATCH 167/167] add missing kunit configs to fix: Missing: CONFIG_DAMON_DBGFS=y Note: many Kconfig options aren't available on UML. You can try running on a different architecture with something like "--arch=x86_64". Signed-off-by: Your Name Signed-off-by: NipaLocal --- tools/testing/kunit/configs/all_tests.config | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/kunit/configs/all_tests.config b/tools/testing/kunit/configs/all_tests.config index c5914f4e75e1b..173a006ddbb26 100644 --- a/tools/testing/kunit/configs/all_tests.config +++ b/tools/testing/kunit/configs/all_tests.config @@ -39,6 +39,7 @@ CONFIG_DAMON=y CONFIG_DAMON_VADDR=y CONFIG_DAMON_PADDR=y CONFIG_DEBUG_FS=y +CONFIG_DAMON_DBGFS_DEPRECATED=y CONFIG_DAMON_DBGFS=y CONFIG_REGMAP_BUILD=y