tun 多队列multiqueue ---RFS #1

jursonmo · 2019-01-29T15:10:42Z

3.9 内核才指出 tun multiqueue

 struct fd *tf = fdget(fd); 就是int fd 找到  tf , tf -> file
struct fd {
	struct file *file;    --------> fops 操作函数集
	unsigned int flags;
};

如果启用tun multiqueue, 那么还需要给tun网卡配置RPS 吗
答：多队列不需要配置RPS, 单队列才需要。
多队列tun 内核就实现了RFS功能，即跟应用层的cpu 关联起来。
多队列只是：tun->flows 记录的是流对应队列index，而队列index又对应应用层tun fd，即数据的来回都放在同一个网卡队列（同一个tun fd）处理。
要有RFS的效果，还是要打开RFS功能，就是配置RFS那两个配置，这样同一个数据流尽量由同一个cpu来处理，cpu cache line 。
tun->flows 每个流entry 的rps_rxhash 这个值基本不会变，但是它对应的cpu是随时可能变的，如果某个流在应用层write时有时在cpu0, 有时在cpu1 处理，那么RFS表（即rps_sock_flow_table）记录对应的cpu就会变；还有，如果两个流在skb_hash一样，一个流一直在cpu0, 另一个流一直在cpu1, RFS 表skb_hash对应的cpu也一直在cpu0 和cpu1之间变，所以如果启用RFS，应用层要保证同样的skb_hash 的数据流在同一cpu上处理？很难做到的。
同样的，如果不同tunfd 处理的不同流但 skb_hash 一样时，即tun->flows 的entry一样，entry-> queue_index 也会不断的变化，因为entry-> queue_index是由 tfile->queue_index 赋值的，不同的tunfd 对应的 tfile->queue_index也不一样；由于entry-> queue_index 的变化，tun_select_queue时，就会放到另一个队列里; 难道也要保证不同流相同skb_hash的数据也要由同一tunfd来处理，否则就会乱序。

从数据流向来说，不同的是：
其他物理网卡的RPS、RFS都是为了让软中断的cpu 跟应用层的cpu一致
tun 网卡的多队列（并且配置RFS）目的是应用层write 的cpu 跟处理软中断的cpu 保持一致

相同的是：
流对应的cpu 的记录和更新都是用处理应用层的cpu, 即由应用层来决定软中断使用哪个cpu.
由于软中断跟应用层对数据处理比较耗时，确实可以让这俩个处理放在同一个cpu上,即RFS，这样cpu缓存利用效率最大化。

//莫：只有设置了FF_MULTI_QUEUE; 才可以在ioctl -->tun_set_iff 对同一个tun dev name 进行创建和操作，第一次创建tun dev 时就创建多队列。
一、应用层 if((err = ioctl(fd,TUNSETIFF,(void *)&ifr))<0) 调用到的内核的__tun_chr_ioctl
__tun_chr_ioctl（）
-->tun_set_iff() 创建一个tun网卡设备 dev = alloc_netdev_mqs(........)
-> tun_attach()

  struct tun_file *tfile = file->private_data;
  tfile->queue_index = tun->numqueues; //记录tun fd 对应的队列index
  tun_set_real_num_queues(tun);
	netif_set_real_num_tx_queues(tun->dev, tun->numqueues);
	netif_set_real_num_rx_queues(tun->dev, tun->numqueues);

二、应用层write(fd,buf) --> 内核do_sync_write --> tun_chr_aio_write，对于网卡就是接收数据
---> tun_get_user()

rxhash = skb_get_hash(skb); //返回一个大于0的整数
netif_rx_ni(skb);
tun_flow_update(tun, rxhash, tfile);
if (tun->numqueues == 1 || tfile->detached) //单队列tun是不会去记录flow的cpu的，
		goto unlock;
     head = &tun->flows[tun_hashfn(rxhash)];
     struct tun_flow_entry *e = tun_flow_find(head, rxhash);
      找不到e, 就创建：tun_flow_create
		e->rxhash = rxhash;
		e->rps_rxhash = 0;
		e->queue_index =  tfile->queue_index;
找到e: 
e->queue_index =  tfile->queue_index; 
//记录这条流最新的tun fd，所以应用层要区分数据流来决定往哪个tun fd 发送，不然这条流的部分数据交给tun fd1处理，有时又交给fd2 处理，会出现乱序，
e->updated = jiffies;
//如果开启了RFS，即rps_sock_flow_table不为空， 下面代码才有用
sock_rps_record_flow_hash(e->rps_rxhash); //记录这个流对应的cpu
//一开始e->rps_rxhash为0， tun dev 发送数据后 tun_select_queue-》tun_flow_save_rps_rxhash -》 rps_rxhash不等于skb_hash 就保存信息 e->rps_rxhash= skb_hash
//  这里就记录这个数据流对应当前的cpu
//下次有数据，再调用netif_rx_ni(skb); 时， get_rps_cpu 找到之前记录的cpu,                                         
 // enqueue_to_backlog 再把skb放到对应的cpu backlog里。再拉起这个cpu的软中断

总：由tun 网卡收到数据来创建e, 发送时会更新e->rps_rxhash= rxhash ，再由接受数据时记录rps_rxhash对应的cpu 到rps_sock_flow_table 表里, 每次收到数据都更新记录cpu。为get_rps_cpu 使用。(目的是应用层write 的cpu 跟处理软中断的cpu 保持一致)
即cpu是由处理tun 接受数据的cpu 决定的,再走协议栈是，就用相同的cpu的软中断来处理这个数据。

三、往 tun dev 出口发包：
dev_queue_xmit(dev)
----> netdev_pick_tx(dev): if (dev->real_num_tx_queues != 1)
---->dev->netdev_ops->ndo_select_queue() //如果是多队列，就xxx_select_queue()
其实就是 tun_select_queue() ：

  txq = skb_get_hash(skb);   //注意，这里的txq 其实是hash
  if (txq) {
		e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
		if (e) {
			tun_flow_save_rps_rxhash(e, txq);
// 如果e->rps_rxhash跟txq 不相同，就  sock_rps_reset_flow_hash 重置 为rps_sock_flow_table ->ents [ rps_rxhash] = RPS_NO_CPU, 并且 e->rps_rxhash= txq
		     txq = e->queue_index; //相同流的数据放到同一个队列里,即由同一个fd读写
		} else
			/* use multiply and shift instead of expensive divide */
			txq = ((u64)txq * numqueues) >> 32;
	} else if (likely(skb_rx_queue_recorded(skb))) {
		txq = skb_get_rx_queue(skb);
		while (unlikely(txq >= numqueues))
			txq -= numqueues;
	}

---> skb->queue_mapping = queue_mapping; // txq
//这样skb->queue_mapping = txq，后面的tun_net_xmit()就会用到skb->queue_mapping

---> tun_net_xmit():
int txq = skb->queue_mapping;
tfile = rcu_dereference(tun->tfiles[txq]);
。。。。。。。
//把skb 放到 tfile sk 的队列sk_receive_queue 里，然后唤醒

skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb);
tfile->socket.sk->sk_data_ready(tfile->socket.sk);
（sk->sk_data_ready	=	sock_def_readable;唤醒 task , task变成就绪状态）

//======================
dev.c :

static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
		       struct rps_dev_flow **rflowp)
{
	struct netdev_rx_queue *rxqueue;
	struct rps_map *map;
	struct rps_dev_flow_table *flow_table;
	struct rps_sock_flow_table *sock_flow_table;
	int cpu = -1;
	u16 tcpu;
	u32 hash;
	if (skb_rx_queue_recorded(skb)) {
		u16 index = skb_get_rx_queue(skb);
		if (unlikely(index >= dev->real_num_rx_queues)) {
			WARN_ONCE(dev->real_num_rx_queues > 1,
				  "%s received packet on queue %u, but number "
				  "of RX queues is %u\n",
				  dev->name, index, dev->real_num_rx_queues);
			goto done;
		}
		rxqueue = dev->_rx + index;
	} else
		rxqueue = dev->_rx;

	map = rcu_dereference(rxqueue->rps_map);
	if (map) {
		if (map->len == 1 &&
		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
			tcpu = map->cpus[0];
			if (cpu_online(tcpu))
				cpu = tcpu;
			goto done;
		}
	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
		goto done;
	}

	skb_reset_network_header(skb);
	hash = skb_get_hash(skb);
	if (!hash)
		goto done;

	//莫：  RFS  set rps_sock_flow_entries create rps_sock_flow_table and rps_flow_cnt will create rps_flow_table

	flow_table = rcu_dereference(rxqueue->rps_flow_table);
	sock_flow_table = rcu_dereference(rps_sock_flow_table);
	if (flow_table && sock_flow_table) {
		u16 next_cpu;
		struct rps_dev_flow *rflow;

		rflow = &flow_table->flows[hash & flow_table->mask];
		tcpu = rflow->cpu;

		next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];

		/*
		 * If the desired CPU (where last recvmsg was done) is
		 * different from current CPU (one in the rx-queue flow
		 * table entry), switch if one of the following holds:
		 *   - Current CPU is unset (equal to RPS_NO_CPU).
		 *   - Current CPU is offline.
		 *   - The current CPU's queue tail has advanced beyond the
		 *     last packet that was enqueued using this table entry.
		 *     This guarantees that all previous packets for the flow
		 *     have been dequeued, thus preserving in order delivery.
		 */
		if (unlikely(tcpu != next_cpu) &&
		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
		      rflow->last_qtail)) >= 0)) {
			tcpu = next_cpu;
			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
		}

		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
			*rflowp = rflow;
			cpu = tcpu;
			goto done;
		}
	}
       //莫：如果没有开启RFS， 就只能RPS 来，reciprocal_scale 选个cpu
	if (map) {
		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
		if (cpu_online(tcpu)) {
			cpu = tcpu;
			goto done;
		}
	}
     done:
	  return cpu;
     }

The text was updated successfully, but these errors were encountered:

jursonmo changed the title ~~tun 多队列multiqueue~~ tun 多队列multiqueue ---RFS Mar 16, 2019

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

tun 多队列multiqueue ---RFS #1

tun 多队列multiqueue ---RFS #1

jursonmo commented Jan 29, 2019 •

edited

Loading

tun 多队列multiqueue ---RFS #1

tun 多队列multiqueue ---RFS #1

Comments

jursonmo commented Jan 29, 2019 • edited Loading

相同的是： 流对应的cpu 的记录和更新都是用处理应用层的cpu, 即由应用层来决定软中断使用哪个cpu. 由于软中断跟应用层对数据处理比较耗时，确实可以让这俩个处理放在同一个cpu上,即RFS，这样cpu缓存利用效率最大化。

jursonmo commented Jan 29, 2019 •

edited

Loading

相同的是：
流对应的cpu 的记录和更新都是用处理应用层的cpu, 即由应用层来决定软中断使用哪个cpu.
由于软中断跟应用层对数据处理比较耗时，确实可以让这俩个处理放在同一个cpu上,即RFS，这样cpu缓存利用效率最大化。