en_tx.c revision 273246
1/*
2 * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses.  You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 *     Redistribution and use in source and binary forms, with or
11 *     without modification, are permitted provided that the following
12 *     conditions are met:
13 *
14 *      - Redistributions of source code must retain the above
15 *        copyright notice, this list of conditions and the following
16 *        disclaimer.
17 *
18 *      - Redistributions in binary form must reproduce the above
19 *        copyright notice, this list of conditions and the following
20 *        disclaimer in the documentation and/or other materials
21 *        provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33
34#include <linux/page.h>
35#include <linux/mlx4/cq.h>
36#include <linux/slab.h>
37#include <linux/mlx4/qp.h>
38#include <linux/if_vlan.h>
39#include <linux/vmalloc.h>
40#include <linux/moduleparam.h>
41
42#include <netinet/in_systm.h>
43#include <netinet/in.h>
44#include <netinet/if_ether.h>
45#include <netinet/ip.h>
46#include <netinet/ip6.h>
47#include <netinet/tcp.h>
48#include <netinet/tcp_lro.h>
49#include <netinet/udp.h>
50
51#include "mlx4_en.h"
52#include "utils.h"
53
54enum {
55	MAX_INLINE = 104, /* 128 - 16 - 4 - 4 */
56	MAX_BF = 256,
57	MIN_PKT_LEN = 17,
58};
59
60static int inline_thold __read_mostly = MAX_INLINE;
61
62module_param_named(inline_thold, inline_thold, uint, 0444);
63MODULE_PARM_DESC(inline_thold, "threshold for using inline data");
64
65int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
66			   struct mlx4_en_tx_ring **pring, u32 size,
67			   u16 stride, int node, int queue_idx)
68{
69	struct mlx4_en_dev *mdev = priv->mdev;
70	struct mlx4_en_tx_ring *ring;
71	int tmp;
72	int err;
73
74	ring = kzalloc_node(sizeof(struct mlx4_en_tx_ring), GFP_KERNEL, node);
75	if (!ring) {
76		ring = kzalloc(sizeof(struct mlx4_en_tx_ring), GFP_KERNEL);
77		if (!ring) {
78			en_err(priv, "Failed allocating TX ring\n");
79			return -ENOMEM;
80		}
81	}
82
83	ring->size = size;
84	ring->size_mask = size - 1;
85	ring->stride = stride;
86	ring->full_size = ring->size - HEADROOM - MAX_DESC_TXBBS;
87	ring->inline_thold = min(inline_thold, MAX_INLINE);
88	mtx_init(&ring->tx_lock.m, "mlx4 tx", NULL, MTX_DEF);
89	mtx_init(&ring->comp_lock.m, "mlx4 comp", NULL, MTX_DEF);
90
91	/* Allocate the buf ring */
92	ring->br = buf_ring_alloc(MLX4_EN_DEF_TX_QUEUE_SIZE, M_DEVBUF,
93		M_WAITOK, &ring->tx_lock.m);
94	if (ring->br == NULL) {
95		en_err(priv, "Failed allocating tx_info ring\n");
96		return -ENOMEM;
97	}
98
99	tmp = size * sizeof(struct mlx4_en_tx_info);
100	ring->tx_info = vmalloc_node(tmp, node);
101	if (!ring->tx_info) {
102		ring->tx_info = vmalloc(tmp);
103		if (!ring->tx_info) {
104			err = -ENOMEM;
105			goto err_ring;
106		}
107	}
108
109	en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n",
110		 ring->tx_info, tmp);
111
112	ring->bounce_buf = kmalloc_node(MAX_DESC_SIZE, GFP_KERNEL, node);
113	if (!ring->bounce_buf) {
114		ring->bounce_buf = kmalloc(MAX_DESC_SIZE, GFP_KERNEL);
115		if (!ring->bounce_buf) {
116			err = -ENOMEM;
117			goto err_info;
118		}
119	}
120	ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE);
121
122	/* Allocate HW buffers on provided NUMA node */
123	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size,
124				 2 * PAGE_SIZE);
125	if (err) {
126		en_err(priv, "Failed allocating hwq resources\n");
127		goto err_bounce;
128	}
129
130	err = mlx4_en_map_buffer(&ring->wqres.buf);
131	if (err) {
132		en_err(priv, "Failed to map TX buffer\n");
133		goto err_hwq_res;
134	}
135
136	ring->buf = ring->wqres.buf.direct.buf;
137
138	en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d "
139	       "buf_size:%d dma:%llx\n", ring, ring->buf, ring->size,
140	       ring->buf_size, (unsigned long long) ring->wqres.buf.direct.map);
141
142	err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn,
143				    MLX4_RESERVE_BF_QP);
144	if (err) {
145		en_err(priv, "failed reserving qp for TX ring\n");
146		goto err_map;
147	}
148
149	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp);
150	if (err) {
151		en_err(priv, "Failed allocating qp %d\n", ring->qpn);
152		goto err_reserve;
153	}
154	ring->qp.event = mlx4_en_sqp_event;
155
156	err = mlx4_bf_alloc(mdev->dev, &ring->bf, node);
157	if (err) {
158		en_dbg(DRV, priv, "working without blueflame (%d)", err);
159		ring->bf.uar = &mdev->priv_uar;
160		ring->bf.uar->map = mdev->uar_map;
161		ring->bf_enabled = false;
162	} else
163		ring->bf_enabled = true;
164	ring->queue_index = queue_idx;
165	if (queue_idx < priv->num_tx_rings_p_up )
166		CPU_SET(queue_idx, &ring->affinity_mask);
167
168	*pring = ring;
169	return 0;
170
171err_reserve:
172	mlx4_qp_release_range(mdev->dev, ring->qpn, 1);
173err_map:
174	mlx4_en_unmap_buffer(&ring->wqres.buf);
175err_hwq_res:
176	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
177err_bounce:
178	kfree(ring->bounce_buf);
179err_info:
180	vfree(ring->tx_info);
181err_ring:
182	buf_ring_free(ring->br, M_DEVBUF);
183	kfree(ring);
184	return err;
185}
186
187void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
188			     struct mlx4_en_tx_ring **pring)
189{
190	struct mlx4_en_dev *mdev = priv->mdev;
191	struct mlx4_en_tx_ring *ring = *pring;
192	en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn);
193
194	buf_ring_free(ring->br, M_DEVBUF);
195	if (ring->bf_enabled)
196		mlx4_bf_free(mdev->dev, &ring->bf);
197	mlx4_qp_remove(mdev->dev, &ring->qp);
198	mlx4_qp_free(mdev->dev, &ring->qp);
199	mlx4_qp_release_range(priv->mdev->dev, ring->qpn, 1);
200	mlx4_en_unmap_buffer(&ring->wqres.buf);
201	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
202	kfree(ring->bounce_buf);
203	vfree(ring->tx_info);
204	mtx_destroy(&ring->tx_lock.m);
205	mtx_destroy(&ring->comp_lock.m);
206	kfree(ring);
207	*pring = NULL;
208}
209
210int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
211			     struct mlx4_en_tx_ring *ring,
212			     int cq, int user_prio)
213{
214	struct mlx4_en_dev *mdev = priv->mdev;
215	int err;
216
217	ring->cqn = cq;
218	ring->prod = 0;
219	ring->cons = 0xffffffff;
220	ring->last_nr_txbb = 1;
221	ring->poll_cnt = 0;
222	ring->blocked = 0;
223	memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info));
224	memset(ring->buf, 0, ring->buf_size);
225
226	ring->qp_state = MLX4_QP_STATE_RST;
227	ring->doorbell_qpn = ring->qp.qpn << 8;
228
229	mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn,
230				ring->cqn, user_prio, &ring->context);
231	if (ring->bf_enabled)
232		ring->context.usr_page = cpu_to_be32(ring->bf.uar->index);
233
234	err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context,
235			       &ring->qp, &ring->qp_state);
236	return err;
237}
238
239void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
240				struct mlx4_en_tx_ring *ring)
241{
242	struct mlx4_en_dev *mdev = priv->mdev;
243
244	mlx4_qp_modify(mdev->dev, NULL, ring->qp_state,
245		       MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp);
246}
247
248static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv,
249		       struct mlx4_en_tx_ring *ring,
250		       int index, u8 owner)
251{
252	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
253	struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
254	void *end = ring->buf + ring->buf_size;
255	__be32 *ptr = (__be32 *)tx_desc;
256	__be32 stamp = cpu_to_be32(STAMP_VAL | (!!owner << STAMP_SHIFT));
257	int i;
258
259	/* Optimize the common case when there are no wraparounds */
260	if (likely((void *)tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end))
261		/* Stamp the freed descriptor */
262		for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; i += STAMP_STRIDE) {
263			*ptr = stamp;
264			ptr += STAMP_DWORDS;
265		}
266	else
267		/* Stamp the freed descriptor */
268		for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; i += STAMP_STRIDE) {
269			*ptr = stamp;
270			ptr += STAMP_DWORDS;
271			if ((void *)ptr >= end) {
272				ptr = ring->buf;
273				stamp ^= cpu_to_be32(0x80000000);
274			}
275		}
276}
277
278static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
279				struct mlx4_en_tx_ring *ring,
280				int index, u8 owner, u64 timestamp)
281{
282	struct mlx4_en_dev *mdev = priv->mdev;
283	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
284	struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
285	struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset;
286        struct mbuf *mb = tx_info->mb;
287	void *end = ring->buf + ring->buf_size;
288	int frags = tx_info->nr_segs;;
289	int i;
290
291	/* Optimize the common case when there are no wraparounds */
292	if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) {
293		if (!tx_info->inl) {
294			if (tx_info->linear) {
295				dma_unmap_single(priv->ddev,
296					(dma_addr_t) be64_to_cpu(data->addr),
297					 be32_to_cpu(data->byte_count),
298					 PCI_DMA_TODEVICE);
299				++data;
300			}
301
302			for (i = 0; i < frags; i++) {
303                                pci_unmap_single(mdev->pdev,
304                                                (dma_addr_t) be64_to_cpu(data[i].addr),
305                                                data[i].byte_count, PCI_DMA_TODEVICE);
306			}
307		}
308	} else {
309		if (!tx_info->inl) {
310			if ((void *) data >= end) {
311				data = ring->buf + ((void *)data - end);
312			}
313
314			if (tx_info->linear) {
315				dma_unmap_single(priv->ddev,
316					(dma_addr_t) be64_to_cpu(data->addr),
317					 be32_to_cpu(data->byte_count),
318					 PCI_DMA_TODEVICE);
319				++data;
320			}
321
322			for (i = 0; i < frags; i++) {
323				/* Check for wraparound before unmapping */
324				if ((void *) data >= end)
325					data = ring->buf;
326                                pci_unmap_single(mdev->pdev,
327                                                (dma_addr_t) be64_to_cpu(data->addr),
328                                                data->byte_count, PCI_DMA_TODEVICE);
329				++data;
330			}
331		}
332	}
333	/* Send a copy of the frame to the BPF listener */
334        if (priv->dev && priv->dev->if_bpf)
335                ETHER_BPF_MTAP(priv->dev, mb);
336        m_freem(mb);
337	return tx_info->nr_txbb;
338}
339
340int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
341{
342	struct mlx4_en_priv *priv = netdev_priv(dev);
343	int cnt = 0;
344
345	/* Skip last polled descriptor */
346	ring->cons += ring->last_nr_txbb;
347	en_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n",
348		 ring->cons, ring->prod);
349
350	if ((u32) (ring->prod - ring->cons) > ring->size) {
351                en_warn(priv, "Tx consumer passed producer!\n");
352		return 0;
353	}
354
355	while (ring->cons != ring->prod) {
356		ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring,
357						ring->cons & ring->size_mask,
358						!!(ring->cons & ring->size), 0);
359		ring->cons += ring->last_nr_txbb;
360		cnt++;
361	}
362
363	if (cnt)
364		en_dbg(DRV, priv, "Freed %d uncompleted tx descriptors\n", cnt);
365
366	return cnt;
367}
368
369static int mlx4_en_process_tx_cq(struct net_device *dev,
370				 struct mlx4_en_cq *cq)
371{
372	struct mlx4_en_priv *priv = netdev_priv(dev);
373	struct mlx4_cq *mcq = &cq->mcq;
374	struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring];
375	struct mlx4_cqe *cqe;
376	u16 index;
377	u16 new_index, ring_index, stamp_index;
378	u32 txbbs_skipped = 0;
379	u32 txbbs_stamp = 0;
380	u32 cons_index = mcq->cons_index;
381	int size = cq->size;
382	u32 size_mask = ring->size_mask;
383	struct mlx4_cqe *buf = cq->buf;
384	u32 packets = 0;
385	u32 bytes = 0;
386	int factor = priv->cqe_factor;
387	u64 timestamp = 0;
388	int done = 0;
389
390
391	if (!priv->port_up)
392		return 0;
393
394	index = cons_index & size_mask;
395	cqe = &buf[(index << factor) + factor];
396	ring_index = ring->cons & size_mask;
397	stamp_index = ring_index;
398
399	/* Process all completed CQEs */
400	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
401			cons_index & size)) {
402		/*
403		 * make sure we read the CQE after we read the
404		 * ownership bit
405		 */
406		rmb();
407
408		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
409			     MLX4_CQE_OPCODE_ERROR)) {
410			en_err(priv, "CQE completed in error - vendor syndrom: 0x%x syndrom: 0x%x\n",
411			       ((struct mlx4_err_cqe *)cqe)->
412				       vendor_err_syndrome,
413			       ((struct mlx4_err_cqe *)cqe)->syndrome);
414		}
415
416		/* Skip over last polled CQE */
417		new_index = be16_to_cpu(cqe->wqe_index) & size_mask;
418
419		do {
420			txbbs_skipped += ring->last_nr_txbb;
421			ring_index = (ring_index + ring->last_nr_txbb) & size_mask;
422			/* free next descriptor */
423			ring->last_nr_txbb = mlx4_en_free_tx_desc(
424					priv, ring, ring_index,
425					!!((ring->cons + txbbs_skipped) &
426					ring->size), timestamp);
427			mlx4_en_stamp_wqe(priv, ring, stamp_index,
428					  !!((ring->cons + txbbs_stamp) &
429						ring->size));
430			stamp_index = ring_index;
431			txbbs_stamp = txbbs_skipped;
432			packets++;
433			bytes += ring->tx_info[ring_index].nr_bytes;
434		} while (ring_index != new_index);
435
436		++cons_index;
437		index = cons_index & size_mask;
438		cqe = &buf[(index << factor) + factor];
439	}
440
441
442	/*
443	 * To prevent CQ overflow we first update CQ consumer and only then
444	 * the ring consumer.
445	 */
446	mcq->cons_index = cons_index;
447	mlx4_cq_set_ci(mcq);
448	wmb();
449	ring->cons += txbbs_skipped;
450
451	/* Wakeup Tx queue if it was stopped and ring is not full */
452	if (unlikely(ring->blocked) &&
453	    (ring->prod - ring->cons) <= ring->full_size) {
454		ring->blocked = 0;
455		if (atomic_fetchadd_int(&priv->blocked, -1) == 1)
456			atomic_clear_int(&dev->if_drv_flags ,IFF_DRV_OACTIVE);
457		ring->wake_queue++;
458		priv->port_stats.wake_queue++;
459	}
460	return done;
461}
462
463void mlx4_en_tx_irq(struct mlx4_cq *mcq)
464{
465	struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
466	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
467	struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring];
468
469	if (!spin_trylock(&ring->comp_lock))
470		return;
471	mlx4_en_process_tx_cq(cq->dev, cq);
472	mod_timer(&cq->timer, jiffies + 1);
473	spin_unlock(&ring->comp_lock);
474}
475
476void mlx4_en_poll_tx_cq(unsigned long data)
477{
478	struct mlx4_en_cq *cq = (struct mlx4_en_cq *) data;
479	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
480	struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring];
481	u32 inflight;
482
483	INC_PERF_COUNTER(priv->pstats.tx_poll);
484
485	if (!spin_trylock(&ring->comp_lock)) {
486		mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT);
487		return;
488	}
489	mlx4_en_process_tx_cq(cq->dev, cq);
490	inflight = (u32) (ring->prod - ring->cons - ring->last_nr_txbb);
491
492	/* If there are still packets in flight and the timer has not already
493	 * been scheduled by the Tx routine then schedule it here to guarantee
494	 * completion processing of these packets */
495	if (inflight && priv->port_up)
496		mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT);
497
498	spin_unlock(&ring->comp_lock);
499}
500
501static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv,
502						      struct mlx4_en_tx_ring *ring,
503						      u32 index,
504						      unsigned int desc_size)
505{
506	u32 copy = (ring->size - index) * TXBB_SIZE;
507	int i;
508
509	for (i = desc_size - copy - 4; i >= 0; i -= 4) {
510		if ((i & (TXBB_SIZE - 1)) == 0)
511			wmb();
512
513		*((u32 *) (ring->buf + i)) =
514			*((u32 *) (ring->bounce_buf + copy + i));
515	}
516
517	for (i = copy - 4; i >= 4 ; i -= 4) {
518		if ((i & (TXBB_SIZE - 1)) == 0)
519			wmb();
520
521		*((u32 *) (ring->buf + index * TXBB_SIZE + i)) =
522			*((u32 *) (ring->bounce_buf + i));
523	}
524
525	/* Return real descriptor location */
526	return ring->buf + index * TXBB_SIZE;
527}
528
529static inline void mlx4_en_xmit_poll(struct mlx4_en_priv *priv, int tx_ind)
530{
531	struct mlx4_en_cq *cq = priv->tx_cq[tx_ind];
532	struct mlx4_en_tx_ring *ring = priv->tx_ring[tx_ind];
533
534	/* If we don't have a pending timer, set one up to catch our recent
535	   post in case the interface becomes idle */
536	if (!timer_pending(&cq->timer))
537		mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT);
538
539	/* Poll the CQ every mlx4_en_TX_MODER_POLL packets */
540	if ((++ring->poll_cnt & (MLX4_EN_TX_POLL_MODER - 1)) == 0)
541		if (spin_trylock(&ring->comp_lock)) {
542			mlx4_en_process_tx_cq(priv->dev, cq);
543			spin_unlock(&ring->comp_lock);
544		}
545}
546
547static int is_inline(struct mbuf *mb, int thold)
548{
549	if (thold && mb->m_pkthdr.len <= thold &&
550		(mb->m_pkthdr.csum_flags & CSUM_TSO) == 0)
551		return 1;
552
553        return 0;
554}
555
556static int inline_size(struct mbuf *mb)
557{
558	int len;
559
560	len = mb->m_pkthdr.len;
561	if (len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg)
562	    <= MLX4_INLINE_ALIGN)
563		return ALIGN(len + CTRL_SIZE +
564			     sizeof(struct mlx4_wqe_inline_seg), 16);
565	else
566		return ALIGN(len + CTRL_SIZE + 2 *
567			     sizeof(struct mlx4_wqe_inline_seg), 16);
568}
569
570static int get_head_size(struct mbuf *mb)
571{
572	struct ether_vlan_header *eh;
573        struct tcphdr *th;
574        struct ip *ip;
575        int ip_hlen, tcp_hlen;
576	struct ip6_hdr *ip6;
577	uint16_t eth_type;
578	int eth_hdr_len;
579
580	eh = mtod(mb, struct ether_vlan_header *);
581	if (mb->m_len < ETHER_HDR_LEN)
582		return (0);
583	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
584		eth_type = ntohs(eh->evl_proto);
585		eth_hdr_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
586	} else {
587		eth_type = ntohs(eh->evl_encap_proto);
588		eth_hdr_len = ETHER_HDR_LEN;
589	}
590	if (mb->m_len < eth_hdr_len)
591		return (0);
592	switch (eth_type) {
593	case ETHERTYPE_IP:
594		ip = (struct ip *)(mb->m_data + eth_hdr_len);
595		if (mb->m_len < eth_hdr_len + sizeof(*ip))
596			return (0);
597		if (ip->ip_p != IPPROTO_TCP)
598			return (0);
599		ip_hlen = ip->ip_hl << 2;
600		eth_hdr_len += ip_hlen;
601		break;
602	case ETHERTYPE_IPV6:
603		ip6 = (struct ip6_hdr *)(mb->m_data + eth_hdr_len);
604		if (mb->m_len < eth_hdr_len + sizeof(*ip6))
605			return (0);
606		if (ip6->ip6_nxt != IPPROTO_TCP)
607			return (0);
608		eth_hdr_len += sizeof(*ip6);
609		break;
610	default:
611		return (0);
612	}
613	if (mb->m_len < eth_hdr_len + sizeof(*th))
614		return (0);
615	th = (struct tcphdr *)(mb->m_data + eth_hdr_len);
616	tcp_hlen = th->th_off << 2;
617	eth_hdr_len += tcp_hlen;
618	if (mb->m_len < eth_hdr_len)
619		return (0);
620	return (eth_hdr_len);
621}
622
623static int get_real_size(struct mbuf *mb, struct net_device *dev, int *p_n_segs,
624    int *lso_header_size, int inl)
625{
626        struct mbuf *m;
627        int nr_segs = 0;
628
629        for (m = mb; m != NULL; m = m->m_next)
630                if (m->m_len)
631                        nr_segs++;
632
633        if (mb->m_pkthdr.csum_flags & CSUM_TSO) {
634                *lso_header_size = get_head_size(mb);
635                if (*lso_header_size) {
636                        if (mb->m_len == *lso_header_size)
637                                nr_segs--;
638                        *p_n_segs = nr_segs;
639                        return CTRL_SIZE + nr_segs * DS_SIZE +
640                            ALIGN(*lso_header_size + 4, DS_SIZE);
641                }
642        } else
643                *lso_header_size = 0;
644        *p_n_segs = nr_segs;
645        if (inl)
646                return inline_size(mb);
647        return (CTRL_SIZE + nr_segs * DS_SIZE);
648}
649
650static struct mbuf *mb_copy(struct mbuf *mb, int *offp, char *data, int len)
651{
652        int bytes;
653        int off;
654
655        off = *offp;
656        while (len) {
657                bytes = min(mb->m_len - off, len);
658                if (bytes)
659                        memcpy(data, mb->m_data + off, bytes);
660                len -= bytes;
661                data += bytes;
662                off += bytes;
663                if (off == mb->m_len) {
664                        off = 0;
665                        mb = mb->m_next;
666                }
667        }
668        *offp = off;
669        return (mb);
670}
671
672static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct mbuf *mb,
673                             int real_size, u16 *vlan_tag, int tx_ind)
674{
675	struct mlx4_wqe_inline_seg *inl = &tx_desc->inl;
676	int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl;
677	int len;
678	int off;
679
680	off = 0;
681	len = mb->m_pkthdr.len;
682	if (len <= spc) {
683		inl->byte_count = cpu_to_be32(1 << 31 |
684				(max_t(typeof(len), len, MIN_PKT_LEN)));
685		mb_copy(mb, &off, (void *)(inl + 1), len);
686		if (len < MIN_PKT_LEN)
687                        memset(((void *)(inl + 1)) + len, 0,
688                               MIN_PKT_LEN - len);
689	} else {
690		inl->byte_count = cpu_to_be32(1 << 31 | spc);
691		mb = mb_copy(mb, &off, (void *)(inl + 1), spc);
692		inl = (void *) (inl + 1) + spc;
693		mb_copy(mb, &off, (void *)(inl + 1), len - spc);
694		wmb();
695		inl->byte_count = cpu_to_be32(1 << 31 | (len - spc));
696	}
697	tx_desc->ctrl.vlan_tag = cpu_to_be16(*vlan_tag);
698	tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * !!(*vlan_tag);
699	tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f;
700}
701
702static unsigned long hashrandom;
703static void hashrandom_init(void *arg)
704{
705	hashrandom = random();
706}
707SYSINIT(hashrandom_init, SI_SUB_KLD, SI_ORDER_SECOND, &hashrandom_init, NULL);
708
709u16 mlx4_en_select_queue(struct net_device *dev, struct mbuf *mb)
710{
711	struct mlx4_en_priv *priv = netdev_priv(dev);
712	u32 rings_p_up = priv->num_tx_rings_p_up;
713	u32 vlan_tag = 0;
714	u32 up = 0;
715	u32 queue_index;
716
717	/* Obtain VLAN information if present */
718	if (mb->m_flags & M_VLANTAG) {
719		vlan_tag = mb->m_pkthdr.ether_vtag;
720	        up = (vlan_tag >> 13);
721	}
722
723	/* hash mbuf */
724	queue_index = mlx4_en_hashmbuf(MLX4_F_HASHL3 | MLX4_F_HASHL4, mb, hashrandom);
725
726	return ((queue_index % rings_p_up) + (up * rings_p_up));
727}
728
729static void mlx4_bf_copy(void __iomem *dst, unsigned long *src, unsigned bytecnt)
730{
731	__iowrite64_copy(dst, src, bytecnt / 8);
732}
733
734static u64 mlx4_en_mac_to_u64(u8 *addr)
735{
736        u64 mac = 0;
737        int i;
738
739        for (i = 0; i < ETHER_ADDR_LEN; i++) {
740                mac <<= 8;
741                mac |= addr[i];
742        }
743        return mac;
744}
745
746static int mlx4_en_xmit(struct net_device *dev, int tx_ind, struct mbuf **mbp)
747{
748	struct mlx4_en_priv *priv = netdev_priv(dev);
749	struct mlx4_en_dev *mdev = priv->mdev;
750	struct mlx4_en_tx_ring *ring;
751	struct mlx4_en_cq *cq;
752	struct mlx4_en_tx_desc *tx_desc;
753	struct mlx4_wqe_data_seg *data;
754	struct mlx4_en_tx_info *tx_info;
755	struct mbuf *m;
756	int nr_txbb;
757	int nr_segs;
758	int desc_size;
759	int real_size;
760	dma_addr_t dma;
761	u32 index, bf_index, ring_size;
762	__be32 op_own;
763	u16 vlan_tag = 0;
764	int i;
765	int lso_header_size;
766	bool bounce = false;
767	bool inl = false;
768	struct mbuf *mb;
769	mb = *mbp;
770	int defrag = 1;
771
772	if (!priv->port_up)
773		goto tx_drop;
774
775	ring = priv->tx_ring[tx_ind];
776	ring_size = ring->size;
777	inl = is_inline(mb, ring->inline_thold);
778
779retry:
780	real_size = get_real_size(mb, dev, &nr_segs, &lso_header_size, inl);
781	if (unlikely(!real_size))
782		goto tx_drop;
783
784	/* Align descriptor to TXBB size */
785	desc_size = ALIGN(real_size, TXBB_SIZE);
786	nr_txbb = desc_size / TXBB_SIZE;
787	if (unlikely(nr_txbb > MAX_DESC_TXBBS)) {
788		if (defrag) {
789                        mb = m_defrag(*mbp, M_NOWAIT);
790                        if (mb == NULL) {
791                                mb = *mbp;
792                                goto tx_drop;
793                        }
794                        *mbp = mb;
795                        defrag = 0;
796                        goto retry;
797                }
798		en_warn(priv, "Oversized header or SG list\n");
799		goto tx_drop;
800	}
801
802	/* Obtain VLAN information if present */
803	if (mb->m_flags & M_VLANTAG) {
804		vlan_tag = mb->m_pkthdr.ether_vtag;
805	}
806
807	/* Check available TXBBs and 2K spare for prefetch
808	 * Even if netif_tx_stop_queue() will be called
809	 * driver will send current packet to ensure
810	 * that at least one completion will be issued after
811	 * stopping the queue
812	 */
813	if (unlikely((int)(ring->prod - ring->cons) > ring->full_size)) {
814		/* every full Tx ring stops queue */
815		if (ring->blocked == 0)
816                        atomic_add_int(&priv->blocked, 1);
817		/* Set HW-queue-is-full flag */
818		atomic_set_int(&dev->if_drv_flags, IFF_DRV_OACTIVE);
819		ring->blocked = 1;
820		priv->port_stats.queue_stopped++;
821		ring->queue_stopped++;
822
823		/* Use interrupts to find out when queue opened */
824		cq = priv->tx_cq[tx_ind];
825		mlx4_en_arm_cq(priv, cq);
826		return EBUSY;
827        }
828
829	/* Track current inflight packets for performance analysis */
830	AVG_PERF_COUNTER(priv->pstats.inflight_avg,
831			 (u32) (ring->prod - ring->cons - 1));
832
833	/* Packet is good - grab an index and transmit it */
834	index = ring->prod & ring->size_mask;
835	bf_index = ring->prod;
836
837	/* See if we have enough space for whole descriptor TXBB for setting
838	 * SW ownership on next descriptor; if not, use a bounce buffer. */
839	if (likely(index + nr_txbb <= ring_size))
840		tx_desc = ring->buf + index * TXBB_SIZE;
841	else {
842		tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf;
843		bounce = true;
844	}
845
846	/* Save mb in tx_info ring */
847	tx_info = &ring->tx_info[index];
848	tx_info->mb = mb;
849	tx_info->nr_txbb = nr_txbb;
850	tx_info->nr_segs = nr_segs;
851
852	if (lso_header_size) {
853		memcpy(tx_desc->lso.header, mb->m_data, lso_header_size);
854		data = ((void *)&tx_desc->lso + ALIGN(lso_header_size + 4,
855						      DS_SIZE));
856		/* lso header is part of m_data.
857		 * need to omit when mapping DMA */
858		mb->m_data += lso_header_size;
859		mb->m_len -= lso_header_size;
860	}
861	else
862		data = &tx_desc->data;
863
864	/* valid only for none inline segments */
865	tx_info->data_offset = (void *)data - (void *)tx_desc;
866
867	if (inl) {
868		tx_info->inl = 1;
869	} else {
870		for (i = 0, m = mb; i < nr_segs; i++, m = m->m_next) {
871                        if (m->m_len == 0) {
872                                i--;
873                                continue;
874                        }
875                        dma = pci_map_single(mdev->dev->pdev, m->m_data,
876                                             m->m_len, PCI_DMA_TODEVICE);
877                        data->addr = cpu_to_be64(dma);
878                        data->lkey = cpu_to_be32(mdev->mr.key);
879                        wmb();
880                        data->byte_count = cpu_to_be32(m->m_len);
881                        data++;
882                }
883                if (lso_header_size) {
884                        mb->m_data -= lso_header_size;
885                        mb->m_len += lso_header_size;
886                }
887                tx_info->inl = 0;
888	}
889
890
891	/* Prepare ctrl segement apart opcode+ownership, which depends on
892	 * whether LSO is used */
893	tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
894	tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN *
895		!!vlan_tag;
896	tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f;
897	tx_desc->ctrl.srcrb_flags = priv->ctrl_flags;
898	if (mb->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO |
899		CSUM_TCP | CSUM_UDP | CSUM_TCP_IPV6 | CSUM_UDP_IPV6)) {
900		if (mb->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO))
901			tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM);
902		if (mb->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP |
903		    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
904			tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM);
905		priv->port_stats.tx_chksum_offload++;
906                ring->tx_csum++;
907        }
908
909	if (unlikely(priv->validate_loopback)) {
910		/* Copy dst mac address to wqe */
911                struct ether_header *ethh;
912                u64 mac;
913                u32 mac_l, mac_h;
914
915                ethh = mtod(mb, struct ether_header *);
916                mac = mlx4_en_mac_to_u64(ethh->ether_dhost);
917                if (mac) {
918                        mac_h = (u32) ((mac & 0xffff00000000ULL) >> 16);
919                        mac_l = (u32) (mac & 0xffffffff);
920                        tx_desc->ctrl.srcrb_flags |= cpu_to_be32(mac_h);
921                        tx_desc->ctrl.imm = cpu_to_be32(mac_l);
922                }
923	}
924
925	/* Handle LSO (TSO) packets */
926	if (lso_header_size) {
927		int segsz;
928		/* Mark opcode as LSO */
929		op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) |
930			((ring->prod & ring_size) ?
931				cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
932
933		/* Fill in the LSO prefix */
934		tx_desc->lso.mss_hdr_size = cpu_to_be32(
935			mb->m_pkthdr.tso_segsz << 16 | lso_header_size);
936
937                priv->port_stats.tso_packets++;
938                segsz = mb->m_pkthdr.tso_segsz;
939                i = ((mb->m_pkthdr.len - lso_header_size + segsz - 1) / segsz);
940                tx_info->nr_bytes= mb->m_pkthdr.len + (i - 1) * lso_header_size;
941                ring->packets += i;
942	} else {
943		/* Normal (Non LSO) packet */
944		op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
945			((ring->prod & ring_size) ?
946			 cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
947		tx_info->nr_bytes = max(mb->m_pkthdr.len,
948                    (unsigned int)ETHER_MIN_LEN - ETHER_CRC_LEN);
949		ring->packets++;
950
951	}
952	ring->bytes += tx_info->nr_bytes;
953	AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, mb->m_pkthdr.len);
954
955	if (tx_info->inl) {
956		build_inline_wqe(tx_desc, mb, real_size, &vlan_tag, tx_ind);
957		tx_info->inl = 1;
958	}
959
960	ring->prod += nr_txbb;
961
962
963	/* If we used a bounce buffer then copy descriptor back into place */
964	if (unlikely(bounce))
965		tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size);
966	if (ring->bf_enabled && desc_size <= MAX_BF && !bounce && !vlan_tag) {
967		*(__be32 *) (&tx_desc->ctrl.vlan_tag) |= cpu_to_be32(ring->doorbell_qpn);
968		op_own |= htonl((bf_index & 0xffff) << 8);
969		/* Ensure new descirptor hits memory
970		* before setting ownership of this descriptor to HW */
971		wmb();
972		tx_desc->ctrl.owner_opcode = op_own;
973
974		wmb();
975
976		mlx4_bf_copy(ring->bf.reg + ring->bf.offset, (unsigned long *) &tx_desc->ctrl,
977		     desc_size);
978
979		wmb();
980
981		ring->bf.offset ^= ring->bf.buf_size;
982	} else {
983		/* Ensure new descirptor hits memory
984		* before setting ownership of this descriptor to HW */
985		wmb();
986		tx_desc->ctrl.owner_opcode = op_own;
987		wmb();
988		writel(cpu_to_be32(ring->doorbell_qpn), ring->bf.uar->map + MLX4_SEND_DOORBELL);
989	}
990
991	return 0;
992tx_drop:
993	*mbp = NULL;
994	m_freem(mb);
995	return EINVAL;
996}
997
998static int
999mlx4_en_transmit_locked(struct ifnet *dev, int tx_ind, struct mbuf *m)
1000{
1001	struct mlx4_en_priv *priv = netdev_priv(dev);
1002	struct mlx4_en_tx_ring *ring;
1003	struct mbuf *next;
1004	int enqueued, err = 0;
1005
1006	ring = priv->tx_ring[tx_ind];
1007	if ((dev->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
1008	    IFF_DRV_RUNNING || priv->port_up == 0) {
1009		if (m != NULL)
1010			err = drbr_enqueue(dev, ring->br, m);
1011		return (err);
1012	}
1013
1014	enqueued = 0;
1015	if (m != NULL) {
1016		if ((err = drbr_enqueue(dev, ring->br, m)) != 0)
1017			return (err);
1018	}
1019	/* Process the queue */
1020	while ((next = drbr_peek(dev, ring->br)) != NULL) {
1021		if ((err = mlx4_en_xmit(dev, tx_ind, &next)) != 0) {
1022			if (next == NULL) {
1023				drbr_advance(dev, ring->br);
1024			} else {
1025				drbr_putback(dev, ring->br, next);
1026			}
1027			break;
1028		}
1029		drbr_advance(dev, ring->br);
1030		enqueued++;
1031		dev->if_obytes += next->m_pkthdr.len;
1032		if (next->m_flags & M_MCAST)
1033			dev->if_omcasts++;
1034		if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0)
1035			break;
1036	}
1037
1038	if (enqueued > 0)
1039		ring->watchdog_time = ticks;
1040
1041	return (err);
1042}
1043
1044void
1045mlx4_en_tx_que(void *context, int pending)
1046{
1047	struct mlx4_en_tx_ring *ring;
1048	struct mlx4_en_priv *priv;
1049	struct net_device *dev;
1050	struct mlx4_en_cq *cq;
1051	int tx_ind;
1052	cq = context;
1053	dev = cq->dev;
1054	priv = dev->if_softc;
1055	tx_ind = cq->ring;
1056	ring = priv->tx_ring[tx_ind];
1057        if (dev->if_drv_flags & IFF_DRV_RUNNING) {
1058		mlx4_en_xmit_poll(priv, tx_ind);
1059		spin_lock(&ring->tx_lock);
1060                if (!drbr_empty(dev, ring->br))
1061			mlx4_en_transmit_locked(dev, tx_ind, NULL);
1062		spin_unlock(&ring->tx_lock);
1063	}
1064}
1065
1066int
1067mlx4_en_transmit(struct ifnet *dev, struct mbuf *m)
1068{
1069	struct mlx4_en_priv *priv = netdev_priv(dev);
1070	struct mlx4_en_tx_ring *ring;
1071	struct mlx4_en_cq *cq;
1072	int i = 0, err = 0;
1073
1074	/* Which queue to use */
1075	if ((m->m_flags & (M_FLOWID | M_VLANTAG)) == M_FLOWID) {
1076		i = m->m_pkthdr.flowid % (priv->tx_ring_num - 1);
1077	}
1078	else {
1079		i = mlx4_en_select_queue(dev, m);
1080	}
1081	ring = priv->tx_ring[i];
1082
1083	if (spin_trylock(&ring->tx_lock)) {
1084		err = mlx4_en_transmit_locked(dev, i, m);
1085		spin_unlock(&ring->tx_lock);
1086		/* Poll CQ here */
1087		mlx4_en_xmit_poll(priv, i);
1088	} else {
1089		err = drbr_enqueue(dev, ring->br, m);
1090		cq = priv->tx_cq[i];
1091		taskqueue_enqueue(cq->tq, &cq->cq_task);
1092	}
1093
1094	return (err);
1095}
1096
1097/*
1098 * Flush ring buffers.
1099 */
1100void
1101mlx4_en_qflush(struct ifnet *dev)
1102{
1103	struct mlx4_en_priv *priv = netdev_priv(dev);
1104	struct mlx4_en_tx_ring *ring;
1105	struct mbuf *m;
1106
1107	for (int i = 0; i < priv->tx_ring_num; i++) {
1108		ring = priv->tx_ring[i];
1109		spin_lock(&ring->tx_lock);
1110		while ((m = buf_ring_dequeue_sc(ring->br)) != NULL)
1111			m_freem(m);
1112		spin_unlock(&ring->tx_lock);
1113	}
1114	if_qflush(dev);
1115}
1116