en_tx.c revision 298773
1/*
2 * Copyright (c) 2007, 2014 Mellanox Technologies. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses.  You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 *     Redistribution and use in source and binary forms, with or
11 *     without modification, are permitted provided that the following
12 *     conditions are met:
13 *
14 *      - Redistributions of source code must retain the above
15 *        copyright notice, this list of conditions and the following
16 *        disclaimer.
17 *
18 *      - Redistributions in binary form must reproduce the above
19 *        copyright notice, this list of conditions and the following
20 *        disclaimer in the documentation and/or other materials
21 *        provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33
34#include <linux/page.h>
35#include <linux/mlx4/cq.h>
36#include <linux/slab.h>
37#include <linux/mlx4/qp.h>
38#include <linux/if_vlan.h>
39#include <linux/vmalloc.h>
40#include <linux/moduleparam.h>
41
42#include <netinet/in_systm.h>
43#include <netinet/in.h>
44#include <netinet/if_ether.h>
45#include <netinet/ip.h>
46#include <netinet/ip6.h>
47#include <netinet/tcp.h>
48#include <netinet/tcp_lro.h>
49#include <netinet/udp.h>
50
51#include "mlx4_en.h"
52#include "utils.h"
53
54enum {
55	MAX_INLINE = 104, /* 128 - 16 - 4 - 4 */
56	MAX_BF = 256,
57	MIN_PKT_LEN = 17,
58};
59
60static int inline_thold __read_mostly = MAX_INLINE;
61
62module_param_named(inline_thold, inline_thold, uint, 0444);
63MODULE_PARM_DESC(inline_thold, "threshold for using inline data");
64
65int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
66			   struct mlx4_en_tx_ring **pring, u32 size,
67			   u16 stride, int node, int queue_idx)
68{
69	struct mlx4_en_dev *mdev = priv->mdev;
70	struct mlx4_en_tx_ring *ring;
71	uint32_t x;
72	int tmp;
73	int err;
74
75	ring = kzalloc_node(sizeof(struct mlx4_en_tx_ring), GFP_KERNEL, node);
76	if (!ring) {
77		ring = kzalloc(sizeof(struct mlx4_en_tx_ring), GFP_KERNEL);
78		if (!ring) {
79			en_err(priv, "Failed allocating TX ring\n");
80			return -ENOMEM;
81		}
82	}
83
84	/* Create DMA descriptor TAG */
85	if ((err = -bus_dma_tag_create(
86	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
87	    1,					/* any alignment */
88	    0,					/* no boundary */
89	    BUS_SPACE_MAXADDR,			/* lowaddr */
90	    BUS_SPACE_MAXADDR,			/* highaddr */
91	    NULL, NULL,				/* filter, filterarg */
92	    MLX4_EN_TX_MAX_PAYLOAD_SIZE,	/* maxsize */
93	    MLX4_EN_TX_MAX_MBUF_FRAGS,		/* nsegments */
94	    MLX4_EN_TX_MAX_MBUF_SIZE,		/* maxsegsize */
95	    0,					/* flags */
96	    NULL, NULL,				/* lockfunc, lockfuncarg */
97	    &ring->dma_tag)))
98		goto done;
99
100	ring->size = size;
101	ring->size_mask = size - 1;
102	ring->stride = stride;
103	ring->inline_thold = MAX(MIN_PKT_LEN, MIN(inline_thold, MAX_INLINE));
104	mtx_init(&ring->tx_lock.m, "mlx4 tx", NULL, MTX_DEF);
105	mtx_init(&ring->comp_lock.m, "mlx4 comp", NULL, MTX_DEF);
106
107	/* Allocate the buf ring */
108	ring->br = buf_ring_alloc(MLX4_EN_DEF_TX_QUEUE_SIZE, M_DEVBUF,
109		M_WAITOK, &ring->tx_lock.m);
110	if (ring->br == NULL) {
111		en_err(priv, "Failed allocating tx_info ring\n");
112		err = -ENOMEM;
113		goto err_free_dma_tag;
114	}
115
116	tmp = size * sizeof(struct mlx4_en_tx_info);
117	ring->tx_info = kzalloc_node(tmp, GFP_KERNEL, node);
118	if (!ring->tx_info) {
119		ring->tx_info = kzalloc(tmp, GFP_KERNEL);
120		if (!ring->tx_info) {
121			err = -ENOMEM;
122			goto err_ring;
123		}
124	}
125
126	/* Create DMA descriptor MAPs */
127	for (x = 0; x != size; x++) {
128		err = -bus_dmamap_create(ring->dma_tag, 0,
129		    &ring->tx_info[x].dma_map);
130		if (err != 0) {
131			while (x--) {
132				bus_dmamap_destroy(ring->dma_tag,
133				    ring->tx_info[x].dma_map);
134			}
135			goto err_info;
136		}
137	}
138
139	en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n",
140		 ring->tx_info, tmp);
141
142	ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE);
143
144	/* Allocate HW buffers on provided NUMA node */
145	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size,
146				 2 * PAGE_SIZE);
147	if (err) {
148		en_err(priv, "Failed allocating hwq resources\n");
149		goto err_dma_map;
150	}
151
152	err = mlx4_en_map_buffer(&ring->wqres.buf);
153	if (err) {
154		en_err(priv, "Failed to map TX buffer\n");
155		goto err_hwq_res;
156	}
157
158	ring->buf = ring->wqres.buf.direct.buf;
159
160	en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d "
161	       "buf_size:%d dma:%llx\n", ring, ring->buf, ring->size,
162	       ring->buf_size, (unsigned long long) ring->wqres.buf.direct.map);
163
164	err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn,
165				    MLX4_RESERVE_BF_QP);
166	if (err) {
167		en_err(priv, "failed reserving qp for TX ring\n");
168		goto err_map;
169	}
170
171	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp);
172	if (err) {
173		en_err(priv, "Failed allocating qp %d\n", ring->qpn);
174		goto err_reserve;
175	}
176	ring->qp.event = mlx4_en_sqp_event;
177
178	err = mlx4_bf_alloc(mdev->dev, &ring->bf, node);
179	if (err) {
180		en_dbg(DRV, priv, "working without blueflame (%d)", err);
181		ring->bf.uar = &mdev->priv_uar;
182		ring->bf.uar->map = mdev->uar_map;
183		ring->bf_enabled = false;
184	} else
185		ring->bf_enabled = true;
186	ring->queue_index = queue_idx;
187	if (queue_idx < priv->num_tx_rings_p_up )
188		CPU_SET(queue_idx, &ring->affinity_mask);
189
190	*pring = ring;
191	return 0;
192
193err_reserve:
194	mlx4_qp_release_range(mdev->dev, ring->qpn, 1);
195err_map:
196	mlx4_en_unmap_buffer(&ring->wqres.buf);
197err_hwq_res:
198	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
199err_dma_map:
200	for (x = 0; x != size; x++)
201		bus_dmamap_destroy(ring->dma_tag, ring->tx_info[x].dma_map);
202err_info:
203	vfree(ring->tx_info);
204err_ring:
205	buf_ring_free(ring->br, M_DEVBUF);
206err_free_dma_tag:
207	bus_dma_tag_destroy(ring->dma_tag);
208done:
209	kfree(ring);
210	return err;
211}
212
213void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
214			     struct mlx4_en_tx_ring **pring)
215{
216	struct mlx4_en_dev *mdev = priv->mdev;
217	struct mlx4_en_tx_ring *ring = *pring;
218	uint32_t x;
219	en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn);
220
221	buf_ring_free(ring->br, M_DEVBUF);
222	if (ring->bf_enabled)
223		mlx4_bf_free(mdev->dev, &ring->bf);
224	mlx4_qp_remove(mdev->dev, &ring->qp);
225	mlx4_qp_free(mdev->dev, &ring->qp);
226	mlx4_qp_release_range(priv->mdev->dev, ring->qpn, 1);
227	mlx4_en_unmap_buffer(&ring->wqres.buf);
228	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
229	for (x = 0; x != ring->size; x++)
230		bus_dmamap_destroy(ring->dma_tag, ring->tx_info[x].dma_map);
231	vfree(ring->tx_info);
232	mtx_destroy(&ring->tx_lock.m);
233	mtx_destroy(&ring->comp_lock.m);
234	bus_dma_tag_destroy(ring->dma_tag);
235	kfree(ring);
236	*pring = NULL;
237}
238
239int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
240			     struct mlx4_en_tx_ring *ring,
241			     int cq, int user_prio)
242{
243	struct mlx4_en_dev *mdev = priv->mdev;
244	int err;
245
246	ring->cqn = cq;
247	ring->prod = 0;
248	ring->cons = 0xffffffff;
249	ring->last_nr_txbb = 1;
250	ring->poll_cnt = 0;
251	ring->blocked = 0;
252	memset(ring->buf, 0, ring->buf_size);
253
254	ring->qp_state = MLX4_QP_STATE_RST;
255	ring->doorbell_qpn = ring->qp.qpn << 8;
256
257	mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn,
258				ring->cqn, user_prio, &ring->context);
259	if (ring->bf_enabled)
260		ring->context.usr_page = cpu_to_be32(ring->bf.uar->index);
261
262	err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context,
263			       &ring->qp, &ring->qp_state);
264	return err;
265}
266
267void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
268				struct mlx4_en_tx_ring *ring)
269{
270	struct mlx4_en_dev *mdev = priv->mdev;
271
272	mlx4_qp_modify(mdev->dev, NULL, ring->qp_state,
273		       MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp);
274}
275
276static volatile struct mlx4_wqe_data_seg *
277mlx4_en_store_inline_lso_data(volatile struct mlx4_wqe_data_seg *dseg,
278    struct mbuf *mb, int len, __be32 owner_bit)
279{
280	uint8_t *inl = __DEVOLATILE(uint8_t *, dseg);
281
282	/* copy data into place */
283	m_copydata(mb, 0, len, inl + 4);
284	dseg += DIV_ROUND_UP(4 + len, DS_SIZE_ALIGNMENT);
285	return (dseg);
286}
287
288static void
289mlx4_en_store_inline_lso_header(volatile struct mlx4_wqe_data_seg *dseg,
290    int len, __be32 owner_bit)
291{
292}
293
294static void
295mlx4_en_stamp_wqe(struct mlx4_en_priv *priv,
296    struct mlx4_en_tx_ring *ring, u32 index, u8 owner)
297{
298	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
299	struct mlx4_en_tx_desc *tx_desc = (struct mlx4_en_tx_desc *)
300	    (ring->buf + (index * TXBB_SIZE));
301	volatile __be32 *ptr = (__be32 *)tx_desc;
302	const __be32 stamp = cpu_to_be32(STAMP_VAL |
303	    ((u32)owner << STAMP_SHIFT));
304	u32 i;
305
306	/* Stamp the freed descriptor */
307	for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; i += STAMP_STRIDE) {
308		*ptr = stamp;
309		ptr += STAMP_DWORDS;
310	}
311}
312
313static u32
314mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
315    struct mlx4_en_tx_ring *ring, u32 index)
316{
317	struct mlx4_en_tx_info *tx_info;
318	struct mbuf *mb;
319
320	tx_info = &ring->tx_info[index];
321	mb = tx_info->mb;
322
323	if (mb == NULL)
324		goto done;
325
326	bus_dmamap_sync(ring->dma_tag, tx_info->dma_map,
327	    BUS_DMASYNC_POSTWRITE);
328	bus_dmamap_unload(ring->dma_tag, tx_info->dma_map);
329
330        m_freem(mb);
331done:
332	return (tx_info->nr_txbb);
333}
334
335int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
336{
337	struct mlx4_en_priv *priv = netdev_priv(dev);
338	int cnt = 0;
339
340	/* Skip last polled descriptor */
341	ring->cons += ring->last_nr_txbb;
342	en_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n",
343		 ring->cons, ring->prod);
344
345	if ((u32) (ring->prod - ring->cons) > ring->size) {
346                en_warn(priv, "Tx consumer passed producer!\n");
347		return 0;
348	}
349
350	while (ring->cons != ring->prod) {
351		ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring,
352		    ring->cons & ring->size_mask);
353		ring->cons += ring->last_nr_txbb;
354		cnt++;
355	}
356
357	if (cnt)
358		en_dbg(DRV, priv, "Freed %d uncompleted tx descriptors\n", cnt);
359
360	return cnt;
361}
362
363static bool
364mlx4_en_tx_ring_is_full(struct mlx4_en_tx_ring *ring)
365{
366	int wqs;
367	wqs = ring->size - (ring->prod - ring->cons);
368	return (wqs < (HEADROOM + (2 * MLX4_EN_TX_WQE_MAX_WQEBBS)));
369}
370
371static int mlx4_en_process_tx_cq(struct net_device *dev,
372				 struct mlx4_en_cq *cq)
373{
374	struct mlx4_en_priv *priv = netdev_priv(dev);
375	struct mlx4_cq *mcq = &cq->mcq;
376	struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring];
377	struct mlx4_cqe *cqe;
378	u16 index;
379	u16 new_index, ring_index, stamp_index;
380	u32 txbbs_skipped = 0;
381	u32 txbbs_stamp = 0;
382	u32 cons_index = mcq->cons_index;
383	int size = cq->size;
384	u32 size_mask = ring->size_mask;
385	struct mlx4_cqe *buf = cq->buf;
386	int factor = priv->cqe_factor;
387
388	if (!priv->port_up)
389		return 0;
390
391	index = cons_index & size_mask;
392	cqe = &buf[(index << factor) + factor];
393	ring_index = ring->cons & size_mask;
394	stamp_index = ring_index;
395
396	/* Process all completed CQEs */
397	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
398			cons_index & size)) {
399		/*
400		 * make sure we read the CQE after we read the
401		 * ownership bit
402		 */
403		rmb();
404
405		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
406			     MLX4_CQE_OPCODE_ERROR)) {
407			en_err(priv, "CQE completed in error - vendor syndrom: 0x%x syndrom: 0x%x\n",
408			       ((struct mlx4_err_cqe *)cqe)->
409				       vendor_err_syndrome,
410			       ((struct mlx4_err_cqe *)cqe)->syndrome);
411		}
412
413		/* Skip over last polled CQE */
414		new_index = be16_to_cpu(cqe->wqe_index) & size_mask;
415
416		do {
417			txbbs_skipped += ring->last_nr_txbb;
418			ring_index = (ring_index + ring->last_nr_txbb) & size_mask;
419			/* free next descriptor */
420			ring->last_nr_txbb = mlx4_en_free_tx_desc(
421			    priv, ring, ring_index);
422			mlx4_en_stamp_wqe(priv, ring, stamp_index,
423					  !!((ring->cons + txbbs_stamp) &
424						ring->size));
425			stamp_index = ring_index;
426			txbbs_stamp = txbbs_skipped;
427		} while (ring_index != new_index);
428
429		++cons_index;
430		index = cons_index & size_mask;
431		cqe = &buf[(index << factor) + factor];
432	}
433
434
435	/*
436	 * To prevent CQ overflow we first update CQ consumer and only then
437	 * the ring consumer.
438	 */
439	mcq->cons_index = cons_index;
440	mlx4_cq_set_ci(mcq);
441	wmb();
442	ring->cons += txbbs_skipped;
443
444	/* Wakeup Tx queue if it was stopped and ring is not full */
445	if (unlikely(ring->blocked) && !mlx4_en_tx_ring_is_full(ring)) {
446		ring->blocked = 0;
447		if (atomic_fetchadd_int(&priv->blocked, -1) == 1)
448			atomic_clear_int(&dev->if_drv_flags ,IFF_DRV_OACTIVE);
449		ring->wake_queue++;
450		priv->port_stats.wake_queue++;
451	}
452	return (0);
453}
454
455void mlx4_en_tx_irq(struct mlx4_cq *mcq)
456{
457	struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
458	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
459	struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring];
460
461	if (priv->port_up == 0 || !spin_trylock(&ring->comp_lock))
462		return;
463	mlx4_en_process_tx_cq(cq->dev, cq);
464	mod_timer(&cq->timer, jiffies + 1);
465	spin_unlock(&ring->comp_lock);
466}
467
468void mlx4_en_poll_tx_cq(unsigned long data)
469{
470	struct mlx4_en_cq *cq = (struct mlx4_en_cq *) data;
471	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
472	struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring];
473	u32 inflight;
474
475	INC_PERF_COUNTER(priv->pstats.tx_poll);
476
477	if (priv->port_up == 0)
478		return;
479	if (!spin_trylock(&ring->comp_lock)) {
480		mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT);
481		return;
482	}
483	mlx4_en_process_tx_cq(cq->dev, cq);
484	inflight = (u32) (ring->prod - ring->cons - ring->last_nr_txbb);
485
486	/* If there are still packets in flight and the timer has not already
487	 * been scheduled by the Tx routine then schedule it here to guarantee
488	 * completion processing of these packets */
489	if (inflight && priv->port_up)
490		mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT);
491
492	spin_unlock(&ring->comp_lock);
493}
494
495static inline void mlx4_en_xmit_poll(struct mlx4_en_priv *priv, int tx_ind)
496{
497	struct mlx4_en_cq *cq = priv->tx_cq[tx_ind];
498	struct mlx4_en_tx_ring *ring = priv->tx_ring[tx_ind];
499
500	if (priv->port_up == 0)
501		return;
502
503	/* If we don't have a pending timer, set one up to catch our recent
504	   post in case the interface becomes idle */
505	if (!timer_pending(&cq->timer))
506		mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT);
507
508	/* Poll the CQ every mlx4_en_TX_MODER_POLL packets */
509	if ((++ring->poll_cnt & (MLX4_EN_TX_POLL_MODER - 1)) == 0)
510		if (spin_trylock(&ring->comp_lock)) {
511			mlx4_en_process_tx_cq(priv->dev, cq);
512			spin_unlock(&ring->comp_lock);
513		}
514}
515
516static u16
517mlx4_en_get_inline_hdr_size(struct mlx4_en_tx_ring *ring, struct mbuf *mb)
518{
519	u16 retval;
520
521	/* only copy from first fragment, if possible */
522	retval = MIN(ring->inline_thold, mb->m_len);
523
524	/* check for too little data */
525	if (unlikely(retval < MIN_PKT_LEN))
526		retval = MIN(ring->inline_thold, mb->m_pkthdr.len);
527	return (retval);
528}
529
530static int
531mlx4_en_get_header_size(struct mbuf *mb)
532{
533	struct ether_vlan_header *eh;
534        struct tcphdr *th;
535        struct ip *ip;
536        int ip_hlen, tcp_hlen;
537	struct ip6_hdr *ip6;
538	uint16_t eth_type;
539	int eth_hdr_len;
540
541	eh = mtod(mb, struct ether_vlan_header *);
542	if (mb->m_len < ETHER_HDR_LEN)
543		return (0);
544	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
545		eth_type = ntohs(eh->evl_proto);
546		eth_hdr_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
547	} else {
548		eth_type = ntohs(eh->evl_encap_proto);
549		eth_hdr_len = ETHER_HDR_LEN;
550	}
551	if (mb->m_len < eth_hdr_len)
552		return (0);
553	switch (eth_type) {
554	case ETHERTYPE_IP:
555		ip = (struct ip *)(mb->m_data + eth_hdr_len);
556		if (mb->m_len < eth_hdr_len + sizeof(*ip))
557			return (0);
558		if (ip->ip_p != IPPROTO_TCP)
559			return (0);
560		ip_hlen = ip->ip_hl << 2;
561		eth_hdr_len += ip_hlen;
562		break;
563	case ETHERTYPE_IPV6:
564		ip6 = (struct ip6_hdr *)(mb->m_data + eth_hdr_len);
565		if (mb->m_len < eth_hdr_len + sizeof(*ip6))
566			return (0);
567		if (ip6->ip6_nxt != IPPROTO_TCP)
568			return (0);
569		eth_hdr_len += sizeof(*ip6);
570		break;
571	default:
572		return (0);
573	}
574	if (mb->m_len < eth_hdr_len + sizeof(*th))
575		return (0);
576	th = (struct tcphdr *)(mb->m_data + eth_hdr_len);
577	tcp_hlen = th->th_off << 2;
578	eth_hdr_len += tcp_hlen;
579	if (mb->m_len < eth_hdr_len)
580		return (0);
581	return (eth_hdr_len);
582}
583
584static volatile struct mlx4_wqe_data_seg *
585mlx4_en_store_inline_data(volatile struct mlx4_wqe_data_seg *dseg,
586    struct mbuf *mb, int len, __be32 owner_bit)
587{
588	uint8_t *inl = __DEVOLATILE(uint8_t *, dseg);
589	const int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - 4;
590
591	if (unlikely(len < MIN_PKT_LEN)) {
592		m_copydata(mb, 0, len, inl + 4);
593		memset(inl + 4 + len, 0, MIN_PKT_LEN - len);
594		dseg += DIV_ROUND_UP(4 + MIN_PKT_LEN, DS_SIZE_ALIGNMENT);
595	} else if (len <= spc) {
596		m_copydata(mb, 0, len, inl + 4);
597		dseg += DIV_ROUND_UP(4 + len, DS_SIZE_ALIGNMENT);
598	} else {
599		m_copydata(mb, 0, spc, inl + 4);
600		m_copydata(mb, spc, len - spc, inl + 8 + spc);
601		dseg += DIV_ROUND_UP(8 + len, DS_SIZE_ALIGNMENT);
602	}
603	return (dseg);
604}
605
606static void
607mlx4_en_store_inline_header(volatile struct mlx4_wqe_data_seg *dseg,
608    int len, __be32 owner_bit)
609{
610	uint8_t *inl = __DEVOLATILE(uint8_t *, dseg);
611	const int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - 4;
612
613	if (unlikely(len < MIN_PKT_LEN)) {
614		*(volatile uint32_t *)inl =
615		    SET_BYTE_COUNT((1 << 31) | MIN_PKT_LEN);
616	} else if (len <= spc) {
617		*(volatile uint32_t *)inl =
618		    SET_BYTE_COUNT((1 << 31) | len);
619	} else {
620		*(volatile uint32_t *)(inl + 4 + spc) =
621		    SET_BYTE_COUNT((1 << 31) | (len - spc));
622		wmb();
623		*(volatile uint32_t *)inl =
624		    SET_BYTE_COUNT((1 << 31) | spc);
625	}
626}
627
628static unsigned long hashrandom;
629static void hashrandom_init(void *arg)
630{
631	hashrandom = random();
632}
633SYSINIT(hashrandom_init, SI_SUB_KLD, SI_ORDER_SECOND, &hashrandom_init, NULL);
634
635u16 mlx4_en_select_queue(struct net_device *dev, struct mbuf *mb)
636{
637	struct mlx4_en_priv *priv = netdev_priv(dev);
638	u32 rings_p_up = priv->num_tx_rings_p_up;
639	u32 up = 0;
640	u32 queue_index;
641
642#if (MLX4_EN_NUM_UP > 1)
643	/* Obtain VLAN information if present */
644	if (mb->m_flags & M_VLANTAG) {
645		u32 vlan_tag = mb->m_pkthdr.ether_vtag;
646	        up = (vlan_tag >> 13) % MLX4_EN_NUM_UP;
647	}
648#endif
649	queue_index = mlx4_en_hashmbuf(MLX4_F_HASHL3 | MLX4_F_HASHL4, mb, hashrandom);
650
651	return ((queue_index % rings_p_up) + (up * rings_p_up));
652}
653
654static void mlx4_bf_copy(void __iomem *dst, volatile unsigned long *src, unsigned bytecnt)
655{
656	__iowrite64_copy(dst, __DEVOLATILE(void *, src), bytecnt / 8);
657}
658
659static u64 mlx4_en_mac_to_u64(u8 *addr)
660{
661        u64 mac = 0;
662        int i;
663
664        for (i = 0; i < ETHER_ADDR_LEN; i++) {
665                mac <<= 8;
666                mac |= addr[i];
667        }
668        return mac;
669}
670
671static int mlx4_en_xmit(struct mlx4_en_priv *priv, int tx_ind, struct mbuf **mbp)
672{
673	enum {
674		DS_FACT = TXBB_SIZE / DS_SIZE_ALIGNMENT,
675		CTRL_FLAGS = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE |
676		    MLX4_WQE_CTRL_SOLICITED),
677	};
678	bus_dma_segment_t segs[MLX4_EN_TX_MAX_MBUF_FRAGS];
679	volatile struct mlx4_wqe_data_seg *dseg;
680	volatile struct mlx4_wqe_data_seg *dseg_inline;
681	volatile struct mlx4_en_tx_desc *tx_desc;
682	struct mlx4_en_tx_ring *ring = priv->tx_ring[tx_ind];
683	struct ifnet *ifp = priv->dev;
684	struct mlx4_en_tx_info *tx_info;
685	struct mbuf *mb = *mbp;
686	struct mbuf *m;
687	__be32 owner_bit;
688	int nr_segs;
689	int pad;
690	int err;
691	u32 bf_size;
692	u32 bf_prod;
693	u32 opcode;
694	u16 index;
695	u16 ds_cnt;
696	u16 ihs;
697
698	if (unlikely(!priv->port_up)) {
699		err = EINVAL;
700		goto tx_drop;
701	}
702
703	/* check if TX ring is full */
704	if (unlikely(mlx4_en_tx_ring_is_full(ring))) {
705			/* every full native Tx ring stops queue */
706			if (ring->blocked == 0)
707				atomic_add_int(&priv->blocked, 1);
708			/* Set HW-queue-is-full flag */
709			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
710			priv->port_stats.queue_stopped++;
711		ring->blocked = 1;
712		priv->port_stats.queue_stopped++;
713		ring->queue_stopped++;
714
715		/* Use interrupts to find out when queue opened */
716		mlx4_en_arm_cq(priv, priv->tx_cq[tx_ind]);
717		return (ENOBUFS);
718        }
719
720	/* sanity check we are not wrapping around */
721	KASSERT(((~ring->prod) & ring->size_mask) >=
722	    (MLX4_EN_TX_WQE_MAX_WQEBBS - 1), ("Wrapping around TX ring"));
723
724	/* Track current inflight packets for performance analysis */
725	AVG_PERF_COUNTER(priv->pstats.inflight_avg,
726			 (u32) (ring->prod - ring->cons - 1));
727
728	/* Track current mbuf packet header length */
729	AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, mb->m_pkthdr.len);
730
731	/* Grab an index and try to transmit packet */
732	owner_bit = (ring->prod & ring->size) ?
733		cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0;
734	index = ring->prod & ring->size_mask;
735	tx_desc = (volatile struct mlx4_en_tx_desc *)
736	    (ring->buf + index * TXBB_SIZE);
737	tx_info = &ring->tx_info[index];
738	dseg = &tx_desc->data;
739
740	/* send a copy of the frame to the BPF listener, if any */
741	if (ifp != NULL && ifp->if_bpf != NULL)
742		ETHER_BPF_MTAP(ifp, mb);
743
744	/* get default flags */
745	tx_desc->ctrl.srcrb_flags = CTRL_FLAGS;
746
747	if (mb->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO))
748		tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM);
749
750	if (mb->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP |
751	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
752		tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM);
753
754	/* do statistics */
755	if (likely(tx_desc->ctrl.srcrb_flags != CTRL_FLAGS)) {
756		priv->port_stats.tx_chksum_offload++;
757		ring->tx_csum++;
758	}
759
760	/* check for VLAN tag */
761	if (mb->m_flags & M_VLANTAG) {
762		tx_desc->ctrl.vlan_tag = cpu_to_be16(mb->m_pkthdr.ether_vtag);
763		tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN;
764	} else {
765		tx_desc->ctrl.vlan_tag = 0;
766		tx_desc->ctrl.ins_vlan = 0;
767	}
768
769	/* clear immediate field */
770	tx_desc->ctrl.imm = 0;
771
772	/* Handle LSO (TSO) packets */
773	if (mb->m_pkthdr.csum_flags & CSUM_TSO) {
774		u32 payload_len;
775		u32 mss = mb->m_pkthdr.tso_segsz;
776		u32 num_pkts;
777
778		opcode = cpu_to_be32(MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR) |
779		    owner_bit;
780		ihs = mlx4_en_get_header_size(mb);
781		if (unlikely(ihs > MAX_INLINE)) {
782			ring->oversized_packets++;
783			err = EINVAL;
784			goto tx_drop;
785		}
786		tx_desc->lso.mss_hdr_size = cpu_to_be32((mss << 16) | ihs);
787		payload_len = mb->m_pkthdr.len - ihs;
788		if (unlikely(payload_len == 0))
789			num_pkts = 1;
790		else
791			num_pkts = DIV_ROUND_UP(payload_len, mss);
792		ring->bytes += payload_len + (num_pkts * ihs);
793		ring->packets += num_pkts;
794		priv->port_stats.tso_packets++;
795		/* store pointer to inline header */
796		dseg_inline = dseg;
797		/* copy data inline */
798		dseg = mlx4_en_store_inline_lso_data(dseg,
799		    mb, ihs, owner_bit);
800	} else {
801		opcode = cpu_to_be32(MLX4_OPCODE_SEND) |
802		    owner_bit;
803		ihs = mlx4_en_get_inline_hdr_size(ring, mb);
804		ring->bytes += max_t (unsigned int,
805		    mb->m_pkthdr.len, ETHER_MIN_LEN - ETHER_CRC_LEN);
806		ring->packets++;
807		/* store pointer to inline header */
808		dseg_inline = dseg;
809		/* copy data inline */
810		dseg = mlx4_en_store_inline_data(dseg,
811		    mb, ihs, owner_bit);
812	}
813	m_adj(mb, ihs);
814
815	/* trim off empty mbufs */
816	while (mb->m_len == 0) {
817		mb = m_free(mb);
818		/* check if all data has been inlined */
819		if (mb == NULL) {
820			nr_segs = 0;
821			goto skip_dma;
822		}
823	}
824
825	err = bus_dmamap_load_mbuf_sg(ring->dma_tag, tx_info->dma_map,
826	    mb, segs, &nr_segs, BUS_DMA_NOWAIT);
827	if (unlikely(err == EFBIG)) {
828		/* Too many mbuf fragments */
829		m = m_defrag(mb, M_NOWAIT);
830		if (m == NULL) {
831			ring->oversized_packets++;
832			goto tx_drop;
833		}
834		mb = m;
835		/* Try again */
836		err = bus_dmamap_load_mbuf_sg(ring->dma_tag, tx_info->dma_map,
837		    mb, segs, &nr_segs, BUS_DMA_NOWAIT);
838	}
839	/* catch errors */
840	if (unlikely(err != 0)) {
841		ring->oversized_packets++;
842		goto tx_drop;
843	}
844	/* make sure all mbuf data is written to RAM */
845	bus_dmamap_sync(ring->dma_tag, tx_info->dma_map,
846	    BUS_DMASYNC_PREWRITE);
847
848skip_dma:
849	/* compute number of DS needed */
850	ds_cnt = (dseg - ((volatile struct mlx4_wqe_data_seg *)tx_desc)) + nr_segs;
851
852	/*
853	 * Check if the next request can wrap around and fill the end
854	 * of the current request with zero immediate data:
855	 */
856	pad = DIV_ROUND_UP(ds_cnt, DS_FACT);
857	pad = (~(ring->prod + pad)) & ring->size_mask;
858
859	if (unlikely(pad < (MLX4_EN_TX_WQE_MAX_WQEBBS - 1))) {
860		/*
861		 * Compute the least number of DS blocks we need to
862		 * pad in order to achieve a TX ring wraparound:
863		 */
864		pad = (DS_FACT * (pad + 1));
865	} else {
866		/*
867		 * The hardware will automatically jump to the next
868		 * TXBB. No need for padding.
869		 */
870		pad = 0;
871	}
872
873	/* compute total number of DS blocks */
874	ds_cnt += pad;
875	/*
876	 * When modifying this code, please ensure that the following
877	 * computation is always less than or equal to 0x3F:
878	 *
879	 * ((MLX4_EN_TX_WQE_MAX_WQEBBS - 1) * DS_FACT) +
880	 * (MLX4_EN_TX_WQE_MAX_WQEBBS * DS_FACT)
881	 *
882	 * Else the "ds_cnt" variable can become too big.
883	 */
884	tx_desc->ctrl.fence_size = (ds_cnt & 0x3f);
885
886	/* store pointer to mbuf */
887	tx_info->mb = mb;
888	tx_info->nr_txbb = DIV_ROUND_UP(ds_cnt, DS_FACT);
889	bf_size = ds_cnt * DS_SIZE_ALIGNMENT;
890	bf_prod = ring->prod;
891
892	/* compute end of "dseg" array */
893	dseg += nr_segs + pad;
894
895	/* pad using zero immediate dseg */
896	while (pad--) {
897		dseg--;
898		dseg->addr = 0;
899		dseg->lkey = 0;
900		wmb();
901		dseg->byte_count = SET_BYTE_COUNT((1 << 31)|0);
902	}
903
904	/* fill segment list */
905	while (nr_segs--) {
906		if (unlikely(segs[nr_segs].ds_len == 0)) {
907			dseg--;
908			dseg->addr = 0;
909			dseg->lkey = 0;
910			wmb();
911			dseg->byte_count = SET_BYTE_COUNT((1 << 31)|0);
912		} else {
913			dseg--;
914			dseg->addr = cpu_to_be64((uint64_t)segs[nr_segs].ds_addr);
915			dseg->lkey = cpu_to_be32(priv->mdev->mr.key);
916			wmb();
917			dseg->byte_count = SET_BYTE_COUNT((uint32_t)segs[nr_segs].ds_len);
918		}
919	}
920
921	wmb();
922
923	/* write owner bits in reverse order */
924	if ((opcode & cpu_to_be32(0x1F)) == cpu_to_be32(MLX4_OPCODE_LSO))
925		mlx4_en_store_inline_lso_header(dseg_inline, ihs, owner_bit);
926	else
927		mlx4_en_store_inline_header(dseg_inline, ihs, owner_bit);
928
929	if (unlikely(priv->validate_loopback)) {
930		/* Copy dst mac address to wqe */
931                struct ether_header *ethh;
932                u64 mac;
933                u32 mac_l, mac_h;
934
935                ethh = mtod(mb, struct ether_header *);
936                mac = mlx4_en_mac_to_u64(ethh->ether_dhost);
937                if (mac) {
938                        mac_h = (u32) ((mac & 0xffff00000000ULL) >> 16);
939                        mac_l = (u32) (mac & 0xffffffff);
940                        tx_desc->ctrl.srcrb_flags |= cpu_to_be32(mac_h);
941                        tx_desc->ctrl.imm = cpu_to_be32(mac_l);
942                }
943	}
944
945	/* update producer counter */
946	ring->prod += tx_info->nr_txbb;
947
948	if (ring->bf_enabled && bf_size <= MAX_BF &&
949	    (tx_desc->ctrl.ins_vlan != MLX4_WQE_CTRL_INS_VLAN)) {
950
951		/* store doorbell number */
952		*(volatile __be32 *) (&tx_desc->ctrl.vlan_tag) |= cpu_to_be32(ring->doorbell_qpn);
953
954		/* or in producer number for this WQE */
955		opcode |= cpu_to_be32((bf_prod & 0xffff) << 8);
956
957		/*
958		 * Ensure the new descriptor hits memory before
959		 * setting ownership of this descriptor to HW:
960		 */
961		wmb();
962		tx_desc->ctrl.owner_opcode = opcode;
963		wmb();
964		mlx4_bf_copy(((u8 *)ring->bf.reg) + ring->bf.offset,
965		     (volatile unsigned long *) &tx_desc->ctrl, bf_size);
966		wmb();
967		ring->bf.offset ^= ring->bf.buf_size;
968	} else {
969		/*
970		 * Ensure the new descriptor hits memory before
971		 * setting ownership of this descriptor to HW:
972		 */
973		wmb();
974		tx_desc->ctrl.owner_opcode = opcode;
975		wmb();
976		writel(cpu_to_be32(ring->doorbell_qpn),
977		    ((u8 *)ring->bf.uar->map) + MLX4_SEND_DOORBELL);
978	}
979
980	return (0);
981tx_drop:
982	*mbp = NULL;
983	m_freem(mb);
984	return (err);
985}
986
987static int
988mlx4_en_transmit_locked(struct ifnet *dev, int tx_ind, struct mbuf *m)
989{
990	struct mlx4_en_priv *priv = netdev_priv(dev);
991	struct mlx4_en_tx_ring *ring;
992	struct mbuf *next;
993	int enqueued, err = 0;
994
995	ring = priv->tx_ring[tx_ind];
996	if ((dev->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
997	    IFF_DRV_RUNNING || priv->port_up == 0) {
998		if (m != NULL)
999			err = drbr_enqueue(dev, ring->br, m);
1000		return (err);
1001	}
1002
1003	enqueued = 0;
1004	if (m != NULL)
1005		/*
1006		 * If we can't insert mbuf into drbr, try to xmit anyway.
1007		 * We keep the error we got so we could return that after xmit.
1008		 */
1009		err = drbr_enqueue(dev, ring->br, m);
1010
1011	/* Process the queue */
1012	while ((next = drbr_peek(dev, ring->br)) != NULL) {
1013		if (mlx4_en_xmit(priv, tx_ind, &next) != 0) {
1014			if (next == NULL) {
1015				drbr_advance(dev, ring->br);
1016			} else {
1017				drbr_putback(dev, ring->br, next);
1018			}
1019			break;
1020		}
1021		drbr_advance(dev, ring->br);
1022		enqueued++;
1023		if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0)
1024			break;
1025	}
1026
1027	if (enqueued > 0)
1028		ring->watchdog_time = ticks;
1029
1030	return (err);
1031}
1032
1033void
1034mlx4_en_tx_que(void *context, int pending)
1035{
1036	struct mlx4_en_tx_ring *ring;
1037	struct mlx4_en_priv *priv;
1038	struct net_device *dev;
1039	struct mlx4_en_cq *cq;
1040	int tx_ind;
1041	cq = context;
1042	dev = cq->dev;
1043	priv = dev->if_softc;
1044	tx_ind = cq->ring;
1045	ring = priv->tx_ring[tx_ind];
1046
1047	if (priv->port_up != 0 &&
1048	    (dev->if_drv_flags & IFF_DRV_RUNNING) != 0) {
1049		mlx4_en_xmit_poll(priv, tx_ind);
1050		spin_lock(&ring->tx_lock);
1051                if (!drbr_empty(dev, ring->br))
1052			mlx4_en_transmit_locked(dev, tx_ind, NULL);
1053		spin_unlock(&ring->tx_lock);
1054	}
1055}
1056
1057int
1058mlx4_en_transmit(struct ifnet *dev, struct mbuf *m)
1059{
1060	struct mlx4_en_priv *priv = netdev_priv(dev);
1061	struct mlx4_en_tx_ring *ring;
1062	struct mlx4_en_cq *cq;
1063	int i, err = 0;
1064
1065	if (priv->port_up == 0) {
1066		m_freem(m);
1067		return (ENETDOWN);
1068	}
1069
1070	/* Compute which queue to use */
1071	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
1072		i = (m->m_pkthdr.flowid % 128) % priv->tx_ring_num;
1073	}
1074	else {
1075		i = mlx4_en_select_queue(dev, m);
1076	}
1077
1078	ring = priv->tx_ring[i];
1079	if (spin_trylock(&ring->tx_lock)) {
1080		err = mlx4_en_transmit_locked(dev, i, m);
1081		spin_unlock(&ring->tx_lock);
1082		/* Poll CQ here */
1083		mlx4_en_xmit_poll(priv, i);
1084	} else {
1085		err = drbr_enqueue(dev, ring->br, m);
1086		cq = priv->tx_cq[i];
1087		taskqueue_enqueue(cq->tq, &cq->cq_task);
1088	}
1089
1090	return (err);
1091}
1092
1093/*
1094 * Flush ring buffers.
1095 */
1096void
1097mlx4_en_qflush(struct ifnet *dev)
1098{
1099	struct mlx4_en_priv *priv = netdev_priv(dev);
1100	struct mlx4_en_tx_ring *ring;
1101	struct mbuf *m;
1102
1103	if (priv->port_up == 0)
1104		return;
1105
1106	for (int i = 0; i < priv->tx_ring_num; i++) {
1107		ring = priv->tx_ring[i];
1108		spin_lock(&ring->tx_lock);
1109		while ((m = buf_ring_dequeue_sc(ring->br)) != NULL)
1110			m_freem(m);
1111		spin_unlock(&ring->tx_lock);
1112	}
1113	if_qflush(dev);
1114}
1115