1/* SLP - Basic Block Vectorization
2   Copyright (C) 2007-2015 Free Software Foundation, Inc.
3   Contributed by Dorit Naishlos <dorit@il.ibm.com>
4   and Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3.  If not see
20<http://www.gnu.org/licenses/>.  */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "dumpfile.h"
26#include "tm.h"
27#include "hash-set.h"
28#include "machmode.h"
29#include "vec.h"
30#include "double-int.h"
31#include "input.h"
32#include "alias.h"
33#include "symtab.h"
34#include "wide-int.h"
35#include "inchash.h"
36#include "tree.h"
37#include "fold-const.h"
38#include "stor-layout.h"
39#include "target.h"
40#include "predict.h"
41#include "hard-reg-set.h"
42#include "function.h"
43#include "basic-block.h"
44#include "gimple-pretty-print.h"
45#include "tree-ssa-alias.h"
46#include "internal-fn.h"
47#include "gimple-expr.h"
48#include "is-a.h"
49#include "gimple.h"
50#include "gimple-iterator.h"
51#include "gimple-ssa.h"
52#include "tree-phinodes.h"
53#include "ssa-iterators.h"
54#include "stringpool.h"
55#include "tree-ssanames.h"
56#include "tree-pass.h"
57#include "cfgloop.h"
58#include "hashtab.h"
59#include "rtl.h"
60#include "flags.h"
61#include "statistics.h"
62#include "real.h"
63#include "fixed-value.h"
64#include "insn-config.h"
65#include "expmed.h"
66#include "dojump.h"
67#include "explow.h"
68#include "calls.h"
69#include "emit-rtl.h"
70#include "varasm.h"
71#include "stmt.h"
72#include "expr.h"
73#include "recog.h"		/* FIXME: for insn_data */
74#include "insn-codes.h"
75#include "optabs.h"
76#include "tree-vectorizer.h"
77#include "langhooks.h"
78#include "gimple-walk.h"
79
80/* Extract the location of the basic block in the source code.
81   Return the basic block location if succeed and NULL if not.  */
82
83source_location
84find_bb_location (basic_block bb)
85{
86  gimple stmt = NULL;
87  gimple_stmt_iterator si;
88
89  if (!bb)
90    return UNKNOWN_LOCATION;
91
92  for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
93    {
94      stmt = gsi_stmt (si);
95      if (gimple_location (stmt) != UNKNOWN_LOCATION)
96        return gimple_location (stmt);
97    }
98
99  return UNKNOWN_LOCATION;
100}
101
102
103/* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
104
105static void
106vect_free_slp_tree (slp_tree node)
107{
108  int i;
109  slp_tree child;
110
111  if (!node)
112    return;
113
114  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
115    vect_free_slp_tree (child);
116
117  SLP_TREE_CHILDREN (node).release ();
118  SLP_TREE_SCALAR_STMTS (node).release ();
119  SLP_TREE_VEC_STMTS (node).release ();
120  SLP_TREE_LOAD_PERMUTATION (node).release ();
121
122  free (node);
123}
124
125
126/* Free the memory allocated for the SLP instance.  */
127
128void
129vect_free_slp_instance (slp_instance instance)
130{
131  vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
132  SLP_INSTANCE_LOADS (instance).release ();
133  SLP_INSTANCE_BODY_COST_VEC (instance).release ();
134  free (instance);
135}
136
137
138/* Create an SLP node for SCALAR_STMTS.  */
139
140static slp_tree
141vect_create_new_slp_node (vec<gimple> scalar_stmts)
142{
143  slp_tree node;
144  gimple stmt = scalar_stmts[0];
145  unsigned int nops;
146
147  if (is_gimple_call (stmt))
148    nops = gimple_call_num_args (stmt);
149  else if (is_gimple_assign (stmt))
150    {
151      nops = gimple_num_ops (stmt) - 1;
152      if (gimple_assign_rhs_code (stmt) == COND_EXPR)
153	nops++;
154    }
155  else
156    return NULL;
157
158  node = XNEW (struct _slp_tree);
159  SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
160  SLP_TREE_VEC_STMTS (node).create (0);
161  SLP_TREE_CHILDREN (node).create (nops);
162  SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
163
164  return node;
165}
166
167
168/* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
169   operand.  */
170static vec<slp_oprnd_info>
171vect_create_oprnd_info (int nops, int group_size)
172{
173  int i;
174  slp_oprnd_info oprnd_info;
175  vec<slp_oprnd_info> oprnds_info;
176
177  oprnds_info.create (nops);
178  for (i = 0; i < nops; i++)
179    {
180      oprnd_info = XNEW (struct _slp_oprnd_info);
181      oprnd_info->def_stmts.create (group_size);
182      oprnd_info->first_dt = vect_uninitialized_def;
183      oprnd_info->first_op_type = NULL_TREE;
184      oprnd_info->first_pattern = false;
185      oprnds_info.quick_push (oprnd_info);
186    }
187
188  return oprnds_info;
189}
190
191
192/* Free operands info.  */
193
194static void
195vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
196{
197  int i;
198  slp_oprnd_info oprnd_info;
199
200  FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
201    {
202      oprnd_info->def_stmts.release ();
203      XDELETE (oprnd_info);
204    }
205
206  oprnds_info.release ();
207}
208
209
210/* Find the place of the data-ref in STMT in the interleaving chain that starts
211   from FIRST_STMT.  Return -1 if the data-ref is not a part of the chain.  */
212
213static int
214vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
215{
216  gimple next_stmt = first_stmt;
217  int result = 0;
218
219  if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
220    return -1;
221
222  do
223    {
224      if (next_stmt == stmt)
225	return result;
226      result++;
227      next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
228    }
229  while (next_stmt);
230
231  return -1;
232}
233
234
235/* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
236   they are of a valid type and that they match the defs of the first stmt of
237   the SLP group (stored in OPRNDS_INFO).  If there was a fatal error
238   return -1, if the error could be corrected by swapping operands of the
239   operation return 1, if everything is ok return 0.  */
240
241static int
242vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
243                             gimple stmt, bool first,
244                             vec<slp_oprnd_info> *oprnds_info)
245{
246  tree oprnd;
247  unsigned int i, number_of_oprnds;
248  tree def;
249  gimple def_stmt;
250  enum vect_def_type dt = vect_uninitialized_def;
251  struct loop *loop = NULL;
252  bool pattern = false;
253  slp_oprnd_info oprnd_info;
254  int first_op_idx = 1;
255  bool commutative = false;
256  bool first_op_cond = false;
257
258  if (loop_vinfo)
259    loop = LOOP_VINFO_LOOP (loop_vinfo);
260
261  if (is_gimple_call (stmt))
262    {
263      number_of_oprnds = gimple_call_num_args (stmt);
264      first_op_idx = 3;
265    }
266  else if (is_gimple_assign (stmt))
267    {
268      enum tree_code code = gimple_assign_rhs_code (stmt);
269      number_of_oprnds = gimple_num_ops (stmt) - 1;
270      if (gimple_assign_rhs_code (stmt) == COND_EXPR)
271	{
272	  first_op_cond = true;
273	  commutative = true;
274	  number_of_oprnds++;
275	}
276      else
277	commutative = commutative_tree_code (code);
278    }
279  else
280    return -1;
281
282  bool swapped = false;
283  for (i = 0; i < number_of_oprnds; i++)
284    {
285again:
286      if (first_op_cond)
287	{
288	  if (i == 0 || i == 1)
289	    oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
290				  swapped ? !i : i);
291	  else
292	    oprnd = gimple_op (stmt, first_op_idx + i - 1);
293	}
294      else
295        oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
296
297      oprnd_info = (*oprnds_info)[i];
298
299      if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
300			       &def, &dt)
301	  || (!def_stmt && dt != vect_constant_def))
302	{
303	  if (dump_enabled_p ())
304	    {
305	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
306			       "Build SLP failed: can't find def for ");
307	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
308              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
309	    }
310
311	  return -1;
312	}
313
314      /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
315         from the pattern.  Check that all the stmts of the node are in the
316         pattern.  */
317      if (def_stmt && gimple_bb (def_stmt)
318          && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
319	      || (!loop && gimple_bb (def_stmt) == BB_VINFO_BB (bb_vinfo)
320		  && gimple_code (def_stmt) != GIMPLE_PHI))
321          && vinfo_for_stmt (def_stmt)
322          && STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (def_stmt))
323	  && !STMT_VINFO_RELEVANT (vinfo_for_stmt (def_stmt))
324	  && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
325        {
326          pattern = true;
327          if (!first && !oprnd_info->first_pattern)
328	    {
329	      if (i == 0
330		  && !swapped
331		  && commutative)
332		{
333		  swapped = true;
334		  goto again;
335		}
336
337	      if (dump_enabled_p ())
338		{
339		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
340				   "Build SLP failed: some of the stmts"
341				   " are in a pattern, and others are not ");
342		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
343                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
344		}
345
346	      return 1;
347            }
348
349          def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
350          dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
351
352          if (dt == vect_unknown_def_type)
353            {
354              if (dump_enabled_p ())
355                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
356				 "Unsupported pattern.\n");
357              return -1;
358            }
359
360          switch (gimple_code (def_stmt))
361            {
362              case GIMPLE_PHI:
363                def = gimple_phi_result (def_stmt);
364                break;
365
366              case GIMPLE_ASSIGN:
367                def = gimple_assign_lhs (def_stmt);
368                break;
369
370              default:
371                if (dump_enabled_p ())
372                  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
373				   "unsupported defining stmt:\n");
374                return -1;
375            }
376        }
377
378      if (first)
379	{
380	  oprnd_info->first_dt = dt;
381	  oprnd_info->first_pattern = pattern;
382	  oprnd_info->first_op_type = TREE_TYPE (oprnd);
383	}
384      else
385	{
386	  /* Not first stmt of the group, check that the def-stmt/s match
387	     the def-stmt/s of the first stmt.  Allow different definition
388	     types for reduction chains: the first stmt must be a
389	     vect_reduction_def (a phi node), and the rest
390	     vect_internal_def.  */
391	  if (((oprnd_info->first_dt != dt
392                && !(oprnd_info->first_dt == vect_reduction_def
393                     && dt == vect_internal_def)
394		&& !((oprnd_info->first_dt == vect_external_def
395		      || oprnd_info->first_dt == vect_constant_def)
396		     && (dt == vect_external_def
397			 || dt == vect_constant_def)))
398               || !types_compatible_p (oprnd_info->first_op_type,
399				       TREE_TYPE (oprnd))))
400	    {
401	      /* Try swapping operands if we got a mismatch.  */
402	      if (i == 0
403		  && !swapped
404		  && commutative)
405		{
406		  swapped = true;
407		  goto again;
408		}
409
410	      if (dump_enabled_p ())
411		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
412				 "Build SLP failed: different types\n");
413
414	      return 1;
415	    }
416	}
417
418      /* Check the types of the definitions.  */
419      switch (dt)
420	{
421	case vect_constant_def:
422	case vect_external_def:
423        case vect_reduction_def:
424	  break;
425
426	case vect_internal_def:
427	  oprnd_info->def_stmts.quick_push (def_stmt);
428	  break;
429
430	default:
431	  /* FORNOW: Not supported.  */
432	  if (dump_enabled_p ())
433	    {
434	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435			       "Build SLP failed: illegal type of def ");
436	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def);
437              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
438	    }
439
440	  return -1;
441	}
442    }
443
444  /* Swap operands.  */
445  if (swapped)
446    {
447      if (first_op_cond)
448	{
449	  tree cond = gimple_assign_rhs1 (stmt);
450	  swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
451			     &TREE_OPERAND (cond, 1));
452	  TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
453	}
454      else
455	swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
456			   gimple_assign_rhs2_ptr (stmt));
457    }
458
459  return 0;
460}
461
462
463/* Verify if the scalar stmts STMTS are isomorphic, require data
464   permutation or are of unsupported types of operation.  Return
465   true if they are, otherwise return false and indicate in *MATCHES
466   which stmts are not isomorphic to the first one.  If MATCHES[0]
467   is false then this indicates the comparison could not be
468   carried out or the stmts will never be vectorized by SLP.  */
469
470static bool
471vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
472		       vec<gimple> stmts, unsigned int group_size,
473		       unsigned nops, unsigned int *max_nunits,
474		       unsigned int vectorization_factor, bool *matches)
475{
476  unsigned int i;
477  gimple stmt = stmts[0];
478  enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
479  enum tree_code first_cond_code = ERROR_MARK;
480  tree lhs;
481  bool need_same_oprnds = false;
482  tree vectype, scalar_type, first_op1 = NULL_TREE;
483  optab optab;
484  int icode;
485  machine_mode optab_op2_mode;
486  machine_mode vec_mode;
487  struct data_reference *first_dr;
488  HOST_WIDE_INT dummy;
489  gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
490  tree cond;
491
492  /* For every stmt in NODE find its def stmt/s.  */
493  FOR_EACH_VEC_ELT (stmts, i, stmt)
494    {
495      matches[i] = false;
496
497      if (dump_enabled_p ())
498	{
499	  dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for ");
500	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
501          dump_printf (MSG_NOTE, "\n");
502	}
503
504      /* Fail to vectorize statements marked as unvectorizable.  */
505      if (!STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
506        {
507          if (dump_enabled_p ())
508            {
509              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
510			       "Build SLP failed: unvectorizable statement ");
511              dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
512              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
513            }
514	  /* Fatal mismatch.  */
515	  matches[0] = false;
516          return false;
517        }
518
519      lhs = gimple_get_lhs (stmt);
520      if (lhs == NULL_TREE)
521	{
522	  if (dump_enabled_p ())
523	    {
524	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
525			       "Build SLP failed: not GIMPLE_ASSIGN nor "
526			       "GIMPLE_CALL ");
527	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
528              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
529	    }
530	  /* Fatal mismatch.  */
531	  matches[0] = false;
532	  return false;
533	}
534
535       if (is_gimple_assign (stmt)
536	   && gimple_assign_rhs_code (stmt) == COND_EXPR
537           && (cond = gimple_assign_rhs1 (stmt))
538           && !COMPARISON_CLASS_P (cond))
539        {
540          if (dump_enabled_p ())
541            {
542              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
543			       "Build SLP failed: condition is not "
544			       "comparison ");
545              dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
546              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
547            }
548	  /* Fatal mismatch.  */
549	  matches[0] = false;
550          return false;
551        }
552
553      scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, &dummy);
554      vectype = get_vectype_for_scalar_type (scalar_type);
555      if (!vectype)
556        {
557          if (dump_enabled_p ())
558            {
559              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
560			       "Build SLP failed: unsupported data-type ");
561              dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
562				 scalar_type);
563              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
564            }
565	  /* Fatal mismatch.  */
566	  matches[0] = false;
567          return false;
568        }
569
570      /* In case of multiple types we need to detect the smallest type.  */
571      if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
572        {
573          *max_nunits = TYPE_VECTOR_SUBPARTS (vectype);
574          if (bb_vinfo)
575            vectorization_factor = *max_nunits;
576        }
577
578      if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
579	{
580	  rhs_code = CALL_EXPR;
581	  if (gimple_call_internal_p (call_stmt)
582	      || gimple_call_tail_p (call_stmt)
583	      || gimple_call_noreturn_p (call_stmt)
584	      || !gimple_call_nothrow_p (call_stmt)
585	      || gimple_call_chain (call_stmt))
586	    {
587	      if (dump_enabled_p ())
588		{
589		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
590				   "Build SLP failed: unsupported call type ");
591		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
592				    call_stmt, 0);
593                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
594		}
595	      /* Fatal mismatch.  */
596	      matches[0] = false;
597	      return false;
598	    }
599	}
600      else
601	rhs_code = gimple_assign_rhs_code (stmt);
602
603      /* Check the operation.  */
604      if (i == 0)
605	{
606	  first_stmt_code = rhs_code;
607
608	  /* Shift arguments should be equal in all the packed stmts for a
609	     vector shift with scalar shift operand.  */
610	  if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
611	      || rhs_code == LROTATE_EXPR
612	      || rhs_code == RROTATE_EXPR)
613	    {
614	      vec_mode = TYPE_MODE (vectype);
615
616	      /* First see if we have a vector/vector shift.  */
617	      optab = optab_for_tree_code (rhs_code, vectype,
618					   optab_vector);
619
620	      if (!optab
621		  || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
622		{
623		  /* No vector/vector shift, try for a vector/scalar shift.  */
624		  optab = optab_for_tree_code (rhs_code, vectype,
625					       optab_scalar);
626
627		  if (!optab)
628		    {
629		      if (dump_enabled_p ())
630			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
631					 "Build SLP failed: no optab.\n");
632		      /* Fatal mismatch.  */
633		      matches[0] = false;
634		      return false;
635		    }
636		  icode = (int) optab_handler (optab, vec_mode);
637		  if (icode == CODE_FOR_nothing)
638		    {
639		      if (dump_enabled_p ())
640			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
641					 "Build SLP failed: "
642					 "op not supported by target.\n");
643		      /* Fatal mismatch.  */
644		      matches[0] = false;
645		      return false;
646		    }
647		  optab_op2_mode = insn_data[icode].operand[2].mode;
648		  if (!VECTOR_MODE_P (optab_op2_mode))
649		    {
650		      need_same_oprnds = true;
651		      first_op1 = gimple_assign_rhs2 (stmt);
652		    }
653		}
654	    }
655	  else if (rhs_code == WIDEN_LSHIFT_EXPR)
656            {
657              need_same_oprnds = true;
658              first_op1 = gimple_assign_rhs2 (stmt);
659            }
660	}
661      else
662	{
663	  if (first_stmt_code != rhs_code
664	      && (first_stmt_code != IMAGPART_EXPR
665		  || rhs_code != REALPART_EXPR)
666	      && (first_stmt_code != REALPART_EXPR
667		  || rhs_code != IMAGPART_EXPR)
668              && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
669                   && (first_stmt_code == ARRAY_REF
670                       || first_stmt_code == BIT_FIELD_REF
671                       || first_stmt_code == INDIRECT_REF
672                       || first_stmt_code == COMPONENT_REF
673                       || first_stmt_code == MEM_REF)))
674	    {
675	      if (dump_enabled_p ())
676		{
677		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
678				   "Build SLP failed: different operation "
679				   "in stmt ");
680		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
681                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
682		}
683	      /* Mismatch.  */
684	      continue;
685	    }
686
687	  if (need_same_oprnds
688	      && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
689	    {
690	      if (dump_enabled_p ())
691		{
692		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
693				   "Build SLP failed: different shift "
694				   "arguments in ");
695		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
696                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
697		}
698	      /* Mismatch.  */
699	      continue;
700	    }
701
702	  if (rhs_code == CALL_EXPR)
703	    {
704	      gimple first_stmt = stmts[0];
705	      if (gimple_call_num_args (stmt) != nops
706		  || !operand_equal_p (gimple_call_fn (first_stmt),
707				       gimple_call_fn (stmt), 0)
708		  || gimple_call_fntype (first_stmt)
709		     != gimple_call_fntype (stmt))
710		{
711		  if (dump_enabled_p ())
712		    {
713		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
714				       "Build SLP failed: different calls in ");
715		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
716					stmt, 0);
717                      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
718		    }
719		  /* Mismatch.  */
720		  continue;
721		}
722	    }
723	}
724
725      /* Grouped store or load.  */
726      if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
727	{
728	  if (REFERENCE_CLASS_P (lhs))
729	    {
730	      /* Store.  */
731	      ;
732	    }
733	  else
734	    {
735	      /* Load.  */
736	      unsigned unrolling_factor
737		= least_common_multiple
738		    (*max_nunits, group_size) / group_size;
739              /* FORNOW: Check that there is no gap between the loads
740		 and no gap between the groups when we need to load
741		 multiple groups at once.
742		 ???  We should enhance this to only disallow gaps
743		 inside vectors.  */
744              if ((unrolling_factor > 1
745		   && ((GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
746			&& GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
747		       /* If the group is split up then GROUP_GAP
748			  isn't correct here, nor is GROUP_FIRST_ELEMENT.  */
749		       || GROUP_SIZE (vinfo_for_stmt (stmt)) > group_size))
750		  || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
751		      && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
752                {
753                  if (dump_enabled_p ())
754                    {
755                      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
756				       "Build SLP failed: grouped "
757				       "loads have gaps ");
758                      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
759					stmt, 0);
760                      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
761                    }
762		  /* Fatal mismatch.  */
763		  matches[0] = false;
764                  return false;
765                }
766
767              /* Check that the size of interleaved loads group is not
768                 greater than the SLP group size.  */
769	      unsigned ncopies
770		= vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype);
771              if (loop_vinfo
772		  && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
773                  && ((GROUP_SIZE (vinfo_for_stmt (stmt))
774		       - GROUP_GAP (vinfo_for_stmt (stmt)))
775		      > ncopies * group_size))
776                {
777                  if (dump_enabled_p ())
778                    {
779                      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
780				       "Build SLP failed: the number "
781				       "of interleaved loads is greater than "
782				       "the SLP group size ");
783                      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
784					stmt, 0);
785                      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
786                    }
787		  /* Fatal mismatch.  */
788		  matches[0] = false;
789                  return false;
790                }
791
792	      old_first_load = first_load;
793              first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
794              if (prev_first_load)
795                {
796                  /* Check that there are no loads from different interleaving
797                     chains in the same node.  */
798                  if (prev_first_load != first_load)
799                    {
800                      if (dump_enabled_p ())
801                        {
802                          dump_printf_loc (MSG_MISSED_OPTIMIZATION,
803					   vect_location,
804					   "Build SLP failed: different "
805					   "interleaving chains in one node ");
806                          dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
807					    stmt, 0);
808                          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
809                        }
810		      /* Mismatch.  */
811		      continue;
812                    }
813                }
814              else
815                prev_first_load = first_load;
816
817	      /* In some cases a group of loads is just the same load
818		 repeated N times.  Only analyze its cost once.  */
819              if (first_load == stmt && old_first_load != first_load)
820                {
821                  first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
822                  if (vect_supportable_dr_alignment (first_dr, false)
823                      == dr_unaligned_unsupported)
824                    {
825                      if (dump_enabled_p ())
826                        {
827                          dump_printf_loc (MSG_MISSED_OPTIMIZATION,
828					   vect_location,
829					   "Build SLP failed: unsupported "
830					   "unaligned load ");
831                          dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
832					    stmt, 0);
833                          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
834                        }
835		      /* Fatal mismatch.  */
836		      matches[0] = false;
837                      return false;
838                    }
839                }
840           }
841        } /* Grouped access.  */
842      else
843	{
844	  if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
845	    {
846	      /* Not grouped load.  */
847	      if (dump_enabled_p ())
848		{
849		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
850				   "Build SLP failed: not grouped load ");
851		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
852                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
853		}
854
855	      /* FORNOW: Not grouped loads are not supported.  */
856	      /* Fatal mismatch.  */
857	      matches[0] = false;
858	      return false;
859	    }
860
861	  /* Not memory operation.  */
862	  if (TREE_CODE_CLASS (rhs_code) != tcc_binary
863	      && TREE_CODE_CLASS (rhs_code) != tcc_unary
864	      && rhs_code != COND_EXPR
865	      && rhs_code != CALL_EXPR)
866	    {
867	      if (dump_enabled_p ())
868		{
869		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
870				   "Build SLP failed: operation");
871		  dump_printf (MSG_MISSED_OPTIMIZATION, " unsupported ");
872		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
873                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
874		}
875	      /* Fatal mismatch.  */
876	      matches[0] = false;
877	      return false;
878	    }
879
880          if (rhs_code == COND_EXPR)
881            {
882              tree cond_expr = gimple_assign_rhs1 (stmt);
883
884	      if (i == 0)
885		first_cond_code = TREE_CODE (cond_expr);
886              else if (first_cond_code != TREE_CODE (cond_expr))
887                {
888                  if (dump_enabled_p ())
889                    {
890                      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
891				       "Build SLP failed: different"
892				       " operation");
893                      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
894					stmt, 0);
895                      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
896                    }
897		  /* Mismatch.  */
898		  continue;
899		}
900            }
901	}
902
903      matches[i] = true;
904    }
905
906  for (i = 0; i < group_size; ++i)
907    if (!matches[i])
908      return false;
909
910  return true;
911}
912
913/* Recursively build an SLP tree starting from NODE.
914   Fail (and return a value not equal to zero) if def-stmts are not
915   isomorphic, require data permutation or are of unsupported types of
916   operation.  Otherwise, return 0.
917   The value returned is the depth in the SLP tree where a mismatch
918   was found.  */
919
920static bool
921vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
922                     slp_tree *node, unsigned int group_size,
923                     unsigned int *max_nunits,
924                     vec<slp_tree> *loads,
925                     unsigned int vectorization_factor,
926		     bool *matches, unsigned *npermutes, unsigned *tree_size,
927		     unsigned max_tree_size)
928{
929  unsigned nops, i, this_tree_size = 0;
930  gimple stmt;
931
932  matches[0] = false;
933
934  stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
935  if (is_gimple_call (stmt))
936    nops = gimple_call_num_args (stmt);
937  else if (is_gimple_assign (stmt))
938    {
939      nops = gimple_num_ops (stmt) - 1;
940      if (gimple_assign_rhs_code (stmt) == COND_EXPR)
941	nops++;
942    }
943  else
944    return false;
945
946  if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
947			      SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
948			      max_nunits, vectorization_factor, matches))
949    return false;
950
951  /* If the SLP node is a load, terminate the recursion.  */
952  if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
953      && DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
954    {
955      loads->safe_push (*node);
956      return true;
957    }
958
959  /* Get at the operands, verifying they are compatible.  */
960  vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
961  slp_oprnd_info oprnd_info;
962  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
963    {
964      switch (vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
965					   stmt, (i == 0), &oprnds_info))
966	{
967	case 0:
968	  break;
969	case -1:
970	  matches[0] = false;
971	  vect_free_oprnd_info (oprnds_info);
972	  return false;
973	case 1:
974	  matches[i] = false;
975	  break;
976	}
977    }
978  for (i = 0; i < group_size; ++i)
979    if (!matches[i])
980      {
981	vect_free_oprnd_info (oprnds_info);
982	return false;
983      }
984
985  stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
986
987  /* Create SLP_TREE nodes for the definition node/s.  */
988  FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
989    {
990      slp_tree child;
991      unsigned old_nloads = loads->length ();
992      unsigned old_max_nunits = *max_nunits;
993
994      if (oprnd_info->first_dt != vect_internal_def)
995        continue;
996
997      if (++this_tree_size > max_tree_size)
998	{
999	  vect_free_oprnd_info (oprnds_info);
1000	  return false;
1001	}
1002
1003      child = vect_create_new_slp_node (oprnd_info->def_stmts);
1004      if (!child)
1005	{
1006	  vect_free_oprnd_info (oprnds_info);
1007	  return false;
1008	}
1009
1010      if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1011			       group_size, max_nunits, loads,
1012			       vectorization_factor, matches,
1013			       npermutes, &this_tree_size, max_tree_size))
1014	{
1015	  oprnd_info->def_stmts = vNULL;
1016	  SLP_TREE_CHILDREN (*node).quick_push (child);
1017	  continue;
1018	}
1019
1020      /* If the SLP build for operand zero failed and operand zero
1021	 and one can be commutated try that for the scalar stmts
1022	 that failed the match.  */
1023      if (i == 0
1024	  /* A first scalar stmt mismatch signals a fatal mismatch.  */
1025	  && matches[0]
1026	  /* ???  For COND_EXPRs we can swap the comparison operands
1027	     as well as the arms under some constraints.  */
1028	  && nops == 2
1029	  && oprnds_info[1]->first_dt == vect_internal_def
1030	  && is_gimple_assign (stmt)
1031	  && commutative_tree_code (gimple_assign_rhs_code (stmt))
1032	  /* Do so only if the number of not successful permutes was nor more
1033	     than a cut-ff as re-trying the recursive match on
1034	     possibly each level of the tree would expose exponential
1035	     behavior.  */
1036	  && *npermutes < 4)
1037	{
1038	  unsigned int j;
1039	  slp_tree grandchild;
1040
1041	  /* Roll back.  */
1042	  *max_nunits = old_max_nunits;
1043	  loads->truncate (old_nloads);
1044	  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1045	    vect_free_slp_tree (grandchild);
1046	  SLP_TREE_CHILDREN (child).truncate (0);
1047
1048	  /* Swap mismatched definition stmts.  */
1049	  dump_printf_loc (MSG_NOTE, vect_location,
1050			   "Re-trying with swapped operands of stmts ");
1051	  for (j = 0; j < group_size; ++j)
1052	    if (!matches[j])
1053	      {
1054		gimple tem = oprnds_info[0]->def_stmts[j];
1055		oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
1056		oprnds_info[1]->def_stmts[j] = tem;
1057		dump_printf (MSG_NOTE, "%d ", j);
1058	      }
1059	  dump_printf (MSG_NOTE, "\n");
1060	  /* And try again ... */
1061	  if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
1062				   group_size, max_nunits, loads,
1063				   vectorization_factor,
1064				   matches, npermutes, &this_tree_size,
1065				   max_tree_size))
1066	    {
1067	      oprnd_info->def_stmts = vNULL;
1068	      SLP_TREE_CHILDREN (*node).quick_push (child);
1069	      continue;
1070	    }
1071
1072	  ++*npermutes;
1073	}
1074
1075      oprnd_info->def_stmts = vNULL;
1076      vect_free_slp_tree (child);
1077      vect_free_oprnd_info (oprnds_info);
1078      return false;
1079    }
1080
1081  if (tree_size)
1082    *tree_size += this_tree_size;
1083
1084  vect_free_oprnd_info (oprnds_info);
1085  return true;
1086}
1087
1088/* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
1089
1090static void
1091vect_print_slp_tree (int dump_kind, slp_tree node)
1092{
1093  int i;
1094  gimple stmt;
1095  slp_tree child;
1096
1097  if (!node)
1098    return;
1099
1100  dump_printf (dump_kind, "node ");
1101  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1102    {
1103      dump_printf (dump_kind, "\n\tstmt %d ", i);
1104      dump_gimple_stmt (dump_kind, TDF_SLIM, stmt, 0);
1105    }
1106  dump_printf (dump_kind, "\n");
1107
1108  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1109    vect_print_slp_tree (dump_kind, child);
1110}
1111
1112
1113/* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
1114   If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
1115   J).  Otherwise, MARK is PURE_SLP and J is -1, which indicates that all the
1116   stmts in NODE are to be marked.  */
1117
1118static void
1119vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
1120{
1121  int i;
1122  gimple stmt;
1123  slp_tree child;
1124
1125  if (!node)
1126    return;
1127
1128  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1129    if (j < 0 || i == j)
1130      STMT_SLP_TYPE (vinfo_for_stmt (stmt)) = mark;
1131
1132  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1133    vect_mark_slp_stmts (child, mark, j);
1134}
1135
1136
1137/* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
1138
1139static void
1140vect_mark_slp_stmts_relevant (slp_tree node)
1141{
1142  int i;
1143  gimple stmt;
1144  stmt_vec_info stmt_info;
1145  slp_tree child;
1146
1147  if (!node)
1148    return;
1149
1150  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1151    {
1152      stmt_info = vinfo_for_stmt (stmt);
1153      gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1154                  || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1155      STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1156    }
1157
1158  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1159    vect_mark_slp_stmts_relevant (child);
1160}
1161
1162
1163/* Rearrange the statements of NODE according to PERMUTATION.  */
1164
1165static void
1166vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1167                          vec<unsigned> permutation)
1168{
1169  gimple stmt;
1170  vec<gimple> tmp_stmts;
1171  unsigned int i;
1172  slp_tree child;
1173
1174  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1175    vect_slp_rearrange_stmts (child, group_size, permutation);
1176
1177  gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1178  tmp_stmts.create (group_size);
1179  tmp_stmts.quick_grow_cleared (group_size);
1180
1181  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
1182    tmp_stmts[permutation[i]] = stmt;
1183
1184  SLP_TREE_SCALAR_STMTS (node).release ();
1185  SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1186}
1187
1188
1189/* Check if the required load permutations in the SLP instance
1190   SLP_INSTN are supported.  */
1191
1192static bool
1193vect_supported_load_permutation_p (slp_instance slp_instn)
1194{
1195  unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1196  unsigned int i, j, k, next;
1197  sbitmap load_index;
1198  slp_tree node;
1199  gimple stmt, load, next_load, first_load;
1200  struct data_reference *dr;
1201
1202  if (dump_enabled_p ())
1203    {
1204      dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1205      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1206	if (node->load_permutation.exists ())
1207	  FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1208	    dump_printf (MSG_NOTE, "%d ", next);
1209	else
1210	  for (k = 0; k < group_size; ++k)
1211	    dump_printf (MSG_NOTE, "%d ", k);
1212      dump_printf (MSG_NOTE, "\n");
1213    }
1214
1215  /* In case of reduction every load permutation is allowed, since the order
1216     of the reduction statements is not important (as opposed to the case of
1217     grouped stores).  The only condition we need to check is that all the
1218     load nodes are of the same size and have the same permutation (and then
1219     rearrange all the nodes of the SLP instance according to this
1220     permutation).  */
1221
1222  /* Check that all the load nodes are of the same size.  */
1223  /* ???  Can't we assert this? */
1224  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1225    if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1226      return false;
1227
1228  node = SLP_INSTANCE_TREE (slp_instn);
1229  stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1230
1231  /* Reduction (there are no data-refs in the root).
1232     In reduction chain the order of the loads is important.  */
1233  if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
1234      && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1235    {
1236      slp_tree load;
1237      unsigned int lidx;
1238
1239      /* Compare all the permutation sequences to the first one.  We know
1240         that at least one load is permuted.  */
1241      node = SLP_INSTANCE_LOADS (slp_instn)[0];
1242      if (!node->load_permutation.exists ())
1243	return false;
1244      for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1245	{
1246	  if (!load->load_permutation.exists ())
1247	    return false;
1248	  FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1249	    if (lidx != node->load_permutation[j])
1250	      return false;
1251	}
1252
1253      /* Check that the loads in the first sequence are different and there
1254	 are no gaps between them.  */
1255      load_index = sbitmap_alloc (group_size);
1256      bitmap_clear (load_index);
1257      FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1258	{
1259	  if (bitmap_bit_p (load_index, lidx))
1260	    {
1261	      sbitmap_free (load_index);
1262	      return false;
1263	    }
1264	  bitmap_set_bit (load_index, lidx);
1265	}
1266      for (i = 0; i < group_size; i++)
1267	if (!bitmap_bit_p (load_index, i))
1268	  {
1269	    sbitmap_free (load_index);
1270	    return false;
1271	  }
1272      sbitmap_free (load_index);
1273
1274      /* This permutation is valid for reduction.  Since the order of the
1275	 statements in the nodes is not important unless they are memory
1276	 accesses, we can rearrange the statements in all the nodes
1277	 according to the order of the loads.  */
1278      vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1279				node->load_permutation);
1280
1281      /* We are done, no actual permutations need to be generated.  */
1282      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1283	SLP_TREE_LOAD_PERMUTATION (node).release ();
1284      return true;
1285    }
1286
1287  /* In basic block vectorization we allow any subchain of an interleaving
1288     chain.
1289     FORNOW: not supported in loop SLP because of realignment compications.  */
1290  if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
1291    {
1292      /* Check that for every node in the instance the loads
1293	 form a subchain.  */
1294      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1295        {
1296          next_load = NULL;
1297          FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
1298            {
1299              if (j != 0 && next_load != load)
1300		return false;
1301              next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
1302            }
1303        }
1304
1305      /* Check that the alignment of the first load in every subchain, i.e.,
1306         the first statement in every load node, is supported.
1307	 ???  This belongs in alignment checking.  */
1308      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1309	{
1310	  first_load = SLP_TREE_SCALAR_STMTS (node)[0];
1311	  if (first_load != GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_load)))
1312	    {
1313	      dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_load));
1314	      if (vect_supportable_dr_alignment (dr, false)
1315		  == dr_unaligned_unsupported)
1316		{
1317		  if (dump_enabled_p ())
1318		    {
1319		      dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1320				       vect_location,
1321				       "unsupported unaligned load ");
1322		      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1323					first_load, 0);
1324                      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1325		    }
1326		  return false;
1327		}
1328	    }
1329	}
1330
1331      /* We are done, no actual permutations need to be generated.  */
1332      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1333	SLP_TREE_LOAD_PERMUTATION (node).release ();
1334      return true;
1335    }
1336
1337  /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
1338     GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
1339     well (unless it's reduction).  */
1340  if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
1341    return false;
1342  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1343    if (!node->load_permutation.exists ())
1344      return false;
1345
1346  load_index = sbitmap_alloc (group_size);
1347  bitmap_clear (load_index);
1348  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1349    {
1350      unsigned int lidx = node->load_permutation[0];
1351      if (bitmap_bit_p (load_index, lidx))
1352	{
1353	  sbitmap_free (load_index);
1354	  return false;
1355	}
1356      bitmap_set_bit (load_index, lidx);
1357      FOR_EACH_VEC_ELT (node->load_permutation, j, k)
1358	if (k != lidx)
1359	  {
1360	    sbitmap_free (load_index);
1361	    return false;
1362	  }
1363    }
1364  for (i = 0; i < group_size; i++)
1365    if (!bitmap_bit_p (load_index, i))
1366      {
1367	sbitmap_free (load_index);
1368	return false;
1369      }
1370  sbitmap_free (load_index);
1371
1372  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1373    if (node->load_permutation.exists ()
1374	&& !vect_transform_slp_perm_load
1375	      (node, vNULL, NULL,
1376	       SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
1377      return false;
1378  return true;
1379}
1380
1381
1382/* Find the first load in the loop that belongs to INSTANCE.
1383   When loads are in several SLP nodes, there can be a case in which the first
1384   load does not appear in the first SLP node to be transformed, causing
1385   incorrect order of statements.  Since we generate all the loads together,
1386   they must be inserted before the first load of the SLP instance and not
1387   before the first load of the first node of the instance.  */
1388
1389static gimple
1390vect_find_first_load_in_slp_instance (slp_instance instance)
1391{
1392  int i, j;
1393  slp_tree load_node;
1394  gimple first_load = NULL, load;
1395
1396  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
1397    FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1398      first_load = get_earlier_stmt (load, first_load);
1399
1400  return first_load;
1401}
1402
1403
1404/* Find the last store in SLP INSTANCE.  */
1405
1406static gimple
1407vect_find_last_store_in_slp_instance (slp_instance instance)
1408{
1409  int i;
1410  slp_tree node;
1411  gimple last_store = NULL, store;
1412
1413  node = SLP_INSTANCE_TREE (instance);
1414  for (i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &store); i++)
1415    last_store = get_later_stmt (store, last_store);
1416
1417  return last_store;
1418}
1419
1420/* Compute the cost for the SLP node NODE in the SLP instance INSTANCE.  */
1421
1422static void
1423vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1424			 slp_instance instance, slp_tree node,
1425			 stmt_vector_for_cost *prologue_cost_vec,
1426			 unsigned ncopies_for_cost)
1427{
1428  stmt_vector_for_cost *body_cost_vec = &SLP_INSTANCE_BODY_COST_VEC (instance);
1429
1430  unsigned i;
1431  slp_tree child;
1432  gimple stmt, s;
1433  stmt_vec_info stmt_info;
1434  tree lhs;
1435  unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1436
1437  /* Recurse down the SLP tree.  */
1438  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1439    vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1440			     instance, child, prologue_cost_vec,
1441			     ncopies_for_cost);
1442
1443  /* Look at the first scalar stmt to determine the cost.  */
1444  stmt = SLP_TREE_SCALAR_STMTS (node)[0];
1445  stmt_info = vinfo_for_stmt (stmt);
1446  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1447    {
1448      if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1449	vect_model_store_cost (stmt_info, ncopies_for_cost, false,
1450			       vect_uninitialized_def,
1451			       node, prologue_cost_vec, body_cost_vec);
1452      else
1453	{
1454	  int i;
1455	  gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1456	  vect_model_load_cost (stmt_info, ncopies_for_cost, false,
1457				node, prologue_cost_vec, body_cost_vec);
1458	  /* If the load is permuted record the cost for the permutation.
1459	     ???  Loads from multiple chains are let through here only
1460	     for a single special case involving complex numbers where
1461	     in the end no permutation is necessary.  */
1462	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s)
1463	    if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s))
1464		 == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info))
1465		&& vect_get_place_in_interleaving_chain
1466		     (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i)
1467	      {
1468		record_stmt_cost (body_cost_vec, group_size, vec_perm,
1469				  stmt_info, 0, vect_body);
1470		break;
1471	      }
1472	}
1473    }
1474  else
1475    record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
1476		      stmt_info, 0, vect_body);
1477
1478  /* Scan operands and account for prologue cost of constants/externals.
1479     ???  This over-estimates cost for multiple uses and should be
1480     re-engineered.  */
1481  lhs = gimple_get_lhs (stmt);
1482  for (i = 0; i < gimple_num_ops (stmt); ++i)
1483    {
1484      tree def, op = gimple_op (stmt, i);
1485      gimple def_stmt;
1486      enum vect_def_type dt;
1487      if (!op || op == lhs)
1488	continue;
1489      if (vect_is_simple_use (op, NULL, loop_vinfo, bb_vinfo,
1490			      &def_stmt, &def, &dt)
1491	  && (dt == vect_constant_def || dt == vect_external_def))
1492	record_stmt_cost (prologue_cost_vec, 1, vector_stmt,
1493			  stmt_info, 0, vect_prologue);
1494    }
1495}
1496
1497/* Compute the cost for the SLP instance INSTANCE.  */
1498
1499static void
1500vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1501		       slp_instance instance, unsigned nunits)
1502{
1503  stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
1504  unsigned ncopies_for_cost;
1505  stmt_info_for_cost *si;
1506  unsigned i;
1507
1508  /* Calculate the number of vector stmts to create based on the unrolling
1509     factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
1510     GROUP_SIZE / NUNITS otherwise.  */
1511  unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
1512  ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
1513
1514  prologue_cost_vec.create (10);
1515  body_cost_vec.create (10);
1516  SLP_INSTANCE_BODY_COST_VEC (instance) = body_cost_vec;
1517  vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
1518			   instance, SLP_INSTANCE_TREE (instance),
1519			   &prologue_cost_vec, ncopies_for_cost);
1520
1521  /* Record the prologue costs, which were delayed until we were
1522     sure that SLP was successful.  Unlike the body costs, we know
1523     the final values now regardless of the loop vectorization factor.  */
1524  void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
1525		: BB_VINFO_TARGET_COST_DATA (bb_vinfo));
1526  FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
1527    {
1528      struct _stmt_vec_info *stmt_info
1529	= si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1530      (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1531			    si->misalign, vect_prologue);
1532    }
1533
1534  prologue_cost_vec.release ();
1535}
1536
1537/* Analyze an SLP instance starting from a group of grouped stores.  Call
1538   vect_build_slp_tree to build a tree of packed stmts if possible.
1539   Return FALSE if it's impossible to SLP any stmt in the loop.  */
1540
1541static bool
1542vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1543                           gimple stmt, unsigned max_tree_size)
1544{
1545  slp_instance new_instance;
1546  slp_tree node;
1547  unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1548  unsigned int unrolling_factor = 1, nunits;
1549  tree vectype, scalar_type = NULL_TREE;
1550  gimple next;
1551  unsigned int vectorization_factor = 0;
1552  int i;
1553  unsigned int max_nunits = 0;
1554  vec<slp_tree> loads;
1555  struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
1556  vec<gimple> scalar_stmts;
1557
1558  if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1559    {
1560      if (dr)
1561        {
1562          scalar_type = TREE_TYPE (DR_REF (dr));
1563          vectype = get_vectype_for_scalar_type (scalar_type);
1564        }
1565      else
1566        {
1567          gcc_assert (loop_vinfo);
1568          vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1569        }
1570
1571      group_size = GROUP_SIZE (vinfo_for_stmt (stmt));
1572    }
1573  else
1574    {
1575      gcc_assert (loop_vinfo);
1576      vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1577      group_size = LOOP_VINFO_REDUCTIONS (loop_vinfo).length ();
1578    }
1579
1580  if (!vectype)
1581    {
1582      if (dump_enabled_p ())
1583        {
1584          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1585			   "Build SLP failed: unsupported data-type ");
1586          dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, scalar_type);
1587          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1588        }
1589
1590      return false;
1591    }
1592
1593  nunits = TYPE_VECTOR_SUBPARTS (vectype);
1594  if (loop_vinfo)
1595    vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1596  else
1597    vectorization_factor = nunits;
1598
1599  /* Calculate the unrolling factor.  */
1600  unrolling_factor = least_common_multiple (nunits, group_size) / group_size;
1601  if (unrolling_factor != 1 && !loop_vinfo)
1602    {
1603      if (dump_enabled_p ())
1604        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1605			 "Build SLP failed: unrolling required in basic"
1606			 " block SLP\n");
1607
1608      return false;
1609    }
1610
1611  /* Create a node (a root of the SLP tree) for the packed grouped stores.  */
1612  scalar_stmts.create (group_size);
1613  next = stmt;
1614  if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1615    {
1616      /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS.  */
1617      while (next)
1618        {
1619	  if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next))
1620	      && STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)))
1621	    scalar_stmts.safe_push (
1622		  STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next)));
1623	  else
1624            scalar_stmts.safe_push (next);
1625          next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
1626        }
1627    }
1628  else
1629    {
1630      /* Collect reduction statements.  */
1631      vec<gimple> reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1632      for (i = 0; reductions.iterate (i, &next); i++)
1633	scalar_stmts.safe_push (next);
1634    }
1635
1636  node = vect_create_new_slp_node (scalar_stmts);
1637
1638  loads.create (group_size);
1639
1640  /* Build the tree for the SLP instance.  */
1641  bool *matches = XALLOCAVEC (bool, group_size);
1642  unsigned npermutes = 0;
1643  if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
1644			   &max_nunits, &loads,
1645			   vectorization_factor, matches, &npermutes, NULL,
1646			   max_tree_size))
1647    {
1648      /* Calculate the unrolling factor based on the smallest type.  */
1649      if (max_nunits > nunits)
1650        unrolling_factor = least_common_multiple (max_nunits, group_size)
1651                           / group_size;
1652
1653      if (unrolling_factor != 1 && !loop_vinfo)
1654        {
1655          if (dump_enabled_p ())
1656            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1657			     "Build SLP failed: unrolling required in basic"
1658			     " block SLP\n");
1659	  vect_free_slp_tree (node);
1660	  loads.release ();
1661          return false;
1662        }
1663
1664      /* Create a new SLP instance.  */
1665      new_instance = XNEW (struct _slp_instance);
1666      SLP_INSTANCE_TREE (new_instance) = node;
1667      SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
1668      SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
1669      SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL;
1670      SLP_INSTANCE_LOADS (new_instance) = loads;
1671      SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
1672
1673      /* Compute the load permutation.  */
1674      slp_tree load_node;
1675      bool loads_permuted = false;
1676      FOR_EACH_VEC_ELT (loads, i, load_node)
1677	{
1678	  vec<unsigned> load_permutation;
1679	  int j;
1680	  gimple load, first_stmt;
1681	  bool this_load_permuted = false;
1682	  load_permutation.create (group_size);
1683	  first_stmt = GROUP_FIRST_ELEMENT
1684	      (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (load_node)[0]));
1685	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
1686	    {
1687	      int load_place
1688		= vect_get_place_in_interleaving_chain (load, first_stmt);
1689	      gcc_assert (load_place != -1);
1690	      if (load_place != j)
1691		this_load_permuted = true;
1692	      load_permutation.safe_push (load_place);
1693	    }
1694	  if (!this_load_permuted)
1695	    {
1696	      load_permutation.release ();
1697	      continue;
1698	    }
1699	  SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
1700	  loads_permuted = true;
1701	}
1702
1703      if (loads_permuted)
1704        {
1705          if (!vect_supported_load_permutation_p (new_instance))
1706            {
1707              if (dump_enabled_p ())
1708                {
1709                  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1710				   "Build SLP failed: unsupported load "
1711				   "permutation ");
1712                  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
1713                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1714                }
1715              vect_free_slp_instance (new_instance);
1716              return false;
1717            }
1718
1719          SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
1720	    = vect_find_first_load_in_slp_instance (new_instance);
1721        }
1722
1723      /* Compute the costs of this SLP instance.  */
1724      vect_analyze_slp_cost (loop_vinfo, bb_vinfo,
1725			     new_instance, TYPE_VECTOR_SUBPARTS (vectype));
1726
1727      if (loop_vinfo)
1728        LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
1729      else
1730        BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
1731
1732      if (dump_enabled_p ())
1733	vect_print_slp_tree (MSG_NOTE, node);
1734
1735      return true;
1736    }
1737
1738  /* Failed to SLP.  */
1739  /* Free the allocated memory.  */
1740  vect_free_slp_tree (node);
1741  loads.release ();
1742
1743  return false;
1744}
1745
1746
1747/* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
1748   trees of packed scalar stmts if SLP is possible.  */
1749
1750bool
1751vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
1752		  unsigned max_tree_size)
1753{
1754  unsigned int i;
1755  vec<gimple> grouped_stores;
1756  vec<gimple> reductions = vNULL;
1757  vec<gimple> reduc_chains = vNULL;
1758  gimple first_element;
1759  bool ok = false;
1760
1761  if (dump_enabled_p ())
1762    dump_printf_loc (MSG_NOTE, vect_location, "=== vect_analyze_slp ===\n");
1763
1764  if (loop_vinfo)
1765    {
1766      grouped_stores = LOOP_VINFO_GROUPED_STORES (loop_vinfo);
1767      reduc_chains = LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo);
1768      reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
1769    }
1770  else
1771    grouped_stores = BB_VINFO_GROUPED_STORES (bb_vinfo);
1772
1773  /* Find SLP sequences starting from groups of grouped stores.  */
1774  FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
1775    if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1776				   max_tree_size))
1777      ok = true;
1778
1779  if (bb_vinfo && !ok)
1780    {
1781      if (dump_enabled_p ())
1782        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1783			 "Failed to SLP the basic block.\n");
1784
1785      return false;
1786    }
1787
1788  if (loop_vinfo
1789      && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
1790    {
1791      /* Find SLP sequences starting from reduction chains.  */
1792      FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
1793        if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
1794				       max_tree_size))
1795          ok = true;
1796        else
1797          return false;
1798
1799      /* Don't try to vectorize SLP reductions if reduction chain was
1800         detected.  */
1801      return ok;
1802    }
1803
1804  /* Find SLP sequences starting from groups of reductions.  */
1805  if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
1806      && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0],
1807				    max_tree_size))
1808    ok = true;
1809
1810  return true;
1811}
1812
1813
1814/* For each possible SLP instance decide whether to SLP it and calculate overall
1815   unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
1816   least one instance.  */
1817
1818bool
1819vect_make_slp_decision (loop_vec_info loop_vinfo)
1820{
1821  unsigned int i, unrolling_factor = 1;
1822  vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1823  slp_instance instance;
1824  int decided_to_slp = 0;
1825
1826  if (dump_enabled_p ())
1827    dump_printf_loc (MSG_NOTE, vect_location, "=== vect_make_slp_decision ==="
1828                     "\n");
1829
1830  FOR_EACH_VEC_ELT (slp_instances, i, instance)
1831    {
1832      /* FORNOW: SLP if you can.  */
1833      if (unrolling_factor < SLP_INSTANCE_UNROLLING_FACTOR (instance))
1834	unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (instance);
1835
1836      /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
1837	 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
1838	 loop-based vectorization.  Such stmts will be marked as HYBRID.  */
1839      vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
1840      decided_to_slp++;
1841    }
1842
1843  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
1844
1845  if (decided_to_slp && dump_enabled_p ())
1846    dump_printf_loc (MSG_NOTE, vect_location,
1847		     "Decided to SLP %d instances. Unrolling factor %d\n",
1848		     decided_to_slp, unrolling_factor);
1849
1850  return (decided_to_slp > 0);
1851}
1852
1853
1854/* Find stmts that must be both vectorized and SLPed (since they feed stmts that
1855   can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
1856
1857static void
1858vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
1859{
1860  gimple stmt = SLP_TREE_SCALAR_STMTS (node)[i];
1861  imm_use_iterator imm_iter;
1862  gimple use_stmt;
1863  stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
1864  slp_tree child;
1865  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
1866  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1867  int j;
1868
1869  /* Propagate hybrid down the SLP tree.  */
1870  if (stype == hybrid)
1871    ;
1872  else if (HYBRID_SLP_STMT (stmt_vinfo))
1873    stype = hybrid;
1874  else
1875    {
1876      /* Check if a pure SLP stmt has uses in non-SLP stmts.  */
1877      gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
1878      /* We always get the pattern stmt here, but for immediate
1879	 uses we have to use the LHS of the original stmt.  */
1880      gcc_checking_assert (!STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
1881      if (STMT_VINFO_RELATED_STMT (stmt_vinfo))
1882	stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
1883      if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
1884	FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
1885	  {
1886	    if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1887	      continue;
1888	    use_vinfo = vinfo_for_stmt (use_stmt);
1889	    if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
1890		&& STMT_VINFO_RELATED_STMT (use_vinfo))
1891	      use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
1892	    if (!STMT_SLP_TYPE (use_vinfo)
1893		&& (STMT_VINFO_RELEVANT (use_vinfo)
1894		    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
1895		&& !(gimple_code (use_stmt) == GIMPLE_PHI
1896		     && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
1897	      stype = hybrid;
1898	  }
1899    }
1900
1901  if (stype == hybrid)
1902    STMT_SLP_TYPE (stmt_vinfo) = hybrid;
1903
1904  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
1905    vect_detect_hybrid_slp_stmts (child, i, stype);
1906}
1907
1908/* Helpers for vect_detect_hybrid_slp walking pattern stmt uses.  */
1909
1910static tree
1911vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
1912{
1913  walk_stmt_info *wi = (walk_stmt_info *)data;
1914  struct loop *loopp = (struct loop *)wi->info;
1915
1916  if (wi->is_lhs)
1917    return NULL_TREE;
1918
1919  if (TREE_CODE (*tp) == SSA_NAME
1920      && !SSA_NAME_IS_DEFAULT_DEF (*tp))
1921    {
1922      gimple def_stmt = SSA_NAME_DEF_STMT (*tp);
1923      if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
1924	  && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
1925	STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
1926    }
1927
1928  return NULL_TREE;
1929}
1930
1931static tree
1932vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
1933			  walk_stmt_info *)
1934{
1935  /* If the stmt is in a SLP instance then this isn't a reason
1936     to mark use definitions in other SLP instances as hybrid.  */
1937  if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
1938    *handled = true;
1939  return NULL_TREE;
1940}
1941
1942/* Find stmts that must be both vectorized and SLPed.  */
1943
1944void
1945vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
1946{
1947  unsigned int i;
1948  vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1949  slp_instance instance;
1950
1951  if (dump_enabled_p ())
1952    dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
1953                     "\n");
1954
1955  /* First walk all pattern stmt in the loop and mark defs of uses as
1956     hybrid because immediate uses in them are not recorded.  */
1957  for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
1958    {
1959      basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
1960      for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1961	   gsi_next (&gsi))
1962	{
1963	  gimple stmt = gsi_stmt (gsi);
1964	  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1965	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
1966	    {
1967	      walk_stmt_info wi;
1968	      memset (&wi, 0, sizeof (wi));
1969	      wi.info = LOOP_VINFO_LOOP (loop_vinfo);
1970	      gimple_stmt_iterator gsi2
1971		= gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
1972	      walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
1973				vect_detect_hybrid_slp_1, &wi);
1974	      walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
1975			       vect_detect_hybrid_slp_2,
1976			       vect_detect_hybrid_slp_1, &wi);
1977	    }
1978	}
1979    }
1980
1981  /* Then walk the SLP instance trees marking stmts with uses in
1982     non-SLP stmts as hybrid, also propagating hybrid down the
1983     SLP tree, collecting the above info on-the-fly.  */
1984  FOR_EACH_VEC_ELT (slp_instances, i, instance)
1985    {
1986      for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
1987	vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
1988				      i, pure_slp);
1989    }
1990}
1991
1992
1993/* Create and initialize a new bb_vec_info struct for BB, as well as
1994   stmt_vec_info structs for all the stmts in it.  */
1995
1996static bb_vec_info
1997new_bb_vec_info (basic_block bb)
1998{
1999  bb_vec_info res = NULL;
2000  gimple_stmt_iterator gsi;
2001
2002  res = (bb_vec_info) xcalloc (1, sizeof (struct _bb_vec_info));
2003  BB_VINFO_BB (res) = bb;
2004
2005  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2006    {
2007      gimple stmt = gsi_stmt (gsi);
2008      gimple_set_uid (stmt, 0);
2009      set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, NULL, res));
2010    }
2011
2012  BB_VINFO_GROUPED_STORES (res).create (10);
2013  BB_VINFO_SLP_INSTANCES (res).create (2);
2014  BB_VINFO_TARGET_COST_DATA (res) = init_cost (NULL);
2015
2016  bb->aux = res;
2017  return res;
2018}
2019
2020
2021/* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2022   stmts in the basic block.  */
2023
2024static void
2025destroy_bb_vec_info (bb_vec_info bb_vinfo)
2026{
2027  vec<slp_instance> slp_instances;
2028  slp_instance instance;
2029  basic_block bb;
2030  gimple_stmt_iterator si;
2031  unsigned i;
2032
2033  if (!bb_vinfo)
2034    return;
2035
2036  bb = BB_VINFO_BB (bb_vinfo);
2037
2038  for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2039    {
2040      gimple stmt = gsi_stmt (si);
2041      stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2042
2043      if (stmt_info)
2044        /* Free stmt_vec_info.  */
2045        free_stmt_vec_info (stmt);
2046    }
2047
2048  vect_destroy_datarefs (NULL, bb_vinfo);
2049  free_dependence_relations (BB_VINFO_DDRS (bb_vinfo));
2050  BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
2051  slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2052  FOR_EACH_VEC_ELT (slp_instances, i, instance)
2053    vect_free_slp_instance (instance);
2054  BB_VINFO_SLP_INSTANCES (bb_vinfo).release ();
2055  destroy_cost_data (BB_VINFO_TARGET_COST_DATA (bb_vinfo));
2056  free (bb_vinfo);
2057  bb->aux = NULL;
2058}
2059
2060
2061/* Analyze statements contained in SLP tree node after recursively analyzing
2062   the subtree. Return TRUE if the operations are supported.  */
2063
2064static bool
2065vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
2066{
2067  bool dummy;
2068  int i;
2069  gimple stmt;
2070  slp_tree child;
2071
2072  if (!node)
2073    return true;
2074
2075  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2076    if (!vect_slp_analyze_node_operations (bb_vinfo, child))
2077      return false;
2078
2079  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2080    {
2081      stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2082      gcc_assert (stmt_info);
2083      gcc_assert (PURE_SLP_STMT (stmt_info));
2084
2085      if (!vect_analyze_stmt (stmt, &dummy, node))
2086        return false;
2087    }
2088
2089  return true;
2090}
2091
2092
2093/* Analyze statements in SLP instances of the basic block.  Return TRUE if the
2094   operations are supported. */
2095
2096static bool
2097vect_slp_analyze_operations (bb_vec_info bb_vinfo)
2098{
2099  vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2100  slp_instance instance;
2101  int i;
2102
2103  for (i = 0; slp_instances.iterate (i, &instance); )
2104    {
2105      if (!vect_slp_analyze_node_operations (bb_vinfo,
2106                                             SLP_INSTANCE_TREE (instance)))
2107        {
2108 	  vect_free_slp_instance (instance);
2109          slp_instances.ordered_remove (i);
2110	}
2111      else
2112        i++;
2113    }
2114
2115  if (!slp_instances.length ())
2116    return false;
2117
2118  return true;
2119}
2120
2121
2122/* Compute the scalar cost of the SLP node NODE and its children
2123   and return it.  Do not account defs that are marked in LIFE and
2124   update LIFE according to uses of NODE.  */
2125
2126static unsigned
2127vect_bb_slp_scalar_cost (basic_block bb,
2128			 slp_tree node, vec<bool, va_heap> *life)
2129{
2130  unsigned scalar_cost = 0;
2131  unsigned i;
2132  gimple stmt;
2133  slp_tree child;
2134
2135  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
2136    {
2137      unsigned stmt_cost;
2138      ssa_op_iter op_iter;
2139      def_operand_p def_p;
2140      stmt_vec_info stmt_info;
2141
2142      if ((*life)[i])
2143	continue;
2144
2145      /* If there is a non-vectorized use of the defs then the scalar
2146         stmt is kept live in which case we do not account it or any
2147	 required defs in the SLP children in the scalar cost.  This
2148	 way we make the vectorization more costly when compared to
2149	 the scalar cost.  */
2150      FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2151	{
2152	  imm_use_iterator use_iter;
2153	  gimple use_stmt;
2154	  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2155	    if (!is_gimple_debug (use_stmt)
2156		&& (gimple_code (use_stmt) == GIMPLE_PHI
2157		    || gimple_bb (use_stmt) != bb
2158		    || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
2159	      {
2160		(*life)[i] = true;
2161		BREAK_FROM_IMM_USE_STMT (use_iter);
2162	      }
2163	}
2164      if ((*life)[i])
2165	continue;
2166
2167      stmt_info = vinfo_for_stmt (stmt);
2168      if (STMT_VINFO_DATA_REF (stmt_info))
2169        {
2170          if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2171            stmt_cost = vect_get_stmt_cost (scalar_load);
2172          else
2173            stmt_cost = vect_get_stmt_cost (scalar_store);
2174        }
2175      else
2176        stmt_cost = vect_get_stmt_cost (scalar_stmt);
2177
2178      scalar_cost += stmt_cost;
2179    }
2180
2181  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2182    scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
2183
2184  return scalar_cost;
2185}
2186
2187/* Check if vectorization of the basic block is profitable.  */
2188
2189static bool
2190vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2191{
2192  vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2193  slp_instance instance;
2194  int i, j;
2195  unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2196  unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2197  void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
2198  stmt_vec_info stmt_info = NULL;
2199  stmt_vector_for_cost body_cost_vec;
2200  stmt_info_for_cost *ci;
2201
2202  /* Calculate vector costs.  */
2203  FOR_EACH_VEC_ELT (slp_instances, i, instance)
2204    {
2205      body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2206
2207      FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
2208	{
2209	  stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
2210	  (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
2211				stmt_info, ci->misalign, vect_body);
2212	}
2213    }
2214
2215  /* Calculate scalar cost.  */
2216  FOR_EACH_VEC_ELT (slp_instances, i, instance)
2217    {
2218      auto_vec<bool, 20> life;
2219      life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2220      scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2221					      SLP_INSTANCE_TREE (instance),
2222					      &life);
2223    }
2224
2225  /* Complete the target-specific cost calculation.  */
2226  finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2227	       &vec_inside_cost, &vec_epilogue_cost);
2228
2229  vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2230
2231  if (dump_enabled_p ())
2232    {
2233      dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2234      dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
2235		   vec_inside_cost);
2236      dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
2237      dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
2238      dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d\n", scalar_cost);
2239    }
2240
2241  /* Vectorization is profitable if its cost is less than the cost of scalar
2242     version.  */
2243  if (vec_outside_cost + vec_inside_cost >= scalar_cost)
2244    return false;
2245
2246  return true;
2247}
2248
2249/* Check if the basic block can be vectorized.  */
2250
2251static bb_vec_info
2252vect_slp_analyze_bb_1 (basic_block bb)
2253{
2254  bb_vec_info bb_vinfo;
2255  vec<slp_instance> slp_instances;
2256  slp_instance instance;
2257  int i;
2258  int min_vf = 2;
2259  unsigned n_stmts = 0;
2260
2261  bb_vinfo = new_bb_vec_info (bb);
2262  if (!bb_vinfo)
2263    return NULL;
2264
2265  if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf, &n_stmts))
2266    {
2267      if (dump_enabled_p ())
2268        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2269			 "not vectorized: unhandled data-ref in basic "
2270			 "block.\n");
2271
2272      destroy_bb_vec_info (bb_vinfo);
2273      return NULL;
2274    }
2275
2276  if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2277    {
2278      if (dump_enabled_p ())
2279        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2280			 "not vectorized: not enough data-refs in "
2281			 "basic block.\n");
2282
2283      destroy_bb_vec_info (bb_vinfo);
2284      return NULL;
2285    }
2286
2287  if (!vect_analyze_data_ref_accesses (NULL, bb_vinfo))
2288    {
2289     if (dump_enabled_p ())
2290       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2291			"not vectorized: unhandled data access in "
2292			"basic block.\n");
2293
2294      destroy_bb_vec_info (bb_vinfo);
2295      return NULL;
2296    }
2297
2298  vect_pattern_recog (NULL, bb_vinfo);
2299
2300  if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
2301    {
2302      if (dump_enabled_p ())
2303        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2304			 "not vectorized: bad data alignment in basic "
2305			 "block.\n");
2306
2307      destroy_bb_vec_info (bb_vinfo);
2308      return NULL;
2309    }
2310
2311  /* Check the SLP opportunities in the basic block, analyze and build SLP
2312     trees.  */
2313  if (!vect_analyze_slp (NULL, bb_vinfo, n_stmts))
2314    {
2315      if (dump_enabled_p ())
2316        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2317			 "not vectorized: failed to find SLP opportunities "
2318			 "in basic block.\n");
2319
2320      destroy_bb_vec_info (bb_vinfo);
2321      return NULL;
2322    }
2323
2324  slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2325
2326  /* Mark all the statements that we want to vectorize as pure SLP and
2327     relevant.  */
2328  FOR_EACH_VEC_ELT (slp_instances, i, instance)
2329    {
2330      vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance), pure_slp, -1);
2331      vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2332    }
2333
2334  /* Mark all the statements that we do not want to vectorize.  */
2335  for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
2336       !gsi_end_p (gsi); gsi_next (&gsi))
2337    {
2338      stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
2339      if (STMT_SLP_TYPE (vinfo) != pure_slp)
2340	STMT_VINFO_VECTORIZABLE (vinfo) = false;
2341    }
2342
2343  /* Analyze dependences.  At this point all stmts not participating in
2344     vectorization have to be marked.  Dependence analysis assumes
2345     that we either vectorize all SLP instances or none at all.  */
2346  if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
2347     {
2348       if (dump_enabled_p ())
2349	 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2350			  "not vectorized: unhandled data dependence "
2351			  "in basic block.\n");
2352
2353       destroy_bb_vec_info (bb_vinfo);
2354       return NULL;
2355     }
2356
2357  if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
2358    {
2359      if (dump_enabled_p ())
2360        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2361                         "not vectorized: unsupported alignment in basic "
2362                         "block.\n");
2363      destroy_bb_vec_info (bb_vinfo);
2364      return NULL;
2365    }
2366
2367  if (!vect_slp_analyze_operations (bb_vinfo))
2368    {
2369      if (dump_enabled_p ())
2370        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2371			 "not vectorized: bad operation in basic block.\n");
2372
2373      destroy_bb_vec_info (bb_vinfo);
2374      return NULL;
2375    }
2376
2377  /* Cost model: check if the vectorization is worthwhile.  */
2378  if (!unlimited_cost_model (NULL)
2379      && !vect_bb_vectorization_profitable_p (bb_vinfo))
2380    {
2381      if (dump_enabled_p ())
2382        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2383			 "not vectorized: vectorization is not "
2384			 "profitable.\n");
2385
2386      destroy_bb_vec_info (bb_vinfo);
2387      return NULL;
2388    }
2389
2390  if (dump_enabled_p ())
2391    dump_printf_loc (MSG_NOTE, vect_location,
2392		     "Basic block will be vectorized using SLP\n");
2393
2394  return bb_vinfo;
2395}
2396
2397
2398bb_vec_info
2399vect_slp_analyze_bb (basic_block bb)
2400{
2401  bb_vec_info bb_vinfo;
2402  int insns = 0;
2403  gimple_stmt_iterator gsi;
2404  unsigned int vector_sizes;
2405
2406  if (dump_enabled_p ())
2407    dump_printf_loc (MSG_NOTE, vect_location, "===vect_slp_analyze_bb===\n");
2408
2409  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2410    {
2411      gimple stmt = gsi_stmt (gsi);
2412      if (!is_gimple_debug (stmt)
2413          && !gimple_nop_p (stmt)
2414          && gimple_code (stmt) != GIMPLE_LABEL)
2415        insns++;
2416    }
2417
2418  if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2419    {
2420      if (dump_enabled_p ())
2421        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2422			 "not vectorized: too many instructions in "
2423			 "basic block.\n");
2424
2425      return NULL;
2426    }
2427
2428  /* Autodetect first vector size we try.  */
2429  current_vector_size = 0;
2430  vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2431
2432  while (1)
2433    {
2434      bb_vinfo = vect_slp_analyze_bb_1 (bb);
2435      if (bb_vinfo)
2436        return bb_vinfo;
2437
2438      destroy_bb_vec_info (bb_vinfo);
2439
2440      vector_sizes &= ~current_vector_size;
2441      if (vector_sizes == 0
2442          || current_vector_size == 0)
2443        return NULL;
2444
2445      /* Try the next biggest vector size.  */
2446      current_vector_size = 1 << floor_log2 (vector_sizes);
2447      if (dump_enabled_p ())
2448        dump_printf_loc (MSG_NOTE, vect_location,
2449			 "***** Re-trying analysis with "
2450			 "vector size %d\n", current_vector_size);
2451    }
2452}
2453
2454
2455/* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
2456   the number of created vector stmts depends on the unrolling factor).
2457   However, the actual number of vector stmts for every SLP node depends on
2458   VF which is set later in vect_analyze_operations ().  Hence, SLP costs
2459   should be updated.  In this function we assume that the inside costs
2460   calculated in vect_model_xxx_cost are linear in ncopies.  */
2461
2462void
2463vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
2464{
2465  unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2466  vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2467  slp_instance instance;
2468  stmt_vector_for_cost body_cost_vec;
2469  stmt_info_for_cost *si;
2470  void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2471
2472  if (dump_enabled_p ())
2473    dump_printf_loc (MSG_NOTE, vect_location,
2474		     "=== vect_update_slp_costs_according_to_vf ===\n");
2475
2476  FOR_EACH_VEC_ELT (slp_instances, i, instance)
2477    {
2478      /* We assume that costs are linear in ncopies.  */
2479      int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
2480
2481      /* Record the instance's instructions in the target cost model.
2482	 This was delayed until here because the count of instructions
2483	 isn't known beforehand.  */
2484      body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
2485
2486      FOR_EACH_VEC_ELT (body_cost_vec, j, si)
2487	(void) add_stmt_cost (data, si->count * ncopies, si->kind,
2488			      vinfo_for_stmt (si->stmt), si->misalign,
2489			      vect_body);
2490    }
2491}
2492
2493
2494/* For constant and loop invariant defs of SLP_NODE this function returns
2495   (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
2496   OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
2497   scalar stmts.  NUMBER_OF_VECTORS is the number of vector defs to create.
2498   REDUC_INDEX is the index of the reduction operand in the statements, unless
2499   it is -1.  */
2500
2501static void
2502vect_get_constant_vectors (tree op, slp_tree slp_node,
2503                           vec<tree> *vec_oprnds,
2504			   unsigned int op_num, unsigned int number_of_vectors,
2505                           int reduc_index)
2506{
2507  vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2508  gimple stmt = stmts[0];
2509  stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2510  unsigned nunits;
2511  tree vec_cst;
2512  tree *elts;
2513  unsigned j, number_of_places_left_in_vector;
2514  tree vector_type;
2515  tree vop;
2516  int group_size = stmts.length ();
2517  unsigned int vec_num, i;
2518  unsigned number_of_copies = 1;
2519  vec<tree> voprnds;
2520  voprnds.create (number_of_vectors);
2521  bool constant_p, is_store;
2522  tree neutral_op = NULL;
2523  enum tree_code code = gimple_expr_code (stmt);
2524  gimple def_stmt;
2525  struct loop *loop;
2526  gimple_seq ctor_seq = NULL;
2527
2528  if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
2529      && reduc_index != -1)
2530    {
2531      op_num = reduc_index - 1;
2532      op = gimple_op (stmt, reduc_index);
2533      /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
2534         we need either neutral operands or the original operands.  See
2535         get_initial_def_for_reduction() for details.  */
2536      switch (code)
2537        {
2538          case WIDEN_SUM_EXPR:
2539          case DOT_PROD_EXPR:
2540          case PLUS_EXPR:
2541          case MINUS_EXPR:
2542          case BIT_IOR_EXPR:
2543          case BIT_XOR_EXPR:
2544             if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2545               neutral_op = build_real (TREE_TYPE (op), dconst0);
2546             else
2547               neutral_op = build_int_cst (TREE_TYPE (op), 0);
2548
2549             break;
2550
2551          case MULT_EXPR:
2552             if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
2553               neutral_op = build_real (TREE_TYPE (op), dconst1);
2554             else
2555               neutral_op = build_int_cst (TREE_TYPE (op), 1);
2556
2557             break;
2558
2559          case BIT_AND_EXPR:
2560            neutral_op = build_int_cst (TREE_TYPE (op), -1);
2561            break;
2562
2563	  /* For MIN/MAX we don't have an easy neutral operand but
2564	     the initial values can be used fine here.  Only for
2565	     a reduction chain we have to force a neutral element.  */
2566	  case MAX_EXPR:
2567	  case MIN_EXPR:
2568	    if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
2569	      neutral_op = NULL;
2570	    else
2571	      {
2572		def_stmt = SSA_NAME_DEF_STMT (op);
2573		loop = (gimple_bb (stmt))->loop_father;
2574		neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2575						    loop_preheader_edge (loop));
2576	      }
2577	    break;
2578
2579          default:
2580            neutral_op = NULL;
2581        }
2582    }
2583
2584  if (STMT_VINFO_DATA_REF (stmt_vinfo))
2585    {
2586      is_store = true;
2587      op = gimple_assign_rhs1 (stmt);
2588    }
2589  else
2590    is_store = false;
2591
2592  gcc_assert (op);
2593
2594  if (CONSTANT_CLASS_P (op))
2595    constant_p = true;
2596  else
2597    constant_p = false;
2598
2599  vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
2600  gcc_assert (vector_type);
2601  nunits = TYPE_VECTOR_SUBPARTS (vector_type);
2602
2603  /* NUMBER_OF_COPIES is the number of times we need to use the same values in
2604     created vectors. It is greater than 1 if unrolling is performed.
2605
2606     For example, we have two scalar operands, s1 and s2 (e.g., group of
2607     strided accesses of size two), while NUNITS is four (i.e., four scalars
2608     of this type can be packed in a vector).  The output vector will contain
2609     two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
2610     will be 2).
2611
2612     If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
2613     containing the operands.
2614
2615     For example, NUNITS is four as before, and the group size is 8
2616     (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
2617     {s5, s6, s7, s8}.  */
2618
2619  number_of_copies = least_common_multiple (nunits, group_size) / group_size;
2620
2621  number_of_places_left_in_vector = nunits;
2622  elts = XALLOCAVEC (tree, nunits);
2623  for (j = 0; j < number_of_copies; j++)
2624    {
2625      for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
2626        {
2627          if (is_store)
2628            op = gimple_assign_rhs1 (stmt);
2629          else
2630	    {
2631	      switch (code)
2632		{
2633		  case COND_EXPR:
2634		    if (op_num == 0 || op_num == 1)
2635		      {
2636			tree cond = gimple_assign_rhs1 (stmt);
2637			op = TREE_OPERAND (cond, op_num);
2638		      }
2639		    else
2640		      {
2641			if (op_num == 2)
2642			  op = gimple_assign_rhs2 (stmt);
2643			else
2644			  op = gimple_assign_rhs3 (stmt);
2645		      }
2646		    break;
2647
2648		  case CALL_EXPR:
2649		    op = gimple_call_arg (stmt, op_num);
2650		    break;
2651
2652		  case LSHIFT_EXPR:
2653		  case RSHIFT_EXPR:
2654		  case LROTATE_EXPR:
2655		  case RROTATE_EXPR:
2656		    op = gimple_op (stmt, op_num + 1);
2657		    /* Unlike the other binary operators, shifts/rotates have
2658		       the shift count being int, instead of the same type as
2659		       the lhs, so make sure the scalar is the right type if
2660		       we are dealing with vectors of
2661		       long long/long/short/char.  */
2662		    if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
2663		      op = fold_convert (TREE_TYPE (vector_type), op);
2664		    break;
2665
2666		  default:
2667		    op = gimple_op (stmt, op_num + 1);
2668		    break;
2669		}
2670	    }
2671
2672          if (reduc_index != -1)
2673            {
2674              loop = (gimple_bb (stmt))->loop_father;
2675              def_stmt = SSA_NAME_DEF_STMT (op);
2676
2677              gcc_assert (loop);
2678
2679              /* Get the def before the loop.  In reduction chain we have only
2680                 one initial value.  */
2681              if ((j != (number_of_copies - 1)
2682                   || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
2683                       && i != 0))
2684                  && neutral_op)
2685                op = neutral_op;
2686              else
2687                op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
2688                                            loop_preheader_edge (loop));
2689            }
2690
2691          /* Create 'vect_ = {op0,op1,...,opn}'.  */
2692          number_of_places_left_in_vector--;
2693	  if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
2694	    {
2695	      if (CONSTANT_CLASS_P (op))
2696		{
2697		  op = fold_unary (VIEW_CONVERT_EXPR,
2698				   TREE_TYPE (vector_type), op);
2699		  gcc_assert (op && CONSTANT_CLASS_P (op));
2700		}
2701	      else
2702		{
2703		  tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
2704		  gimple init_stmt;
2705		  op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), op);
2706		  init_stmt
2707		    = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, op);
2708		  gimple_seq_add_stmt (&ctor_seq, init_stmt);
2709		  op = new_temp;
2710		}
2711	    }
2712	  elts[number_of_places_left_in_vector] = op;
2713	  if (!CONSTANT_CLASS_P (op))
2714	    constant_p = false;
2715
2716          if (number_of_places_left_in_vector == 0)
2717            {
2718              number_of_places_left_in_vector = nunits;
2719
2720	      if (constant_p)
2721		vec_cst = build_vector (vector_type, elts);
2722	      else
2723		{
2724		  vec<constructor_elt, va_gc> *v;
2725		  unsigned k;
2726		  vec_alloc (v, nunits);
2727		  for (k = 0; k < nunits; ++k)
2728		    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
2729		  vec_cst = build_constructor (vector_type, v);
2730		}
2731              voprnds.quick_push (vect_init_vector (stmt, vec_cst,
2732						    vector_type, NULL));
2733	      if (ctor_seq != NULL)
2734		{
2735		  gimple init_stmt = SSA_NAME_DEF_STMT (voprnds.last ());
2736		  gimple_stmt_iterator gsi = gsi_for_stmt (init_stmt);
2737		  gsi_insert_seq_before_without_update (&gsi, ctor_seq,
2738							GSI_SAME_STMT);
2739		  ctor_seq = NULL;
2740		}
2741            }
2742        }
2743    }
2744
2745  /* Since the vectors are created in the reverse order, we should invert
2746     them.  */
2747  vec_num = voprnds.length ();
2748  for (j = vec_num; j != 0; j--)
2749    {
2750      vop = voprnds[j - 1];
2751      vec_oprnds->quick_push (vop);
2752    }
2753
2754  voprnds.release ();
2755
2756  /* In case that VF is greater than the unrolling factor needed for the SLP
2757     group of stmts, NUMBER_OF_VECTORS to be created is greater than
2758     NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
2759     to replicate the vectors.  */
2760  while (number_of_vectors > vec_oprnds->length ())
2761    {
2762      tree neutral_vec = NULL;
2763
2764      if (neutral_op)
2765        {
2766          if (!neutral_vec)
2767	    neutral_vec = build_vector_from_val (vector_type, neutral_op);
2768
2769          vec_oprnds->quick_push (neutral_vec);
2770        }
2771      else
2772        {
2773          for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
2774            vec_oprnds->quick_push (vop);
2775        }
2776    }
2777}
2778
2779
2780/* Get vectorized definitions from SLP_NODE that contains corresponding
2781   vectorized def-stmts.  */
2782
2783static void
2784vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
2785{
2786  tree vec_oprnd;
2787  gimple vec_def_stmt;
2788  unsigned int i;
2789
2790  gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
2791
2792  FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt)
2793    {
2794      gcc_assert (vec_def_stmt);
2795      vec_oprnd = gimple_get_lhs (vec_def_stmt);
2796      vec_oprnds->quick_push (vec_oprnd);
2797    }
2798}
2799
2800
2801/* Get vectorized definitions for SLP_NODE.
2802   If the scalar definitions are loop invariants or constants, collect them and
2803   call vect_get_constant_vectors() to create vector stmts.
2804   Otherwise, the def-stmts must be already vectorized and the vectorized stmts
2805   must be stored in the corresponding child of SLP_NODE, and we call
2806   vect_get_slp_vect_defs () to retrieve them.  */
2807
2808void
2809vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
2810		   vec<vec<tree> > *vec_oprnds, int reduc_index)
2811{
2812  gimple first_stmt;
2813  int number_of_vects = 0, i;
2814  unsigned int child_index = 0;
2815  HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
2816  slp_tree child = NULL;
2817  vec<tree> vec_defs;
2818  tree oprnd;
2819  bool vectorized_defs;
2820
2821  first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2822  FOR_EACH_VEC_ELT (ops, i, oprnd)
2823    {
2824      /* For each operand we check if it has vectorized definitions in a child
2825	 node or we need to create them (for invariants and constants).  We
2826	 check if the LHS of the first stmt of the next child matches OPRND.
2827	 If it does, we found the correct child.  Otherwise, we call
2828	 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
2829	 to check this child node for the next operand.  */
2830      vectorized_defs = false;
2831      if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
2832        {
2833          child = SLP_TREE_CHILDREN (slp_node)[child_index];
2834
2835	  /* We have to check both pattern and original def, if available.  */
2836	  gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
2837	  gimple related = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
2838
2839	  if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
2840	      || (related
2841		  && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
2842	    {
2843	      /* The number of vector defs is determined by the number of
2844		 vector statements in the node from which we get those
2845		 statements.  */
2846	      number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
2847	      vectorized_defs = true;
2848	      child_index++;
2849	    }
2850        }
2851
2852      if (!vectorized_defs)
2853        {
2854          if (i == 0)
2855            {
2856              number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
2857              /* Number of vector stmts was calculated according to LHS in
2858                 vect_schedule_slp_instance (), fix it by replacing LHS with
2859                 RHS, if necessary.  See vect_get_smallest_scalar_type () for
2860                 details.  */
2861              vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
2862                                             &rhs_size_unit);
2863              if (rhs_size_unit != lhs_size_unit)
2864                {
2865                  number_of_vects *= rhs_size_unit;
2866                  number_of_vects /= lhs_size_unit;
2867                }
2868            }
2869        }
2870
2871      /* Allocate memory for vectorized defs.  */
2872      vec_defs = vNULL;
2873      vec_defs.create (number_of_vects);
2874
2875      /* For reduction defs we call vect_get_constant_vectors (), since we are
2876         looking for initial loop invariant values.  */
2877      if (vectorized_defs && reduc_index == -1)
2878        /* The defs are already vectorized.  */
2879	vect_get_slp_vect_defs (child, &vec_defs);
2880      else
2881        /* Build vectors from scalar defs.  */
2882	vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
2883                                   number_of_vects, reduc_index);
2884
2885      vec_oprnds->quick_push (vec_defs);
2886
2887      /* For reductions, we only need initial values.  */
2888      if (reduc_index != -1)
2889        return;
2890    }
2891}
2892
2893
2894/* Create NCOPIES permutation statements using the mask MASK_BYTES (by
2895   building a vector of type MASK_TYPE from it) and two input vectors placed in
2896   DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
2897   shifting by STRIDE elements of DR_CHAIN for every copy.
2898   (STRIDE is the number of vectorized stmts for NODE divided by the number of
2899   copies).
2900   VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
2901   the created stmts must be inserted.  */
2902
2903static inline void
2904vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
2905                           tree mask, int first_vec_indx, int second_vec_indx,
2906                           gimple_stmt_iterator *gsi, slp_tree node,
2907                           tree vectype, vec<tree> dr_chain,
2908                           int ncopies, int vect_stmts_counter)
2909{
2910  tree perm_dest;
2911  gimple perm_stmt = NULL;
2912  stmt_vec_info next_stmt_info;
2913  int i, stride;
2914  tree first_vec, second_vec, data_ref;
2915
2916  stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
2917
2918  /* Initialize the vect stmts of NODE to properly insert the generated
2919     stmts later.  */
2920  for (i = SLP_TREE_VEC_STMTS (node).length ();
2921       i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
2922    SLP_TREE_VEC_STMTS (node).quick_push (NULL);
2923
2924  perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
2925  for (i = 0; i < ncopies; i++)
2926    {
2927      first_vec = dr_chain[first_vec_indx];
2928      second_vec = dr_chain[second_vec_indx];
2929
2930      /* Generate the permute statement.  */
2931      perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
2932				       first_vec, second_vec, mask);
2933      data_ref = make_ssa_name (perm_dest, perm_stmt);
2934      gimple_set_lhs (perm_stmt, data_ref);
2935      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
2936
2937      /* Store the vector statement in NODE.  */
2938      SLP_TREE_VEC_STMTS (node)[stride * i + vect_stmts_counter] = perm_stmt;
2939
2940      first_vec_indx += stride;
2941      second_vec_indx += stride;
2942    }
2943
2944  /* Mark the scalar stmt as vectorized.  */
2945  next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
2946  STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
2947}
2948
2949
2950/* Given FIRST_MASK_ELEMENT - the mask element in element representation,
2951   return in CURRENT_MASK_ELEMENT its equivalent in target specific
2952   representation.  Check that the mask is valid and return FALSE if not.
2953   Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
2954   the next vector, i.e., the current first vector is not needed.  */
2955
2956static bool
2957vect_get_mask_element (gimple stmt, int first_mask_element, int m,
2958                       int mask_nunits, bool only_one_vec, int index,
2959		       unsigned char *mask, int *current_mask_element,
2960                       bool *need_next_vector, int *number_of_mask_fixes,
2961                       bool *mask_fixed, bool *needs_first_vector)
2962{
2963  int i;
2964
2965  /* Convert to target specific representation.  */
2966  *current_mask_element = first_mask_element + m;
2967  /* Adjust the value in case it's a mask for second and third vectors.  */
2968  *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
2969
2970  if (*current_mask_element < mask_nunits)
2971    *needs_first_vector = true;
2972
2973  /* We have only one input vector to permute but the mask accesses values in
2974     the next vector as well.  */
2975  if (only_one_vec && *current_mask_element >= mask_nunits)
2976    {
2977      if (dump_enabled_p ())
2978        {
2979          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2980			   "permutation requires at least two vectors ");
2981          dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2982          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2983        }
2984
2985      return false;
2986    }
2987
2988  /* The mask requires the next vector.  */
2989  while (*current_mask_element >= mask_nunits * 2)
2990    {
2991      if (*needs_first_vector || *mask_fixed)
2992        {
2993          /* We either need the first vector too or have already moved to the
2994             next vector. In both cases, this permutation needs three
2995             vectors.  */
2996          if (dump_enabled_p ())
2997            {
2998              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2999			       "permutation requires at "
3000			       "least three vectors ");
3001              dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3002              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3003            }
3004
3005          return false;
3006        }
3007
3008      /* We move to the next vector, dropping the first one and working with
3009         the second and the third - we need to adjust the values of the mask
3010         accordingly.  */
3011      *current_mask_element -= mask_nunits * *number_of_mask_fixes;
3012
3013      for (i = 0; i < index; i++)
3014        mask[i] -= mask_nunits * *number_of_mask_fixes;
3015
3016      (*number_of_mask_fixes)++;
3017      *mask_fixed = true;
3018    }
3019
3020  *need_next_vector = *mask_fixed;
3021
3022  /* This was the last element of this mask. Start a new one.  */
3023  if (index == mask_nunits - 1)
3024    {
3025      *number_of_mask_fixes = 1;
3026      *mask_fixed = false;
3027      *needs_first_vector = false;
3028    }
3029
3030  return true;
3031}
3032
3033
3034/* Generate vector permute statements from a list of loads in DR_CHAIN.
3035   If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3036   permute statements for the SLP node NODE of the SLP instance
3037   SLP_NODE_INSTANCE.  */
3038
3039bool
3040vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3041                              gimple_stmt_iterator *gsi, int vf,
3042                              slp_instance slp_node_instance, bool analyze_only)
3043{
3044  gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3045  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3046  tree mask_element_type = NULL_TREE, mask_type;
3047  int i, j, k, nunits, vec_index = 0, scalar_index;
3048  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3049  gimple next_scalar_stmt;
3050  int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3051  int first_mask_element;
3052  int index, unroll_factor, current_mask_element, ncopies;
3053  unsigned char *mask;
3054  bool only_one_vec = false, need_next_vector = false;
3055  int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
3056  int number_of_mask_fixes = 1;
3057  bool mask_fixed = false;
3058  bool needs_first_vector = false;
3059  machine_mode mode;
3060
3061  mode = TYPE_MODE (vectype);
3062
3063  if (!can_vec_perm_p (mode, false, NULL))
3064    {
3065      if (dump_enabled_p ())
3066        {
3067          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3068			   "no vect permute for ");
3069          dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3070          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3071        }
3072      return false;
3073    }
3074
3075  /* The generic VEC_PERM_EXPR code always uses an integral type of the
3076     same size as the vector element being permuted.  */
3077  mask_element_type = lang_hooks.types.type_for_mode
3078		(int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))), 1);
3079  mask_type = get_vectype_for_scalar_type (mask_element_type);
3080  nunits = TYPE_VECTOR_SUBPARTS (vectype);
3081  mask = XALLOCAVEC (unsigned char, nunits);
3082  unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3083
3084  /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
3085     unrolling factor.  */
3086  orig_vec_stmts_num = group_size *
3087                SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
3088  if (orig_vec_stmts_num == 1)
3089    only_one_vec = true;
3090
3091  /* Number of copies is determined by the final vectorization factor
3092     relatively to SLP_NODE_INSTANCE unrolling factor.  */
3093  ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
3094
3095  if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3096    return false;
3097
3098  /* Generate permutation masks for every NODE. Number of masks for each NODE
3099     is equal to GROUP_SIZE.
3100     E.g., we have a group of three nodes with three loads from the same
3101     location in each node, and the vector size is 4. I.e., we have a
3102     a0b0c0a1b1c1... sequence and we need to create the following vectors:
3103     for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3104     for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3105     ...
3106
3107     The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3108     The last mask is illegal since we assume two operands for permute
3109     operation, and the mask element values can't be outside that range.
3110     Hence, the last mask must be converted into {2,5,5,5}.
3111     For the first two permutations we need the first and the second input
3112     vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3113     we need the second and the third vectors: {b1,c1,a2,b2} and
3114     {c2,a3,b3,c3}.  */
3115
3116    {
3117      scalar_index = 0;
3118      index = 0;
3119      vect_stmts_counter = 0;
3120      vec_index = 0;
3121      first_vec_index = vec_index++;
3122      if (only_one_vec)
3123        second_vec_index = first_vec_index;
3124      else
3125        second_vec_index =  vec_index++;
3126
3127      for (j = 0; j < unroll_factor; j++)
3128        {
3129          for (k = 0; k < group_size; k++)
3130            {
3131	      i = SLP_TREE_LOAD_PERMUTATION (node)[k];
3132              first_mask_element = i + j * group_size;
3133              if (!vect_get_mask_element (stmt, first_mask_element, 0,
3134					  nunits, only_one_vec, index,
3135					  mask, &current_mask_element,
3136					  &need_next_vector,
3137					  &number_of_mask_fixes, &mask_fixed,
3138					  &needs_first_vector))
3139		return false;
3140	      gcc_assert (current_mask_element < 2 * nunits);
3141	      mask[index++] = current_mask_element;
3142
3143              if (index == nunits)
3144                {
3145		  index = 0;
3146		  if (!can_vec_perm_p (mode, false, mask))
3147		    {
3148		      if (dump_enabled_p ())
3149			{
3150			  dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3151					   vect_location,
3152					   "unsupported vect permute { ");
3153			  for (i = 0; i < nunits; ++i)
3154			    dump_printf (MSG_MISSED_OPTIMIZATION, "%d ",
3155					 mask[i]);
3156			  dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3157			}
3158		      return false;
3159		    }
3160
3161                  if (!analyze_only)
3162                    {
3163		      int l;
3164		      tree mask_vec, *mask_elts;
3165		      mask_elts = XALLOCAVEC (tree, nunits);
3166		      for (l = 0; l < nunits; ++l)
3167			mask_elts[l] = build_int_cst (mask_element_type,
3168						      mask[l]);
3169		      mask_vec = build_vector (mask_type, mask_elts);
3170
3171		      if (need_next_vector)
3172                        {
3173                          first_vec_index = second_vec_index;
3174                          second_vec_index = vec_index;
3175                        }
3176
3177                      next_scalar_stmt
3178			  = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
3179
3180                      vect_create_mask_and_perm (stmt, next_scalar_stmt,
3181                               mask_vec, first_vec_index, second_vec_index,
3182			       gsi, node, vectype, dr_chain,
3183			       ncopies, vect_stmts_counter++);
3184                    }
3185                }
3186            }
3187        }
3188    }
3189
3190  return true;
3191}
3192
3193
3194
3195/* Vectorize SLP instance tree in postorder.  */
3196
3197static bool
3198vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3199                            unsigned int vectorization_factor)
3200{
3201  gimple stmt;
3202  bool grouped_store, is_store;
3203  gimple_stmt_iterator si;
3204  stmt_vec_info stmt_info;
3205  unsigned int vec_stmts_size, nunits, group_size;
3206  tree vectype;
3207  int i;
3208  slp_tree child;
3209
3210  if (!node)
3211    return false;
3212
3213  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3214    vect_schedule_slp_instance (child, instance, vectorization_factor);
3215
3216  stmt = SLP_TREE_SCALAR_STMTS (node)[0];
3217  stmt_info = vinfo_for_stmt (stmt);
3218
3219  /* VECTYPE is the type of the destination.  */
3220  vectype = STMT_VINFO_VECTYPE (stmt_info);
3221  nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (vectype);
3222  group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3223
3224  /* For each SLP instance calculate number of vector stmts to be created
3225     for the scalar stmts in each node of the SLP tree.  Number of vector
3226     elements in one vector iteration is the number of scalar elements in
3227     one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
3228     size.  */
3229  vec_stmts_size = (vectorization_factor * group_size) / nunits;
3230
3231  if (!SLP_TREE_VEC_STMTS (node).exists ())
3232    {
3233      SLP_TREE_VEC_STMTS (node).create (vec_stmts_size);
3234      SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
3235    }
3236
3237  if (dump_enabled_p ())
3238    {
3239      dump_printf_loc (MSG_NOTE,vect_location,
3240		       "------>vectorizing SLP node starting from: ");
3241      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3242      dump_printf (MSG_NOTE, "\n");
3243    }
3244
3245  /* Loads should be inserted before the first load.  */
3246  if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
3247      && STMT_VINFO_GROUPED_ACCESS (stmt_info)
3248      && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
3249      && SLP_TREE_LOAD_PERMUTATION (node).exists ())
3250    si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
3251  else if (is_pattern_stmt_p (stmt_info))
3252    si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
3253  else
3254    si = gsi_for_stmt (stmt);
3255
3256  /* Stores should be inserted just before the last store.  */
3257  if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
3258      && REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
3259    {
3260      gimple last_store = vect_find_last_store_in_slp_instance (instance);
3261      if (is_pattern_stmt_p (vinfo_for_stmt (last_store)))
3262       last_store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store));
3263      si = gsi_for_stmt (last_store);
3264    }
3265
3266  /* Mark the first element of the reduction chain as reduction to properly
3267     transform the node.  In the analysis phase only the last element of the
3268     chain is marked as reduction.  */
3269  if (GROUP_FIRST_ELEMENT (stmt_info) && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3270      && GROUP_FIRST_ELEMENT (stmt_info) == stmt)
3271    {
3272      STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3273      STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3274    }
3275
3276  is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
3277  return is_store;
3278}
3279
3280/* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3281   For loop vectorization this is done in vectorizable_call, but for SLP
3282   it needs to be deferred until end of vect_schedule_slp, because multiple
3283   SLP instances may refer to the same scalar stmt.  */
3284
3285static void
3286vect_remove_slp_scalar_calls (slp_tree node)
3287{
3288  gimple stmt, new_stmt;
3289  gimple_stmt_iterator gsi;
3290  int i;
3291  slp_tree child;
3292  tree lhs;
3293  stmt_vec_info stmt_info;
3294
3295  if (!node)
3296    return;
3297
3298  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3299    vect_remove_slp_scalar_calls (child);
3300
3301  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
3302    {
3303      if (!is_gimple_call (stmt) || gimple_bb (stmt) == NULL)
3304	continue;
3305      stmt_info = vinfo_for_stmt (stmt);
3306      if (stmt_info == NULL
3307	  || is_pattern_stmt_p (stmt_info)
3308	  || !PURE_SLP_STMT (stmt_info))
3309	continue;
3310      lhs = gimple_call_lhs (stmt);
3311      new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
3312      set_vinfo_for_stmt (new_stmt, stmt_info);
3313      set_vinfo_for_stmt (stmt, NULL);
3314      STMT_VINFO_STMT (stmt_info) = new_stmt;
3315      gsi = gsi_for_stmt (stmt);
3316      gsi_replace (&gsi, new_stmt, false);
3317      SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
3318    }
3319}
3320
3321/* Generate vector code for all SLP instances in the loop/basic block.  */
3322
3323bool
3324vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
3325{
3326  vec<slp_instance> slp_instances;
3327  slp_instance instance;
3328  unsigned int i, vf;
3329  bool is_store = false;
3330
3331  if (loop_vinfo)
3332    {
3333      slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
3334      vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3335    }
3336  else
3337    {
3338      slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3339      vf = 1;
3340    }
3341
3342  FOR_EACH_VEC_ELT (slp_instances, i, instance)
3343    {
3344      /* Schedule the tree of INSTANCE.  */
3345      is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
3346                                             instance, vf);
3347      if (dump_enabled_p ())
3348	dump_printf_loc (MSG_NOTE, vect_location,
3349                         "vectorizing stmts using SLP.\n");
3350    }
3351
3352  FOR_EACH_VEC_ELT (slp_instances, i, instance)
3353    {
3354      slp_tree root = SLP_INSTANCE_TREE (instance);
3355      gimple store;
3356      unsigned int j;
3357      gimple_stmt_iterator gsi;
3358
3359      /* Remove scalar call stmts.  Do not do this for basic-block
3360	 vectorization as not all uses may be vectorized.
3361	 ???  Why should this be necessary?  DCE should be able to
3362	 remove the stmts itself.
3363	 ???  For BB vectorization we can as well remove scalar
3364	 stmts starting from the SLP tree root if they have no
3365	 uses.  */
3366      if (loop_vinfo)
3367	vect_remove_slp_scalar_calls (root);
3368
3369      for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store)
3370                  && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
3371        {
3372          if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
3373            break;
3374
3375         if (is_pattern_stmt_p (vinfo_for_stmt (store)))
3376           store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (store));
3377          /* Free the attached stmt_vec_info and remove the stmt.  */
3378          gsi = gsi_for_stmt (store);
3379	  unlink_stmt_vdef (store);
3380          gsi_remove (&gsi, true);
3381	  release_defs (store);
3382          free_stmt_vec_info (store);
3383        }
3384    }
3385
3386  return is_store;
3387}
3388
3389
3390/* Vectorize the basic block.  */
3391
3392void
3393vect_slp_transform_bb (basic_block bb)
3394{
3395  bb_vec_info bb_vinfo = vec_info_for_bb (bb);
3396  gimple_stmt_iterator si;
3397
3398  gcc_assert (bb_vinfo);
3399
3400  if (dump_enabled_p ())
3401    dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB\n");
3402
3403  for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
3404    {
3405      gimple stmt = gsi_stmt (si);
3406      stmt_vec_info stmt_info;
3407
3408      if (dump_enabled_p ())
3409        {
3410          dump_printf_loc (MSG_NOTE, vect_location,
3411                           "------>SLPing statement: ");
3412          dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3413          dump_printf (MSG_NOTE, "\n");
3414        }
3415
3416      stmt_info = vinfo_for_stmt (stmt);
3417      gcc_assert (stmt_info);
3418
3419      /* Schedule all the SLP instances when the first SLP stmt is reached.  */
3420      if (STMT_SLP_TYPE (stmt_info))
3421        {
3422          vect_schedule_slp (NULL, bb_vinfo);
3423          break;
3424        }
3425    }
3426
3427  if (dump_enabled_p ())
3428    dump_printf_loc (MSG_NOTE, vect_location,
3429		     "BASIC BLOCK VECTORIZED\n");
3430
3431  destroy_bb_vec_info (bb_vinfo);
3432}
3433