1/* Loop Vectorization
2   Copyright (C) 2003-2015 Free Software Foundation, Inc.
3   Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4   Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3.  If not see
20<http://www.gnu.org/licenses/>.  */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "dumpfile.h"
26#include "tm.h"
27#include "hash-set.h"
28#include "machmode.h"
29#include "vec.h"
30#include "double-int.h"
31#include "input.h"
32#include "alias.h"
33#include "symtab.h"
34#include "wide-int.h"
35#include "inchash.h"
36#include "tree.h"
37#include "fold-const.h"
38#include "stor-layout.h"
39#include "predict.h"
40#include "hard-reg-set.h"
41#include "function.h"
42#include "dominance.h"
43#include "cfg.h"
44#include "cfganal.h"
45#include "basic-block.h"
46#include "gimple-pretty-print.h"
47#include "tree-ssa-alias.h"
48#include "internal-fn.h"
49#include "gimple-expr.h"
50#include "is-a.h"
51#include "gimple.h"
52#include "gimplify.h"
53#include "gimple-iterator.h"
54#include "gimplify-me.h"
55#include "gimple-ssa.h"
56#include "tree-phinodes.h"
57#include "ssa-iterators.h"
58#include "stringpool.h"
59#include "tree-ssanames.h"
60#include "tree-ssa-loop-ivopts.h"
61#include "tree-ssa-loop-manip.h"
62#include "tree-ssa-loop-niter.h"
63#include "tree-pass.h"
64#include "cfgloop.h"
65#include "hashtab.h"
66#include "rtl.h"
67#include "flags.h"
68#include "statistics.h"
69#include "real.h"
70#include "fixed-value.h"
71#include "insn-config.h"
72#include "expmed.h"
73#include "dojump.h"
74#include "explow.h"
75#include "calls.h"
76#include "emit-rtl.h"
77#include "varasm.h"
78#include "stmt.h"
79#include "expr.h"
80#include "recog.h"
81#include "insn-codes.h"
82#include "optabs.h"
83#include "params.h"
84#include "diagnostic-core.h"
85#include "tree-chrec.h"
86#include "tree-scalar-evolution.h"
87#include "tree-vectorizer.h"
88#include "target.h"
89
90/* Loop Vectorization Pass.
91
92   This pass tries to vectorize loops.
93
94   For example, the vectorizer transforms the following simple loop:
95
96        short a[N]; short b[N]; short c[N]; int i;
97
98        for (i=0; i<N; i++){
99          a[i] = b[i] + c[i];
100        }
101
102   as if it was manually vectorized by rewriting the source code into:
103
104        typedef int __attribute__((mode(V8HI))) v8hi;
105        short a[N];  short b[N]; short c[N];   int i;
106        v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
107        v8hi va, vb, vc;
108
109        for (i=0; i<N/8; i++){
110          vb = pb[i];
111          vc = pc[i];
112          va = vb + vc;
113          pa[i] = va;
114        }
115
116        The main entry to this pass is vectorize_loops(), in which
117   the vectorizer applies a set of analyses on a given set of loops,
118   followed by the actual vectorization transformation for the loops that
119   had successfully passed the analysis phase.
120        Throughout this pass we make a distinction between two types of
121   data: scalars (which are represented by SSA_NAMES), and memory references
122   ("data-refs").  These two types of data require different handling both
123   during analysis and transformation. The types of data-refs that the
124   vectorizer currently supports are ARRAY_REFS which base is an array DECL
125   (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
126   accesses are required to have a simple (consecutive) access pattern.
127
128   Analysis phase:
129   ===============
130        The driver for the analysis phase is vect_analyze_loop().
131   It applies a set of analyses, some of which rely on the scalar evolution
132   analyzer (scev) developed by Sebastian Pop.
133
134        During the analysis phase the vectorizer records some information
135   per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
136   loop, as well as general information about the loop as a whole, which is
137   recorded in a "loop_vec_info" struct attached to each loop.
138
139   Transformation phase:
140   =====================
141        The loop transformation phase scans all the stmts in the loop, and
142   creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
143   the loop that needs to be vectorized.  It inserts the vector code sequence
144   just before the scalar stmt S, and records a pointer to the vector code
145   in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
146   attached to S).  This pointer will be used for the vectorization of following
147   stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
148   otherwise, we rely on dead code elimination for removing it.
149
150        For example, say stmt S1 was vectorized into stmt VS1:
151
152   VS1: vb = px[i];
153   S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
154   S2:  a = b;
155
156   To vectorize stmt S2, the vectorizer first finds the stmt that defines
157   the operand 'b' (S1), and gets the relevant vector def 'vb' from the
158   vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
159   resulting sequence would be:
160
161   VS1: vb = px[i];
162   S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
163   VS2: va = vb;
164   S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
165
166        Operands that are not SSA_NAMEs, are data-refs that appear in
167   load/store operations (like 'x[i]' in S1), and are handled differently.
168
169   Target modeling:
170   =================
171        Currently the only target specific information that is used is the
172   size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
173   Targets that can support different sizes of vectors, for now will need
174   to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
175   flexibility will be added in the future.
176
177        Since we only vectorize operations which vector form can be
178   expressed using existing tree codes, to verify that an operation is
179   supported, the vectorizer checks the relevant optab at the relevant
180   machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
181   the value found is CODE_FOR_nothing, then there's no target support, and
182   we can't vectorize the stmt.
183
184   For additional information on this project see:
185   http://gcc.gnu.org/projects/tree-ssa/vectorization.html
186*/
187
188static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
189
190/* Function vect_determine_vectorization_factor
191
192   Determine the vectorization factor (VF).  VF is the number of data elements
193   that are operated upon in parallel in a single iteration of the vectorized
194   loop.  For example, when vectorizing a loop that operates on 4byte elements,
195   on a target with vector size (VS) 16byte, the VF is set to 4, since 4
196   elements can fit in a single vector register.
197
198   We currently support vectorization of loops in which all types operated upon
199   are of the same size.  Therefore this function currently sets VF according to
200   the size of the types operated upon, and fails if there are multiple sizes
201   in the loop.
202
203   VF is also the factor by which the loop iterations are strip-mined, e.g.:
204   original loop:
205        for (i=0; i<N; i++){
206          a[i] = b[i] + c[i];
207        }
208
209   vectorized loop:
210        for (i=0; i<N; i+=VF){
211          a[i:VF] = b[i:VF] + c[i:VF];
212        }
213*/
214
215static bool
216vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
217{
218  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
219  basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
220  int nbbs = loop->num_nodes;
221  unsigned int vectorization_factor = 0;
222  tree scalar_type;
223  gphi *phi;
224  tree vectype;
225  unsigned int nunits;
226  stmt_vec_info stmt_info;
227  int i;
228  HOST_WIDE_INT dummy;
229  gimple stmt, pattern_stmt = NULL;
230  gimple_seq pattern_def_seq = NULL;
231  gimple_stmt_iterator pattern_def_si = gsi_none ();
232  bool analyze_pattern_stmt = false;
233
234  if (dump_enabled_p ())
235    dump_printf_loc (MSG_NOTE, vect_location,
236                     "=== vect_determine_vectorization_factor ===\n");
237
238  for (i = 0; i < nbbs; i++)
239    {
240      basic_block bb = bbs[i];
241
242      for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
243	   gsi_next (&si))
244	{
245	  phi = si.phi ();
246	  stmt_info = vinfo_for_stmt (phi);
247	  if (dump_enabled_p ())
248	    {
249	      dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
250	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
251	      dump_printf (MSG_NOTE, "\n");
252	    }
253
254	  gcc_assert (stmt_info);
255
256	  if (STMT_VINFO_RELEVANT_P (stmt_info))
257            {
258	      gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
259              scalar_type = TREE_TYPE (PHI_RESULT (phi));
260
261	      if (dump_enabled_p ())
262		{
263		  dump_printf_loc (MSG_NOTE, vect_location,
264                                   "get vectype for scalar type:  ");
265		  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
266                  dump_printf (MSG_NOTE, "\n");
267		}
268
269	      vectype = get_vectype_for_scalar_type (scalar_type);
270	      if (!vectype)
271		{
272		  if (dump_enabled_p ())
273		    {
274		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
275                                       "not vectorized: unsupported "
276                                       "data-type ");
277		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
278                                         scalar_type);
279                      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
280		    }
281		  return false;
282		}
283	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
284
285	      if (dump_enabled_p ())
286		{
287		  dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
288		  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
289                  dump_printf (MSG_NOTE, "\n");
290		}
291
292	      nunits = TYPE_VECTOR_SUBPARTS (vectype);
293	      if (dump_enabled_p ())
294		dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
295                                 nunits);
296
297	      if (!vectorization_factor
298		  || (nunits > vectorization_factor))
299		vectorization_factor = nunits;
300	    }
301	}
302
303      for (gimple_stmt_iterator si = gsi_start_bb (bb);
304	   !gsi_end_p (si) || analyze_pattern_stmt;)
305        {
306          tree vf_vectype;
307
308          if (analyze_pattern_stmt)
309	    stmt = pattern_stmt;
310          else
311            stmt = gsi_stmt (si);
312
313          stmt_info = vinfo_for_stmt (stmt);
314
315	  if (dump_enabled_p ())
316	    {
317	      dump_printf_loc (MSG_NOTE, vect_location,
318                               "==> examining statement: ");
319	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
320              dump_printf (MSG_NOTE, "\n");
321	    }
322
323	  gcc_assert (stmt_info);
324
325	  /* Skip stmts which do not need to be vectorized.  */
326	  if ((!STMT_VINFO_RELEVANT_P (stmt_info)
327	       && !STMT_VINFO_LIVE_P (stmt_info))
328	      || gimple_clobber_p (stmt))
329            {
330              if (STMT_VINFO_IN_PATTERN_P (stmt_info)
331                  && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
332                  && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
333                      || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
334                {
335                  stmt = pattern_stmt;
336                  stmt_info = vinfo_for_stmt (pattern_stmt);
337                  if (dump_enabled_p ())
338                    {
339                      dump_printf_loc (MSG_NOTE, vect_location,
340                                       "==> examining pattern statement: ");
341                      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
342                      dump_printf (MSG_NOTE, "\n");
343                    }
344                }
345              else
346	        {
347	          if (dump_enabled_p ())
348	            dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
349                  gsi_next (&si);
350	          continue;
351                }
352	    }
353          else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
354                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
355                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
356                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
357            analyze_pattern_stmt = true;
358
359	  /* If a pattern statement has def stmts, analyze them too.  */
360	  if (is_pattern_stmt_p (stmt_info))
361	    {
362	      if (pattern_def_seq == NULL)
363		{
364		  pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
365		  pattern_def_si = gsi_start (pattern_def_seq);
366		}
367	      else if (!gsi_end_p (pattern_def_si))
368		gsi_next (&pattern_def_si);
369	      if (pattern_def_seq != NULL)
370		{
371		  gimple pattern_def_stmt = NULL;
372		  stmt_vec_info pattern_def_stmt_info = NULL;
373
374		  while (!gsi_end_p (pattern_def_si))
375		    {
376		      pattern_def_stmt = gsi_stmt (pattern_def_si);
377		      pattern_def_stmt_info
378			= vinfo_for_stmt (pattern_def_stmt);
379		      if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
380			  || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
381			break;
382		      gsi_next (&pattern_def_si);
383		    }
384
385		  if (!gsi_end_p (pattern_def_si))
386		    {
387		      if (dump_enabled_p ())
388			{
389			  dump_printf_loc (MSG_NOTE, vect_location,
390                                           "==> examining pattern def stmt: ");
391			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
392                                            pattern_def_stmt, 0);
393                          dump_printf (MSG_NOTE, "\n");
394			}
395
396		      stmt = pattern_def_stmt;
397		      stmt_info = pattern_def_stmt_info;
398		    }
399		  else
400		    {
401		      pattern_def_si = gsi_none ();
402		      analyze_pattern_stmt = false;
403		    }
404		}
405	      else
406		analyze_pattern_stmt = false;
407	    }
408
409	  if (gimple_get_lhs (stmt) == NULL_TREE
410	      /* MASK_STORE has no lhs, but is ok.  */
411	      && (!is_gimple_call (stmt)
412		  || !gimple_call_internal_p (stmt)
413		  || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
414	    {
415	      if (is_gimple_call (stmt))
416		{
417		  /* Ignore calls with no lhs.  These must be calls to
418		     #pragma omp simd functions, and what vectorization factor
419		     it really needs can't be determined until
420		     vectorizable_simd_clone_call.  */
421		  if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
422		    {
423		      pattern_def_seq = NULL;
424		      gsi_next (&si);
425		    }
426		  continue;
427		}
428	      if (dump_enabled_p ())
429		{
430	          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
431                                   "not vectorized: irregular stmt.");
432		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
433                                    0);
434                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
435		}
436	      return false;
437	    }
438
439	  if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
440	    {
441	      if (dump_enabled_p ())
442	        {
443	          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
444                                   "not vectorized: vector stmt in loop:");
445	          dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
446                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
447	        }
448	      return false;
449	    }
450
451	  if (STMT_VINFO_VECTYPE (stmt_info))
452	    {
453	      /* The only case when a vectype had been already set is for stmts
454	         that contain a dataref, or for "pattern-stmts" (stmts
455		 generated by the vectorizer to represent/replace a certain
456		 idiom).  */
457	      gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
458			  || is_pattern_stmt_p (stmt_info)
459			  || !gsi_end_p (pattern_def_si));
460	      vectype = STMT_VINFO_VECTYPE (stmt_info);
461	    }
462	  else
463	    {
464	      gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
465	      if (is_gimple_call (stmt)
466		  && gimple_call_internal_p (stmt)
467		  && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
468		scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
469	      else
470		scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
471	      if (dump_enabled_p ())
472		{
473		  dump_printf_loc (MSG_NOTE, vect_location,
474                                   "get vectype for scalar type:  ");
475		  dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
476                  dump_printf (MSG_NOTE, "\n");
477		}
478	      vectype = get_vectype_for_scalar_type (scalar_type);
479	      if (!vectype)
480		{
481		  if (dump_enabled_p ())
482		    {
483		      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
484                                       "not vectorized: unsupported "
485                                       "data-type ");
486		      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
487                                         scalar_type);
488                      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
489		    }
490		  return false;
491		}
492
493	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
494
495	      if (dump_enabled_p ())
496		{
497		  dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
498		  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
499                  dump_printf (MSG_NOTE, "\n");
500		}
501            }
502
503	  /* The vectorization factor is according to the smallest
504	     scalar type (or the largest vector size, but we only
505	     support one vector size per loop).  */
506	  scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
507						       &dummy);
508	  if (dump_enabled_p ())
509	    {
510	      dump_printf_loc (MSG_NOTE, vect_location,
511                               "get vectype for scalar type:  ");
512	      dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
513              dump_printf (MSG_NOTE, "\n");
514	    }
515	  vf_vectype = get_vectype_for_scalar_type (scalar_type);
516	  if (!vf_vectype)
517	    {
518	      if (dump_enabled_p ())
519		{
520		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
521                                   "not vectorized: unsupported data-type ");
522		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
523                                     scalar_type);
524                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
525		}
526	      return false;
527	    }
528
529	  if ((GET_MODE_SIZE (TYPE_MODE (vectype))
530	       != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
531	    {
532	      if (dump_enabled_p ())
533		{
534		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
535                                   "not vectorized: different sized vector "
536                                   "types in statement, ");
537		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
538                                     vectype);
539		  dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
540		  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
541                                     vf_vectype);
542                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
543		}
544	      return false;
545	    }
546
547	  if (dump_enabled_p ())
548	    {
549	      dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
550	      dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
551              dump_printf (MSG_NOTE, "\n");
552	    }
553
554	  nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
555	  if (dump_enabled_p ())
556	    dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
557	  if (!vectorization_factor
558	      || (nunits > vectorization_factor))
559	    vectorization_factor = nunits;
560
561	  if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
562	    {
563	      pattern_def_seq = NULL;
564	      gsi_next (&si);
565	    }
566        }
567    }
568
569  /* TODO: Analyze cost. Decide if worth while to vectorize.  */
570  if (dump_enabled_p ())
571    dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
572                     vectorization_factor);
573  if (vectorization_factor <= 1)
574    {
575      if (dump_enabled_p ())
576        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
577                         "not vectorized: unsupported data-type\n");
578      return false;
579    }
580  LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
581
582  return true;
583}
584
585
586/* Function vect_is_simple_iv_evolution.
587
588   FORNOW: A simple evolution of an induction variables in the loop is
589   considered a polynomial evolution.  */
590
591static bool
592vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
593                             tree * step)
594{
595  tree init_expr;
596  tree step_expr;
597  tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
598  basic_block bb;
599
600  /* When there is no evolution in this loop, the evolution function
601     is not "simple".  */
602  if (evolution_part == NULL_TREE)
603    return false;
604
605  /* When the evolution is a polynomial of degree >= 2
606     the evolution function is not "simple".  */
607  if (tree_is_chrec (evolution_part))
608    return false;
609
610  step_expr = evolution_part;
611  init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
612
613  if (dump_enabled_p ())
614    {
615      dump_printf_loc (MSG_NOTE, vect_location, "step: ");
616      dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
617      dump_printf (MSG_NOTE, ",  init: ");
618      dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
619      dump_printf (MSG_NOTE, "\n");
620    }
621
622  *init = init_expr;
623  *step = step_expr;
624
625  if (TREE_CODE (step_expr) != INTEGER_CST
626      && (TREE_CODE (step_expr) != SSA_NAME
627	  || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
628	      && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
629	  || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
630	      && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
631		  || !flag_associative_math)))
632      && (TREE_CODE (step_expr) != REAL_CST
633	  || !flag_associative_math))
634    {
635      if (dump_enabled_p ())
636        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
637                         "step unknown.\n");
638      return false;
639    }
640
641  return true;
642}
643
644/* Function vect_analyze_scalar_cycles_1.
645
646   Examine the cross iteration def-use cycles of scalar variables
647   in LOOP.  LOOP_VINFO represents the loop that is now being
648   considered for vectorization (can be LOOP, or an outer-loop
649   enclosing LOOP).  */
650
651static void
652vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
653{
654  basic_block bb = loop->header;
655  tree init, step;
656  auto_vec<gimple, 64> worklist;
657  gphi_iterator gsi;
658  bool double_reduc;
659
660  if (dump_enabled_p ())
661    dump_printf_loc (MSG_NOTE, vect_location,
662                     "=== vect_analyze_scalar_cycles ===\n");
663
664  /* First - identify all inductions.  Reduction detection assumes that all the
665     inductions have been identified, therefore, this order must not be
666     changed.  */
667  for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
668    {
669      gphi *phi = gsi.phi ();
670      tree access_fn = NULL;
671      tree def = PHI_RESULT (phi);
672      stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
673
674      if (dump_enabled_p ())
675	{
676	  dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
677	  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
678          dump_printf (MSG_NOTE, "\n");
679	}
680
681      /* Skip virtual phi's.  The data dependences that are associated with
682         virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
683      if (virtual_operand_p (def))
684	continue;
685
686      STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
687
688      /* Analyze the evolution function.  */
689      access_fn = analyze_scalar_evolution (loop, def);
690      if (access_fn)
691	{
692	  STRIP_NOPS (access_fn);
693	  if (dump_enabled_p ())
694	    {
695	      dump_printf_loc (MSG_NOTE, vect_location,
696                               "Access function of PHI: ");
697	      dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
698              dump_printf (MSG_NOTE, "\n");
699	    }
700	  STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
701	    = evolution_part_in_loop_num (access_fn, loop->num);
702	}
703
704      if (!access_fn
705	  || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
706	  || (LOOP_VINFO_LOOP (loop_vinfo) != loop
707	      && TREE_CODE (step) != INTEGER_CST))
708	{
709	  worklist.safe_push (phi);
710	  continue;
711	}
712
713      gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
714
715      if (dump_enabled_p ())
716	dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
717      STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
718    }
719
720
721  /* Second - identify all reductions and nested cycles.  */
722  while (worklist.length () > 0)
723    {
724      gimple phi = worklist.pop ();
725      tree def = PHI_RESULT (phi);
726      stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
727      gimple reduc_stmt;
728      bool nested_cycle;
729
730      if (dump_enabled_p ())
731        {
732          dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
733          dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
734          dump_printf (MSG_NOTE, "\n");
735        }
736
737      gcc_assert (!virtual_operand_p (def)
738		  && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
739
740      nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
741      reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
742						&double_reduc);
743      if (reduc_stmt)
744        {
745          if (double_reduc)
746            {
747              if (dump_enabled_p ())
748                dump_printf_loc (MSG_NOTE, vect_location,
749				 "Detected double reduction.\n");
750
751              STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
752              STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
753                                                    vect_double_reduction_def;
754            }
755          else
756            {
757              if (nested_cycle)
758                {
759                  if (dump_enabled_p ())
760                    dump_printf_loc (MSG_NOTE, vect_location,
761				     "Detected vectorizable nested cycle.\n");
762
763                  STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
764                  STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
765                                                             vect_nested_cycle;
766                }
767              else
768                {
769                  if (dump_enabled_p ())
770                    dump_printf_loc (MSG_NOTE, vect_location,
771				     "Detected reduction.\n");
772
773                  STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
774                  STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
775                                                           vect_reduction_def;
776                  /* Store the reduction cycles for possible vectorization in
777                     loop-aware SLP.  */
778                  LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
779                }
780            }
781        }
782      else
783        if (dump_enabled_p ())
784          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
785			   "Unknown def-use cycle pattern.\n");
786    }
787}
788
789
790/* Function vect_analyze_scalar_cycles.
791
792   Examine the cross iteration def-use cycles of scalar variables, by
793   analyzing the loop-header PHIs of scalar variables.  Classify each
794   cycle as one of the following: invariant, induction, reduction, unknown.
795   We do that for the loop represented by LOOP_VINFO, and also to its
796   inner-loop, if exists.
797   Examples for scalar cycles:
798
799   Example1: reduction:
800
801              loop1:
802              for (i=0; i<N; i++)
803                 sum += a[i];
804
805   Example2: induction:
806
807              loop2:
808              for (i=0; i<N; i++)
809                 a[i] = i;  */
810
811static void
812vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
813{
814  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
815
816  vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
817
818  /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
819     Reductions in such inner-loop therefore have different properties than
820     the reductions in the nest that gets vectorized:
821     1. When vectorized, they are executed in the same order as in the original
822        scalar loop, so we can't change the order of computation when
823        vectorizing them.
824     2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
825        current checks are too strict.  */
826
827  if (loop->inner)
828    vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
829}
830
831
832/* Function vect_get_loop_niters.
833
834   Determine how many iterations the loop is executed and place it
835   in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
836   in NUMBER_OF_ITERATIONSM1.
837
838   Return the loop exit condition.  */
839
840
841static gcond *
842vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
843		      tree *number_of_iterationsm1)
844{
845  tree niters;
846
847  if (dump_enabled_p ())
848    dump_printf_loc (MSG_NOTE, vect_location,
849		     "=== get_loop_niters ===\n");
850
851  niters = number_of_latch_executions (loop);
852  *number_of_iterationsm1 = niters;
853
854  /* We want the number of loop header executions which is the number
855     of latch executions plus one.
856     ???  For UINT_MAX latch executions this number overflows to zero
857     for loops like do { n++; } while (n != 0);  */
858  if (niters && !chrec_contains_undetermined (niters))
859    niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
860			  build_int_cst (TREE_TYPE (niters), 1));
861  *number_of_iterations = niters;
862
863  return get_loop_exit_condition (loop);
864}
865
866
867/* Function bb_in_loop_p
868
869   Used as predicate for dfs order traversal of the loop bbs.  */
870
871static bool
872bb_in_loop_p (const_basic_block bb, const void *data)
873{
874  const struct loop *const loop = (const struct loop *)data;
875  if (flow_bb_inside_loop_p (loop, bb))
876    return true;
877  return false;
878}
879
880
881/* Function new_loop_vec_info.
882
883   Create and initialize a new loop_vec_info struct for LOOP, as well as
884   stmt_vec_info structs for all the stmts in LOOP.  */
885
886static loop_vec_info
887new_loop_vec_info (struct loop *loop)
888{
889  loop_vec_info res;
890  basic_block *bbs;
891  gimple_stmt_iterator si;
892  unsigned int i, nbbs;
893
894  res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
895  LOOP_VINFO_LOOP (res) = loop;
896
897  bbs = get_loop_body (loop);
898
899  /* Create/Update stmt_info for all stmts in the loop.  */
900  for (i = 0; i < loop->num_nodes; i++)
901    {
902      basic_block bb = bbs[i];
903
904      /* BBs in a nested inner-loop will have been already processed (because
905         we will have called vect_analyze_loop_form for any nested inner-loop).
906         Therefore, for stmts in an inner-loop we just want to update the
907         STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
908         loop_info of the outer-loop we are currently considering to vectorize
909         (instead of the loop_info of the inner-loop).
910         For stmts in other BBs we need to create a stmt_info from scratch.  */
911      if (bb->loop_father != loop)
912        {
913          /* Inner-loop bb.  */
914          gcc_assert (loop->inner && bb->loop_father == loop->inner);
915          for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
916            {
917              gimple phi = gsi_stmt (si);
918              stmt_vec_info stmt_info = vinfo_for_stmt (phi);
919              loop_vec_info inner_loop_vinfo =
920                STMT_VINFO_LOOP_VINFO (stmt_info);
921              gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
922              STMT_VINFO_LOOP_VINFO (stmt_info) = res;
923            }
924          for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
925           {
926              gimple stmt = gsi_stmt (si);
927              stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
928              loop_vec_info inner_loop_vinfo =
929                 STMT_VINFO_LOOP_VINFO (stmt_info);
930              gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
931              STMT_VINFO_LOOP_VINFO (stmt_info) = res;
932           }
933        }
934      else
935        {
936          /* bb in current nest.  */
937          for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
938            {
939              gimple phi = gsi_stmt (si);
940              gimple_set_uid (phi, 0);
941              set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
942            }
943
944          for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
945            {
946              gimple stmt = gsi_stmt (si);
947              gimple_set_uid (stmt, 0);
948              set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
949            }
950        }
951    }
952
953  /* CHECKME: We want to visit all BBs before their successors (except for
954     latch blocks, for which this assertion wouldn't hold).  In the simple
955     case of the loop forms we allow, a dfs order of the BBs would the same
956     as reversed postorder traversal, so we are safe.  */
957
958   free (bbs);
959   bbs = XCNEWVEC (basic_block, loop->num_nodes);
960   nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
961                              bbs, loop->num_nodes, loop);
962   gcc_assert (nbbs == loop->num_nodes);
963
964  LOOP_VINFO_BBS (res) = bbs;
965  LOOP_VINFO_NITERSM1 (res) = NULL;
966  LOOP_VINFO_NITERS (res) = NULL;
967  LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
968  LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
969  LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
970  LOOP_VINFO_VECTORIZABLE_P (res) = 0;
971  LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
972  LOOP_VINFO_VECT_FACTOR (res) = 0;
973  LOOP_VINFO_LOOP_NEST (res).create (3);
974  LOOP_VINFO_DATAREFS (res).create (10);
975  LOOP_VINFO_DDRS (res).create (10 * 10);
976  LOOP_VINFO_UNALIGNED_DR (res) = NULL;
977  LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
978	     PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
979  LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
980	     PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
981  LOOP_VINFO_GROUPED_STORES (res).create (10);
982  LOOP_VINFO_REDUCTIONS (res).create (10);
983  LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
984  LOOP_VINFO_SLP_INSTANCES (res).create (10);
985  LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
986  LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
987  LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
988  LOOP_VINFO_PEELING_FOR_NITER (res) = false;
989  LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
990
991  return res;
992}
993
994
995/* Function destroy_loop_vec_info.
996
997   Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
998   stmts in the loop.  */
999
1000void
1001destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1002{
1003  struct loop *loop;
1004  basic_block *bbs;
1005  int nbbs;
1006  gimple_stmt_iterator si;
1007  int j;
1008  vec<slp_instance> slp_instances;
1009  slp_instance instance;
1010  bool swapped;
1011
1012  if (!loop_vinfo)
1013    return;
1014
1015  loop = LOOP_VINFO_LOOP (loop_vinfo);
1016
1017  bbs = LOOP_VINFO_BBS (loop_vinfo);
1018  nbbs = clean_stmts ? loop->num_nodes : 0;
1019  swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1020
1021  for (j = 0; j < nbbs; j++)
1022    {
1023      basic_block bb = bbs[j];
1024      for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1025        free_stmt_vec_info (gsi_stmt (si));
1026
1027      for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1028        {
1029          gimple stmt = gsi_stmt (si);
1030
1031	  /* We may have broken canonical form by moving a constant
1032	     into RHS1 of a commutative op.  Fix such occurrences.  */
1033	  if (swapped && is_gimple_assign (stmt))
1034	    {
1035	      enum tree_code code = gimple_assign_rhs_code (stmt);
1036
1037	      if ((code == PLUS_EXPR
1038		   || code == POINTER_PLUS_EXPR
1039		   || code == MULT_EXPR)
1040		  && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1041		swap_ssa_operands (stmt,
1042				   gimple_assign_rhs1_ptr (stmt),
1043				   gimple_assign_rhs2_ptr (stmt));
1044	    }
1045
1046	  /* Free stmt_vec_info.  */
1047	  free_stmt_vec_info (stmt);
1048          gsi_next (&si);
1049        }
1050    }
1051
1052  free (LOOP_VINFO_BBS (loop_vinfo));
1053  vect_destroy_datarefs (loop_vinfo, NULL);
1054  free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1055  LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1056  LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1057  LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1058  slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1059  FOR_EACH_VEC_ELT (slp_instances, j, instance)
1060    vect_free_slp_instance (instance);
1061
1062  LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1063  LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1064  LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1065  LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1066
1067  delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1068  LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1069
1070  destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1071
1072  free (loop_vinfo);
1073  loop->aux = NULL;
1074}
1075
1076
1077/* Function vect_analyze_loop_1.
1078
1079   Apply a set of analyses on LOOP, and create a loop_vec_info struct
1080   for it. The different analyses will record information in the
1081   loop_vec_info struct.  This is a subset of the analyses applied in
1082   vect_analyze_loop, to be applied on an inner-loop nested in the loop
1083   that is now considered for (outer-loop) vectorization.  */
1084
1085static loop_vec_info
1086vect_analyze_loop_1 (struct loop *loop)
1087{
1088  loop_vec_info loop_vinfo;
1089
1090  if (dump_enabled_p ())
1091    dump_printf_loc (MSG_NOTE, vect_location,
1092		     "===== analyze_loop_nest_1 =====\n");
1093
1094  /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1095
1096  loop_vinfo = vect_analyze_loop_form (loop);
1097  if (!loop_vinfo)
1098    {
1099      if (dump_enabled_p ())
1100        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1101			 "bad inner-loop form.\n");
1102      return NULL;
1103    }
1104
1105  return loop_vinfo;
1106}
1107
1108
1109/* Function vect_analyze_loop_form.
1110
1111   Verify that certain CFG restrictions hold, including:
1112   - the loop has a pre-header
1113   - the loop has a single entry and exit
1114   - the loop exit condition is simple enough, and the number of iterations
1115     can be analyzed (a countable loop).  */
1116
1117loop_vec_info
1118vect_analyze_loop_form (struct loop *loop)
1119{
1120  loop_vec_info loop_vinfo;
1121  gcond *loop_cond;
1122  tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1123  loop_vec_info inner_loop_vinfo = NULL;
1124
1125  if (dump_enabled_p ())
1126    dump_printf_loc (MSG_NOTE, vect_location,
1127		     "=== vect_analyze_loop_form ===\n");
1128
1129  /* Different restrictions apply when we are considering an inner-most loop,
1130     vs. an outer (nested) loop.
1131     (FORNOW. May want to relax some of these restrictions in the future).  */
1132
1133  if (!loop->inner)
1134    {
1135      /* Inner-most loop.  We currently require that the number of BBs is
1136	 exactly 2 (the header and latch).  Vectorizable inner-most loops
1137	 look like this:
1138
1139                        (pre-header)
1140                           |
1141                          header <--------+
1142                           | |            |
1143                           | +--> latch --+
1144                           |
1145                        (exit-bb)  */
1146
1147      if (loop->num_nodes != 2)
1148        {
1149          if (dump_enabled_p ())
1150            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1151			     "not vectorized: control flow in loop.\n");
1152          return NULL;
1153        }
1154
1155      if (empty_block_p (loop->header))
1156	{
1157	  if (dump_enabled_p ())
1158	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159			     "not vectorized: empty loop.\n");
1160	  return NULL;
1161	}
1162    }
1163  else
1164    {
1165      struct loop *innerloop = loop->inner;
1166      edge entryedge;
1167
1168      /* Nested loop. We currently require that the loop is doubly-nested,
1169	 contains a single inner loop, and the number of BBs is exactly 5.
1170	 Vectorizable outer-loops look like this:
1171
1172			(pre-header)
1173			   |
1174			  header <---+
1175			   |         |
1176		          inner-loop |
1177			   |         |
1178			  tail ------+
1179			   |
1180		        (exit-bb)
1181
1182	 The inner-loop has the properties expected of inner-most loops
1183	 as described above.  */
1184
1185      if ((loop->inner)->inner || (loop->inner)->next)
1186	{
1187	  if (dump_enabled_p ())
1188	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1189			     "not vectorized: multiple nested loops.\n");
1190	  return NULL;
1191	}
1192
1193      /* Analyze the inner-loop.  */
1194      inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1195      if (!inner_loop_vinfo)
1196	{
1197	  if (dump_enabled_p ())
1198            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199			     "not vectorized: Bad inner loop.\n");
1200	  return NULL;
1201	}
1202
1203      if (!expr_invariant_in_loop_p (loop,
1204					LOOP_VINFO_NITERS (inner_loop_vinfo)))
1205	{
1206	  if (dump_enabled_p ())
1207	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1208			     "not vectorized: inner-loop count not"
1209                             " invariant.\n");
1210	  destroy_loop_vec_info (inner_loop_vinfo, true);
1211	  return NULL;
1212	}
1213
1214      if (loop->num_nodes != 5)
1215        {
1216	  if (dump_enabled_p ())
1217	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1218			     "not vectorized: control flow in loop.\n");
1219	  destroy_loop_vec_info (inner_loop_vinfo, true);
1220	  return NULL;
1221        }
1222
1223      gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1224      entryedge = EDGE_PRED (innerloop->header, 0);
1225      if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1226	entryedge = EDGE_PRED (innerloop->header, 1);
1227
1228      if (entryedge->src != loop->header
1229	  || !single_exit (innerloop)
1230	  || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1231	{
1232	  if (dump_enabled_p ())
1233	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234			     "not vectorized: unsupported outerloop form.\n");
1235	  destroy_loop_vec_info (inner_loop_vinfo, true);
1236	  return NULL;
1237	}
1238
1239      if (dump_enabled_p ())
1240        dump_printf_loc (MSG_NOTE, vect_location,
1241			 "Considering outer-loop vectorization.\n");
1242    }
1243
1244  if (!single_exit (loop)
1245      || EDGE_COUNT (loop->header->preds) != 2)
1246    {
1247      if (dump_enabled_p ())
1248        {
1249          if (!single_exit (loop))
1250	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1251			     "not vectorized: multiple exits.\n");
1252          else if (EDGE_COUNT (loop->header->preds) != 2)
1253	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1254			     "not vectorized: too many incoming edges.\n");
1255        }
1256      if (inner_loop_vinfo)
1257	destroy_loop_vec_info (inner_loop_vinfo, true);
1258      return NULL;
1259    }
1260
1261  /* We assume that the loop exit condition is at the end of the loop. i.e,
1262     that the loop is represented as a do-while (with a proper if-guard
1263     before the loop if needed), where the loop header contains all the
1264     executable statements, and the latch is empty.  */
1265  if (!empty_block_p (loop->latch)
1266      || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1267    {
1268      if (dump_enabled_p ())
1269	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1270			 "not vectorized: latch block not empty.\n");
1271      if (inner_loop_vinfo)
1272	destroy_loop_vec_info (inner_loop_vinfo, true);
1273      return NULL;
1274    }
1275
1276  /* Make sure there exists a single-predecessor exit bb:  */
1277  if (!single_pred_p (single_exit (loop)->dest))
1278    {
1279      edge e = single_exit (loop);
1280      if (!(e->flags & EDGE_ABNORMAL))
1281	{
1282	  split_loop_exit_edge (e);
1283	  if (dump_enabled_p ())
1284	    dump_printf (MSG_NOTE, "split exit edge.\n");
1285	}
1286      else
1287	{
1288	  if (dump_enabled_p ())
1289	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1290			     "not vectorized: abnormal loop exit edge.\n");
1291	  if (inner_loop_vinfo)
1292	    destroy_loop_vec_info (inner_loop_vinfo, true);
1293	  return NULL;
1294	}
1295    }
1296
1297  loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1298				    &number_of_iterationsm1);
1299  if (!loop_cond)
1300    {
1301      if (dump_enabled_p ())
1302	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1303			 "not vectorized: complicated exit condition.\n");
1304      if (inner_loop_vinfo)
1305	destroy_loop_vec_info (inner_loop_vinfo, true);
1306      return NULL;
1307    }
1308
1309  if (!number_of_iterations
1310      || chrec_contains_undetermined (number_of_iterations))
1311    {
1312      if (dump_enabled_p ())
1313	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1314			 "not vectorized: number of iterations cannot be "
1315			 "computed.\n");
1316      if (inner_loop_vinfo)
1317	destroy_loop_vec_info (inner_loop_vinfo, true);
1318      return NULL;
1319    }
1320
1321  if (integer_zerop (number_of_iterations))
1322    {
1323      if (dump_enabled_p ())
1324	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1325			 "not vectorized: number of iterations = 0.\n");
1326      if (inner_loop_vinfo)
1327        destroy_loop_vec_info (inner_loop_vinfo, true);
1328      return NULL;
1329    }
1330
1331  loop_vinfo = new_loop_vec_info (loop);
1332  LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1333  LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1334  LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1335
1336  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1337    {
1338      if (dump_enabled_p ())
1339        {
1340          dump_printf_loc (MSG_NOTE, vect_location,
1341			   "Symbolic number of iterations is ");
1342	  dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1343          dump_printf (MSG_NOTE, "\n");
1344        }
1345    }
1346
1347  STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1348
1349  /* CHECKME: May want to keep it around it in the future.  */
1350  if (inner_loop_vinfo)
1351    destroy_loop_vec_info (inner_loop_vinfo, false);
1352
1353  gcc_assert (!loop->aux);
1354  loop->aux = loop_vinfo;
1355  return loop_vinfo;
1356}
1357
1358
1359/* Function vect_analyze_loop_operations.
1360
1361   Scan the loop stmts and make sure they are all vectorizable.  */
1362
1363static bool
1364vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1365{
1366  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1367  basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1368  int nbbs = loop->num_nodes;
1369  unsigned int vectorization_factor = 0;
1370  int i;
1371  stmt_vec_info stmt_info;
1372  bool need_to_vectorize = false;
1373  int min_profitable_iters;
1374  int min_scalar_loop_bound;
1375  unsigned int th;
1376  bool only_slp_in_loop = true, ok;
1377  HOST_WIDE_INT max_niter;
1378  HOST_WIDE_INT estimated_niter;
1379  int min_profitable_estimate;
1380
1381  if (dump_enabled_p ())
1382    dump_printf_loc (MSG_NOTE, vect_location,
1383		     "=== vect_analyze_loop_operations ===\n");
1384
1385  gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1386  vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1387  if (slp)
1388    {
1389      /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1390	 vectorization factor of the loop is the unrolling factor required by
1391	 the SLP instances.  If that unrolling factor is 1, we say, that we
1392	 perform pure SLP on loop - cross iteration parallelism is not
1393	 exploited.  */
1394      for (i = 0; i < nbbs; i++)
1395	{
1396	  basic_block bb = bbs[i];
1397	  for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1398	       gsi_next (&si))
1399	    {
1400	      gimple stmt = gsi_stmt (si);
1401	      stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1402	      gcc_assert (stmt_info);
1403	      if ((STMT_VINFO_RELEVANT_P (stmt_info)
1404		   || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1405		  && !PURE_SLP_STMT (stmt_info))
1406		/* STMT needs both SLP and loop-based vectorization.  */
1407		only_slp_in_loop = false;
1408	    }
1409	}
1410
1411      if (only_slp_in_loop)
1412	vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1413      else
1414	vectorization_factor = least_common_multiple (vectorization_factor,
1415				LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1416
1417      LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1418      if (dump_enabled_p ())
1419	dump_printf_loc (MSG_NOTE, vect_location,
1420			 "Updating vectorization factor to %d\n",
1421			 vectorization_factor);
1422    }
1423
1424  for (i = 0; i < nbbs; i++)
1425    {
1426      basic_block bb = bbs[i];
1427
1428      for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1429	   gsi_next (&si))
1430        {
1431          gphi *phi = si.phi ();
1432          ok = true;
1433
1434          stmt_info = vinfo_for_stmt (phi);
1435          if (dump_enabled_p ())
1436            {
1437              dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1438              dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1439              dump_printf (MSG_NOTE, "\n");
1440            }
1441
1442          /* Inner-loop loop-closed exit phi in outer-loop vectorization
1443             (i.e., a phi in the tail of the outer-loop).  */
1444          if (! is_loop_header_bb_p (bb))
1445            {
1446              /* FORNOW: we currently don't support the case that these phis
1447                 are not used in the outerloop (unless it is double reduction,
1448                 i.e., this phi is vect_reduction_def), cause this case
1449                 requires to actually do something here.  */
1450              if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1451                   || STMT_VINFO_LIVE_P (stmt_info))
1452                  && STMT_VINFO_DEF_TYPE (stmt_info)
1453                     != vect_double_reduction_def)
1454                {
1455                  if (dump_enabled_p ())
1456		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1457				     "Unsupported loop-closed phi in "
1458				     "outer-loop.\n");
1459                  return false;
1460                }
1461
1462              /* If PHI is used in the outer loop, we check that its operand
1463                 is defined in the inner loop.  */
1464              if (STMT_VINFO_RELEVANT_P (stmt_info))
1465                {
1466                  tree phi_op;
1467                  gimple op_def_stmt;
1468
1469                  if (gimple_phi_num_args (phi) != 1)
1470                    return false;
1471
1472                  phi_op = PHI_ARG_DEF (phi, 0);
1473                  if (TREE_CODE (phi_op) != SSA_NAME)
1474                    return false;
1475
1476                  op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1477		  if (gimple_nop_p (op_def_stmt)
1478		      || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1479		      || !vinfo_for_stmt (op_def_stmt))
1480                    return false;
1481
1482                  if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1483                        != vect_used_in_outer
1484                      && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1485                           != vect_used_in_outer_by_reduction)
1486                    return false;
1487                }
1488
1489              continue;
1490            }
1491
1492          gcc_assert (stmt_info);
1493
1494          if (STMT_VINFO_LIVE_P (stmt_info))
1495            {
1496              /* FORNOW: not yet supported.  */
1497              if (dump_enabled_p ())
1498		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1499				 "not vectorized: value used after loop.\n");
1500              return false;
1501            }
1502
1503          if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1504              && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1505            {
1506              /* A scalar-dependence cycle that we don't support.  */
1507              if (dump_enabled_p ())
1508		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1509				 "not vectorized: scalar dependence cycle.\n");
1510              return false;
1511            }
1512
1513          if (STMT_VINFO_RELEVANT_P (stmt_info))
1514            {
1515              need_to_vectorize = true;
1516              if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1517                ok = vectorizable_induction (phi, NULL, NULL);
1518            }
1519
1520          if (!ok)
1521            {
1522              if (dump_enabled_p ())
1523                {
1524		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1525				   "not vectorized: relevant phi not "
1526				   "supported: ");
1527                  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1528                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1529                }
1530	      return false;
1531            }
1532        }
1533
1534      for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1535	   gsi_next (&si))
1536        {
1537          gimple stmt = gsi_stmt (si);
1538	  if (!gimple_clobber_p (stmt)
1539	      && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1540	    return false;
1541        }
1542    } /* bbs */
1543
1544  /* All operations in the loop are either irrelevant (deal with loop
1545     control, or dead), or only used outside the loop and can be moved
1546     out of the loop (e.g. invariants, inductions).  The loop can be
1547     optimized away by scalar optimizations.  We're better off not
1548     touching this loop.  */
1549  if (!need_to_vectorize)
1550    {
1551      if (dump_enabled_p ())
1552        dump_printf_loc (MSG_NOTE, vect_location,
1553			 "All the computation can be taken out of the loop.\n");
1554      if (dump_enabled_p ())
1555	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556			 "not vectorized: redundant loop. no profit to "
1557			 "vectorize.\n");
1558      return false;
1559    }
1560
1561  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1562    dump_printf_loc (MSG_NOTE, vect_location,
1563		     "vectorization_factor = %d, niters = "
1564		     HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1565		     LOOP_VINFO_INT_NITERS (loop_vinfo));
1566
1567  if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1568       && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1569      || ((max_niter = max_stmt_executions_int (loop)) != -1
1570	  && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1571    {
1572      if (dump_enabled_p ())
1573	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1574			 "not vectorized: iteration count too small.\n");
1575      if (dump_enabled_p ())
1576	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1577			 "not vectorized: iteration count smaller than "
1578			 "vectorization factor.\n");
1579      return false;
1580    }
1581
1582  /* Analyze cost.  Decide if worth while to vectorize.  */
1583
1584  /* Once VF is set, SLP costs should be updated since the number of created
1585     vector stmts depends on VF.  */
1586  vect_update_slp_costs_according_to_vf (loop_vinfo);
1587
1588  vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1589				      &min_profitable_estimate);
1590  LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1591
1592  if (min_profitable_iters < 0)
1593    {
1594      if (dump_enabled_p ())
1595	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1596			 "not vectorized: vectorization not profitable.\n");
1597      if (dump_enabled_p ())
1598	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1599			 "not vectorized: vector version will never be "
1600			 "profitable.\n");
1601      return false;
1602    }
1603
1604  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1605                            * vectorization_factor) - 1);
1606
1607
1608  /* Use the cost model only if it is more conservative than user specified
1609     threshold.  */
1610
1611  th = (unsigned) min_scalar_loop_bound;
1612  if (min_profitable_iters
1613      && (!min_scalar_loop_bound
1614          || min_profitable_iters > min_scalar_loop_bound))
1615    th = (unsigned) min_profitable_iters;
1616
1617  LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1618
1619  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1620      && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1621    {
1622      if (dump_enabled_p ())
1623	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1624			 "not vectorized: vectorization not profitable.\n");
1625      if (dump_enabled_p ())
1626        dump_printf_loc (MSG_NOTE, vect_location,
1627			 "not vectorized: iteration count smaller than user "
1628			 "specified loop bound parameter or minimum profitable "
1629			 "iterations (whichever is more conservative).\n");
1630      return false;
1631    }
1632
1633  if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1634      && ((unsigned HOST_WIDE_INT) estimated_niter
1635          <= MAX (th, (unsigned)min_profitable_estimate)))
1636    {
1637      if (dump_enabled_p ())
1638	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639			 "not vectorized: estimated iteration count too "
1640                         "small.\n");
1641      if (dump_enabled_p ())
1642        dump_printf_loc (MSG_NOTE, vect_location,
1643			 "not vectorized: estimated iteration count smaller "
1644                         "than specified loop bound parameter or minimum "
1645                         "profitable iterations (whichever is more "
1646                         "conservative).\n");
1647      return false;
1648    }
1649
1650  return true;
1651}
1652
1653
1654/* Function vect_analyze_loop_2.
1655
1656   Apply a set of analyses on LOOP, and create a loop_vec_info struct
1657   for it.  The different analyses will record information in the
1658   loop_vec_info struct.  */
1659static bool
1660vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1661{
1662  bool ok, slp = false;
1663  int max_vf = MAX_VECTORIZATION_FACTOR;
1664  int min_vf = 2;
1665  unsigned int th;
1666  unsigned int n_stmts = 0;
1667
1668  /* Find all data references in the loop (which correspond to vdefs/vuses)
1669     and analyze their evolution in the loop.  Also adjust the minimal
1670     vectorization factor according to the loads and stores.
1671
1672     FORNOW: Handle only simple, array references, which
1673     alignment can be forced, and aligned pointer-references.  */
1674
1675  ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1676  if (!ok)
1677    {
1678      if (dump_enabled_p ())
1679	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1680			 "bad data references.\n");
1681      return false;
1682    }
1683
1684  /* Classify all cross-iteration scalar data-flow cycles.
1685     Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1686
1687  vect_analyze_scalar_cycles (loop_vinfo);
1688
1689  vect_pattern_recog (loop_vinfo, NULL);
1690
1691  /* Analyze the access patterns of the data-refs in the loop (consecutive,
1692     complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1693
1694  ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1695  if (!ok)
1696    {
1697      if (dump_enabled_p ())
1698	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1699			 "bad data access.\n");
1700      return false;
1701    }
1702
1703  /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1704
1705  ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1706  if (!ok)
1707    {
1708      if (dump_enabled_p ())
1709	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1710			 "unexpected pattern.\n");
1711      return false;
1712    }
1713
1714  /* Analyze data dependences between the data-refs in the loop
1715     and adjust the maximum vectorization factor according to
1716     the dependences.
1717     FORNOW: fail at the first data dependence that we encounter.  */
1718
1719  ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1720  if (!ok
1721      || max_vf < min_vf)
1722    {
1723      if (dump_enabled_p ())
1724	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1725			     "bad data dependence.\n");
1726      return false;
1727    }
1728
1729  ok = vect_determine_vectorization_factor (loop_vinfo);
1730  if (!ok)
1731    {
1732      if (dump_enabled_p ())
1733	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1734			 "can't determine vectorization factor.\n");
1735      return false;
1736    }
1737  if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1738    {
1739      if (dump_enabled_p ())
1740	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1741			 "bad data dependence.\n");
1742      return false;
1743    }
1744
1745  /* Analyze the alignment of the data-refs in the loop.
1746     Fail if a data reference is found that cannot be vectorized.  */
1747
1748  ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1749  if (!ok)
1750    {
1751      if (dump_enabled_p ())
1752	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1753			 "bad data alignment.\n");
1754      return false;
1755    }
1756
1757  /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1758     It is important to call pruning after vect_analyze_data_ref_accesses,
1759     since we use grouping information gathered by interleaving analysis.  */
1760  ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1761  if (!ok)
1762    {
1763      if (dump_enabled_p ())
1764	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1765			 "number of versioning for alias "
1766			 "run-time tests exceeds %d "
1767			 "(--param vect-max-version-for-alias-checks)\n",
1768			 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1769      return false;
1770    }
1771
1772  /* This pass will decide on using loop versioning and/or loop peeling in
1773     order to enhance the alignment of data references in the loop.  */
1774
1775  ok = vect_enhance_data_refs_alignment (loop_vinfo);
1776  if (!ok)
1777    {
1778      if (dump_enabled_p ())
1779	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1780			 "bad data alignment.\n");
1781      return false;
1782    }
1783
1784  /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1785  ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1786  if (ok)
1787    {
1788      /* Decide which possible SLP instances to SLP.  */
1789      slp = vect_make_slp_decision (loop_vinfo);
1790
1791      /* Find stmts that need to be both vectorized and SLPed.  */
1792      vect_detect_hybrid_slp (loop_vinfo);
1793    }
1794  else
1795    return false;
1796
1797  /* Scan all the operations in the loop and make sure they are
1798     vectorizable.  */
1799
1800  ok = vect_analyze_loop_operations (loop_vinfo, slp);
1801  if (!ok)
1802    {
1803      if (dump_enabled_p ())
1804	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1805			 "bad operation or unsupported loop bound.\n");
1806      return false;
1807    }
1808
1809  /* Decide whether we need to create an epilogue loop to handle
1810     remaining scalar iterations.  */
1811  th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1812        / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1813       * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1814
1815  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1816      && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1817    {
1818      if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1819		   - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1820	  < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1821	LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1822    }
1823  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1824	   || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1825	       < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1826               /* In case of versioning, check if the maximum number of
1827                  iterations is greater than th.  If they are identical,
1828                  the epilogue is unnecessary.  */
1829	       && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1830	            && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1831                   || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1832		        (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1833    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1834
1835  /* If an epilogue loop is required make sure we can create one.  */
1836  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1837      || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1838    {
1839      if (dump_enabled_p ())
1840        dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1841      if (!vect_can_advance_ivs_p (loop_vinfo)
1842	  || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1843					   single_exit (LOOP_VINFO_LOOP
1844							 (loop_vinfo))))
1845        {
1846          if (dump_enabled_p ())
1847	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848			     "not vectorized: can't create required "
1849			     "epilog loop\n");
1850          return false;
1851        }
1852    }
1853
1854  return true;
1855}
1856
1857/* Function vect_analyze_loop.
1858
1859   Apply a set of analyses on LOOP, and create a loop_vec_info struct
1860   for it.  The different analyses will record information in the
1861   loop_vec_info struct.  */
1862loop_vec_info
1863vect_analyze_loop (struct loop *loop)
1864{
1865  loop_vec_info loop_vinfo;
1866  unsigned int vector_sizes;
1867
1868  /* Autodetect first vector size we try.  */
1869  current_vector_size = 0;
1870  vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1871
1872  if (dump_enabled_p ())
1873    dump_printf_loc (MSG_NOTE, vect_location,
1874		     "===== analyze_loop_nest =====\n");
1875
1876  if (loop_outer (loop)
1877      && loop_vec_info_for_loop (loop_outer (loop))
1878      && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1879    {
1880      if (dump_enabled_p ())
1881	dump_printf_loc (MSG_NOTE, vect_location,
1882			 "outer-loop already vectorized.\n");
1883      return NULL;
1884    }
1885
1886  while (1)
1887    {
1888      /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1889      loop_vinfo = vect_analyze_loop_form (loop);
1890      if (!loop_vinfo)
1891	{
1892	  if (dump_enabled_p ())
1893	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1894			     "bad loop form.\n");
1895	  return NULL;
1896	}
1897
1898      if (vect_analyze_loop_2 (loop_vinfo))
1899	{
1900	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1901
1902	  return loop_vinfo;
1903	}
1904
1905      destroy_loop_vec_info (loop_vinfo, true);
1906
1907      vector_sizes &= ~current_vector_size;
1908      if (vector_sizes == 0
1909	  || current_vector_size == 0)
1910	return NULL;
1911
1912      /* Try the next biggest vector size.  */
1913      current_vector_size = 1 << floor_log2 (vector_sizes);
1914      if (dump_enabled_p ())
1915	dump_printf_loc (MSG_NOTE, vect_location,
1916			 "***** Re-trying analysis with "
1917			 "vector size %d\n", current_vector_size);
1918    }
1919}
1920
1921
1922/* Function reduction_code_for_scalar_code
1923
1924   Input:
1925   CODE - tree_code of a reduction operations.
1926
1927   Output:
1928   REDUC_CODE - the corresponding tree-code to be used to reduce the
1929      vector of partial results into a single scalar result, or ERROR_MARK
1930      if the operation is a supported reduction operation, but does not have
1931      such a tree-code.
1932
1933   Return FALSE if CODE currently cannot be vectorized as reduction.  */
1934
1935static bool
1936reduction_code_for_scalar_code (enum tree_code code,
1937                                enum tree_code *reduc_code)
1938{
1939  switch (code)
1940    {
1941      case MAX_EXPR:
1942        *reduc_code = REDUC_MAX_EXPR;
1943        return true;
1944
1945      case MIN_EXPR:
1946        *reduc_code = REDUC_MIN_EXPR;
1947        return true;
1948
1949      case PLUS_EXPR:
1950        *reduc_code = REDUC_PLUS_EXPR;
1951        return true;
1952
1953      case MULT_EXPR:
1954      case MINUS_EXPR:
1955      case BIT_IOR_EXPR:
1956      case BIT_XOR_EXPR:
1957      case BIT_AND_EXPR:
1958        *reduc_code = ERROR_MARK;
1959        return true;
1960
1961      default:
1962       return false;
1963    }
1964}
1965
1966
1967/* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1968   STMT is printed with a message MSG. */
1969
1970static void
1971report_vect_op (int msg_type, gimple stmt, const char *msg)
1972{
1973  dump_printf_loc (msg_type, vect_location, "%s", msg);
1974  dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1975  dump_printf (msg_type, "\n");
1976}
1977
1978
1979/* Detect SLP reduction of the form:
1980
1981   #a1 = phi <a5, a0>
1982   a2 = operation (a1)
1983   a3 = operation (a2)
1984   a4 = operation (a3)
1985   a5 = operation (a4)
1986
1987   #a = phi <a5>
1988
1989   PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1990   FIRST_STMT is the first reduction stmt in the chain
1991   (a2 = operation (a1)).
1992
1993   Return TRUE if a reduction chain was detected.  */
1994
1995static bool
1996vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1997{
1998  struct loop *loop = (gimple_bb (phi))->loop_father;
1999  struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2000  enum tree_code code;
2001  gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
2002  stmt_vec_info use_stmt_info, current_stmt_info;
2003  tree lhs;
2004  imm_use_iterator imm_iter;
2005  use_operand_p use_p;
2006  int nloop_uses, size = 0, n_out_of_loop_uses;
2007  bool found = false;
2008
2009  if (loop != vect_loop)
2010    return false;
2011
2012  lhs = PHI_RESULT (phi);
2013  code = gimple_assign_rhs_code (first_stmt);
2014  while (1)
2015    {
2016      nloop_uses = 0;
2017      n_out_of_loop_uses = 0;
2018      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2019        {
2020	  gimple use_stmt = USE_STMT (use_p);
2021	  if (is_gimple_debug (use_stmt))
2022	    continue;
2023
2024          /* Check if we got back to the reduction phi.  */
2025	  if (use_stmt == phi)
2026            {
2027	      loop_use_stmt = use_stmt;
2028              found = true;
2029              break;
2030            }
2031
2032          if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2033            {
2034              if (vinfo_for_stmt (use_stmt)
2035                  && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
2036                {
2037                  loop_use_stmt = use_stmt;
2038                  nloop_uses++;
2039                }
2040            }
2041           else
2042             n_out_of_loop_uses++;
2043
2044           /* There are can be either a single use in the loop or two uses in
2045              phi nodes.  */
2046           if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2047             return false;
2048        }
2049
2050      if (found)
2051        break;
2052
2053      /* We reached a statement with no loop uses.  */
2054      if (nloop_uses == 0)
2055	return false;
2056
2057      /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2058      if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2059        return false;
2060
2061      if (!is_gimple_assign (loop_use_stmt)
2062	  || code != gimple_assign_rhs_code (loop_use_stmt)
2063	  || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2064        return false;
2065
2066      /* Insert USE_STMT into reduction chain.  */
2067      use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2068      if (current_stmt)
2069        {
2070          current_stmt_info = vinfo_for_stmt (current_stmt);
2071	  GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2072          GROUP_FIRST_ELEMENT (use_stmt_info)
2073            = GROUP_FIRST_ELEMENT (current_stmt_info);
2074        }
2075      else
2076	GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2077
2078      lhs = gimple_assign_lhs (loop_use_stmt);
2079      current_stmt = loop_use_stmt;
2080      size++;
2081   }
2082
2083  if (!found || loop_use_stmt != phi || size < 2)
2084    return false;
2085
2086  /* Swap the operands, if needed, to make the reduction operand be the second
2087     operand.  */
2088  lhs = PHI_RESULT (phi);
2089  next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2090  while (next_stmt)
2091    {
2092      if (gimple_assign_rhs2 (next_stmt) == lhs)
2093	{
2094	  tree op = gimple_assign_rhs1 (next_stmt);
2095          gimple def_stmt = NULL;
2096
2097          if (TREE_CODE (op) == SSA_NAME)
2098            def_stmt = SSA_NAME_DEF_STMT (op);
2099
2100	  /* Check that the other def is either defined in the loop
2101	     ("vect_internal_def"), or it's an induction (defined by a
2102	     loop-header phi-node).  */
2103          if (def_stmt
2104              && gimple_bb (def_stmt)
2105	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2106              && (is_gimple_assign (def_stmt)
2107                  || is_gimple_call (def_stmt)
2108                  || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2109                           == vect_induction_def
2110                  || (gimple_code (def_stmt) == GIMPLE_PHI
2111                      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2112                                  == vect_internal_def
2113                      && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2114	    {
2115	      lhs = gimple_assign_lhs (next_stmt);
2116	      next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2117 	      continue;
2118	    }
2119
2120	  return false;
2121	}
2122      else
2123	{
2124          tree op = gimple_assign_rhs2 (next_stmt);
2125          gimple def_stmt = NULL;
2126
2127          if (TREE_CODE (op) == SSA_NAME)
2128            def_stmt = SSA_NAME_DEF_STMT (op);
2129
2130          /* Check that the other def is either defined in the loop
2131            ("vect_internal_def"), or it's an induction (defined by a
2132            loop-header phi-node).  */
2133          if (def_stmt
2134              && gimple_bb (def_stmt)
2135	      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2136              && (is_gimple_assign (def_stmt)
2137                  || is_gimple_call (def_stmt)
2138                  || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2139                              == vect_induction_def
2140                  || (gimple_code (def_stmt) == GIMPLE_PHI
2141                      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2142                                  == vect_internal_def
2143                      && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2144  	    {
2145	      if (dump_enabled_p ())
2146		{
2147		  dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2148		  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2149                  dump_printf (MSG_NOTE, "\n");
2150		}
2151
2152	      swap_ssa_operands (next_stmt,
2153	 		         gimple_assign_rhs1_ptr (next_stmt),
2154                                 gimple_assign_rhs2_ptr (next_stmt));
2155	      update_stmt (next_stmt);
2156
2157	      if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2158		LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2159	    }
2160	  else
2161	    return false;
2162        }
2163
2164      lhs = gimple_assign_lhs (next_stmt);
2165      next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2166    }
2167
2168  /* Save the chain for further analysis in SLP detection.  */
2169  first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2170  LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2171  GROUP_SIZE (vinfo_for_stmt (first)) = size;
2172
2173  return true;
2174}
2175
2176
2177/* Function vect_is_simple_reduction_1
2178
2179   (1) Detect a cross-iteration def-use cycle that represents a simple
2180   reduction computation.  We look for the following pattern:
2181
2182   loop_header:
2183     a1 = phi < a0, a2 >
2184     a3 = ...
2185     a2 = operation (a3, a1)
2186
2187   or
2188
2189   a3 = ...
2190   loop_header:
2191     a1 = phi < a0, a2 >
2192     a2 = operation (a3, a1)
2193
2194   such that:
2195   1. operation is commutative and associative and it is safe to
2196      change the order of the computation (if CHECK_REDUCTION is true)
2197   2. no uses for a2 in the loop (a2 is used out of the loop)
2198   3. no uses of a1 in the loop besides the reduction operation
2199   4. no uses of a1 outside the loop.
2200
2201   Conditions 1,4 are tested here.
2202   Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2203
2204   (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2205   nested cycles, if CHECK_REDUCTION is false.
2206
2207   (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2208   reductions:
2209
2210     a1 = phi < a0, a2 >
2211     inner loop (def of a3)
2212     a2 = phi < a3 >
2213
2214   If MODIFY is true it tries also to rework the code in-place to enable
2215   detection of more reduction patterns.  For the time being we rewrite
2216   "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2217*/
2218
2219static gimple
2220vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2221			    bool check_reduction, bool *double_reduc,
2222			    bool modify)
2223{
2224  struct loop *loop = (gimple_bb (phi))->loop_father;
2225  struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2226  edge latch_e = loop_latch_edge (loop);
2227  tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2228  gimple def_stmt, def1 = NULL, def2 = NULL;
2229  enum tree_code orig_code, code;
2230  tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2231  tree type;
2232  int nloop_uses;
2233  tree name;
2234  imm_use_iterator imm_iter;
2235  use_operand_p use_p;
2236  bool phi_def;
2237
2238  *double_reduc = false;
2239
2240  /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2241     otherwise, we assume outer loop vectorization.  */
2242  gcc_assert ((check_reduction && loop == vect_loop)
2243              || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2244
2245  name = PHI_RESULT (phi);
2246  /* ???  If there are no uses of the PHI result the inner loop reduction
2247     won't be detected as possibly double-reduction by vectorizable_reduction
2248     because that tries to walk the PHI arg from the preheader edge which
2249     can be constant.  See PR60382.  */
2250  if (has_zero_uses (name))
2251    return NULL;
2252  nloop_uses = 0;
2253  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2254    {
2255      gimple use_stmt = USE_STMT (use_p);
2256      if (is_gimple_debug (use_stmt))
2257	continue;
2258
2259      if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2260        {
2261          if (dump_enabled_p ())
2262	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2263			     "intermediate value used outside loop.\n");
2264
2265          return NULL;
2266        }
2267
2268      if (vinfo_for_stmt (use_stmt)
2269	  && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2270        nloop_uses++;
2271      if (nloop_uses > 1)
2272        {
2273          if (dump_enabled_p ())
2274	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2275			     "reduction used in loop.\n");
2276          return NULL;
2277        }
2278    }
2279
2280  if (TREE_CODE (loop_arg) != SSA_NAME)
2281    {
2282      if (dump_enabled_p ())
2283	{
2284	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285			   "reduction: not ssa_name: ");
2286	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2287          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2288	}
2289      return NULL;
2290    }
2291
2292  def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2293  if (!def_stmt)
2294    {
2295      if (dump_enabled_p ())
2296	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2297			 "reduction: no def_stmt.\n");
2298      return NULL;
2299    }
2300
2301  if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2302    {
2303      if (dump_enabled_p ())
2304        {
2305          dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2306          dump_printf (MSG_NOTE, "\n");
2307        }
2308      return NULL;
2309    }
2310
2311  if (is_gimple_assign (def_stmt))
2312    {
2313      name = gimple_assign_lhs (def_stmt);
2314      phi_def = false;
2315    }
2316  else
2317    {
2318      name = PHI_RESULT (def_stmt);
2319      phi_def = true;
2320    }
2321
2322  nloop_uses = 0;
2323  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2324    {
2325      gimple use_stmt = USE_STMT (use_p);
2326      if (is_gimple_debug (use_stmt))
2327	continue;
2328      if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2329	  && vinfo_for_stmt (use_stmt)
2330	  && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2331	nloop_uses++;
2332      if (nloop_uses > 1)
2333	{
2334	  if (dump_enabled_p ())
2335	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2336			     "reduction used in loop.\n");
2337	  return NULL;
2338	}
2339    }
2340
2341  /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2342     defined in the inner loop.  */
2343  if (phi_def)
2344    {
2345      op1 = PHI_ARG_DEF (def_stmt, 0);
2346
2347      if (gimple_phi_num_args (def_stmt) != 1
2348          || TREE_CODE (op1) != SSA_NAME)
2349        {
2350          if (dump_enabled_p ())
2351	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2352			     "unsupported phi node definition.\n");
2353
2354          return NULL;
2355        }
2356
2357      def1 = SSA_NAME_DEF_STMT (op1);
2358      if (gimple_bb (def1)
2359	  && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2360          && loop->inner
2361          && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2362          && is_gimple_assign (def1))
2363        {
2364          if (dump_enabled_p ())
2365            report_vect_op (MSG_NOTE, def_stmt,
2366			    "detected double reduction: ");
2367
2368          *double_reduc = true;
2369          return def_stmt;
2370        }
2371
2372      return NULL;
2373    }
2374
2375  code = orig_code = gimple_assign_rhs_code (def_stmt);
2376
2377  /* We can handle "res -= x[i]", which is non-associative by
2378     simply rewriting this into "res += -x[i]".  Avoid changing
2379     gimple instruction for the first simple tests and only do this
2380     if we're allowed to change code at all.  */
2381  if (code == MINUS_EXPR
2382      && modify
2383      && (op1 = gimple_assign_rhs1 (def_stmt))
2384      && TREE_CODE (op1) == SSA_NAME
2385      && SSA_NAME_DEF_STMT (op1) == phi)
2386    code = PLUS_EXPR;
2387
2388  if (check_reduction
2389      && (!commutative_tree_code (code) || !associative_tree_code (code)))
2390    {
2391      if (dump_enabled_p ())
2392        report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2393			"reduction: not commutative/associative: ");
2394      return NULL;
2395    }
2396
2397  if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2398    {
2399      if (code != COND_EXPR)
2400        {
2401	  if (dump_enabled_p ())
2402	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2403			    "reduction: not binary operation: ");
2404
2405          return NULL;
2406        }
2407
2408      op3 = gimple_assign_rhs1 (def_stmt);
2409      if (COMPARISON_CLASS_P (op3))
2410        {
2411          op4 = TREE_OPERAND (op3, 1);
2412          op3 = TREE_OPERAND (op3, 0);
2413        }
2414
2415      op1 = gimple_assign_rhs2 (def_stmt);
2416      op2 = gimple_assign_rhs3 (def_stmt);
2417
2418      if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2419        {
2420          if (dump_enabled_p ())
2421            report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2422			    "reduction: uses not ssa_names: ");
2423
2424          return NULL;
2425        }
2426    }
2427  else
2428    {
2429      op1 = gimple_assign_rhs1 (def_stmt);
2430      op2 = gimple_assign_rhs2 (def_stmt);
2431
2432      if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2433        {
2434          if (dump_enabled_p ())
2435	    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2436			    "reduction: uses not ssa_names: ");
2437
2438          return NULL;
2439        }
2440   }
2441
2442  type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2443  if ((TREE_CODE (op1) == SSA_NAME
2444       && !types_compatible_p (type,TREE_TYPE (op1)))
2445      || (TREE_CODE (op2) == SSA_NAME
2446          && !types_compatible_p (type, TREE_TYPE (op2)))
2447      || (op3 && TREE_CODE (op3) == SSA_NAME
2448          && !types_compatible_p (type, TREE_TYPE (op3)))
2449      || (op4 && TREE_CODE (op4) == SSA_NAME
2450          && !types_compatible_p (type, TREE_TYPE (op4))))
2451    {
2452      if (dump_enabled_p ())
2453        {
2454          dump_printf_loc (MSG_NOTE, vect_location,
2455			   "reduction: multiple types: operation type: ");
2456          dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2457          dump_printf (MSG_NOTE, ", operands types: ");
2458          dump_generic_expr (MSG_NOTE, TDF_SLIM,
2459			     TREE_TYPE (op1));
2460          dump_printf (MSG_NOTE, ",");
2461          dump_generic_expr (MSG_NOTE, TDF_SLIM,
2462			     TREE_TYPE (op2));
2463          if (op3)
2464            {
2465              dump_printf (MSG_NOTE, ",");
2466              dump_generic_expr (MSG_NOTE, TDF_SLIM,
2467				 TREE_TYPE (op3));
2468            }
2469
2470          if (op4)
2471            {
2472              dump_printf (MSG_NOTE, ",");
2473              dump_generic_expr (MSG_NOTE, TDF_SLIM,
2474				 TREE_TYPE (op4));
2475            }
2476          dump_printf (MSG_NOTE, "\n");
2477        }
2478
2479      return NULL;
2480    }
2481
2482  /* Check that it's ok to change the order of the computation.
2483     Generally, when vectorizing a reduction we change the order of the
2484     computation.  This may change the behavior of the program in some
2485     cases, so we need to check that this is ok.  One exception is when
2486     vectorizing an outer-loop: the inner-loop is executed sequentially,
2487     and therefore vectorizing reductions in the inner-loop during
2488     outer-loop vectorization is safe.  */
2489
2490  /* CHECKME: check for !flag_finite_math_only too?  */
2491  if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2492      && check_reduction)
2493    {
2494      /* Changing the order of operations changes the semantics.  */
2495      if (dump_enabled_p ())
2496	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2497			"reduction: unsafe fp math optimization: ");
2498      return NULL;
2499    }
2500  else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2501	   && check_reduction)
2502    {
2503      /* Changing the order of operations changes the semantics.  */
2504      if (dump_enabled_p ())
2505	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2506			"reduction: unsafe int math optimization: ");
2507      return NULL;
2508    }
2509  else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2510    {
2511      /* Changing the order of operations changes the semantics.  */
2512      if (dump_enabled_p ())
2513	report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2514			"reduction: unsafe fixed-point math optimization: ");
2515      return NULL;
2516    }
2517
2518  /* If we detected "res -= x[i]" earlier, rewrite it into
2519     "res += -x[i]" now.  If this turns out to be useless reassoc
2520     will clean it up again.  */
2521  if (orig_code == MINUS_EXPR)
2522    {
2523      tree rhs = gimple_assign_rhs2 (def_stmt);
2524      tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2525      gimple negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2526      gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2527      set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2528							  loop_info, NULL));
2529      gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2530      gimple_assign_set_rhs2 (def_stmt, negrhs);
2531      gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2532      update_stmt (def_stmt);
2533    }
2534
2535  /* Reduction is safe. We're dealing with one of the following:
2536     1) integer arithmetic and no trapv
2537     2) floating point arithmetic, and special flags permit this optimization
2538     3) nested cycle (i.e., outer loop vectorization).  */
2539  if (TREE_CODE (op1) == SSA_NAME)
2540    def1 = SSA_NAME_DEF_STMT (op1);
2541
2542  if (TREE_CODE (op2) == SSA_NAME)
2543    def2 = SSA_NAME_DEF_STMT (op2);
2544
2545  if (code != COND_EXPR
2546      && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2547    {
2548      if (dump_enabled_p ())
2549	report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2550      return NULL;
2551    }
2552
2553  /* Check that one def is the reduction def, defined by PHI,
2554     the other def is either defined in the loop ("vect_internal_def"),
2555     or it's an induction (defined by a loop-header phi-node).  */
2556
2557  if (def2 && def2 == phi
2558      && (code == COND_EXPR
2559	  || !def1 || gimple_nop_p (def1)
2560	  || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2561          || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2562              && (is_gimple_assign (def1)
2563		  || is_gimple_call (def1)
2564  	          || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2565                      == vect_induction_def
2566   	          || (gimple_code (def1) == GIMPLE_PHI
2567	              && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2568                          == vect_internal_def
2569 	              && !is_loop_header_bb_p (gimple_bb (def1)))))))
2570    {
2571      if (dump_enabled_p ())
2572	report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2573      return def_stmt;
2574    }
2575
2576  if (def1 && def1 == phi
2577      && (code == COND_EXPR
2578	  || !def2 || gimple_nop_p (def2)
2579	  || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2580          || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2581 	      && (is_gimple_assign (def2)
2582		  || is_gimple_call (def2)
2583	          || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2584                      == vect_induction_def
2585 	          || (gimple_code (def2) == GIMPLE_PHI
2586		      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2587                          == vect_internal_def
2588		      && !is_loop_header_bb_p (gimple_bb (def2)))))))
2589    {
2590      if (check_reduction)
2591        {
2592          /* Swap operands (just for simplicity - so that the rest of the code
2593	     can assume that the reduction variable is always the last (second)
2594	     argument).  */
2595          if (dump_enabled_p ())
2596	    report_vect_op (MSG_NOTE, def_stmt,
2597	  	            "detected reduction: need to swap operands: ");
2598
2599          swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2600 			     gimple_assign_rhs2_ptr (def_stmt));
2601
2602	  if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2603	    LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2604        }
2605      else
2606        {
2607          if (dump_enabled_p ())
2608            report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2609        }
2610
2611      return def_stmt;
2612    }
2613
2614  /* Try to find SLP reduction chain.  */
2615  if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2616    {
2617      if (dump_enabled_p ())
2618        report_vect_op (MSG_NOTE, def_stmt,
2619			"reduction: detected reduction chain: ");
2620
2621      return def_stmt;
2622    }
2623
2624  if (dump_enabled_p ())
2625    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2626		    "reduction: unknown pattern: ");
2627
2628  return NULL;
2629}
2630
2631/* Wrapper around vect_is_simple_reduction_1, that won't modify code
2632   in-place.  Arguments as there.  */
2633
2634static gimple
2635vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2636                          bool check_reduction, bool *double_reduc)
2637{
2638  return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2639				     double_reduc, false);
2640}
2641
2642/* Wrapper around vect_is_simple_reduction_1, which will modify code
2643   in-place if it enables detection of more reductions.  Arguments
2644   as there.  */
2645
2646gimple
2647vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2648                          bool check_reduction, bool *double_reduc)
2649{
2650  return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2651				     double_reduc, true);
2652}
2653
2654/* Calculate the cost of one scalar iteration of the loop.  */
2655int
2656vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo,
2657				       stmt_vector_for_cost *scalar_cost_vec)
2658{
2659  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2660  basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2661  int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2662  int innerloop_iters, i;
2663
2664  /* Count statements in scalar loop.  Using this as scalar cost for a single
2665     iteration for now.
2666
2667     TODO: Add outer loop support.
2668
2669     TODO: Consider assigning different costs to different scalar
2670     statements.  */
2671
2672  /* FORNOW.  */
2673  innerloop_iters = 1;
2674  if (loop->inner)
2675    innerloop_iters = 50; /* FIXME */
2676
2677  for (i = 0; i < nbbs; i++)
2678    {
2679      gimple_stmt_iterator si;
2680      basic_block bb = bbs[i];
2681
2682      if (bb->loop_father == loop->inner)
2683        factor = innerloop_iters;
2684      else
2685        factor = 1;
2686
2687      for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2688        {
2689          gimple stmt = gsi_stmt (si);
2690          stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2691
2692          if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2693            continue;
2694
2695          /* Skip stmts that are not vectorized inside the loop.  */
2696          if (stmt_info
2697              && !STMT_VINFO_RELEVANT_P (stmt_info)
2698              && (!STMT_VINFO_LIVE_P (stmt_info)
2699                  || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2700	      && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2701            continue;
2702
2703	  vect_cost_for_stmt kind;
2704          if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2705            {
2706              if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2707               kind = scalar_load;
2708             else
2709               kind = scalar_store;
2710            }
2711          else
2712            kind = scalar_stmt;
2713
2714	  scalar_single_iter_cost
2715	    += record_stmt_cost (scalar_cost_vec, factor, kind,
2716				 NULL, 0, vect_prologue);
2717        }
2718    }
2719  return scalar_single_iter_cost;
2720}
2721
2722/* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2723int
2724vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2725                             int *peel_iters_epilogue,
2726                             stmt_vector_for_cost *scalar_cost_vec,
2727			     stmt_vector_for_cost *prologue_cost_vec,
2728			     stmt_vector_for_cost *epilogue_cost_vec)
2729{
2730  int retval = 0;
2731  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2732
2733  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2734    {
2735      *peel_iters_epilogue = vf/2;
2736      if (dump_enabled_p ())
2737        dump_printf_loc (MSG_NOTE, vect_location,
2738			 "cost model: epilogue peel iters set to vf/2 "
2739			 "because loop iterations are unknown .\n");
2740
2741      /* If peeled iterations are known but number of scalar loop
2742         iterations are unknown, count a taken branch per peeled loop.  */
2743      retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2744				 NULL, 0, vect_prologue);
2745      retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2746				 NULL, 0, vect_epilogue);
2747    }
2748  else
2749    {
2750      int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2751      peel_iters_prologue = niters < peel_iters_prologue ?
2752                            niters : peel_iters_prologue;
2753      *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2754      /* If we need to peel for gaps, but no peeling is required, we have to
2755	 peel VF iterations.  */
2756      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2757        *peel_iters_epilogue = vf;
2758    }
2759
2760  stmt_info_for_cost *si;
2761  int j;
2762  if (peel_iters_prologue)
2763    FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2764      retval += record_stmt_cost (prologue_cost_vec,
2765				  si->count * peel_iters_prologue,
2766				  si->kind, NULL, si->misalign,
2767				  vect_prologue);
2768  if (*peel_iters_epilogue)
2769    FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2770      retval += record_stmt_cost (epilogue_cost_vec,
2771				  si->count * *peel_iters_epilogue,
2772				  si->kind, NULL, si->misalign,
2773				  vect_epilogue);
2774
2775  return retval;
2776}
2777
2778/* Function vect_estimate_min_profitable_iters
2779
2780   Return the number of iterations required for the vector version of the
2781   loop to be profitable relative to the cost of the scalar version of the
2782   loop.  */
2783
2784static void
2785vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2786				    int *ret_min_profitable_niters,
2787				    int *ret_min_profitable_estimate)
2788{
2789  int min_profitable_iters;
2790  int min_profitable_estimate;
2791  int peel_iters_prologue;
2792  int peel_iters_epilogue;
2793  unsigned vec_inside_cost = 0;
2794  int vec_outside_cost = 0;
2795  unsigned vec_prologue_cost = 0;
2796  unsigned vec_epilogue_cost = 0;
2797  int scalar_single_iter_cost = 0;
2798  int scalar_outside_cost = 0;
2799  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2800  int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2801  void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2802
2803  /* Cost model disabled.  */
2804  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2805    {
2806      dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2807      *ret_min_profitable_niters = 0;
2808      *ret_min_profitable_estimate = 0;
2809      return;
2810    }
2811
2812  /* Requires loop versioning tests to handle misalignment.  */
2813  if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2814    {
2815      /*  FIXME: Make cost depend on complexity of individual check.  */
2816      unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2817      (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2818			    vect_prologue);
2819      dump_printf (MSG_NOTE,
2820                   "cost model: Adding cost of checks for loop "
2821                   "versioning to treat misalignment.\n");
2822    }
2823
2824  /* Requires loop versioning with alias checks.  */
2825  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2826    {
2827      /*  FIXME: Make cost depend on complexity of individual check.  */
2828      unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
2829      (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2830			    vect_prologue);
2831      dump_printf (MSG_NOTE,
2832                   "cost model: Adding cost of checks for loop "
2833                   "versioning aliasing.\n");
2834    }
2835
2836  if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2837      || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2838    (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2839			  vect_prologue);
2840
2841  /* Count statements in scalar loop.  Using this as scalar cost for a single
2842     iteration for now.
2843
2844     TODO: Add outer loop support.
2845
2846     TODO: Consider assigning different costs to different scalar
2847     statements.  */
2848
2849  auto_vec<stmt_info_for_cost> scalar_cost_vec;
2850  scalar_single_iter_cost
2851     = vect_get_single_scalar_iteration_cost (loop_vinfo, &scalar_cost_vec);
2852
2853  /* Add additional cost for the peeled instructions in prologue and epilogue
2854     loop.
2855
2856     FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2857     at compile-time - we assume it's vf/2 (the worst would be vf-1).
2858
2859     TODO: Build an expression that represents peel_iters for prologue and
2860     epilogue to be used in a run-time test.  */
2861
2862  if (npeel  < 0)
2863    {
2864      peel_iters_prologue = vf/2;
2865      dump_printf (MSG_NOTE, "cost model: "
2866                   "prologue peel iters set to vf/2.\n");
2867
2868      /* If peeling for alignment is unknown, loop bound of main loop becomes
2869         unknown.  */
2870      peel_iters_epilogue = vf/2;
2871      dump_printf (MSG_NOTE, "cost model: "
2872                   "epilogue peel iters set to vf/2 because "
2873                   "peeling for alignment is unknown.\n");
2874
2875      /* If peeled iterations are unknown, count a taken branch and a not taken
2876         branch per peeled loop. Even if scalar loop iterations are known,
2877         vector iterations are not known since peeled prologue iterations are
2878         not known. Hence guards remain the same.  */
2879      (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2880			    NULL, 0, vect_prologue);
2881      (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2882			    NULL, 0, vect_prologue);
2883      (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2884			    NULL, 0, vect_epilogue);
2885      (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2886			    NULL, 0, vect_epilogue);
2887      stmt_info_for_cost *si;
2888      int j;
2889      FOR_EACH_VEC_ELT (scalar_cost_vec, j, si)
2890	{
2891	  struct _stmt_vec_info *stmt_info
2892	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2893	  (void) add_stmt_cost (target_cost_data,
2894				si->count * peel_iters_prologue,
2895				si->kind, stmt_info, si->misalign,
2896				vect_prologue);
2897	  (void) add_stmt_cost (target_cost_data,
2898				si->count * peel_iters_epilogue,
2899				si->kind, stmt_info, si->misalign,
2900				vect_epilogue);
2901	}
2902    }
2903  else
2904    {
2905      stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2906      stmt_info_for_cost *si;
2907      int j;
2908      void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2909
2910      prologue_cost_vec.create (2);
2911      epilogue_cost_vec.create (2);
2912      peel_iters_prologue = npeel;
2913
2914      (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2915					  &peel_iters_epilogue,
2916					  &scalar_cost_vec,
2917					  &prologue_cost_vec,
2918					  &epilogue_cost_vec);
2919
2920      FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2921	{
2922	  struct _stmt_vec_info *stmt_info
2923	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2924	  (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2925				si->misalign, vect_prologue);
2926	}
2927
2928      FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2929	{
2930	  struct _stmt_vec_info *stmt_info
2931	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2932	  (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2933				si->misalign, vect_epilogue);
2934	}
2935
2936      prologue_cost_vec.release ();
2937      epilogue_cost_vec.release ();
2938    }
2939
2940  /* FORNOW: The scalar outside cost is incremented in one of the
2941     following ways:
2942
2943     1. The vectorizer checks for alignment and aliasing and generates
2944     a condition that allows dynamic vectorization.  A cost model
2945     check is ANDED with the versioning condition.  Hence scalar code
2946     path now has the added cost of the versioning check.
2947
2948       if (cost > th & versioning_check)
2949         jmp to vector code
2950
2951     Hence run-time scalar is incremented by not-taken branch cost.
2952
2953     2. The vectorizer then checks if a prologue is required.  If the
2954     cost model check was not done before during versioning, it has to
2955     be done before the prologue check.
2956
2957       if (cost <= th)
2958         prologue = scalar_iters
2959       if (prologue == 0)
2960         jmp to vector code
2961       else
2962         execute prologue
2963       if (prologue == num_iters)
2964	 go to exit
2965
2966     Hence the run-time scalar cost is incremented by a taken branch,
2967     plus a not-taken branch, plus a taken branch cost.
2968
2969     3. The vectorizer then checks if an epilogue is required.  If the
2970     cost model check was not done before during prologue check, it
2971     has to be done with the epilogue check.
2972
2973       if (prologue == 0)
2974         jmp to vector code
2975       else
2976         execute prologue
2977       if (prologue == num_iters)
2978	 go to exit
2979       vector code:
2980         if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2981           jmp to epilogue
2982
2983     Hence the run-time scalar cost should be incremented by 2 taken
2984     branches.
2985
2986     TODO: The back end may reorder the BBS's differently and reverse
2987     conditions/branch directions.  Change the estimates below to
2988     something more reasonable.  */
2989
2990  /* If the number of iterations is known and we do not do versioning, we can
2991     decide whether to vectorize at compile time.  Hence the scalar version
2992     do not carry cost model guard costs.  */
2993  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2994      || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2995      || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2996    {
2997      /* Cost model check occurs at versioning.  */
2998      if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2999          || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3000	scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3001      else
3002	{
3003	  /* Cost model check occurs at prologue generation.  */
3004	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3005	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3006	      + vect_get_stmt_cost (cond_branch_not_taken);
3007	  /* Cost model check occurs at epilogue generation.  */
3008	  else
3009	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3010	}
3011    }
3012
3013  /* Complete the target-specific cost calculations.  */
3014  finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3015	       &vec_inside_cost, &vec_epilogue_cost);
3016
3017  vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3018
3019  if (dump_enabled_p ())
3020    {
3021      dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3022      dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3023                   vec_inside_cost);
3024      dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3025                   vec_prologue_cost);
3026      dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3027                   vec_epilogue_cost);
3028      dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3029                   scalar_single_iter_cost);
3030      dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3031                   scalar_outside_cost);
3032      dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3033                   vec_outside_cost);
3034      dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3035                   peel_iters_prologue);
3036      dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3037                   peel_iters_epilogue);
3038    }
3039
3040  /* Calculate number of iterations required to make the vector version
3041     profitable, relative to the loop bodies only.  The following condition
3042     must hold true:
3043     SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3044     where
3045     SIC = scalar iteration cost, VIC = vector iteration cost,
3046     VOC = vector outside cost, VF = vectorization factor,
3047     PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3048     SOC = scalar outside cost for run time cost model check.  */
3049
3050  if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3051    {
3052      if (vec_outside_cost <= 0)
3053        min_profitable_iters = 1;
3054      else
3055        {
3056          min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3057				  - vec_inside_cost * peel_iters_prologue
3058                                  - vec_inside_cost * peel_iters_epilogue)
3059                                 / ((scalar_single_iter_cost * vf)
3060                                    - vec_inside_cost);
3061
3062          if ((scalar_single_iter_cost * vf * min_profitable_iters)
3063              <= (((int) vec_inside_cost * min_profitable_iters)
3064                  + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3065            min_profitable_iters++;
3066        }
3067    }
3068  /* vector version will never be profitable.  */
3069  else
3070    {
3071      if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3072	warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3073		    "did not happen for a simd loop");
3074
3075      if (dump_enabled_p ())
3076        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3077			 "cost model: the vector iteration cost = %d "
3078			 "divided by the scalar iteration cost = %d "
3079			 "is greater or equal to the vectorization factor = %d"
3080                         ".\n",
3081			 vec_inside_cost, scalar_single_iter_cost, vf);
3082      *ret_min_profitable_niters = -1;
3083      *ret_min_profitable_estimate = -1;
3084      return;
3085    }
3086
3087  dump_printf (MSG_NOTE,
3088	       "  Calculated minimum iters for profitability: %d\n",
3089	       min_profitable_iters);
3090
3091  min_profitable_iters =
3092	min_profitable_iters < vf ? vf : min_profitable_iters;
3093
3094  /* Because the condition we create is:
3095     if (niters <= min_profitable_iters)
3096       then skip the vectorized loop.  */
3097  min_profitable_iters--;
3098
3099  if (dump_enabled_p ())
3100    dump_printf_loc (MSG_NOTE, vect_location,
3101                     "  Runtime profitability threshold = %d\n",
3102                     min_profitable_iters);
3103
3104  *ret_min_profitable_niters = min_profitable_iters;
3105
3106  /* Calculate number of iterations required to make the vector version
3107     profitable, relative to the loop bodies only.
3108
3109     Non-vectorized variant is SIC * niters and it must win over vector
3110     variant on the expected loop trip count.  The following condition must hold true:
3111     SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3112
3113  if (vec_outside_cost <= 0)
3114    min_profitable_estimate = 1;
3115  else
3116    {
3117      min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3118				 - vec_inside_cost * peel_iters_prologue
3119				 - vec_inside_cost * peel_iters_epilogue)
3120				 / ((scalar_single_iter_cost * vf)
3121				   - vec_inside_cost);
3122    }
3123  min_profitable_estimate --;
3124  min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3125  if (dump_enabled_p ())
3126    dump_printf_loc (MSG_NOTE, vect_location,
3127                     "  Static estimate profitability threshold = %d\n",
3128                      min_profitable_iters);
3129
3130  *ret_min_profitable_estimate = min_profitable_estimate;
3131}
3132
3133/* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3134   vector elements (not bits) for a vector of mode MODE.  */
3135static void
3136calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3137			      unsigned char *sel)
3138{
3139  unsigned int i, nelt = GET_MODE_NUNITS (mode);
3140
3141  for (i = 0; i < nelt; i++)
3142    sel[i] = (i + offset) & (2*nelt - 1);
3143}
3144
3145/* Checks whether the target supports whole-vector shifts for vectors of mode
3146   MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3147   it supports vec_perm_const with masks for all necessary shift amounts.  */
3148static bool
3149have_whole_vector_shift (enum machine_mode mode)
3150{
3151  if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3152    return true;
3153
3154  if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3155    return false;
3156
3157  unsigned int i, nelt = GET_MODE_NUNITS (mode);
3158  unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3159
3160  for (i = nelt/2; i >= 1; i/=2)
3161    {
3162      calc_vec_perm_mask_for_shift (mode, i, sel);
3163      if (!can_vec_perm_p (mode, false, sel))
3164	return false;
3165    }
3166  return true;
3167}
3168
3169/* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3170   functions. Design better to avoid maintenance issues.  */
3171
3172/* Function vect_model_reduction_cost.
3173
3174   Models cost for a reduction operation, including the vector ops
3175   generated within the strip-mine loop, the initial definition before
3176   the loop, and the epilogue code that must be generated.  */
3177
3178static bool
3179vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3180			   int ncopies)
3181{
3182  int prologue_cost = 0, epilogue_cost = 0;
3183  enum tree_code code;
3184  optab optab;
3185  tree vectype;
3186  gimple stmt, orig_stmt;
3187  tree reduction_op;
3188  machine_mode mode;
3189  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3190  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3191  void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3192
3193  /* Cost of reduction op inside loop.  */
3194  unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3195					stmt_info, 0, vect_body);
3196  stmt = STMT_VINFO_STMT (stmt_info);
3197
3198  switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3199    {
3200    case GIMPLE_SINGLE_RHS:
3201      gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3202      reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3203      break;
3204    case GIMPLE_UNARY_RHS:
3205      reduction_op = gimple_assign_rhs1 (stmt);
3206      break;
3207    case GIMPLE_BINARY_RHS:
3208      reduction_op = gimple_assign_rhs2 (stmt);
3209      break;
3210    case GIMPLE_TERNARY_RHS:
3211      reduction_op = gimple_assign_rhs3 (stmt);
3212      break;
3213    default:
3214      gcc_unreachable ();
3215    }
3216
3217  vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3218  if (!vectype)
3219    {
3220      if (dump_enabled_p ())
3221        {
3222	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3223			   "unsupported data-type ");
3224          dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3225			     TREE_TYPE (reduction_op));
3226          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3227        }
3228      return false;
3229   }
3230
3231  mode = TYPE_MODE (vectype);
3232  orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3233
3234  if (!orig_stmt)
3235    orig_stmt = STMT_VINFO_STMT (stmt_info);
3236
3237  code = gimple_assign_rhs_code (orig_stmt);
3238
3239  /* Add in cost for initial definition.  */
3240  prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3241				  stmt_info, 0, vect_prologue);
3242
3243  /* Determine cost of epilogue code.
3244
3245     We have a reduction operator that will reduce the vector in one statement.
3246     Also requires scalar extract.  */
3247
3248  if (!nested_in_vect_loop_p (loop, orig_stmt))
3249    {
3250      if (reduc_code != ERROR_MARK)
3251	{
3252	  epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3253					  stmt_info, 0, vect_epilogue);
3254	  epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3255					  stmt_info, 0, vect_epilogue);
3256	}
3257      else
3258	{
3259	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3260	  tree bitsize =
3261	    TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3262	  int element_bitsize = tree_to_uhwi (bitsize);
3263	  int nelements = vec_size_in_bits / element_bitsize;
3264
3265	  optab = optab_for_tree_code (code, vectype, optab_default);
3266
3267	  /* We have a whole vector shift available.  */
3268	  if (VECTOR_MODE_P (mode)
3269	      && optab_handler (optab, mode) != CODE_FOR_nothing
3270	      && have_whole_vector_shift (mode))
3271	    {
3272	      /* Final reduction via vector shifts and the reduction operator.
3273		 Also requires scalar extract.  */
3274	      epilogue_cost += add_stmt_cost (target_cost_data,
3275					      exact_log2 (nelements) * 2,
3276					      vector_stmt, stmt_info, 0,
3277					      vect_epilogue);
3278	      epilogue_cost += add_stmt_cost (target_cost_data, 1,
3279					      vec_to_scalar, stmt_info, 0,
3280					      vect_epilogue);
3281	    }
3282	  else
3283	    /* Use extracts and reduction op for final reduction.  For N
3284	       elements, we have N extracts and N-1 reduction ops.  */
3285	    epilogue_cost += add_stmt_cost (target_cost_data,
3286					    nelements + nelements - 1,
3287					    vector_stmt, stmt_info, 0,
3288					    vect_epilogue);
3289	}
3290    }
3291
3292  if (dump_enabled_p ())
3293    dump_printf (MSG_NOTE,
3294                 "vect_model_reduction_cost: inside_cost = %d, "
3295                 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3296                 prologue_cost, epilogue_cost);
3297
3298  return true;
3299}
3300
3301
3302/* Function vect_model_induction_cost.
3303
3304   Models cost for induction operations.  */
3305
3306static void
3307vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3308{
3309  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3310  void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3311  unsigned inside_cost, prologue_cost;
3312
3313  /* loop cost for vec_loop.  */
3314  inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3315			       stmt_info, 0, vect_body);
3316
3317  /* prologue cost for vec_init and vec_step.  */
3318  prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3319				 stmt_info, 0, vect_prologue);
3320
3321  if (dump_enabled_p ())
3322    dump_printf_loc (MSG_NOTE, vect_location,
3323                     "vect_model_induction_cost: inside_cost = %d, "
3324                     "prologue_cost = %d .\n", inside_cost, prologue_cost);
3325}
3326
3327
3328/* Function get_initial_def_for_induction
3329
3330   Input:
3331   STMT - a stmt that performs an induction operation in the loop.
3332   IV_PHI - the initial value of the induction variable
3333
3334   Output:
3335   Return a vector variable, initialized with the first VF values of
3336   the induction variable.  E.g., for an iv with IV_PHI='X' and
3337   evolution S, for a vector of 4 units, we want to return:
3338   [X, X + S, X + 2*S, X + 3*S].  */
3339
3340static tree
3341get_initial_def_for_induction (gimple iv_phi)
3342{
3343  stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3344  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3345  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3346  tree vectype;
3347  int nunits;
3348  edge pe = loop_preheader_edge (loop);
3349  struct loop *iv_loop;
3350  basic_block new_bb;
3351  tree new_vec, vec_init, vec_step, t;
3352  tree new_var;
3353  tree new_name;
3354  gimple init_stmt, new_stmt;
3355  gphi *induction_phi;
3356  tree induc_def, vec_def, vec_dest;
3357  tree init_expr, step_expr;
3358  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3359  int i;
3360  int ncopies;
3361  tree expr;
3362  stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3363  bool nested_in_vect_loop = false;
3364  gimple_seq stmts = NULL;
3365  imm_use_iterator imm_iter;
3366  use_operand_p use_p;
3367  gimple exit_phi;
3368  edge latch_e;
3369  tree loop_arg;
3370  gimple_stmt_iterator si;
3371  basic_block bb = gimple_bb (iv_phi);
3372  tree stepvectype;
3373  tree resvectype;
3374
3375  /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3376  if (nested_in_vect_loop_p (loop, iv_phi))
3377    {
3378      nested_in_vect_loop = true;
3379      iv_loop = loop->inner;
3380    }
3381  else
3382    iv_loop = loop;
3383  gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3384
3385  latch_e = loop_latch_edge (iv_loop);
3386  loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3387
3388  step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3389  gcc_assert (step_expr != NULL_TREE);
3390
3391  pe = loop_preheader_edge (iv_loop);
3392  init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3393				     loop_preheader_edge (iv_loop));
3394
3395  vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3396  resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3397  gcc_assert (vectype);
3398  nunits = TYPE_VECTOR_SUBPARTS (vectype);
3399  ncopies = vf / nunits;
3400
3401  gcc_assert (phi_info);
3402  gcc_assert (ncopies >= 1);
3403
3404  /* Convert the step to the desired type.  */
3405  step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3406						  step_expr),
3407				    &stmts, true, NULL_TREE);
3408  if (stmts)
3409    {
3410      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3411      gcc_assert (!new_bb);
3412    }
3413
3414  /* Find the first insertion point in the BB.  */
3415  si = gsi_after_labels (bb);
3416
3417  /* Create the vector that holds the initial_value of the induction.  */
3418  if (nested_in_vect_loop)
3419    {
3420      /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3421	 been created during vectorization of previous stmts.  We obtain it
3422	 from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3423      vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3424      /* If the initial value is not of proper type, convert it.  */
3425      if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3426	{
3427	  new_stmt
3428	    = gimple_build_assign (vect_get_new_vect_var (vectype,
3429							  vect_simple_var,
3430							  "vec_iv_"),
3431				   VIEW_CONVERT_EXPR,
3432				   build1 (VIEW_CONVERT_EXPR, vectype,
3433					   vec_init));
3434	  vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3435	  gimple_assign_set_lhs (new_stmt, vec_init);
3436	  new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3437						 new_stmt);
3438	  gcc_assert (!new_bb);
3439	  set_vinfo_for_stmt (new_stmt,
3440			      new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3441	}
3442    }
3443  else
3444    {
3445      vec<constructor_elt, va_gc> *v;
3446
3447      /* iv_loop is the loop to be vectorized. Create:
3448	 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3449      new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3450				       vect_scalar_var, "var_");
3451      new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3452						     init_expr),
3453				       &stmts, false, new_var);
3454      if (stmts)
3455	{
3456	  new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3457	  gcc_assert (!new_bb);
3458	}
3459
3460      vec_alloc (v, nunits);
3461      bool constant_p = is_gimple_min_invariant (new_name);
3462      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3463      for (i = 1; i < nunits; i++)
3464	{
3465	  /* Create: new_name_i = new_name + step_expr  */
3466	  new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3467				  new_name, step_expr);
3468	  if (!is_gimple_min_invariant (new_name))
3469	    {
3470	      init_stmt = gimple_build_assign (new_var, new_name);
3471	      new_name = make_ssa_name (new_var, init_stmt);
3472	      gimple_assign_set_lhs (init_stmt, new_name);
3473	      new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3474	      gcc_assert (!new_bb);
3475	      if (dump_enabled_p ())
3476		{
3477		  dump_printf_loc (MSG_NOTE, vect_location,
3478				   "created new init_stmt: ");
3479		  dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3480                  dump_printf (MSG_NOTE, "\n");
3481		}
3482	      constant_p = false;
3483	    }
3484	  CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3485	}
3486      /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3487      if (constant_p)
3488	new_vec = build_vector_from_ctor (vectype, v);
3489      else
3490	new_vec = build_constructor (vectype, v);
3491      vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3492    }
3493
3494
3495  /* Create the vector that holds the step of the induction.  */
3496  if (nested_in_vect_loop)
3497    /* iv_loop is nested in the loop to be vectorized. Generate:
3498       vec_step = [S, S, S, S]  */
3499    new_name = step_expr;
3500  else
3501    {
3502      /* iv_loop is the loop to be vectorized. Generate:
3503	  vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3504      if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3505	{
3506	  expr = build_int_cst (integer_type_node, vf);
3507	  expr = fold_convert (TREE_TYPE (step_expr), expr);
3508	}
3509      else
3510	expr = build_int_cst (TREE_TYPE (step_expr), vf);
3511      new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3512			      expr, step_expr);
3513      if (TREE_CODE (step_expr) == SSA_NAME)
3514	new_name = vect_init_vector (iv_phi, new_name,
3515				     TREE_TYPE (step_expr), NULL);
3516    }
3517
3518  t = unshare_expr (new_name);
3519  gcc_assert (CONSTANT_CLASS_P (new_name)
3520	      || TREE_CODE (new_name) == SSA_NAME);
3521  stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3522  gcc_assert (stepvectype);
3523  new_vec = build_vector_from_val (stepvectype, t);
3524  vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3525
3526
3527  /* Create the following def-use cycle:
3528     loop prolog:
3529         vec_init = ...
3530	 vec_step = ...
3531     loop:
3532         vec_iv = PHI <vec_init, vec_loop>
3533         ...
3534         STMT
3535         ...
3536         vec_loop = vec_iv + vec_step;  */
3537
3538  /* Create the induction-phi that defines the induction-operand.  */
3539  vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3540  induction_phi = create_phi_node (vec_dest, iv_loop->header);
3541  set_vinfo_for_stmt (induction_phi,
3542		      new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3543  induc_def = PHI_RESULT (induction_phi);
3544
3545  /* Create the iv update inside the loop  */
3546  new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3547  vec_def = make_ssa_name (vec_dest, new_stmt);
3548  gimple_assign_set_lhs (new_stmt, vec_def);
3549  gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3550  set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3551                                                   NULL));
3552
3553  /* Set the arguments of the phi node:  */
3554  add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3555  add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3556	       UNKNOWN_LOCATION);
3557
3558
3559  /* In case that vectorization factor (VF) is bigger than the number
3560     of elements that we can fit in a vectype (nunits), we have to generate
3561     more than one vector stmt - i.e - we need to "unroll" the
3562     vector stmt by a factor VF/nunits.  For more details see documentation
3563     in vectorizable_operation.  */
3564
3565  if (ncopies > 1)
3566    {
3567      stmt_vec_info prev_stmt_vinfo;
3568      /* FORNOW. This restriction should be relaxed.  */
3569      gcc_assert (!nested_in_vect_loop);
3570
3571      /* Create the vector that holds the step of the induction.  */
3572      if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3573	{
3574	  expr = build_int_cst (integer_type_node, nunits);
3575	  expr = fold_convert (TREE_TYPE (step_expr), expr);
3576	}
3577      else
3578	expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3579      new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3580			      expr, step_expr);
3581      if (TREE_CODE (step_expr) == SSA_NAME)
3582	new_name = vect_init_vector (iv_phi, new_name,
3583				     TREE_TYPE (step_expr), NULL);
3584      t = unshare_expr (new_name);
3585      gcc_assert (CONSTANT_CLASS_P (new_name)
3586		  || TREE_CODE (new_name) == SSA_NAME);
3587      new_vec = build_vector_from_val (stepvectype, t);
3588      vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3589
3590      vec_def = induc_def;
3591      prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3592      for (i = 1; i < ncopies; i++)
3593	{
3594	  /* vec_i = vec_prev + vec_step  */
3595	  new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3596					  vec_def, vec_step);
3597	  vec_def = make_ssa_name (vec_dest, new_stmt);
3598	  gimple_assign_set_lhs (new_stmt, vec_def);
3599
3600	  gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3601	  if (!useless_type_conversion_p (resvectype, vectype))
3602	    {
3603	      new_stmt
3604		= gimple_build_assign
3605			(vect_get_new_vect_var (resvectype, vect_simple_var,
3606						"vec_iv_"),
3607			 VIEW_CONVERT_EXPR,
3608			 build1 (VIEW_CONVERT_EXPR, resvectype,
3609				 gimple_assign_lhs (new_stmt)));
3610	      gimple_assign_set_lhs (new_stmt,
3611				     make_ssa_name
3612				       (gimple_assign_lhs (new_stmt), new_stmt));
3613	      gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3614	    }
3615	  set_vinfo_for_stmt (new_stmt,
3616			      new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3617	  STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3618	  prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3619	}
3620    }
3621
3622  if (nested_in_vect_loop)
3623    {
3624      /* Find the loop-closed exit-phi of the induction, and record
3625         the final vector of induction results:  */
3626      exit_phi = NULL;
3627      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3628        {
3629	  gimple use_stmt = USE_STMT (use_p);
3630	  if (is_gimple_debug (use_stmt))
3631	    continue;
3632
3633	  if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3634	    {
3635	      exit_phi = use_stmt;
3636	      break;
3637	    }
3638        }
3639      if (exit_phi)
3640	{
3641	  stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3642	  /* FORNOW. Currently not supporting the case that an inner-loop induction
3643	     is not used in the outer-loop (i.e. only outside the outer-loop).  */
3644	  gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3645		      && !STMT_VINFO_LIVE_P (stmt_vinfo));
3646
3647	  STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3648	  if (dump_enabled_p ())
3649	    {
3650	      dump_printf_loc (MSG_NOTE, vect_location,
3651			       "vector of inductions after inner-loop:");
3652	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3653              dump_printf (MSG_NOTE, "\n");
3654	    }
3655	}
3656    }
3657
3658
3659  if (dump_enabled_p ())
3660    {
3661      dump_printf_loc (MSG_NOTE, vect_location,
3662		       "transform induction: created def-use cycle: ");
3663      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3664      dump_printf (MSG_NOTE, "\n");
3665      dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3666			SSA_NAME_DEF_STMT (vec_def), 0);
3667      dump_printf (MSG_NOTE, "\n");
3668    }
3669
3670  STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3671  if (!useless_type_conversion_p (resvectype, vectype))
3672    {
3673      new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3674							     vect_simple_var,
3675							     "vec_iv_"),
3676				      VIEW_CONVERT_EXPR,
3677				      build1 (VIEW_CONVERT_EXPR, resvectype,
3678					      induc_def));
3679      induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3680      gimple_assign_set_lhs (new_stmt, induc_def);
3681      si = gsi_after_labels (bb);
3682      gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3683      set_vinfo_for_stmt (new_stmt,
3684			  new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3685      STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3686	= STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3687    }
3688
3689  return induc_def;
3690}
3691
3692
3693/* Function get_initial_def_for_reduction
3694
3695   Input:
3696   STMT - a stmt that performs a reduction operation in the loop.
3697   INIT_VAL - the initial value of the reduction variable
3698
3699   Output:
3700   ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3701        of the reduction (used for adjusting the epilog - see below).
3702   Return a vector variable, initialized according to the operation that STMT
3703        performs. This vector will be used as the initial value of the
3704        vector of partial results.
3705
3706   Option1 (adjust in epilog): Initialize the vector as follows:
3707     add/bit or/xor:    [0,0,...,0,0]
3708     mult/bit and:      [1,1,...,1,1]
3709     min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3710   and when necessary (e.g. add/mult case) let the caller know
3711   that it needs to adjust the result by init_val.
3712
3713   Option2: Initialize the vector as follows:
3714     add/bit or/xor:    [init_val,0,0,...,0]
3715     mult/bit and:      [init_val,1,1,...,1]
3716     min/max/cond_expr: [init_val,init_val,...,init_val]
3717   and no adjustments are needed.
3718
3719   For example, for the following code:
3720
3721   s = init_val;
3722   for (i=0;i<n;i++)
3723     s = s + a[i];
3724
3725   STMT is 's = s + a[i]', and the reduction variable is 's'.
3726   For a vector of 4 units, we want to return either [0,0,0,init_val],
3727   or [0,0,0,0] and let the caller know that it needs to adjust
3728   the result at the end by 'init_val'.
3729
3730   FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3731   initialization vector is simpler (same element in all entries), if
3732   ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3733
3734   A cost model should help decide between these two schemes.  */
3735
3736tree
3737get_initial_def_for_reduction (gimple stmt, tree init_val,
3738                               tree *adjustment_def)
3739{
3740  stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3741  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3742  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3743  tree scalar_type = TREE_TYPE (init_val);
3744  tree vectype = get_vectype_for_scalar_type (scalar_type);
3745  int nunits;
3746  enum tree_code code = gimple_assign_rhs_code (stmt);
3747  tree def_for_init;
3748  tree init_def;
3749  tree *elts;
3750  int i;
3751  bool nested_in_vect_loop = false;
3752  tree init_value;
3753  REAL_VALUE_TYPE real_init_val = dconst0;
3754  int int_init_val = 0;
3755  gimple def_stmt = NULL;
3756
3757  gcc_assert (vectype);
3758  nunits = TYPE_VECTOR_SUBPARTS (vectype);
3759
3760  gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3761	      || SCALAR_FLOAT_TYPE_P (scalar_type));
3762
3763  if (nested_in_vect_loop_p (loop, stmt))
3764    nested_in_vect_loop = true;
3765  else
3766    gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3767
3768  /* In case of double reduction we only create a vector variable to be put
3769     in the reduction phi node.  The actual statement creation is done in
3770     vect_create_epilog_for_reduction.  */
3771  if (adjustment_def && nested_in_vect_loop
3772      && TREE_CODE (init_val) == SSA_NAME
3773      && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3774      && gimple_code (def_stmt) == GIMPLE_PHI
3775      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3776      && vinfo_for_stmt (def_stmt)
3777      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3778          == vect_double_reduction_def)
3779    {
3780      *adjustment_def = NULL;
3781      return vect_create_destination_var (init_val, vectype);
3782    }
3783
3784  if (TREE_CONSTANT (init_val))
3785    {
3786      if (SCALAR_FLOAT_TYPE_P (scalar_type))
3787        init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3788      else
3789        init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3790    }
3791  else
3792    init_value = init_val;
3793
3794  /* In case of a nested reduction do not use an adjustment def as
3795     that case is not supported by the epilogue generation correctly
3796     if ncopies is not one.  */
3797  if (adjustment_def && nested_in_vect_loop)
3798    {
3799      *adjustment_def = NULL;
3800      return vect_get_vec_def_for_operand (init_val, stmt, NULL);
3801    }
3802
3803  switch (code)
3804    {
3805      case WIDEN_SUM_EXPR:
3806      case DOT_PROD_EXPR:
3807      case SAD_EXPR:
3808      case PLUS_EXPR:
3809      case MINUS_EXPR:
3810      case BIT_IOR_EXPR:
3811      case BIT_XOR_EXPR:
3812      case MULT_EXPR:
3813      case BIT_AND_EXPR:
3814        /* ADJUSMENT_DEF is NULL when called from
3815           vect_create_epilog_for_reduction to vectorize double reduction.  */
3816        if (adjustment_def)
3817	  *adjustment_def = init_val;
3818
3819        if (code == MULT_EXPR)
3820          {
3821            real_init_val = dconst1;
3822            int_init_val = 1;
3823          }
3824
3825        if (code == BIT_AND_EXPR)
3826          int_init_val = -1;
3827
3828        if (SCALAR_FLOAT_TYPE_P (scalar_type))
3829          def_for_init = build_real (scalar_type, real_init_val);
3830        else
3831          def_for_init = build_int_cst (scalar_type, int_init_val);
3832
3833        /* Create a vector of '0' or '1' except the first element.  */
3834	elts = XALLOCAVEC (tree, nunits);
3835        for (i = nunits - 2; i >= 0; --i)
3836	  elts[i + 1] = def_for_init;
3837
3838        /* Option1: the first element is '0' or '1' as well.  */
3839        if (adjustment_def)
3840          {
3841	    elts[0] = def_for_init;
3842            init_def = build_vector (vectype, elts);
3843            break;
3844          }
3845
3846        /* Option2: the first element is INIT_VAL.  */
3847	elts[0] = init_val;
3848        if (TREE_CONSTANT (init_val))
3849          init_def = build_vector (vectype, elts);
3850        else
3851	  {
3852	    vec<constructor_elt, va_gc> *v;
3853	    vec_alloc (v, nunits);
3854	    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3855	    for (i = 1; i < nunits; ++i)
3856	      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3857	    init_def = build_constructor (vectype, v);
3858	  }
3859
3860        break;
3861
3862      case MIN_EXPR:
3863      case MAX_EXPR:
3864      case COND_EXPR:
3865        if (adjustment_def)
3866          {
3867            *adjustment_def = NULL_TREE;
3868            init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3869            break;
3870          }
3871
3872	init_def = build_vector_from_val (vectype, init_value);
3873        break;
3874
3875      default:
3876        gcc_unreachable ();
3877    }
3878
3879  return init_def;
3880}
3881
3882/* Function vect_create_epilog_for_reduction
3883
3884   Create code at the loop-epilog to finalize the result of a reduction
3885   computation.
3886
3887   VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3888     reduction statements.
3889   STMT is the scalar reduction stmt that is being vectorized.
3890   NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3891     number of elements that we can fit in a vectype (nunits).  In this case
3892     we have to generate more than one vector stmt - i.e - we need to "unroll"
3893     the vector stmt by a factor VF/nunits.  For more details see documentation
3894     in vectorizable_operation.
3895   REDUC_CODE is the tree-code for the epilog reduction.
3896   REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3897     computation.
3898   REDUC_INDEX is the index of the operand in the right hand side of the
3899     statement that is defined by REDUCTION_PHI.
3900   DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3901   SLP_NODE is an SLP node containing a group of reduction statements. The
3902     first one in this group is STMT.
3903
3904   This function:
3905   1. Creates the reduction def-use cycles: sets the arguments for
3906      REDUCTION_PHIS:
3907      The loop-entry argument is the vectorized initial-value of the reduction.
3908      The loop-latch argument is taken from VECT_DEFS - the vector of partial
3909      sums.
3910   2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3911      by applying the operation specified by REDUC_CODE if available, or by
3912      other means (whole-vector shifts or a scalar loop).
3913      The function also creates a new phi node at the loop exit to preserve
3914      loop-closed form, as illustrated below.
3915
3916     The flow at the entry to this function:
3917
3918        loop:
3919          vec_def = phi <null, null>            # REDUCTION_PHI
3920          VECT_DEF = vector_stmt                # vectorized form of STMT
3921          s_loop = scalar_stmt                  # (scalar) STMT
3922        loop_exit:
3923          s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3924          use <s_out0>
3925          use <s_out0>
3926
3927     The above is transformed by this function into:
3928
3929        loop:
3930          vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3931          VECT_DEF = vector_stmt                # vectorized form of STMT
3932          s_loop = scalar_stmt                  # (scalar) STMT
3933        loop_exit:
3934          s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3935          v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3936          v_out2 = reduce <v_out1>
3937          s_out3 = extract_field <v_out2, 0>
3938          s_out4 = adjust_result <s_out3>
3939          use <s_out4>
3940          use <s_out4>
3941*/
3942
3943static void
3944vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3945				  int ncopies, enum tree_code reduc_code,
3946				  vec<gimple> reduction_phis,
3947                                  int reduc_index, bool double_reduc,
3948                                  slp_tree slp_node)
3949{
3950  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3951  stmt_vec_info prev_phi_info;
3952  tree vectype;
3953  machine_mode mode;
3954  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3955  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3956  basic_block exit_bb;
3957  tree scalar_dest;
3958  tree scalar_type;
3959  gimple new_phi = NULL, phi;
3960  gimple_stmt_iterator exit_gsi;
3961  tree vec_dest;
3962  tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3963  gimple epilog_stmt = NULL;
3964  enum tree_code code = gimple_assign_rhs_code (stmt);
3965  gimple exit_phi;
3966  tree bitsize;
3967  tree adjustment_def = NULL;
3968  tree vec_initial_def = NULL;
3969  tree reduction_op, expr, def;
3970  tree orig_name, scalar_result;
3971  imm_use_iterator imm_iter, phi_imm_iter;
3972  use_operand_p use_p, phi_use_p;
3973  gimple use_stmt, orig_stmt, reduction_phi = NULL;
3974  bool nested_in_vect_loop = false;
3975  auto_vec<gimple> new_phis;
3976  auto_vec<gimple> inner_phis;
3977  enum vect_def_type dt = vect_unknown_def_type;
3978  int j, i;
3979  auto_vec<tree> scalar_results;
3980  unsigned int group_size = 1, k, ratio;
3981  auto_vec<tree> vec_initial_defs;
3982  auto_vec<gimple> phis;
3983  bool slp_reduc = false;
3984  tree new_phi_result;
3985  gimple inner_phi = NULL;
3986
3987  if (slp_node)
3988    group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3989
3990  if (nested_in_vect_loop_p (loop, stmt))
3991    {
3992      outer_loop = loop;
3993      loop = loop->inner;
3994      nested_in_vect_loop = true;
3995      gcc_assert (!slp_node);
3996    }
3997
3998  switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3999    {
4000    case GIMPLE_SINGLE_RHS:
4001      gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
4002		  == ternary_op);
4003      reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
4004      break;
4005    case GIMPLE_UNARY_RHS:
4006      reduction_op = gimple_assign_rhs1 (stmt);
4007      break;
4008    case GIMPLE_BINARY_RHS:
4009      reduction_op = reduc_index ?
4010                     gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
4011      break;
4012    case GIMPLE_TERNARY_RHS:
4013      reduction_op = gimple_op (stmt, reduc_index + 1);
4014      break;
4015    default:
4016      gcc_unreachable ();
4017    }
4018
4019  vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4020  gcc_assert (vectype);
4021  mode = TYPE_MODE (vectype);
4022
4023  /* 1. Create the reduction def-use cycle:
4024     Set the arguments of REDUCTION_PHIS, i.e., transform
4025
4026        loop:
4027          vec_def = phi <null, null>            # REDUCTION_PHI
4028          VECT_DEF = vector_stmt                # vectorized form of STMT
4029          ...
4030
4031     into:
4032
4033        loop:
4034          vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4035          VECT_DEF = vector_stmt                # vectorized form of STMT
4036          ...
4037
4038     (in case of SLP, do it for all the phis). */
4039
4040  /* Get the loop-entry arguments.  */
4041  enum vect_def_type initial_def_dt = vect_unknown_def_type;
4042  if (slp_node)
4043    vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4044                       NULL, slp_node, reduc_index);
4045  else
4046    {
4047      /* Get at the scalar def before the loop, that defines the initial value
4048	 of the reduction variable.  */
4049      gimple def_stmt = SSA_NAME_DEF_STMT (reduction_op);
4050      tree initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4051						loop_preheader_edge (loop));
4052      vect_is_simple_use (initial_def, NULL, loop_vinfo, NULL,
4053			  &def_stmt, &initial_def, &initial_def_dt);
4054     /* For the case of reduction, vect_get_vec_def_for_operand returns
4055        the scalar def before the loop, that defines the initial value
4056        of the reduction variable.  */
4057      vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
4058                                                      &adjustment_def);
4059      vec_initial_defs.create (1);
4060      vec_initial_defs.quick_push (vec_initial_def);
4061    }
4062
4063  /* Set phi nodes arguments.  */
4064  FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4065    {
4066      tree vec_init_def, def;
4067      gimple_seq stmts;
4068      vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4069					   true, NULL_TREE);
4070      gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4071      def = vect_defs[i];
4072      for (j = 0; j < ncopies; j++)
4073        {
4074	  if (j != 0)
4075	    {
4076	      phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4077	      if (nested_in_vect_loop)
4078		vec_init_def
4079		  = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4080						    vec_init_def);
4081	    }
4082
4083          /* Set the loop-entry arg of the reduction-phi.  */
4084          add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4085		       loop_preheader_edge (loop), UNKNOWN_LOCATION);
4086
4087          /* Set the loop-latch arg for the reduction-phi.  */
4088          if (j > 0)
4089            def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4090
4091          add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4092		       UNKNOWN_LOCATION);
4093
4094          if (dump_enabled_p ())
4095            {
4096              dump_printf_loc (MSG_NOTE, vect_location,
4097			       "transform reduction: created def-use cycle: ");
4098              dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4099              dump_printf (MSG_NOTE, "\n");
4100              dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4101              dump_printf (MSG_NOTE, "\n");
4102            }
4103        }
4104    }
4105
4106  /* 2. Create epilog code.
4107        The reduction epilog code operates across the elements of the vector
4108        of partial results computed by the vectorized loop.
4109        The reduction epilog code consists of:
4110
4111        step 1: compute the scalar result in a vector (v_out2)
4112        step 2: extract the scalar result (s_out3) from the vector (v_out2)
4113        step 3: adjust the scalar result (s_out3) if needed.
4114
4115        Step 1 can be accomplished using one the following three schemes:
4116          (scheme 1) using reduc_code, if available.
4117          (scheme 2) using whole-vector shifts, if available.
4118          (scheme 3) using a scalar loop. In this case steps 1+2 above are
4119                     combined.
4120
4121          The overall epilog code looks like this:
4122
4123          s_out0 = phi <s_loop>         # original EXIT_PHI
4124          v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4125          v_out2 = reduce <v_out1>              # step 1
4126          s_out3 = extract_field <v_out2, 0>    # step 2
4127          s_out4 = adjust_result <s_out3>       # step 3
4128
4129          (step 3 is optional, and steps 1 and 2 may be combined).
4130          Lastly, the uses of s_out0 are replaced by s_out4.  */
4131
4132
4133  /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4134         v_out1 = phi <VECT_DEF>
4135         Store them in NEW_PHIS.  */
4136
4137  exit_bb = single_exit (loop)->dest;
4138  prev_phi_info = NULL;
4139  new_phis.create (vect_defs.length ());
4140  FOR_EACH_VEC_ELT (vect_defs, i, def)
4141    {
4142      for (j = 0; j < ncopies; j++)
4143        {
4144	  tree new_def = copy_ssa_name (def);
4145          phi = create_phi_node (new_def, exit_bb);
4146          set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4147          if (j == 0)
4148            new_phis.quick_push (phi);
4149          else
4150	    {
4151	      def = vect_get_vec_def_for_stmt_copy (dt, def);
4152	      STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4153	    }
4154
4155          SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4156          prev_phi_info = vinfo_for_stmt (phi);
4157        }
4158    }
4159
4160  /* The epilogue is created for the outer-loop, i.e., for the loop being
4161     vectorized.  Create exit phis for the outer loop.  */
4162  if (double_reduc)
4163    {
4164      loop = outer_loop;
4165      exit_bb = single_exit (loop)->dest;
4166      inner_phis.create (vect_defs.length ());
4167      FOR_EACH_VEC_ELT (new_phis, i, phi)
4168	{
4169	  tree new_result = copy_ssa_name (PHI_RESULT (phi));
4170	  gphi *outer_phi = create_phi_node (new_result, exit_bb);
4171	  SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4172			   PHI_RESULT (phi));
4173	  set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4174							    loop_vinfo, NULL));
4175	  inner_phis.quick_push (phi);
4176	  new_phis[i] = outer_phi;
4177	  prev_phi_info = vinfo_for_stmt (outer_phi);
4178          while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4179            {
4180	      phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4181	      new_result = copy_ssa_name (PHI_RESULT (phi));
4182	      outer_phi = create_phi_node (new_result, exit_bb);
4183	      SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4184			       PHI_RESULT (phi));
4185	      set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4186							loop_vinfo, NULL));
4187	      STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4188	      prev_phi_info = vinfo_for_stmt (outer_phi);
4189	    }
4190	}
4191    }
4192
4193  exit_gsi = gsi_after_labels (exit_bb);
4194
4195  /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4196         (i.e. when reduc_code is not available) and in the final adjustment
4197	 code (if needed).  Also get the original scalar reduction variable as
4198         defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4199         represents a reduction pattern), the tree-code and scalar-def are
4200         taken from the original stmt that the pattern-stmt (STMT) replaces.
4201         Otherwise (it is a regular reduction) - the tree-code and scalar-def
4202         are taken from STMT.  */
4203
4204  orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4205  if (!orig_stmt)
4206    {
4207      /* Regular reduction  */
4208      orig_stmt = stmt;
4209    }
4210  else
4211    {
4212      /* Reduction pattern  */
4213      stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4214      gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4215      gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4216    }
4217
4218  code = gimple_assign_rhs_code (orig_stmt);
4219  /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4220     partial results are added and not subtracted.  */
4221  if (code == MINUS_EXPR)
4222    code = PLUS_EXPR;
4223
4224  scalar_dest = gimple_assign_lhs (orig_stmt);
4225  scalar_type = TREE_TYPE (scalar_dest);
4226  scalar_results.create (group_size);
4227  new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4228  bitsize = TYPE_SIZE (scalar_type);
4229
4230  /* In case this is a reduction in an inner-loop while vectorizing an outer
4231     loop - we don't need to extract a single scalar result at the end of the
4232     inner-loop (unless it is double reduction, i.e., the use of reduction is
4233     outside the outer-loop).  The final vector of partial results will be used
4234     in the vectorized outer-loop, or reduced to a scalar result at the end of
4235     the outer-loop.  */
4236  if (nested_in_vect_loop && !double_reduc)
4237    goto vect_finalize_reduction;
4238
4239  /* SLP reduction without reduction chain, e.g.,
4240     # a1 = phi <a2, a0>
4241     # b1 = phi <b2, b0>
4242     a2 = operation (a1)
4243     b2 = operation (b1)  */
4244  slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4245
4246  /* In case of reduction chain, e.g.,
4247     # a1 = phi <a3, a0>
4248     a2 = operation (a1)
4249     a3 = operation (a2),
4250
4251     we may end up with more than one vector result.  Here we reduce them to
4252     one vector.  */
4253  if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4254    {
4255      tree first_vect = PHI_RESULT (new_phis[0]);
4256      tree tmp;
4257      gassign *new_vec_stmt = NULL;
4258
4259      vec_dest = vect_create_destination_var (scalar_dest, vectype);
4260      for (k = 1; k < new_phis.length (); k++)
4261        {
4262          gimple next_phi = new_phis[k];
4263          tree second_vect = PHI_RESULT (next_phi);
4264
4265          tmp = build2 (code, vectype,  first_vect, second_vect);
4266          new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4267          first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4268          gimple_assign_set_lhs (new_vec_stmt, first_vect);
4269          gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4270        }
4271
4272      new_phi_result = first_vect;
4273      if (new_vec_stmt)
4274        {
4275          new_phis.truncate (0);
4276          new_phis.safe_push (new_vec_stmt);
4277        }
4278    }
4279  else
4280    new_phi_result = PHI_RESULT (new_phis[0]);
4281
4282  /* 2.3 Create the reduction code, using one of the three schemes described
4283         above. In SLP we simply need to extract all the elements from the
4284         vector (without reducing them), so we use scalar shifts.  */
4285  if (reduc_code != ERROR_MARK && !slp_reduc)
4286    {
4287      tree tmp;
4288      tree vec_elem_type;
4289
4290      /*** Case 1:  Create:
4291           v_out2 = reduc_expr <v_out1>  */
4292
4293      if (dump_enabled_p ())
4294        dump_printf_loc (MSG_NOTE, vect_location,
4295			 "Reduce using direct vector reduction.\n");
4296
4297      vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4298      if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4299	{
4300          tree tmp_dest =
4301	      vect_create_destination_var (scalar_dest, vec_elem_type);
4302	  tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4303	  epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4304	  new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4305	  gimple_assign_set_lhs (epilog_stmt, new_temp);
4306	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4307
4308	  tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4309	}
4310      else
4311	tmp = build1 (reduc_code, scalar_type, new_phi_result);
4312      epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4313      new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4314      gimple_assign_set_lhs (epilog_stmt, new_temp);
4315      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4316      scalar_results.safe_push (new_temp);
4317    }
4318  else
4319    {
4320      bool reduce_with_shift = have_whole_vector_shift (mode);
4321      int element_bitsize = tree_to_uhwi (bitsize);
4322      int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4323      tree vec_temp;
4324
4325      /* Regardless of whether we have a whole vector shift, if we're
4326         emulating the operation via tree-vect-generic, we don't want
4327         to use it.  Only the first round of the reduction is likely
4328         to still be profitable via emulation.  */
4329      /* ??? It might be better to emit a reduction tree code here, so that
4330         tree-vect-generic can expand the first round via bit tricks.  */
4331      if (!VECTOR_MODE_P (mode))
4332        reduce_with_shift = false;
4333      else
4334        {
4335          optab optab = optab_for_tree_code (code, vectype, optab_default);
4336          if (optab_handler (optab, mode) == CODE_FOR_nothing)
4337            reduce_with_shift = false;
4338        }
4339
4340      if (reduce_with_shift && !slp_reduc)
4341        {
4342          int nelements = vec_size_in_bits / element_bitsize;
4343          unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4344
4345          int elt_offset;
4346
4347          tree zero_vec = build_zero_cst (vectype);
4348          /*** Case 2: Create:
4349             for (offset = nelements/2; offset >= 1; offset/=2)
4350                {
4351                  Create:  va' = vec_shift <va, offset>
4352                  Create:  va = vop <va, va'>
4353                }  */
4354
4355          tree rhs;
4356
4357          if (dump_enabled_p ())
4358            dump_printf_loc (MSG_NOTE, vect_location,
4359			     "Reduce using vector shifts\n");
4360
4361          vec_dest = vect_create_destination_var (scalar_dest, vectype);
4362          new_temp = new_phi_result;
4363          for (elt_offset = nelements / 2;
4364               elt_offset >= 1;
4365               elt_offset /= 2)
4366            {
4367              calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4368              tree mask = vect_gen_perm_mask_any (vectype, sel);
4369	      epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4370						 new_temp, zero_vec, mask);
4371              new_name = make_ssa_name (vec_dest, epilog_stmt);
4372              gimple_assign_set_lhs (epilog_stmt, new_name);
4373              gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4374
4375	      epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4376						 new_temp);
4377              new_temp = make_ssa_name (vec_dest, epilog_stmt);
4378              gimple_assign_set_lhs (epilog_stmt, new_temp);
4379              gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4380            }
4381
4382	  /* 2.4  Extract the final scalar result.  Create:
4383	     s_out3 = extract_field <v_out2, bitpos>  */
4384
4385	  if (dump_enabled_p ())
4386	    dump_printf_loc (MSG_NOTE, vect_location,
4387			     "extract scalar result\n");
4388
4389	  rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4390			bitsize, bitsize_zero_node);
4391	  epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4392	  new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4393	  gimple_assign_set_lhs (epilog_stmt, new_temp);
4394	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4395	  scalar_results.safe_push (new_temp);
4396        }
4397      else
4398        {
4399          /*** Case 3: Create:
4400             s = extract_field <v_out2, 0>
4401             for (offset = element_size;
4402                  offset < vector_size;
4403                  offset += element_size;)
4404               {
4405                 Create:  s' = extract_field <v_out2, offset>
4406                 Create:  s = op <s, s'>  // For non SLP cases
4407               }  */
4408
4409          if (dump_enabled_p ())
4410            dump_printf_loc (MSG_NOTE, vect_location,
4411			     "Reduce using scalar code.\n");
4412
4413          vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4414          FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4415            {
4416              int bit_offset;
4417              if (gimple_code (new_phi) == GIMPLE_PHI)
4418                vec_temp = PHI_RESULT (new_phi);
4419              else
4420                vec_temp = gimple_assign_lhs (new_phi);
4421              tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4422                            bitsize_zero_node);
4423              epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4424              new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4425              gimple_assign_set_lhs (epilog_stmt, new_temp);
4426              gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4427
4428              /* In SLP we don't need to apply reduction operation, so we just
4429                 collect s' values in SCALAR_RESULTS.  */
4430              if (slp_reduc)
4431                scalar_results.safe_push (new_temp);
4432
4433              for (bit_offset = element_bitsize;
4434                   bit_offset < vec_size_in_bits;
4435                   bit_offset += element_bitsize)
4436                {
4437                  tree bitpos = bitsize_int (bit_offset);
4438                  tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4439                                     bitsize, bitpos);
4440
4441                  epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4442                  new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4443                  gimple_assign_set_lhs (epilog_stmt, new_name);
4444                  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4445
4446                  if (slp_reduc)
4447                    {
4448                      /* In SLP we don't need to apply reduction operation, so
4449                         we just collect s' values in SCALAR_RESULTS.  */
4450                      new_temp = new_name;
4451                      scalar_results.safe_push (new_name);
4452                    }
4453                  else
4454                    {
4455		      epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4456							 new_name, new_temp);
4457                      new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4458                      gimple_assign_set_lhs (epilog_stmt, new_temp);
4459                      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4460                    }
4461                }
4462            }
4463
4464          /* The only case where we need to reduce scalar results in SLP, is
4465             unrolling.  If the size of SCALAR_RESULTS is greater than
4466             GROUP_SIZE, we reduce them combining elements modulo
4467             GROUP_SIZE.  */
4468          if (slp_reduc)
4469            {
4470              tree res, first_res, new_res;
4471              gimple new_stmt;
4472
4473              /* Reduce multiple scalar results in case of SLP unrolling.  */
4474              for (j = group_size; scalar_results.iterate (j, &res);
4475                   j++)
4476                {
4477                  first_res = scalar_results[j % group_size];
4478		  new_stmt = gimple_build_assign (new_scalar_dest, code,
4479						  first_res, res);
4480                  new_res = make_ssa_name (new_scalar_dest, new_stmt);
4481                  gimple_assign_set_lhs (new_stmt, new_res);
4482                  gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4483                  scalar_results[j % group_size] = new_res;
4484                }
4485            }
4486          else
4487            /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4488            scalar_results.safe_push (new_temp);
4489        }
4490    }
4491
4492vect_finalize_reduction:
4493
4494  if (double_reduc)
4495    loop = loop->inner;
4496
4497  /* 2.5 Adjust the final result by the initial value of the reduction
4498	 variable. (When such adjustment is not needed, then
4499	 'adjustment_def' is zero).  For example, if code is PLUS we create:
4500	 new_temp = loop_exit_def + adjustment_def  */
4501
4502  if (adjustment_def)
4503    {
4504      gcc_assert (!slp_reduc);
4505      if (nested_in_vect_loop)
4506	{
4507          new_phi = new_phis[0];
4508	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4509	  expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4510	  new_dest = vect_create_destination_var (scalar_dest, vectype);
4511	}
4512      else
4513	{
4514          new_temp = scalar_results[0];
4515	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4516	  expr = build2 (code, scalar_type, new_temp, adjustment_def);
4517	  new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4518	}
4519
4520      epilog_stmt = gimple_build_assign (new_dest, expr);
4521      new_temp = make_ssa_name (new_dest, epilog_stmt);
4522      gimple_assign_set_lhs (epilog_stmt, new_temp);
4523      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4524      if (nested_in_vect_loop)
4525        {
4526          set_vinfo_for_stmt (epilog_stmt,
4527                              new_stmt_vec_info (epilog_stmt, loop_vinfo,
4528                                                 NULL));
4529          STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4530                STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4531
4532          if (!double_reduc)
4533            scalar_results.quick_push (new_temp);
4534          else
4535            scalar_results[0] = new_temp;
4536        }
4537      else
4538        scalar_results[0] = new_temp;
4539
4540      new_phis[0] = epilog_stmt;
4541    }
4542
4543  /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4544          phis with new adjusted scalar results, i.e., replace use <s_out0>
4545          with use <s_out4>.
4546
4547     Transform:
4548        loop_exit:
4549          s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4550          v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4551          v_out2 = reduce <v_out1>
4552          s_out3 = extract_field <v_out2, 0>
4553          s_out4 = adjust_result <s_out3>
4554          use <s_out0>
4555          use <s_out0>
4556
4557     into:
4558
4559        loop_exit:
4560          s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4561          v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4562          v_out2 = reduce <v_out1>
4563          s_out3 = extract_field <v_out2, 0>
4564          s_out4 = adjust_result <s_out3>
4565          use <s_out4>
4566          use <s_out4> */
4567
4568
4569  /* In SLP reduction chain we reduce vector results into one vector if
4570     necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4571     the last stmt in the reduction chain, since we are looking for the loop
4572     exit phi node.  */
4573  if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4574    {
4575      scalar_dest = gimple_assign_lhs (
4576			SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4577      group_size = 1;
4578    }
4579
4580  /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4581     case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4582     need to match SCALAR_RESULTS with corresponding statements.  The first
4583     (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4584     the first vector stmt, etc.
4585     (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4586  if (group_size > new_phis.length ())
4587    {
4588      ratio = group_size / new_phis.length ();
4589      gcc_assert (!(group_size % new_phis.length ()));
4590    }
4591  else
4592    ratio = 1;
4593
4594  for (k = 0; k < group_size; k++)
4595    {
4596      if (k % ratio == 0)
4597        {
4598          epilog_stmt = new_phis[k / ratio];
4599          reduction_phi = reduction_phis[k / ratio];
4600	  if (double_reduc)
4601	    inner_phi = inner_phis[k / ratio];
4602        }
4603
4604      if (slp_reduc)
4605        {
4606          gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4607
4608          orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4609          /* SLP statements can't participate in patterns.  */
4610          gcc_assert (!orig_stmt);
4611          scalar_dest = gimple_assign_lhs (current_stmt);
4612        }
4613
4614      phis.create (3);
4615      /* Find the loop-closed-use at the loop exit of the original scalar
4616         result.  (The reduction result is expected to have two immediate uses -
4617         one at the latch block, and one at the loop exit).  */
4618      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4619        if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4620	    && !is_gimple_debug (USE_STMT (use_p)))
4621          phis.safe_push (USE_STMT (use_p));
4622
4623      /* While we expect to have found an exit_phi because of loop-closed-ssa
4624         form we can end up without one if the scalar cycle is dead.  */
4625
4626      FOR_EACH_VEC_ELT (phis, i, exit_phi)
4627        {
4628          if (outer_loop)
4629            {
4630              stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4631              gphi *vect_phi;
4632
4633              /* FORNOW. Currently not supporting the case that an inner-loop
4634                 reduction is not used in the outer-loop (but only outside the
4635                 outer-loop), unless it is double reduction.  */
4636              gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4637                           && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4638                          || double_reduc);
4639
4640	      if (double_reduc)
4641		STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4642	      else
4643		STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4644              if (!double_reduc
4645                  || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4646                      != vect_double_reduction_def)
4647                continue;
4648
4649              /* Handle double reduction:
4650
4651                 stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4652                 stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4653                 stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4654                 stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4655
4656                 At that point the regular reduction (stmt2 and stmt3) is
4657                 already vectorized, as well as the exit phi node, stmt4.
4658                 Here we vectorize the phi node of double reduction, stmt1, and
4659                 update all relevant statements.  */
4660
4661              /* Go through all the uses of s2 to find double reduction phi
4662                 node, i.e., stmt1 above.  */
4663              orig_name = PHI_RESULT (exit_phi);
4664              FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4665                {
4666                  stmt_vec_info use_stmt_vinfo;
4667                  stmt_vec_info new_phi_vinfo;
4668                  tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4669                  basic_block bb = gimple_bb (use_stmt);
4670                  gimple use;
4671
4672                  /* Check that USE_STMT is really double reduction phi
4673                     node.  */
4674                  if (gimple_code (use_stmt) != GIMPLE_PHI
4675                      || gimple_phi_num_args (use_stmt) != 2
4676                      || bb->loop_father != outer_loop)
4677                    continue;
4678                  use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4679                  if (!use_stmt_vinfo
4680                      || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4681                          != vect_double_reduction_def)
4682		    continue;
4683
4684                  /* Create vector phi node for double reduction:
4685                     vs1 = phi <vs0, vs2>
4686                     vs1 was created previously in this function by a call to
4687                       vect_get_vec_def_for_operand and is stored in
4688                       vec_initial_def;
4689                     vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4690                     vs0 is created here.  */
4691
4692                  /* Create vector phi node.  */
4693                  vect_phi = create_phi_node (vec_initial_def, bb);
4694                  new_phi_vinfo = new_stmt_vec_info (vect_phi,
4695                                    loop_vec_info_for_loop (outer_loop), NULL);
4696                  set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4697
4698                  /* Create vs0 - initial def of the double reduction phi.  */
4699                  preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4700                                             loop_preheader_edge (outer_loop));
4701                  init_def = get_initial_def_for_reduction (stmt,
4702                                                          preheader_arg, NULL);
4703                  vect_phi_init = vect_init_vector (use_stmt, init_def,
4704                                                    vectype, NULL);
4705
4706                  /* Update phi node arguments with vs0 and vs2.  */
4707                  add_phi_arg (vect_phi, vect_phi_init,
4708                               loop_preheader_edge (outer_loop),
4709                               UNKNOWN_LOCATION);
4710                  add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4711                               loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4712                  if (dump_enabled_p ())
4713                    {
4714                      dump_printf_loc (MSG_NOTE, vect_location,
4715				       "created double reduction phi node: ");
4716                      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4717                      dump_printf (MSG_NOTE, "\n");
4718                    }
4719
4720                  vect_phi_res = PHI_RESULT (vect_phi);
4721
4722                  /* Replace the use, i.e., set the correct vs1 in the regular
4723                     reduction phi node.  FORNOW, NCOPIES is always 1, so the
4724                     loop is redundant.  */
4725                  use = reduction_phi;
4726                  for (j = 0; j < ncopies; j++)
4727                    {
4728                      edge pr_edge = loop_preheader_edge (loop);
4729                      SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4730                      use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4731                    }
4732                }
4733            }
4734        }
4735
4736      phis.release ();
4737      if (nested_in_vect_loop)
4738        {
4739          if (double_reduc)
4740            loop = outer_loop;
4741          else
4742            continue;
4743        }
4744
4745      phis.create (3);
4746      /* Find the loop-closed-use at the loop exit of the original scalar
4747         result.  (The reduction result is expected to have two immediate uses,
4748         one at the latch block, and one at the loop exit).  For double
4749         reductions we are looking for exit phis of the outer loop.  */
4750      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4751        {
4752          if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4753	    {
4754	      if (!is_gimple_debug (USE_STMT (use_p)))
4755		phis.safe_push (USE_STMT (use_p));
4756	    }
4757          else
4758            {
4759              if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4760                {
4761                  tree phi_res = PHI_RESULT (USE_STMT (use_p));
4762
4763                  FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4764                    {
4765                      if (!flow_bb_inside_loop_p (loop,
4766                                             gimple_bb (USE_STMT (phi_use_p)))
4767			  && !is_gimple_debug (USE_STMT (phi_use_p)))
4768                        phis.safe_push (USE_STMT (phi_use_p));
4769                    }
4770                }
4771            }
4772        }
4773
4774      FOR_EACH_VEC_ELT (phis, i, exit_phi)
4775        {
4776          /* Replace the uses:  */
4777          orig_name = PHI_RESULT (exit_phi);
4778          scalar_result = scalar_results[k];
4779          FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4780            FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4781              SET_USE (use_p, scalar_result);
4782        }
4783
4784      phis.release ();
4785    }
4786}
4787
4788
4789/* Function vectorizable_reduction.
4790
4791   Check if STMT performs a reduction operation that can be vectorized.
4792   If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4793   stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4794   Return FALSE if not a vectorizable STMT, TRUE otherwise.
4795
4796   This function also handles reduction idioms (patterns) that have been
4797   recognized in advance during vect_pattern_recog.  In this case, STMT may be
4798   of this form:
4799     X = pattern_expr (arg0, arg1, ..., X)
4800   and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4801   sequence that had been detected and replaced by the pattern-stmt (STMT).
4802
4803   In some cases of reduction patterns, the type of the reduction variable X is
4804   different than the type of the other arguments of STMT.
4805   In such cases, the vectype that is used when transforming STMT into a vector
4806   stmt is different than the vectype that is used to determine the
4807   vectorization factor, because it consists of a different number of elements
4808   than the actual number of elements that are being operated upon in parallel.
4809
4810   For example, consider an accumulation of shorts into an int accumulator.
4811   On some targets it's possible to vectorize this pattern operating on 8
4812   shorts at a time (hence, the vectype for purposes of determining the
4813   vectorization factor should be V8HI); on the other hand, the vectype that
4814   is used to create the vector form is actually V4SI (the type of the result).
4815
4816   Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4817   indicates what is the actual level of parallelism (V8HI in the example), so
4818   that the right vectorization factor would be derived.  This vectype
4819   corresponds to the type of arguments to the reduction stmt, and should *NOT*
4820   be used to create the vectorized stmt.  The right vectype for the vectorized
4821   stmt is obtained from the type of the result X:
4822        get_vectype_for_scalar_type (TREE_TYPE (X))
4823
4824   This means that, contrary to "regular" reductions (or "regular" stmts in
4825   general), the following equation:
4826      STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4827   does *NOT* necessarily hold for reduction patterns.  */
4828
4829bool
4830vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4831			gimple *vec_stmt, slp_tree slp_node)
4832{
4833  tree vec_dest;
4834  tree scalar_dest;
4835  tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4836  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4837  tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4838  tree vectype_in = NULL_TREE;
4839  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4840  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4841  enum tree_code code, orig_code, epilog_reduc_code;
4842  machine_mode vec_mode;
4843  int op_type;
4844  optab optab, reduc_optab;
4845  tree new_temp = NULL_TREE;
4846  tree def;
4847  gimple def_stmt;
4848  enum vect_def_type dt;
4849  gphi *new_phi = NULL;
4850  tree scalar_type;
4851  bool is_simple_use;
4852  gimple orig_stmt;
4853  stmt_vec_info orig_stmt_info;
4854  tree expr = NULL_TREE;
4855  int i;
4856  int ncopies;
4857  int epilog_copies;
4858  stmt_vec_info prev_stmt_info, prev_phi_info;
4859  bool single_defuse_cycle = false;
4860  tree reduc_def = NULL_TREE;
4861  gimple new_stmt = NULL;
4862  int j;
4863  tree ops[3];
4864  bool nested_cycle = false, found_nested_cycle_def = false;
4865  gimple reduc_def_stmt = NULL;
4866  /* The default is that the reduction variable is the last in statement.  */
4867  int reduc_index = 2;
4868  bool double_reduc = false, dummy;
4869  basic_block def_bb;
4870  struct loop * def_stmt_loop, *outer_loop = NULL;
4871  tree def_arg;
4872  gimple def_arg_stmt;
4873  auto_vec<tree> vec_oprnds0;
4874  auto_vec<tree> vec_oprnds1;
4875  auto_vec<tree> vect_defs;
4876  auto_vec<gimple> phis;
4877  int vec_num;
4878  tree def0, def1, tem, op0, op1 = NULL_TREE;
4879
4880  /* In case of reduction chain we switch to the first stmt in the chain, but
4881     we don't update STMT_INFO, since only the last stmt is marked as reduction
4882     and has reduction properties.  */
4883  if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4884    stmt = GROUP_FIRST_ELEMENT (stmt_info);
4885
4886  if (nested_in_vect_loop_p (loop, stmt))
4887    {
4888      outer_loop = loop;
4889      loop = loop->inner;
4890      nested_cycle = true;
4891    }
4892
4893  /* 1. Is vectorizable reduction?  */
4894  /* Not supportable if the reduction variable is used in the loop, unless
4895     it's a reduction chain.  */
4896  if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4897      && !GROUP_FIRST_ELEMENT (stmt_info))
4898    return false;
4899
4900  /* Reductions that are not used even in an enclosing outer-loop,
4901     are expected to be "live" (used out of the loop).  */
4902  if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4903      && !STMT_VINFO_LIVE_P (stmt_info))
4904    return false;
4905
4906  /* Make sure it was already recognized as a reduction computation.  */
4907  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4908      && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4909    return false;
4910
4911  /* 2. Has this been recognized as a reduction pattern?
4912
4913     Check if STMT represents a pattern that has been recognized
4914     in earlier analysis stages.  For stmts that represent a pattern,
4915     the STMT_VINFO_RELATED_STMT field records the last stmt in
4916     the original sequence that constitutes the pattern.  */
4917
4918  orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4919  if (orig_stmt)
4920    {
4921      orig_stmt_info = vinfo_for_stmt (orig_stmt);
4922      gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4923      gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4924    }
4925
4926  /* 3. Check the operands of the operation.  The first operands are defined
4927        inside the loop body. The last operand is the reduction variable,
4928        which is defined by the loop-header-phi.  */
4929
4930  gcc_assert (is_gimple_assign (stmt));
4931
4932  /* Flatten RHS.  */
4933  switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4934    {
4935    case GIMPLE_SINGLE_RHS:
4936      op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4937      if (op_type == ternary_op)
4938	{
4939	  tree rhs = gimple_assign_rhs1 (stmt);
4940	  ops[0] = TREE_OPERAND (rhs, 0);
4941	  ops[1] = TREE_OPERAND (rhs, 1);
4942	  ops[2] = TREE_OPERAND (rhs, 2);
4943	  code = TREE_CODE (rhs);
4944	}
4945      else
4946	return false;
4947      break;
4948
4949    case GIMPLE_BINARY_RHS:
4950      code = gimple_assign_rhs_code (stmt);
4951      op_type = TREE_CODE_LENGTH (code);
4952      gcc_assert (op_type == binary_op);
4953      ops[0] = gimple_assign_rhs1 (stmt);
4954      ops[1] = gimple_assign_rhs2 (stmt);
4955      break;
4956
4957    case GIMPLE_TERNARY_RHS:
4958      code = gimple_assign_rhs_code (stmt);
4959      op_type = TREE_CODE_LENGTH (code);
4960      gcc_assert (op_type == ternary_op);
4961      ops[0] = gimple_assign_rhs1 (stmt);
4962      ops[1] = gimple_assign_rhs2 (stmt);
4963      ops[2] = gimple_assign_rhs3 (stmt);
4964      break;
4965
4966    case GIMPLE_UNARY_RHS:
4967      return false;
4968
4969    default:
4970      gcc_unreachable ();
4971    }
4972
4973  if (code == COND_EXPR && slp_node)
4974    return false;
4975
4976  scalar_dest = gimple_assign_lhs (stmt);
4977  scalar_type = TREE_TYPE (scalar_dest);
4978  if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4979      && !SCALAR_FLOAT_TYPE_P (scalar_type))
4980    return false;
4981
4982  /* Do not try to vectorize bit-precision reductions.  */
4983  if ((TYPE_PRECISION (scalar_type)
4984       != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4985    return false;
4986
4987  /* All uses but the last are expected to be defined in the loop.
4988     The last use is the reduction variable.  In case of nested cycle this
4989     assumption is not true: we use reduc_index to record the index of the
4990     reduction variable.  */
4991  for (i = 0; i < op_type - 1; i++)
4992    {
4993      /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4994      if (i == 0 && code == COND_EXPR)
4995        continue;
4996
4997      is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4998					    &def_stmt, &def, &dt, &tem);
4999      if (!vectype_in)
5000	vectype_in = tem;
5001      gcc_assert (is_simple_use);
5002
5003      if (dt != vect_internal_def
5004	  && dt != vect_external_def
5005	  && dt != vect_constant_def
5006	  && dt != vect_induction_def
5007          && !(dt == vect_nested_cycle && nested_cycle))
5008	return false;
5009
5010      if (dt == vect_nested_cycle)
5011        {
5012          found_nested_cycle_def = true;
5013          reduc_def_stmt = def_stmt;
5014          reduc_index = i;
5015        }
5016    }
5017
5018  is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5019					&def_stmt, &def, &dt, &tem);
5020  if (!vectype_in)
5021    vectype_in = tem;
5022  gcc_assert (is_simple_use);
5023  if (!found_nested_cycle_def)
5024    reduc_def_stmt = def_stmt;
5025
5026  if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5027    return false;
5028
5029  if (!(dt == vect_reduction_def
5030	|| dt == vect_nested_cycle
5031	|| ((dt == vect_internal_def || dt == vect_external_def
5032	     || dt == vect_constant_def || dt == vect_induction_def)
5033	    && nested_cycle && found_nested_cycle_def)))
5034    {
5035      /* For pattern recognized stmts, orig_stmt might be a reduction,
5036	 but some helper statements for the pattern might not, or
5037	 might be COND_EXPRs with reduction uses in the condition.  */
5038      gcc_assert (orig_stmt);
5039      return false;
5040    }
5041
5042  if (orig_stmt)
5043    gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
5044                                                       reduc_def_stmt,
5045                                                       !nested_cycle,
5046                                                       &dummy));
5047  else
5048    {
5049      gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5050                                             !nested_cycle, &dummy);
5051      /* We changed STMT to be the first stmt in reduction chain, hence we
5052         check that in this case the first element in the chain is STMT.  */
5053      gcc_assert (stmt == tmp
5054                  || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5055    }
5056
5057  if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5058    return false;
5059
5060  if (slp_node || PURE_SLP_STMT (stmt_info))
5061    ncopies = 1;
5062  else
5063    ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5064               / TYPE_VECTOR_SUBPARTS (vectype_in));
5065
5066  gcc_assert (ncopies >= 1);
5067
5068  vec_mode = TYPE_MODE (vectype_in);
5069
5070  if (code == COND_EXPR)
5071    {
5072      if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5073        {
5074          if (dump_enabled_p ())
5075	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5076			     "unsupported condition in reduction\n");
5077
5078            return false;
5079        }
5080    }
5081  else
5082    {
5083      /* 4. Supportable by target?  */
5084
5085      if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5086	  || code == LROTATE_EXPR || code == RROTATE_EXPR)
5087	{
5088	  /* Shifts and rotates are only supported by vectorizable_shifts,
5089	     not vectorizable_reduction.  */
5090          if (dump_enabled_p ())
5091	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5092			     "unsupported shift or rotation.\n");
5093	  return false;
5094	}
5095
5096      /* 4.1. check support for the operation in the loop  */
5097      optab = optab_for_tree_code (code, vectype_in, optab_default);
5098      if (!optab)
5099        {
5100          if (dump_enabled_p ())
5101	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5102			     "no optab.\n");
5103
5104          return false;
5105        }
5106
5107      if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5108        {
5109          if (dump_enabled_p ())
5110            dump_printf (MSG_NOTE, "op not supported by target.\n");
5111
5112          if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5113              || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5114	          < vect_min_worthwhile_factor (code))
5115            return false;
5116
5117          if (dump_enabled_p ())
5118  	    dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5119        }
5120
5121      /* Worthwhile without SIMD support?  */
5122      if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5123          && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5124   	     < vect_min_worthwhile_factor (code))
5125        {
5126          if (dump_enabled_p ())
5127	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5128			     "not worthwhile without SIMD support.\n");
5129
5130          return false;
5131        }
5132    }
5133
5134  /* 4.2. Check support for the epilog operation.
5135
5136          If STMT represents a reduction pattern, then the type of the
5137          reduction variable may be different than the type of the rest
5138          of the arguments.  For example, consider the case of accumulation
5139          of shorts into an int accumulator; The original code:
5140                        S1: int_a = (int) short_a;
5141          orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5142
5143          was replaced with:
5144                        STMT: int_acc = widen_sum <short_a, int_acc>
5145
5146          This means that:
5147          1. The tree-code that is used to create the vector operation in the
5148             epilog code (that reduces the partial results) is not the
5149             tree-code of STMT, but is rather the tree-code of the original
5150             stmt from the pattern that STMT is replacing.  I.e, in the example
5151             above we want to use 'widen_sum' in the loop, but 'plus' in the
5152             epilog.
5153          2. The type (mode) we use to check available target support
5154             for the vector operation to be created in the *epilog*, is
5155             determined by the type of the reduction variable (in the example
5156             above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5157             However the type (mode) we use to check available target support
5158             for the vector operation to be created *inside the loop*, is
5159             determined by the type of the other arguments to STMT (in the
5160             example we'd check this: optab_handler (widen_sum_optab,
5161	     vect_short_mode)).
5162
5163          This is contrary to "regular" reductions, in which the types of all
5164          the arguments are the same as the type of the reduction variable.
5165          For "regular" reductions we can therefore use the same vector type
5166          (and also the same tree-code) when generating the epilog code and
5167          when generating the code inside the loop.  */
5168
5169  if (orig_stmt)
5170    {
5171      /* This is a reduction pattern: get the vectype from the type of the
5172         reduction variable, and get the tree-code from orig_stmt.  */
5173      orig_code = gimple_assign_rhs_code (orig_stmt);
5174      gcc_assert (vectype_out);
5175      vec_mode = TYPE_MODE (vectype_out);
5176    }
5177  else
5178    {
5179      /* Regular reduction: use the same vectype and tree-code as used for
5180         the vector code inside the loop can be used for the epilog code. */
5181      orig_code = code;
5182    }
5183
5184  if (nested_cycle)
5185    {
5186      def_bb = gimple_bb (reduc_def_stmt);
5187      def_stmt_loop = def_bb->loop_father;
5188      def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5189                                       loop_preheader_edge (def_stmt_loop));
5190      if (TREE_CODE (def_arg) == SSA_NAME
5191          && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5192          && gimple_code (def_arg_stmt) == GIMPLE_PHI
5193          && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5194          && vinfo_for_stmt (def_arg_stmt)
5195          && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5196              == vect_double_reduction_def)
5197        double_reduc = true;
5198    }
5199
5200  epilog_reduc_code = ERROR_MARK;
5201  if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5202    {
5203      reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5204                                         optab_default);
5205      if (!reduc_optab)
5206        {
5207          if (dump_enabled_p ())
5208	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5209			     "no optab for reduction.\n");
5210
5211          epilog_reduc_code = ERROR_MARK;
5212        }
5213      else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5214        {
5215          optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5216          if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5217            {
5218              if (dump_enabled_p ())
5219	        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5220				 "reduc op not supported by target.\n");
5221
5222	      epilog_reduc_code = ERROR_MARK;
5223	    }
5224        }
5225    }
5226  else
5227    {
5228      if (!nested_cycle || double_reduc)
5229        {
5230          if (dump_enabled_p ())
5231	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5232			     "no reduc code for scalar code.\n");
5233
5234          return false;
5235        }
5236    }
5237
5238  if (double_reduc && ncopies > 1)
5239    {
5240      if (dump_enabled_p ())
5241	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5242			 "multiple types in double reduction\n");
5243
5244      return false;
5245    }
5246
5247  /* In case of widenning multiplication by a constant, we update the type
5248     of the constant to be the type of the other operand.  We check that the
5249     constant fits the type in the pattern recognition pass.  */
5250  if (code == DOT_PROD_EXPR
5251      && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5252    {
5253      if (TREE_CODE (ops[0]) == INTEGER_CST)
5254        ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5255      else if (TREE_CODE (ops[1]) == INTEGER_CST)
5256        ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5257      else
5258        {
5259          if (dump_enabled_p ())
5260	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5261			     "invalid types in dot-prod\n");
5262
5263          return false;
5264        }
5265    }
5266
5267  if (!vec_stmt) /* transformation not required.  */
5268    {
5269      if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5270        return false;
5271      STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5272      return true;
5273    }
5274
5275  /** Transform.  **/
5276
5277  if (dump_enabled_p ())
5278    dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5279
5280  /* FORNOW: Multiple types are not supported for condition.  */
5281  if (code == COND_EXPR)
5282    gcc_assert (ncopies == 1);
5283
5284  /* Create the destination vector  */
5285  vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5286
5287  /* In case the vectorization factor (VF) is bigger than the number
5288     of elements that we can fit in a vectype (nunits), we have to generate
5289     more than one vector stmt - i.e - we need to "unroll" the
5290     vector stmt by a factor VF/nunits.  For more details see documentation
5291     in vectorizable_operation.  */
5292
5293  /* If the reduction is used in an outer loop we need to generate
5294     VF intermediate results, like so (e.g. for ncopies=2):
5295	r0 = phi (init, r0)
5296	r1 = phi (init, r1)
5297	r0 = x0 + r0;
5298        r1 = x1 + r1;
5299    (i.e. we generate VF results in 2 registers).
5300    In this case we have a separate def-use cycle for each copy, and therefore
5301    for each copy we get the vector def for the reduction variable from the
5302    respective phi node created for this copy.
5303
5304    Otherwise (the reduction is unused in the loop nest), we can combine
5305    together intermediate results, like so (e.g. for ncopies=2):
5306	r = phi (init, r)
5307	r = x0 + r;
5308	r = x1 + r;
5309   (i.e. we generate VF/2 results in a single register).
5310   In this case for each copy we get the vector def for the reduction variable
5311   from the vectorized reduction operation generated in the previous iteration.
5312  */
5313
5314  if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5315    {
5316      single_defuse_cycle = true;
5317      epilog_copies = 1;
5318    }
5319  else
5320    epilog_copies = ncopies;
5321
5322  prev_stmt_info = NULL;
5323  prev_phi_info = NULL;
5324  if (slp_node)
5325    {
5326      vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5327      gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5328                  == TYPE_VECTOR_SUBPARTS (vectype_in));
5329    }
5330  else
5331    {
5332      vec_num = 1;
5333      vec_oprnds0.create (1);
5334      if (op_type == ternary_op)
5335        vec_oprnds1.create (1);
5336    }
5337
5338  phis.create (vec_num);
5339  vect_defs.create (vec_num);
5340  if (!slp_node)
5341    vect_defs.quick_push (NULL_TREE);
5342
5343  for (j = 0; j < ncopies; j++)
5344    {
5345      if (j == 0 || !single_defuse_cycle)
5346	{
5347          for (i = 0; i < vec_num; i++)
5348            {
5349              /* Create the reduction-phi that defines the reduction
5350                 operand.  */
5351              new_phi = create_phi_node (vec_dest, loop->header);
5352              set_vinfo_for_stmt (new_phi,
5353                                  new_stmt_vec_info (new_phi, loop_vinfo,
5354                                                     NULL));
5355               if (j == 0 || slp_node)
5356                 phis.quick_push (new_phi);
5357            }
5358        }
5359
5360      if (code == COND_EXPR)
5361        {
5362          gcc_assert (!slp_node);
5363          vectorizable_condition (stmt, gsi, vec_stmt,
5364                                  PHI_RESULT (phis[0]),
5365                                  reduc_index, NULL);
5366          /* Multiple types are not supported for condition.  */
5367          break;
5368        }
5369
5370      /* Handle uses.  */
5371      if (j == 0)
5372        {
5373          op0 = ops[!reduc_index];
5374          if (op_type == ternary_op)
5375            {
5376              if (reduc_index == 0)
5377                op1 = ops[2];
5378              else
5379                op1 = ops[1];
5380            }
5381
5382          if (slp_node)
5383            vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5384                               slp_node, -1);
5385          else
5386            {
5387              loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5388                                                            stmt, NULL);
5389              vec_oprnds0.quick_push (loop_vec_def0);
5390              if (op_type == ternary_op)
5391               {
5392                 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5393                                                               NULL);
5394                 vec_oprnds1.quick_push (loop_vec_def1);
5395               }
5396            }
5397        }
5398      else
5399        {
5400          if (!slp_node)
5401            {
5402              enum vect_def_type dt;
5403              gimple dummy_stmt;
5404              tree dummy;
5405
5406              vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5407                                  &dummy_stmt, &dummy, &dt);
5408              loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5409                                                              loop_vec_def0);
5410              vec_oprnds0[0] = loop_vec_def0;
5411              if (op_type == ternary_op)
5412                {
5413                  vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5414                                      &dummy, &dt);
5415                  loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5416                                                                loop_vec_def1);
5417                  vec_oprnds1[0] = loop_vec_def1;
5418                }
5419            }
5420
5421          if (single_defuse_cycle)
5422            reduc_def = gimple_assign_lhs (new_stmt);
5423
5424          STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5425        }
5426
5427      FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5428        {
5429          if (slp_node)
5430            reduc_def = PHI_RESULT (phis[i]);
5431          else
5432            {
5433              if (!single_defuse_cycle || j == 0)
5434                reduc_def = PHI_RESULT (new_phi);
5435            }
5436
5437          def1 = ((op_type == ternary_op)
5438                  ? vec_oprnds1[i] : NULL);
5439          if (op_type == binary_op)
5440            {
5441              if (reduc_index == 0)
5442                expr = build2 (code, vectype_out, reduc_def, def0);
5443              else
5444                expr = build2 (code, vectype_out, def0, reduc_def);
5445            }
5446          else
5447            {
5448              if (reduc_index == 0)
5449                expr = build3 (code, vectype_out, reduc_def, def0, def1);
5450              else
5451                {
5452                  if (reduc_index == 1)
5453                    expr = build3 (code, vectype_out, def0, reduc_def, def1);
5454                  else
5455                    expr = build3 (code, vectype_out, def0, def1, reduc_def);
5456                }
5457            }
5458
5459          new_stmt = gimple_build_assign (vec_dest, expr);
5460          new_temp = make_ssa_name (vec_dest, new_stmt);
5461          gimple_assign_set_lhs (new_stmt, new_temp);
5462          vect_finish_stmt_generation (stmt, new_stmt, gsi);
5463
5464          if (slp_node)
5465            {
5466              SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5467              vect_defs.quick_push (new_temp);
5468            }
5469          else
5470            vect_defs[0] = new_temp;
5471        }
5472
5473      if (slp_node)
5474        continue;
5475
5476      if (j == 0)
5477	STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5478      else
5479	STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5480
5481      prev_stmt_info = vinfo_for_stmt (new_stmt);
5482      prev_phi_info = vinfo_for_stmt (new_phi);
5483    }
5484
5485  /* Finalize the reduction-phi (set its arguments) and create the
5486     epilog reduction code.  */
5487  if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5488    {
5489      new_temp = gimple_assign_lhs (*vec_stmt);
5490      vect_defs[0] = new_temp;
5491    }
5492
5493  vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5494                                    epilog_reduc_code, phis, reduc_index,
5495                                    double_reduc, slp_node);
5496
5497  return true;
5498}
5499
5500/* Function vect_min_worthwhile_factor.
5501
5502   For a loop where we could vectorize the operation indicated by CODE,
5503   return the minimum vectorization factor that makes it worthwhile
5504   to use generic vectors.  */
5505int
5506vect_min_worthwhile_factor (enum tree_code code)
5507{
5508  switch (code)
5509    {
5510    case PLUS_EXPR:
5511    case MINUS_EXPR:
5512    case NEGATE_EXPR:
5513      return 4;
5514
5515    case BIT_AND_EXPR:
5516    case BIT_IOR_EXPR:
5517    case BIT_XOR_EXPR:
5518    case BIT_NOT_EXPR:
5519      return 2;
5520
5521    default:
5522      return INT_MAX;
5523    }
5524}
5525
5526
5527/* Function vectorizable_induction
5528
5529   Check if PHI performs an induction computation that can be vectorized.
5530   If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5531   phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5532   Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5533
5534bool
5535vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5536			gimple *vec_stmt)
5537{
5538  stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5539  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5540  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5541  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5542  int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5543  int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5544  tree vec_def;
5545
5546  gcc_assert (ncopies >= 1);
5547  /* FORNOW. These restrictions should be relaxed.  */
5548  if (nested_in_vect_loop_p (loop, phi))
5549    {
5550      imm_use_iterator imm_iter;
5551      use_operand_p use_p;
5552      gimple exit_phi;
5553      edge latch_e;
5554      tree loop_arg;
5555
5556      if (ncopies > 1)
5557	{
5558	  if (dump_enabled_p ())
5559	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5560			     "multiple types in nested loop.\n");
5561	  return false;
5562	}
5563
5564      exit_phi = NULL;
5565      latch_e = loop_latch_edge (loop->inner);
5566      loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5567      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5568	{
5569	  gimple use_stmt = USE_STMT (use_p);
5570	  if (is_gimple_debug (use_stmt))
5571	    continue;
5572
5573	  if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5574	    {
5575	      exit_phi = use_stmt;
5576	      break;
5577	    }
5578	}
5579      if (exit_phi)
5580	{
5581	  stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5582	  if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5583		&& !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5584	    {
5585	      if (dump_enabled_p ())
5586		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5587				 "inner-loop induction only used outside "
5588				 "of the outer vectorized loop.\n");
5589	      return false;
5590	    }
5591	}
5592    }
5593
5594  if (!STMT_VINFO_RELEVANT_P (stmt_info))
5595    return false;
5596
5597  /* FORNOW: SLP not supported.  */
5598  if (STMT_SLP_TYPE (stmt_info))
5599    return false;
5600
5601  gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5602
5603  if (gimple_code (phi) != GIMPLE_PHI)
5604    return false;
5605
5606  if (!vec_stmt) /* transformation not required.  */
5607    {
5608      STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5609      if (dump_enabled_p ())
5610        dump_printf_loc (MSG_NOTE, vect_location,
5611                         "=== vectorizable_induction ===\n");
5612      vect_model_induction_cost (stmt_info, ncopies);
5613      return true;
5614    }
5615
5616  /** Transform.  **/
5617
5618  if (dump_enabled_p ())
5619    dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5620
5621  vec_def = get_initial_def_for_induction (phi);
5622  *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5623  return true;
5624}
5625
5626/* Function vectorizable_live_operation.
5627
5628   STMT computes a value that is used outside the loop.  Check if
5629   it can be supported.  */
5630
5631bool
5632vectorizable_live_operation (gimple stmt,
5633			     gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5634			     gimple *vec_stmt)
5635{
5636  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5637  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5638  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5639  int i;
5640  int op_type;
5641  tree op;
5642  tree def;
5643  gimple def_stmt;
5644  enum vect_def_type dt;
5645  enum tree_code code;
5646  enum gimple_rhs_class rhs_class;
5647
5648  gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5649
5650  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5651    return false;
5652
5653  if (!is_gimple_assign (stmt))
5654    {
5655      if (gimple_call_internal_p (stmt)
5656	  && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5657	  && gimple_call_lhs (stmt)
5658	  && loop->simduid
5659	  && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5660	  && loop->simduid
5661	     == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5662	{
5663	  edge e = single_exit (loop);
5664	  basic_block merge_bb = e->dest;
5665	  imm_use_iterator imm_iter;
5666	  use_operand_p use_p;
5667	  tree lhs = gimple_call_lhs (stmt);
5668
5669	  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5670	    {
5671	      gimple use_stmt = USE_STMT (use_p);
5672	      if (gimple_code (use_stmt) == GIMPLE_PHI
5673		  && gimple_bb (use_stmt) == merge_bb)
5674		{
5675		  if (vec_stmt)
5676		    {
5677		      tree vfm1
5678			= build_int_cst (unsigned_type_node,
5679					 loop_vinfo->vectorization_factor - 1);
5680		      SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5681		    }
5682		  return true;
5683		}
5684	    }
5685	}
5686
5687      return false;
5688    }
5689
5690  if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5691    return false;
5692
5693  /* FORNOW. CHECKME. */
5694  if (nested_in_vect_loop_p (loop, stmt))
5695    return false;
5696
5697  code = gimple_assign_rhs_code (stmt);
5698  op_type = TREE_CODE_LENGTH (code);
5699  rhs_class = get_gimple_rhs_class (code);
5700  gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5701  gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5702
5703  /* FORNOW: support only if all uses are invariant.  This means
5704     that the scalar operations can remain in place, unvectorized.
5705     The original last scalar value that they compute will be used.  */
5706
5707  for (i = 0; i < op_type; i++)
5708    {
5709      if (rhs_class == GIMPLE_SINGLE_RHS)
5710	op = TREE_OPERAND (gimple_op (stmt, 1), i);
5711      else
5712	op = gimple_op (stmt, i + 1);
5713      if (op
5714          && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5715				  &dt))
5716        {
5717          if (dump_enabled_p ())
5718	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5719			     "use not simple.\n");
5720          return false;
5721        }
5722
5723      if (dt != vect_external_def && dt != vect_constant_def)
5724        return false;
5725    }
5726
5727  /* No transformation is required for the cases we currently support.  */
5728  return true;
5729}
5730
5731/* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5732
5733static void
5734vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5735{
5736  ssa_op_iter op_iter;
5737  imm_use_iterator imm_iter;
5738  def_operand_p def_p;
5739  gimple ustmt;
5740
5741  FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5742    {
5743      FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5744	{
5745	  basic_block bb;
5746
5747	  if (!is_gimple_debug (ustmt))
5748	    continue;
5749
5750	  bb = gimple_bb (ustmt);
5751
5752	  if (!flow_bb_inside_loop_p (loop, bb))
5753	    {
5754	      if (gimple_debug_bind_p (ustmt))
5755		{
5756		  if (dump_enabled_p ())
5757		    dump_printf_loc (MSG_NOTE, vect_location,
5758                                     "killing debug use\n");
5759
5760		  gimple_debug_bind_reset_value (ustmt);
5761		  update_stmt (ustmt);
5762		}
5763	      else
5764		gcc_unreachable ();
5765	    }
5766	}
5767    }
5768}
5769
5770
5771/* This function builds ni_name = number of iterations.  Statements
5772   are emitted on the loop preheader edge.  */
5773
5774static tree
5775vect_build_loop_niters (loop_vec_info loop_vinfo)
5776{
5777  tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5778  if (TREE_CODE (ni) == INTEGER_CST)
5779    return ni;
5780  else
5781    {
5782      tree ni_name, var;
5783      gimple_seq stmts = NULL;
5784      edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5785
5786      var = create_tmp_var (TREE_TYPE (ni), "niters");
5787      ni_name = force_gimple_operand (ni, &stmts, false, var);
5788      if (stmts)
5789	gsi_insert_seq_on_edge_immediate (pe, stmts);
5790
5791      return ni_name;
5792    }
5793}
5794
5795
5796/* This function generates the following statements:
5797
5798   ni_name = number of iterations loop executes
5799   ratio = ni_name / vf
5800   ratio_mult_vf_name = ratio * vf
5801
5802   and places them on the loop preheader edge.  */
5803
5804static void
5805vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5806				 tree ni_name,
5807				 tree *ratio_mult_vf_name_ptr,
5808				 tree *ratio_name_ptr)
5809{
5810  tree ni_minus_gap_name;
5811  tree var;
5812  tree ratio_name;
5813  tree ratio_mult_vf_name;
5814  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5815  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5816  tree log_vf;
5817
5818  log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5819
5820  /* If epilogue loop is required because of data accesses with gaps, we
5821     subtract one iteration from the total number of iterations here for
5822     correct calculation of RATIO.  */
5823  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5824    {
5825      ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5826				       ni_name,
5827			               build_one_cst (TREE_TYPE (ni_name)));
5828      if (!is_gimple_val (ni_minus_gap_name))
5829	{
5830	  var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5831          gimple stmts = NULL;
5832          ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5833						    true, var);
5834	  gsi_insert_seq_on_edge_immediate (pe, stmts);
5835        }
5836    }
5837  else
5838    ni_minus_gap_name = ni_name;
5839
5840  /* Create: ratio = ni >> log2(vf) */
5841  /* ???  As we have ni == number of latch executions + 1, ni could
5842     have overflown to zero.  So avoid computing ratio based on ni
5843     but compute it using the fact that we know ratio will be at least
5844     one, thus via (ni - vf) >> log2(vf) + 1.  */
5845  ratio_name
5846    = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5847		   fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5848				fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5849					     ni_minus_gap_name,
5850					     build_int_cst
5851					       (TREE_TYPE (ni_name), vf)),
5852				log_vf),
5853		   build_int_cst (TREE_TYPE (ni_name), 1));
5854  if (!is_gimple_val (ratio_name))
5855    {
5856      var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5857      gimple stmts = NULL;
5858      ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5859      gsi_insert_seq_on_edge_immediate (pe, stmts);
5860    }
5861  *ratio_name_ptr = ratio_name;
5862
5863  /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5864
5865  if (ratio_mult_vf_name_ptr)
5866    {
5867      ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5868					ratio_name, log_vf);
5869      if (!is_gimple_val (ratio_mult_vf_name))
5870	{
5871	  var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5872	  gimple stmts = NULL;
5873	  ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5874						     true, var);
5875	  gsi_insert_seq_on_edge_immediate (pe, stmts);
5876	}
5877      *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5878    }
5879
5880  return;
5881}
5882
5883
5884/* Function vect_transform_loop.
5885
5886   The analysis phase has determined that the loop is vectorizable.
5887   Vectorize the loop - created vectorized stmts to replace the scalar
5888   stmts in the loop, and update the loop exit condition.  */
5889
5890void
5891vect_transform_loop (loop_vec_info loop_vinfo)
5892{
5893  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5894  basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5895  int nbbs = loop->num_nodes;
5896  int i;
5897  tree ratio = NULL;
5898  int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5899  bool grouped_store;
5900  bool slp_scheduled = false;
5901  gimple stmt, pattern_stmt;
5902  gimple_seq pattern_def_seq = NULL;
5903  gimple_stmt_iterator pattern_def_si = gsi_none ();
5904  bool transform_pattern_stmt = false;
5905  bool check_profitability = false;
5906  int th;
5907  /* Record number of iterations before we started tampering with the profile. */
5908  gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5909
5910  if (dump_enabled_p ())
5911    dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5912
5913  /* If profile is inprecise, we have chance to fix it up.  */
5914  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5915    expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5916
5917  /* Use the more conservative vectorization threshold.  If the number
5918     of iterations is constant assume the cost check has been performed
5919     by our caller.  If the threshold makes all loops profitable that
5920     run at least the vectorization factor number of times checking
5921     is pointless, too.  */
5922  th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5923  if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5924      && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5925    {
5926      if (dump_enabled_p ())
5927	dump_printf_loc (MSG_NOTE, vect_location,
5928			 "Profitability threshold is %d loop iterations.\n",
5929                         th);
5930      check_profitability = true;
5931    }
5932
5933  /* Version the loop first, if required, so the profitability check
5934     comes first.  */
5935
5936  if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5937      || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5938    {
5939      vect_loop_versioning (loop_vinfo, th, check_profitability);
5940      check_profitability = false;
5941    }
5942
5943  tree ni_name = vect_build_loop_niters (loop_vinfo);
5944  LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5945
5946  /* Peel the loop if there are data refs with unknown alignment.
5947     Only one data ref with unknown store is allowed.  */
5948
5949  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5950    {
5951      vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5952				     th, check_profitability);
5953      check_profitability = false;
5954      /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5955	 be re-computed.  */
5956      ni_name = NULL_TREE;
5957    }
5958
5959  /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5960     compile time constant), or it is a constant that doesn't divide by the
5961     vectorization factor, then an epilog loop needs to be created.
5962     We therefore duplicate the loop: the original loop will be vectorized,
5963     and will compute the first (n/VF) iterations.  The second copy of the loop
5964     will remain scalar and will compute the remaining (n%VF) iterations.
5965     (VF is the vectorization factor).  */
5966
5967  if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5968      || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5969    {
5970      tree ratio_mult_vf;
5971      if (!ni_name)
5972	ni_name = vect_build_loop_niters (loop_vinfo);
5973      vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5974				       &ratio);
5975      vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5976				      th, check_profitability);
5977    }
5978  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5979    ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5980		LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5981  else
5982    {
5983      if (!ni_name)
5984	ni_name = vect_build_loop_niters (loop_vinfo);
5985      vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5986    }
5987
5988  /* 1) Make sure the loop header has exactly two entries
5989     2) Make sure we have a preheader basic block.  */
5990
5991  gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5992
5993  split_edge (loop_preheader_edge (loop));
5994
5995  /* FORNOW: the vectorizer supports only loops which body consist
5996     of one basic block (header + empty latch). When the vectorizer will
5997     support more involved loop forms, the order by which the BBs are
5998     traversed need to be reconsidered.  */
5999
6000  for (i = 0; i < nbbs; i++)
6001    {
6002      basic_block bb = bbs[i];
6003      stmt_vec_info stmt_info;
6004
6005      for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6006	   gsi_next (&si))
6007        {
6008	  gphi *phi = si.phi ();
6009	  if (dump_enabled_p ())
6010	    {
6011	      dump_printf_loc (MSG_NOTE, vect_location,
6012                               "------>vectorizing phi: ");
6013	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6014              dump_printf (MSG_NOTE, "\n");
6015	    }
6016	  stmt_info = vinfo_for_stmt (phi);
6017	  if (!stmt_info)
6018	    continue;
6019
6020	  if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6021	    vect_loop_kill_debug_uses (loop, phi);
6022
6023	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
6024	      && !STMT_VINFO_LIVE_P (stmt_info))
6025	    continue;
6026
6027	  if (STMT_VINFO_VECTYPE (stmt_info)
6028	      && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6029		  != (unsigned HOST_WIDE_INT) vectorization_factor)
6030	      && dump_enabled_p ())
6031	    dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6032
6033	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6034	    {
6035	      if (dump_enabled_p ())
6036		dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6037	      vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6038	    }
6039	}
6040
6041      pattern_stmt = NULL;
6042      for (gimple_stmt_iterator si = gsi_start_bb (bb);
6043	   !gsi_end_p (si) || transform_pattern_stmt;)
6044	{
6045	  bool is_store;
6046
6047          if (transform_pattern_stmt)
6048	    stmt = pattern_stmt;
6049          else
6050	    {
6051	      stmt = gsi_stmt (si);
6052	      /* During vectorization remove existing clobber stmts.  */
6053	      if (gimple_clobber_p (stmt))
6054		{
6055		  unlink_stmt_vdef (stmt);
6056		  gsi_remove (&si, true);
6057		  release_defs (stmt);
6058		  continue;
6059		}
6060	    }
6061
6062	  if (dump_enabled_p ())
6063	    {
6064	      dump_printf_loc (MSG_NOTE, vect_location,
6065			       "------>vectorizing statement: ");
6066	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6067              dump_printf (MSG_NOTE, "\n");
6068	    }
6069
6070	  stmt_info = vinfo_for_stmt (stmt);
6071
6072	  /* vector stmts created in the outer-loop during vectorization of
6073	     stmts in an inner-loop may not have a stmt_info, and do not
6074	     need to be vectorized.  */
6075	  if (!stmt_info)
6076	    {
6077	      gsi_next (&si);
6078	      continue;
6079	    }
6080
6081	  if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6082	    vect_loop_kill_debug_uses (loop, stmt);
6083
6084	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
6085	      && !STMT_VINFO_LIVE_P (stmt_info))
6086            {
6087              if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6088                  && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6089                  && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6090                      || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6091                {
6092                  stmt = pattern_stmt;
6093                  stmt_info = vinfo_for_stmt (stmt);
6094                }
6095              else
6096	        {
6097   	          gsi_next (&si);
6098	          continue;
6099                }
6100	    }
6101          else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6102                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6103                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6104                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6105            transform_pattern_stmt = true;
6106
6107	  /* If pattern statement has def stmts, vectorize them too.  */
6108	  if (is_pattern_stmt_p (stmt_info))
6109	    {
6110	      if (pattern_def_seq == NULL)
6111		{
6112		  pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6113		  pattern_def_si = gsi_start (pattern_def_seq);
6114		}
6115	      else if (!gsi_end_p (pattern_def_si))
6116		gsi_next (&pattern_def_si);
6117	      if (pattern_def_seq != NULL)
6118		{
6119		  gimple pattern_def_stmt = NULL;
6120		  stmt_vec_info pattern_def_stmt_info = NULL;
6121
6122		  while (!gsi_end_p (pattern_def_si))
6123		    {
6124		      pattern_def_stmt = gsi_stmt (pattern_def_si);
6125		      pattern_def_stmt_info
6126			= vinfo_for_stmt (pattern_def_stmt);
6127		      if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6128			  || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6129			break;
6130		      gsi_next (&pattern_def_si);
6131		    }
6132
6133		  if (!gsi_end_p (pattern_def_si))
6134		    {
6135		      if (dump_enabled_p ())
6136			{
6137			  dump_printf_loc (MSG_NOTE, vect_location,
6138					   "==> vectorizing pattern def "
6139					   "stmt: ");
6140			  dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6141					    pattern_def_stmt, 0);
6142                          dump_printf (MSG_NOTE, "\n");
6143			}
6144
6145		      stmt = pattern_def_stmt;
6146		      stmt_info = pattern_def_stmt_info;
6147		    }
6148		  else
6149		    {
6150		      pattern_def_si = gsi_none ();
6151		      transform_pattern_stmt = false;
6152		    }
6153		}
6154	      else
6155		transform_pattern_stmt = false;
6156            }
6157
6158	  if (STMT_VINFO_VECTYPE (stmt_info))
6159	    {
6160	      unsigned int nunits
6161		= (unsigned int)
6162		  TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6163	      if (!STMT_SLP_TYPE (stmt_info)
6164		  && nunits != (unsigned int) vectorization_factor
6165		  && dump_enabled_p ())
6166		  /* For SLP VF is set according to unrolling factor, and not
6167		     to vector size, hence for SLP this print is not valid.  */
6168		dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6169	    }
6170
6171	  /* SLP. Schedule all the SLP instances when the first SLP stmt is
6172	     reached.  */
6173	  if (STMT_SLP_TYPE (stmt_info))
6174	    {
6175	      if (!slp_scheduled)
6176		{
6177		  slp_scheduled = true;
6178
6179		  if (dump_enabled_p ())
6180		    dump_printf_loc (MSG_NOTE, vect_location,
6181				     "=== scheduling SLP instances ===\n");
6182
6183		  vect_schedule_slp (loop_vinfo, NULL);
6184		}
6185
6186	      /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6187	      if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6188		{
6189		  if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6190		    {
6191		      pattern_def_seq = NULL;
6192		      gsi_next (&si);
6193		    }
6194		  continue;
6195		}
6196	    }
6197
6198	  /* -------- vectorize statement ------------ */
6199	  if (dump_enabled_p ())
6200	    dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6201
6202	  grouped_store = false;
6203	  is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6204          if (is_store)
6205            {
6206	      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6207		{
6208		  /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6209		     interleaving chain was completed - free all the stores in
6210		     the chain.  */
6211		  gsi_next (&si);
6212		  vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6213		}
6214	      else
6215		{
6216		  /* Free the attached stmt_vec_info and remove the stmt.  */
6217		  gimple store = gsi_stmt (si);
6218		  free_stmt_vec_info (store);
6219		  unlink_stmt_vdef (store);
6220		  gsi_remove (&si, true);
6221		  release_defs (store);
6222		}
6223
6224	      /* Stores can only appear at the end of pattern statements.  */
6225	      gcc_assert (!transform_pattern_stmt);
6226	      pattern_def_seq = NULL;
6227	    }
6228	  else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6229	    {
6230	      pattern_def_seq = NULL;
6231	      gsi_next (&si);
6232	    }
6233	}		        /* stmts in BB */
6234    }				/* BBs in loop */
6235
6236  slpeel_make_loop_iterate_ntimes (loop, ratio);
6237
6238  /* Reduce loop iterations by the vectorization factor.  */
6239  scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6240		      expected_iterations / vectorization_factor);
6241  loop->nb_iterations_upper_bound
6242    = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6243  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6244      && loop->nb_iterations_upper_bound != 0)
6245    loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6246  if (loop->any_estimate)
6247    {
6248      loop->nb_iterations_estimate
6249        = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6250       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6251	   && loop->nb_iterations_estimate != 0)
6252	 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6253    }
6254
6255  if (dump_enabled_p ())
6256    {
6257      dump_printf_loc (MSG_NOTE, vect_location,
6258		       "LOOP VECTORIZED\n");
6259      if (loop->inner)
6260	dump_printf_loc (MSG_NOTE, vect_location,
6261			 "OUTER LOOP VECTORIZED\n");
6262      dump_printf (MSG_NOTE, "\n");
6263    }
6264}
6265