1/* Data References Analysis and Manipulation Utilities for Vectorization.
2   Copyright (C) 2003-2015 Free Software Foundation, Inc.
3   Contributed by Dorit Naishlos <dorit@il.ibm.com>
4   and Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3.  If not see
20<http://www.gnu.org/licenses/>.  */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "dumpfile.h"
26#include "tm.h"
27#include "hash-set.h"
28#include "machmode.h"
29#include "vec.h"
30#include "double-int.h"
31#include "input.h"
32#include "alias.h"
33#include "symtab.h"
34#include "wide-int.h"
35#include "inchash.h"
36#include "tree.h"
37#include "fold-const.h"
38#include "stor-layout.h"
39#include "tm_p.h"
40#include "target.h"
41#include "predict.h"
42#include "hard-reg-set.h"
43#include "function.h"
44#include "dominance.h"
45#include "cfg.h"
46#include "basic-block.h"
47#include "gimple-pretty-print.h"
48#include "tree-ssa-alias.h"
49#include "internal-fn.h"
50#include "tree-eh.h"
51#include "gimple-expr.h"
52#include "is-a.h"
53#include "gimple.h"
54#include "gimplify.h"
55#include "gimple-iterator.h"
56#include "gimplify-me.h"
57#include "gimple-ssa.h"
58#include "tree-phinodes.h"
59#include "ssa-iterators.h"
60#include "stringpool.h"
61#include "tree-ssanames.h"
62#include "tree-ssa-loop-ivopts.h"
63#include "tree-ssa-loop-manip.h"
64#include "tree-ssa-loop.h"
65#include "cfgloop.h"
66#include "tree-chrec.h"
67#include "tree-scalar-evolution.h"
68#include "tree-vectorizer.h"
69#include "diagnostic-core.h"
70#include "hash-map.h"
71#include "plugin-api.h"
72#include "ipa-ref.h"
73#include "cgraph.h"
74/* Need to include rtl.h, expr.h, etc. for optabs.  */
75#include "hashtab.h"
76#include "rtl.h"
77#include "flags.h"
78#include "statistics.h"
79#include "real.h"
80#include "fixed-value.h"
81#include "insn-config.h"
82#include "expmed.h"
83#include "dojump.h"
84#include "explow.h"
85#include "calls.h"
86#include "emit-rtl.h"
87#include "varasm.h"
88#include "stmt.h"
89#include "expr.h"
90#include "insn-codes.h"
91#include "optabs.h"
92#include "builtins.h"
93
94/* Return true if load- or store-lanes optab OPTAB is implemented for
95   COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
96
97static bool
98vect_lanes_optab_supported_p (const char *name, convert_optab optab,
99			      tree vectype, unsigned HOST_WIDE_INT count)
100{
101  machine_mode mode, array_mode;
102  bool limit_p;
103
104  mode = TYPE_MODE (vectype);
105  limit_p = !targetm.array_mode_supported_p (mode, count);
106  array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
107			      MODE_INT, limit_p);
108
109  if (array_mode == BLKmode)
110    {
111      if (dump_enabled_p ())
112	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
113                         "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
114                         GET_MODE_NAME (mode), count);
115      return false;
116    }
117
118  if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
119    {
120      if (dump_enabled_p ())
121	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
122                         "cannot use %s<%s><%s>\n", name,
123                         GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
124      return false;
125    }
126
127  if (dump_enabled_p ())
128    dump_printf_loc (MSG_NOTE, vect_location,
129                     "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
130                     GET_MODE_NAME (mode));
131
132  return true;
133}
134
135
136/* Return the smallest scalar part of STMT.
137   This is used to determine the vectype of the stmt.  We generally set the
138   vectype according to the type of the result (lhs).  For stmts whose
139   result-type is different than the type of the arguments (e.g., demotion,
140   promotion), vectype will be reset appropriately (later).  Note that we have
141   to visit the smallest datatype in this function, because that determines the
142   VF.  If the smallest datatype in the loop is present only as the rhs of a
143   promotion operation - we'd miss it.
144   Such a case, where a variable of this datatype does not appear in the lhs
145   anywhere in the loop, can only occur if it's an invariant: e.g.:
146   'int_x = (int) short_inv', which we'd expect to have been optimized away by
147   invariant motion.  However, we cannot rely on invariant motion to always
148   take invariants out of the loop, and so in the case of promotion we also
149   have to check the rhs.
150   LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
151   types.  */
152
153tree
154vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
155                               HOST_WIDE_INT *rhs_size_unit)
156{
157  tree scalar_type = gimple_expr_type (stmt);
158  HOST_WIDE_INT lhs, rhs;
159
160  lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
161
162  if (is_gimple_assign (stmt)
163      && (gimple_assign_cast_p (stmt)
164          || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
165          || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
166          || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
167    {
168      tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
169
170      rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
171      if (rhs < lhs)
172        scalar_type = rhs_type;
173    }
174
175  *lhs_size_unit = lhs;
176  *rhs_size_unit = rhs;
177  return scalar_type;
178}
179
180
181/* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
182   tested at run-time.  Return TRUE if DDR was successfully inserted.
183   Return false if versioning is not supported.  */
184
185static bool
186vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
187{
188  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
189
190  if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
191    return false;
192
193  if (dump_enabled_p ())
194    {
195      dump_printf_loc (MSG_NOTE, vect_location,
196                       "mark for run-time aliasing test between ");
197      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
198      dump_printf (MSG_NOTE,  " and ");
199      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
200      dump_printf (MSG_NOTE, "\n");
201    }
202
203  if (optimize_loop_nest_for_size_p (loop))
204    {
205      if (dump_enabled_p ())
206	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
207                         "versioning not supported when optimizing"
208                         " for size.\n");
209      return false;
210    }
211
212  /* FORNOW: We don't support versioning with outer-loop vectorization.  */
213  if (loop->inner)
214    {
215      if (dump_enabled_p ())
216	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
217                         "versioning not yet supported for outer-loops.\n");
218      return false;
219    }
220
221  /* FORNOW: We don't support creating runtime alias tests for non-constant
222     step.  */
223  if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
224      || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
225    {
226      if (dump_enabled_p ())
227	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
228                         "versioning not yet supported for non-constant "
229                         "step\n");
230      return false;
231    }
232
233  LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
234  return true;
235}
236
237
238/* Function vect_analyze_data_ref_dependence.
239
240   Return TRUE if there (might) exist a dependence between a memory-reference
241   DRA and a memory-reference DRB.  When versioning for alias may check a
242   dependence at run-time, return FALSE.  Adjust *MAX_VF according to
243   the data dependence.  */
244
245static bool
246vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
247                                  loop_vec_info loop_vinfo, int *max_vf)
248{
249  unsigned int i;
250  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
251  struct data_reference *dra = DDR_A (ddr);
252  struct data_reference *drb = DDR_B (ddr);
253  stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
254  stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
255  lambda_vector dist_v;
256  unsigned int loop_depth;
257
258  /* In loop analysis all data references should be vectorizable.  */
259  if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
260      || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
261    gcc_unreachable ();
262
263  /* Independent data accesses.  */
264  if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
265    return false;
266
267  if (dra == drb
268      || (DR_IS_READ (dra) && DR_IS_READ (drb)))
269    return false;
270
271  /* Even if we have an anti-dependence then, as the vectorized loop covers at
272     least two scalar iterations, there is always also a true dependence.
273     As the vectorizer does not re-order loads and stores we can ignore
274     the anti-dependence if TBAA can disambiguate both DRs similar to the
275     case with known negative distance anti-dependences (positive
276     distance anti-dependences would violate TBAA constraints).  */
277  if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
278       || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
279      && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
280				 get_alias_set (DR_REF (drb))))
281    return false;
282
283  /* Unknown data dependence.  */
284  if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
285    {
286      /* If user asserted safelen consecutive iterations can be
287	 executed concurrently, assume independence.  */
288      if (loop->safelen >= 2)
289	{
290	  if (loop->safelen < *max_vf)
291	    *max_vf = loop->safelen;
292	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
293	  return false;
294	}
295
296      if (STMT_VINFO_GATHER_P (stmtinfo_a)
297	  || STMT_VINFO_GATHER_P (stmtinfo_b))
298	{
299	  if (dump_enabled_p ())
300	    {
301	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
302			       "versioning for alias not supported for: "
303			       "can't determine dependence between ");
304	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
305				 DR_REF (dra));
306	      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
307	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
308				 DR_REF (drb));
309	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
310	    }
311	  return true;
312	}
313
314      if (dump_enabled_p ())
315	{
316	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
317			   "versioning for alias required: "
318			   "can't determine dependence between ");
319	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
320			     DR_REF (dra));
321	  dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
322	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
323			     DR_REF (drb));
324	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
325	}
326
327      /* Add to list of ddrs that need to be tested at run-time.  */
328      return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
329    }
330
331  /* Known data dependence.  */
332  if (DDR_NUM_DIST_VECTS (ddr) == 0)
333    {
334      /* If user asserted safelen consecutive iterations can be
335	 executed concurrently, assume independence.  */
336      if (loop->safelen >= 2)
337	{
338	  if (loop->safelen < *max_vf)
339	    *max_vf = loop->safelen;
340	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
341	  return false;
342	}
343
344      if (STMT_VINFO_GATHER_P (stmtinfo_a)
345	  || STMT_VINFO_GATHER_P (stmtinfo_b))
346	{
347	  if (dump_enabled_p ())
348	    {
349	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
350			       "versioning for alias not supported for: "
351			       "bad dist vector for ");
352	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
353				 DR_REF (dra));
354	      dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
355	      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
356				 DR_REF (drb));
357	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
358	    }
359	  return true;
360	}
361
362      if (dump_enabled_p ())
363        {
364          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
365                           "versioning for alias required: "
366                           "bad dist vector for ");
367          dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
368          dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
369          dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
370          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
371        }
372      /* Add to list of ddrs that need to be tested at run-time.  */
373      return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
374    }
375
376  loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
377  FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
378    {
379      int dist = dist_v[loop_depth];
380
381      if (dump_enabled_p ())
382	dump_printf_loc (MSG_NOTE, vect_location,
383                         "dependence distance  = %d.\n", dist);
384
385      if (dist == 0)
386	{
387	  if (dump_enabled_p ())
388	    {
389	      dump_printf_loc (MSG_NOTE, vect_location,
390	                       "dependence distance == 0 between ");
391	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
392	      dump_printf (MSG_NOTE, " and ");
393	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
394	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
395	    }
396
397	  /* When we perform grouped accesses and perform implicit CSE
398	     by detecting equal accesses and doing disambiguation with
399	     runtime alias tests like for
400	        .. = a[i];
401		.. = a[i+1];
402		a[i] = ..;
403		a[i+1] = ..;
404		*p = ..;
405		.. = a[i];
406		.. = a[i+1];
407	     where we will end up loading { a[i], a[i+1] } once, make
408	     sure that inserting group loads before the first load and
409	     stores after the last store will do the right thing.
410	     Similar for groups like
411	        a[i] = ...;
412		... = a[i];
413		a[i+1] = ...;
414	     where loads from the group interleave with the store.  */
415	  if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
416	      || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
417	    {
418	      gimple earlier_stmt;
419	      earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
420	      if (DR_IS_WRITE
421		    (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
422		{
423		  if (dump_enabled_p ())
424		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
425				     "READ_WRITE dependence in interleaving."
426				     "\n");
427		  return true;
428		}
429	    }
430
431	  continue;
432	}
433
434      if (dist > 0 && DDR_REVERSED_P (ddr))
435	{
436	  /* If DDR_REVERSED_P the order of the data-refs in DDR was
437	     reversed (to make distance vector positive), and the actual
438	     distance is negative.  */
439	  if (dump_enabled_p ())
440	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
441	                     "dependence distance negative.\n");
442	  /* Record a negative dependence distance to later limit the
443	     amount of stmt copying / unrolling we can perform.
444	     Only need to handle read-after-write dependence.  */
445	  if (DR_IS_READ (drb)
446	      && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
447		  || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
448	    STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
449	  continue;
450	}
451
452      if (abs (dist) >= 2
453	  && abs (dist) < *max_vf)
454	{
455	  /* The dependence distance requires reduction of the maximal
456	     vectorization factor.  */
457	  *max_vf = abs (dist);
458	  if (dump_enabled_p ())
459	    dump_printf_loc (MSG_NOTE, vect_location,
460	                     "adjusting maximal vectorization factor to %i\n",
461	                     *max_vf);
462	}
463
464      if (abs (dist) >= *max_vf)
465	{
466	  /* Dependence distance does not create dependence, as far as
467	     vectorization is concerned, in this case.  */
468	  if (dump_enabled_p ())
469	    dump_printf_loc (MSG_NOTE, vect_location,
470	                     "dependence distance >= VF.\n");
471	  continue;
472	}
473
474      if (dump_enabled_p ())
475	{
476	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
477	               "not vectorized, possible dependence "
478	               "between data-refs ");
479	  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
480	  dump_printf (MSG_NOTE,  " and ");
481	  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
482	  dump_printf (MSG_NOTE,  "\n");
483	}
484
485      return true;
486    }
487
488  return false;
489}
490
491/* Function vect_analyze_data_ref_dependences.
492
493   Examine all the data references in the loop, and make sure there do not
494   exist any data dependences between them.  Set *MAX_VF according to
495   the maximum vectorization factor the data dependences allow.  */
496
497bool
498vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
499{
500  unsigned int i;
501  struct data_dependence_relation *ddr;
502
503  if (dump_enabled_p ())
504    dump_printf_loc (MSG_NOTE, vect_location,
505                     "=== vect_analyze_data_ref_dependences ===\n");
506
507  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
508  if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
509				&LOOP_VINFO_DDRS (loop_vinfo),
510				LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
511    return false;
512
513  FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
514    if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
515      return false;
516
517  return true;
518}
519
520
521/* Function vect_slp_analyze_data_ref_dependence.
522
523   Return TRUE if there (might) exist a dependence between a memory-reference
524   DRA and a memory-reference DRB.  When versioning for alias may check a
525   dependence at run-time, return FALSE.  Adjust *MAX_VF according to
526   the data dependence.  */
527
528static bool
529vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
530{
531  struct data_reference *dra = DDR_A (ddr);
532  struct data_reference *drb = DDR_B (ddr);
533
534  /* We need to check dependences of statements marked as unvectorizable
535     as well, they still can prohibit vectorization.  */
536
537  /* Independent data accesses.  */
538  if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
539    return false;
540
541  if (dra == drb)
542    return false;
543
544  /* Read-read is OK.  */
545  if (DR_IS_READ (dra) && DR_IS_READ (drb))
546    return false;
547
548  /* If dra and drb are part of the same interleaving chain consider
549     them independent.  */
550  if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
551      && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
552	  == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
553    return false;
554
555  /* Unknown data dependence.  */
556  if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
557    {
558      if  (dump_enabled_p ())
559	{
560	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
561			   "can't determine dependence between ");
562	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
563	  dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
564	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
565	  dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
566	}
567    }
568  else if (dump_enabled_p ())
569    {
570      dump_printf_loc (MSG_NOTE, vect_location,
571		       "determined dependence between ");
572      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
573      dump_printf (MSG_NOTE, " and ");
574      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
575      dump_printf (MSG_NOTE,  "\n");
576    }
577
578  /* We do not vectorize basic blocks with write-write dependencies.  */
579  if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
580    return true;
581
582  /* If we have a read-write dependence check that the load is before the store.
583     When we vectorize basic blocks, vector load can be only before
584     corresponding scalar load, and vector store can be only after its
585     corresponding scalar store.  So the order of the acceses is preserved in
586     case the load is before the store.  */
587  gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
588  if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
589    {
590      /* That only holds for load-store pairs taking part in vectorization.  */
591      if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
592	  && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
593	return false;
594    }
595
596  return true;
597}
598
599
600/* Function vect_analyze_data_ref_dependences.
601
602   Examine all the data references in the basic-block, and make sure there
603   do not exist any data dependences between them.  Set *MAX_VF according to
604   the maximum vectorization factor the data dependences allow.  */
605
606bool
607vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
608{
609  struct data_dependence_relation *ddr;
610  unsigned int i;
611
612  if (dump_enabled_p ())
613    dump_printf_loc (MSG_NOTE, vect_location,
614                     "=== vect_slp_analyze_data_ref_dependences ===\n");
615
616  if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
617				&BB_VINFO_DDRS (bb_vinfo),
618				vNULL, true))
619    return false;
620
621  FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
622    if (vect_slp_analyze_data_ref_dependence (ddr))
623      return false;
624
625  return true;
626}
627
628
629/* Function vect_compute_data_ref_alignment
630
631   Compute the misalignment of the data reference DR.
632
633   Output:
634   1. If during the misalignment computation it is found that the data reference
635      cannot be vectorized then false is returned.
636   2. DR_MISALIGNMENT (DR) is defined.
637
638   FOR NOW: No analysis is actually performed. Misalignment is calculated
639   only for trivial cases. TODO.  */
640
641static bool
642vect_compute_data_ref_alignment (struct data_reference *dr)
643{
644  gimple stmt = DR_STMT (dr);
645  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
646  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
647  struct loop *loop = NULL;
648  tree ref = DR_REF (dr);
649  tree vectype;
650  tree base, base_addr;
651  tree misalign;
652  tree aligned_to;
653  unsigned HOST_WIDE_INT alignment;
654
655  if (dump_enabled_p ())
656    dump_printf_loc (MSG_NOTE, vect_location,
657                     "vect_compute_data_ref_alignment:\n");
658
659  if (loop_vinfo)
660    loop = LOOP_VINFO_LOOP (loop_vinfo);
661
662  /* Initialize misalignment to unknown.  */
663  SET_DR_MISALIGNMENT (dr, -1);
664
665  /* Strided loads perform only component accesses, misalignment information
666     is irrelevant for them.  */
667  if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
668    return true;
669
670  misalign = DR_INIT (dr);
671  aligned_to = DR_ALIGNED_TO (dr);
672  base_addr = DR_BASE_ADDRESS (dr);
673  vectype = STMT_VINFO_VECTYPE (stmt_info);
674
675  /* In case the dataref is in an inner-loop of the loop that is being
676     vectorized (LOOP), we use the base and misalignment information
677     relative to the outer-loop (LOOP).  This is ok only if the misalignment
678     stays the same throughout the execution of the inner-loop, which is why
679     we have to check that the stride of the dataref in the inner-loop evenly
680     divides by the vector size.  */
681  if (loop && nested_in_vect_loop_p (loop, stmt))
682    {
683      tree step = DR_STEP (dr);
684      HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
685
686      if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
687        {
688          if (dump_enabled_p ())
689            dump_printf_loc (MSG_NOTE, vect_location,
690                             "inner step divides the vector-size.\n");
691	  misalign = STMT_VINFO_DR_INIT (stmt_info);
692	  aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
693	  base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
694        }
695      else
696	{
697	  if (dump_enabled_p ())
698	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
699	                     "inner step doesn't divide the vector-size.\n");
700	  misalign = NULL_TREE;
701	}
702    }
703
704  /* Similarly, if we're doing basic-block vectorization, we can only use
705     base and misalignment information relative to an innermost loop if the
706     misalignment stays the same throughout the execution of the loop.
707     As above, this is the case if the stride of the dataref evenly divides
708     by the vector size.  */
709  if (!loop)
710    {
711      tree step = DR_STEP (dr);
712      HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
713
714      if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
715	{
716	  if (dump_enabled_p ())
717	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
718	                     "SLP: step doesn't divide the vector-size.\n");
719	  misalign = NULL_TREE;
720	}
721    }
722
723  /* To look at alignment of the base we have to preserve an inner MEM_REF
724     as that carries alignment information of the actual access.  */
725  base = ref;
726  while (handled_component_p (base))
727    base = TREE_OPERAND (base, 0);
728  if (TREE_CODE (base) == MEM_REF)
729    base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
730		   build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
731  unsigned int base_alignment = get_object_alignment (base);
732
733  if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
734    DR_VECT_AUX (dr)->base_element_aligned = true;
735
736  alignment = TYPE_ALIGN_UNIT (vectype);
737
738  if ((compare_tree_int (aligned_to, alignment) < 0)
739      || !misalign)
740    {
741      if (dump_enabled_p ())
742	{
743	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
744	                   "Unknown alignment for access: ");
745	  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
746	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
747	}
748      return true;
749    }
750
751  if (base_alignment < TYPE_ALIGN (vectype))
752    {
753      /* Strip an inner MEM_REF to a bare decl if possible.  */
754      if (TREE_CODE (base) == MEM_REF
755	  && integer_zerop (TREE_OPERAND (base, 1))
756	  && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
757	base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
758
759      if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
760	{
761	  if (dump_enabled_p ())
762	    {
763	      dump_printf_loc (MSG_NOTE, vect_location,
764	                       "can't force alignment of ref: ");
765	      dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
766	      dump_printf (MSG_NOTE, "\n");
767	    }
768	  return true;
769	}
770
771      /* Force the alignment of the decl.
772	 NOTE: This is the only change to the code we make during
773	 the analysis phase, before deciding to vectorize the loop.  */
774      if (dump_enabled_p ())
775        {
776          dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
777          dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
778          dump_printf (MSG_NOTE, "\n");
779        }
780
781      DR_VECT_AUX (dr)->base_decl = base;
782      DR_VECT_AUX (dr)->base_misaligned = true;
783      DR_VECT_AUX (dr)->base_element_aligned = true;
784    }
785
786  /* If this is a backward running DR then first access in the larger
787     vectype actually is N-1 elements before the address in the DR.
788     Adjust misalign accordingly.  */
789  if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
790    {
791      tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
792      /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
793	 otherwise we wouldn't be here.  */
794      offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
795      /* PLUS because DR_STEP was negative.  */
796      misalign = size_binop (PLUS_EXPR, misalign, offset);
797    }
798
799  SET_DR_MISALIGNMENT (dr,
800		       wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
801
802  if (dump_enabled_p ())
803    {
804      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
805                       "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
806      dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
807      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
808    }
809
810  return true;
811}
812
813
814/* Function vect_compute_data_refs_alignment
815
816   Compute the misalignment of data references in the loop.
817   Return FALSE if a data reference is found that cannot be vectorized.  */
818
819static bool
820vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
821                                  bb_vec_info bb_vinfo)
822{
823  vec<data_reference_p> datarefs;
824  struct data_reference *dr;
825  unsigned int i;
826
827  if (loop_vinfo)
828    datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
829  else
830    datarefs = BB_VINFO_DATAREFS (bb_vinfo);
831
832  FOR_EACH_VEC_ELT (datarefs, i, dr)
833    if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
834        && !vect_compute_data_ref_alignment (dr))
835      {
836        if (bb_vinfo)
837          {
838            /* Mark unsupported statement as unvectorizable.  */
839            STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
840            continue;
841          }
842        else
843          return false;
844      }
845
846  return true;
847}
848
849
850/* Function vect_update_misalignment_for_peel
851
852   DR - the data reference whose misalignment is to be adjusted.
853   DR_PEEL - the data reference whose misalignment is being made
854             zero in the vector loop by the peel.
855   NPEEL - the number of iterations in the peel loop if the misalignment
856           of DR_PEEL is known at compile time.  */
857
858static void
859vect_update_misalignment_for_peel (struct data_reference *dr,
860                                   struct data_reference *dr_peel, int npeel)
861{
862  unsigned int i;
863  vec<dr_p> same_align_drs;
864  struct data_reference *current_dr;
865  int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
866  int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
867  stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
868  stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
869
870 /* For interleaved data accesses the step in the loop must be multiplied by
871     the size of the interleaving group.  */
872  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
873    dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
874  if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
875    dr_peel_size *= GROUP_SIZE (peel_stmt_info);
876
877  /* It can be assumed that the data refs with the same alignment as dr_peel
878     are aligned in the vector loop.  */
879  same_align_drs
880    = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
881  FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
882    {
883      if (current_dr != dr)
884        continue;
885      gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
886                  DR_MISALIGNMENT (dr_peel) / dr_peel_size);
887      SET_DR_MISALIGNMENT (dr, 0);
888      return;
889    }
890
891  if (known_alignment_for_access_p (dr)
892      && known_alignment_for_access_p (dr_peel))
893    {
894      bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
895      int misal = DR_MISALIGNMENT (dr);
896      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
897      misal += negative ? -npeel * dr_size : npeel * dr_size;
898      misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
899      SET_DR_MISALIGNMENT (dr, misal);
900      return;
901    }
902
903  if (dump_enabled_p ())
904    dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
905  SET_DR_MISALIGNMENT (dr, -1);
906}
907
908
909/* Function vect_verify_datarefs_alignment
910
911   Return TRUE if all data references in the loop can be
912   handled with respect to alignment.  */
913
914bool
915vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
916{
917  vec<data_reference_p> datarefs;
918  struct data_reference *dr;
919  enum dr_alignment_support supportable_dr_alignment;
920  unsigned int i;
921
922  if (loop_vinfo)
923    datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
924  else
925    datarefs = BB_VINFO_DATAREFS (bb_vinfo);
926
927  FOR_EACH_VEC_ELT (datarefs, i, dr)
928    {
929      gimple stmt = DR_STMT (dr);
930      stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
931
932      if (!STMT_VINFO_RELEVANT_P (stmt_info))
933	continue;
934
935      /* For interleaving, only the alignment of the first access matters.
936         Skip statements marked as not vectorizable.  */
937      if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
938           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
939          || !STMT_VINFO_VECTORIZABLE (stmt_info))
940        continue;
941
942      /* Strided loads perform only component accesses, alignment is
943	 irrelevant for them.  */
944      if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
945	continue;
946
947      supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
948      if (!supportable_dr_alignment)
949        {
950          if (dump_enabled_p ())
951            {
952              if (DR_IS_READ (dr))
953                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
954                                 "not vectorized: unsupported unaligned load.");
955              else
956                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
957                                 "not vectorized: unsupported unaligned "
958                                 "store.");
959
960              dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
961                                 DR_REF (dr));
962              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
963            }
964          return false;
965        }
966      if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
967        dump_printf_loc (MSG_NOTE, vect_location,
968                         "Vectorizing an unaligned access.\n");
969    }
970  return true;
971}
972
973/* Given an memory reference EXP return whether its alignment is less
974   than its size.  */
975
976static bool
977not_size_aligned (tree exp)
978{
979  if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
980    return true;
981
982  return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
983	  > get_object_alignment (exp));
984}
985
986/* Function vector_alignment_reachable_p
987
988   Return true if vector alignment for DR is reachable by peeling
989   a few loop iterations.  Return false otherwise.  */
990
991static bool
992vector_alignment_reachable_p (struct data_reference *dr)
993{
994  gimple stmt = DR_STMT (dr);
995  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
996  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
997
998  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
999    {
1000      /* For interleaved access we peel only if number of iterations in
1001	 the prolog loop ({VF - misalignment}), is a multiple of the
1002	 number of the interleaved accesses.  */
1003      int elem_size, mis_in_elements;
1004      int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1005
1006      /* FORNOW: handle only known alignment.  */
1007      if (!known_alignment_for_access_p (dr))
1008	return false;
1009
1010      elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1011      mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1012
1013      if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1014	return false;
1015    }
1016
1017  /* If misalignment is known at the compile time then allow peeling
1018     only if natural alignment is reachable through peeling.  */
1019  if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1020    {
1021      HOST_WIDE_INT elmsize =
1022		int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1023      if (dump_enabled_p ())
1024	{
1025	  dump_printf_loc (MSG_NOTE, vect_location,
1026	                   "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1027	  dump_printf (MSG_NOTE,
1028	               ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1029	}
1030      if (DR_MISALIGNMENT (dr) % elmsize)
1031	{
1032	  if (dump_enabled_p ())
1033	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1034	                     "data size does not divide the misalignment.\n");
1035	  return false;
1036	}
1037    }
1038
1039  if (!known_alignment_for_access_p (dr))
1040    {
1041      tree type = TREE_TYPE (DR_REF (dr));
1042      bool is_packed = not_size_aligned (DR_REF (dr));
1043      if (dump_enabled_p ())
1044	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1045	                 "Unknown misalignment, is_packed = %d\n",is_packed);
1046      if ((TYPE_USER_ALIGN (type) && !is_packed)
1047	  || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1048	return true;
1049      else
1050	return false;
1051    }
1052
1053  return true;
1054}
1055
1056
1057/* Calculate the cost of the memory access represented by DR.  */
1058
1059static void
1060vect_get_data_access_cost (struct data_reference *dr,
1061                           unsigned int *inside_cost,
1062                           unsigned int *outside_cost,
1063			   stmt_vector_for_cost *body_cost_vec)
1064{
1065  gimple stmt = DR_STMT (dr);
1066  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1067  int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1068  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1069  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1070  int ncopies = vf / nunits;
1071
1072  if (DR_IS_READ (dr))
1073    vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1074			NULL, body_cost_vec, false);
1075  else
1076    vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1077
1078  if (dump_enabled_p ())
1079    dump_printf_loc (MSG_NOTE, vect_location,
1080                     "vect_get_data_access_cost: inside_cost = %d, "
1081                     "outside_cost = %d.\n", *inside_cost, *outside_cost);
1082}
1083
1084
1085/* Insert DR into peeling hash table with NPEEL as key.  */
1086
1087static void
1088vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1089                          int npeel)
1090{
1091  struct _vect_peel_info elem, *slot;
1092  _vect_peel_info **new_slot;
1093  bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1094
1095  elem.npeel = npeel;
1096  slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1097  if (slot)
1098    slot->count++;
1099  else
1100    {
1101      slot = XNEW (struct _vect_peel_info);
1102      slot->npeel = npeel;
1103      slot->dr = dr;
1104      slot->count = 1;
1105      new_slot
1106       	= LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1107      *new_slot = slot;
1108    }
1109
1110  if (!supportable_dr_alignment
1111      && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1112    slot->count += VECT_MAX_COST;
1113}
1114
1115
1116/* Traverse peeling hash table to find peeling option that aligns maximum
1117   number of data accesses.  */
1118
1119int
1120vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1121				     _vect_peel_extended_info *max)
1122{
1123  vect_peel_info elem = *slot;
1124
1125  if (elem->count > max->peel_info.count
1126      || (elem->count == max->peel_info.count
1127          && max->peel_info.npeel > elem->npeel))
1128    {
1129      max->peel_info.npeel = elem->npeel;
1130      max->peel_info.count = elem->count;
1131      max->peel_info.dr = elem->dr;
1132    }
1133
1134  return 1;
1135}
1136
1137
1138/* Traverse peeling hash table and calculate cost for each peeling option.
1139   Find the one with the lowest cost.  */
1140
1141int
1142vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1143				   _vect_peel_extended_info *min)
1144{
1145  vect_peel_info elem = *slot;
1146  int save_misalignment, dummy;
1147  unsigned int inside_cost = 0, outside_cost = 0, i;
1148  gimple stmt = DR_STMT (elem->dr);
1149  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1150  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1151  vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1152  struct data_reference *dr;
1153  stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1154
1155  prologue_cost_vec.create (2);
1156  body_cost_vec.create (2);
1157  epilogue_cost_vec.create (2);
1158
1159  FOR_EACH_VEC_ELT (datarefs, i, dr)
1160    {
1161      stmt = DR_STMT (dr);
1162      stmt_info = vinfo_for_stmt (stmt);
1163      /* For interleaving, only the alignment of the first access
1164         matters.  */
1165      if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1166          && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1167        continue;
1168
1169      save_misalignment = DR_MISALIGNMENT (dr);
1170      vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1171      vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1172				 &body_cost_vec);
1173      SET_DR_MISALIGNMENT (dr, save_misalignment);
1174    }
1175
1176  auto_vec<stmt_info_for_cost> scalar_cost_vec;
1177  vect_get_single_scalar_iteration_cost (loop_vinfo, &scalar_cost_vec);
1178  outside_cost += vect_get_known_peeling_cost
1179    (loop_vinfo, elem->npeel, &dummy,
1180     &scalar_cost_vec, &prologue_cost_vec, &epilogue_cost_vec);
1181
1182  /* Prologue and epilogue costs are added to the target model later.
1183     These costs depend only on the scalar iteration cost, the
1184     number of peeling iterations finally chosen, and the number of
1185     misaligned statements.  So discard the information found here.  */
1186  prologue_cost_vec.release ();
1187  epilogue_cost_vec.release ();
1188
1189  if (inside_cost < min->inside_cost
1190      || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1191    {
1192      min->inside_cost = inside_cost;
1193      min->outside_cost = outside_cost;
1194      min->body_cost_vec.release ();
1195      min->body_cost_vec = body_cost_vec;
1196      min->peel_info.dr = elem->dr;
1197      min->peel_info.npeel = elem->npeel;
1198    }
1199  else
1200    body_cost_vec.release ();
1201
1202  return 1;
1203}
1204
1205
1206/* Choose best peeling option by traversing peeling hash table and either
1207   choosing an option with the lowest cost (if cost model is enabled) or the
1208   option that aligns as many accesses as possible.  */
1209
1210static struct data_reference *
1211vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1212                                       unsigned int *npeel,
1213				       stmt_vector_for_cost *body_cost_vec)
1214{
1215   struct _vect_peel_extended_info res;
1216
1217   res.peel_info.dr = NULL;
1218   res.body_cost_vec = stmt_vector_for_cost ();
1219
1220   if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1221     {
1222       res.inside_cost = INT_MAX;
1223       res.outside_cost = INT_MAX;
1224       LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1225           ->traverse <_vect_peel_extended_info *,
1226                       vect_peeling_hash_get_lowest_cost> (&res);
1227     }
1228   else
1229     {
1230       res.peel_info.count = 0;
1231       LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1232           ->traverse <_vect_peel_extended_info *,
1233                       vect_peeling_hash_get_most_frequent> (&res);
1234     }
1235
1236   *npeel = res.peel_info.npeel;
1237   *body_cost_vec = res.body_cost_vec;
1238   return res.peel_info.dr;
1239}
1240
1241
1242/* Function vect_enhance_data_refs_alignment
1243
1244   This pass will use loop versioning and loop peeling in order to enhance
1245   the alignment of data references in the loop.
1246
1247   FOR NOW: we assume that whatever versioning/peeling takes place, only the
1248   original loop is to be vectorized.  Any other loops that are created by
1249   the transformations performed in this pass - are not supposed to be
1250   vectorized.  This restriction will be relaxed.
1251
1252   This pass will require a cost model to guide it whether to apply peeling
1253   or versioning or a combination of the two.  For example, the scheme that
1254   intel uses when given a loop with several memory accesses, is as follows:
1255   choose one memory access ('p') which alignment you want to force by doing
1256   peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1257   other accesses are not necessarily aligned, or (2) use loop versioning to
1258   generate one loop in which all accesses are aligned, and another loop in
1259   which only 'p' is necessarily aligned.
1260
1261   ("Automatic Intra-Register Vectorization for the Intel Architecture",
1262   Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1263   Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1264
1265   Devising a cost model is the most critical aspect of this work.  It will
1266   guide us on which access to peel for, whether to use loop versioning, how
1267   many versions to create, etc.  The cost model will probably consist of
1268   generic considerations as well as target specific considerations (on
1269   powerpc for example, misaligned stores are more painful than misaligned
1270   loads).
1271
1272   Here are the general steps involved in alignment enhancements:
1273
1274     -- original loop, before alignment analysis:
1275	for (i=0; i<N; i++){
1276	  x = q[i];			# DR_MISALIGNMENT(q) = unknown
1277	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1278	}
1279
1280     -- After vect_compute_data_refs_alignment:
1281	for (i=0; i<N; i++){
1282	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1283	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1284	}
1285
1286     -- Possibility 1: we do loop versioning:
1287     if (p is aligned) {
1288	for (i=0; i<N; i++){	# loop 1A
1289	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1290	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1291	}
1292     }
1293     else {
1294	for (i=0; i<N; i++){	# loop 1B
1295	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1296	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1297	}
1298     }
1299
1300     -- Possibility 2: we do loop peeling:
1301     for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1302	x = q[i];
1303	p[i] = y;
1304     }
1305     for (i = 3; i < N; i++){	# loop 2A
1306	x = q[i];			# DR_MISALIGNMENT(q) = 0
1307	p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1308     }
1309
1310     -- Possibility 3: combination of loop peeling and versioning:
1311     for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1312	x = q[i];
1313	p[i] = y;
1314     }
1315     if (p is aligned) {
1316	for (i = 3; i<N; i++){	# loop 3A
1317	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1318	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1319	}
1320     }
1321     else {
1322	for (i = 3; i<N; i++){	# loop 3B
1323	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1324	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1325	}
1326     }
1327
1328     These loops are later passed to loop_transform to be vectorized.  The
1329     vectorizer will use the alignment information to guide the transformation
1330     (whether to generate regular loads/stores, or with special handling for
1331     misalignment).  */
1332
1333bool
1334vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1335{
1336  vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1337  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1338  enum dr_alignment_support supportable_dr_alignment;
1339  struct data_reference *dr0 = NULL, *first_store = NULL;
1340  struct data_reference *dr;
1341  unsigned int i, j;
1342  bool do_peeling = false;
1343  bool do_versioning = false;
1344  bool stat;
1345  gimple stmt;
1346  stmt_vec_info stmt_info;
1347  unsigned int npeel = 0;
1348  bool all_misalignments_unknown = true;
1349  unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1350  unsigned possible_npeel_number = 1;
1351  tree vectype;
1352  unsigned int nelements, mis, same_align_drs_max = 0;
1353  stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1354
1355  if (dump_enabled_p ())
1356    dump_printf_loc (MSG_NOTE, vect_location,
1357                     "=== vect_enhance_data_refs_alignment ===\n");
1358
1359  /* While cost model enhancements are expected in the future, the high level
1360     view of the code at this time is as follows:
1361
1362     A) If there is a misaligned access then see if peeling to align
1363        this access can make all data references satisfy
1364        vect_supportable_dr_alignment.  If so, update data structures
1365        as needed and return true.
1366
1367     B) If peeling wasn't possible and there is a data reference with an
1368        unknown misalignment that does not satisfy vect_supportable_dr_alignment
1369        then see if loop versioning checks can be used to make all data
1370        references satisfy vect_supportable_dr_alignment.  If so, update
1371        data structures as needed and return true.
1372
1373     C) If neither peeling nor versioning were successful then return false if
1374        any data reference does not satisfy vect_supportable_dr_alignment.
1375
1376     D) Return true (all data references satisfy vect_supportable_dr_alignment).
1377
1378     Note, Possibility 3 above (which is peeling and versioning together) is not
1379     being done at this time.  */
1380
1381  /* (1) Peeling to force alignment.  */
1382
1383  /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1384     Considerations:
1385     + How many accesses will become aligned due to the peeling
1386     - How many accesses will become unaligned due to the peeling,
1387       and the cost of misaligned accesses.
1388     - The cost of peeling (the extra runtime checks, the increase
1389       in code size).  */
1390
1391  FOR_EACH_VEC_ELT (datarefs, i, dr)
1392    {
1393      stmt = DR_STMT (dr);
1394      stmt_info = vinfo_for_stmt (stmt);
1395
1396      if (!STMT_VINFO_RELEVANT_P (stmt_info))
1397	continue;
1398
1399      /* For interleaving, only the alignment of the first access
1400         matters.  */
1401      if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1402          && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1403        continue;
1404
1405      /* For invariant accesses there is nothing to enhance.  */
1406      if (integer_zerop (DR_STEP (dr)))
1407	continue;
1408
1409      /* Strided loads perform only component accesses, alignment is
1410	 irrelevant for them.  */
1411      if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1412	continue;
1413
1414      supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1415      do_peeling = vector_alignment_reachable_p (dr);
1416      if (do_peeling)
1417        {
1418          if (known_alignment_for_access_p (dr))
1419            {
1420              unsigned int npeel_tmp;
1421	      bool negative = tree_int_cst_compare (DR_STEP (dr),
1422						    size_zero_node) < 0;
1423
1424              /* Save info about DR in the hash table.  */
1425              if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1426                LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1427		  = new hash_table<peel_info_hasher> (1);
1428
1429              vectype = STMT_VINFO_VECTYPE (stmt_info);
1430              nelements = TYPE_VECTOR_SUBPARTS (vectype);
1431              mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1432                                                TREE_TYPE (DR_REF (dr))));
1433              npeel_tmp = (negative
1434			   ? (mis - nelements) : (nelements - mis))
1435		  & (nelements - 1);
1436
1437              /* For multiple types, it is possible that the bigger type access
1438                 will have more than one peeling option.  E.g., a loop with two
1439                 types: one of size (vector size / 4), and the other one of
1440                 size (vector size / 8).  Vectorization factor will 8.  If both
1441                 access are misaligned by 3, the first one needs one scalar
1442                 iteration to be aligned, and the second one needs 5.  But the
1443                 the first one will be aligned also by peeling 5 scalar
1444                 iterations, and in that case both accesses will be aligned.
1445                 Hence, except for the immediate peeling amount, we also want
1446                 to try to add full vector size, while we don't exceed
1447                 vectorization factor.
1448                 We do this automtically for cost model, since we calculate cost
1449                 for every peeling option.  */
1450              if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1451                possible_npeel_number = vf /nelements;
1452
1453              /* Handle the aligned case. We may decide to align some other
1454                 access, making DR unaligned.  */
1455              if (DR_MISALIGNMENT (dr) == 0)
1456                {
1457                  npeel_tmp = 0;
1458                  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1459                    possible_npeel_number++;
1460                }
1461
1462              for (j = 0; j < possible_npeel_number; j++)
1463                {
1464                  gcc_assert (npeel_tmp <= vf);
1465                  vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1466                  npeel_tmp += nelements;
1467                }
1468
1469              all_misalignments_unknown = false;
1470              /* Data-ref that was chosen for the case that all the
1471                 misalignments are unknown is not relevant anymore, since we
1472                 have a data-ref with known alignment.  */
1473              dr0 = NULL;
1474            }
1475          else
1476            {
1477              /* If we don't know any misalignment values, we prefer
1478                 peeling for data-ref that has the maximum number of data-refs
1479                 with the same alignment, unless the target prefers to align
1480                 stores over load.  */
1481              if (all_misalignments_unknown)
1482                {
1483		  unsigned same_align_drs
1484		    = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1485                  if (!dr0
1486		      || same_align_drs_max < same_align_drs)
1487                    {
1488                      same_align_drs_max = same_align_drs;
1489                      dr0 = dr;
1490                    }
1491		  /* For data-refs with the same number of related
1492		     accesses prefer the one where the misalign
1493		     computation will be invariant in the outermost loop.  */
1494		  else if (same_align_drs_max == same_align_drs)
1495		    {
1496		      struct loop *ivloop0, *ivloop;
1497		      ivloop0 = outermost_invariant_loop_for_expr
1498			  (loop, DR_BASE_ADDRESS (dr0));
1499		      ivloop = outermost_invariant_loop_for_expr
1500			  (loop, DR_BASE_ADDRESS (dr));
1501		      if ((ivloop && !ivloop0)
1502			  || (ivloop && ivloop0
1503			      && flow_loop_nested_p (ivloop, ivloop0)))
1504			dr0 = dr;
1505		    }
1506
1507                  if (!first_store && DR_IS_WRITE (dr))
1508                    first_store = dr;
1509                }
1510
1511              /* If there are both known and unknown misaligned accesses in the
1512                 loop, we choose peeling amount according to the known
1513                 accesses.  */
1514              if (!supportable_dr_alignment)
1515                {
1516                  dr0 = dr;
1517                  if (!first_store && DR_IS_WRITE (dr))
1518                    first_store = dr;
1519                }
1520            }
1521        }
1522      else
1523        {
1524          if (!aligned_access_p (dr))
1525            {
1526              if (dump_enabled_p ())
1527                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1528                                 "vector alignment may not be reachable\n");
1529              break;
1530            }
1531        }
1532    }
1533
1534  /* Check if we can possibly peel the loop.  */
1535  if (!vect_can_advance_ivs_p (loop_vinfo)
1536      || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1537    do_peeling = false;
1538
1539  /* If we don't know how many times the peeling loop will run
1540     assume it will run VF-1 times and disable peeling if the remaining
1541     iters are less than the vectorization factor.  */
1542  if (do_peeling
1543      && all_misalignments_unknown
1544      && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1545      && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1546	  < 2 * (unsigned) LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1))
1547    do_peeling = false;
1548
1549  if (do_peeling
1550      && all_misalignments_unknown
1551      && vect_supportable_dr_alignment (dr0, false))
1552    {
1553      /* Check if the target requires to prefer stores over loads, i.e., if
1554         misaligned stores are more expensive than misaligned loads (taking
1555         drs with same alignment into account).  */
1556      if (first_store && DR_IS_READ (dr0))
1557        {
1558          unsigned int load_inside_cost = 0, load_outside_cost = 0;
1559          unsigned int store_inside_cost = 0, store_outside_cost = 0;
1560          unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1561          unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1562	  stmt_vector_for_cost dummy;
1563	  dummy.create (2);
1564
1565          vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1566				     &dummy);
1567          vect_get_data_access_cost (first_store, &store_inside_cost,
1568				     &store_outside_cost, &dummy);
1569
1570	  dummy.release ();
1571
1572          /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1573             aligning the load DR0).  */
1574          load_inside_penalty = store_inside_cost;
1575          load_outside_penalty = store_outside_cost;
1576          for (i = 0;
1577	       STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1578			  DR_STMT (first_store))).iterate (i, &dr);
1579               i++)
1580            if (DR_IS_READ (dr))
1581              {
1582                load_inside_penalty += load_inside_cost;
1583                load_outside_penalty += load_outside_cost;
1584              }
1585            else
1586              {
1587                load_inside_penalty += store_inside_cost;
1588                load_outside_penalty += store_outside_cost;
1589              }
1590
1591          /* Calculate the penalty for leaving DR0 unaligned (by
1592             aligning the FIRST_STORE).  */
1593          store_inside_penalty = load_inside_cost;
1594          store_outside_penalty = load_outside_cost;
1595          for (i = 0;
1596	       STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1597		      DR_STMT (dr0))).iterate (i, &dr);
1598               i++)
1599            if (DR_IS_READ (dr))
1600              {
1601                store_inside_penalty += load_inside_cost;
1602                store_outside_penalty += load_outside_cost;
1603              }
1604            else
1605              {
1606                store_inside_penalty += store_inside_cost;
1607                store_outside_penalty += store_outside_cost;
1608              }
1609
1610          if (load_inside_penalty > store_inside_penalty
1611              || (load_inside_penalty == store_inside_penalty
1612                  && load_outside_penalty > store_outside_penalty))
1613            dr0 = first_store;
1614        }
1615
1616      /* In case there are only loads with different unknown misalignments, use
1617         peeling only if it may help to align other accesses in the loop.  */
1618      if (!first_store
1619	  && !STMT_VINFO_SAME_ALIGN_REFS (
1620		  vinfo_for_stmt (DR_STMT (dr0))).length ()
1621          && vect_supportable_dr_alignment (dr0, false)
1622              != dr_unaligned_supported)
1623        do_peeling = false;
1624    }
1625
1626  if (do_peeling && !dr0)
1627    {
1628      /* Peeling is possible, but there is no data access that is not supported
1629         unless aligned. So we try to choose the best possible peeling.  */
1630
1631      /* We should get here only if there are drs with known misalignment.  */
1632      gcc_assert (!all_misalignments_unknown);
1633
1634      /* Choose the best peeling from the hash table.  */
1635      dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1636						   &body_cost_vec);
1637      if (!dr0 || !npeel)
1638        do_peeling = false;
1639
1640      /* If peeling by npeel will result in a remaining loop not iterating
1641         enough to be vectorized then do not peel.  */
1642      if (do_peeling
1643	  && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1644	  && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1645	      < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + npeel))
1646	do_peeling = false;
1647    }
1648
1649  if (do_peeling)
1650    {
1651      stmt = DR_STMT (dr0);
1652      stmt_info = vinfo_for_stmt (stmt);
1653      vectype = STMT_VINFO_VECTYPE (stmt_info);
1654      nelements = TYPE_VECTOR_SUBPARTS (vectype);
1655
1656      if (known_alignment_for_access_p (dr0))
1657        {
1658	  bool negative = tree_int_cst_compare (DR_STEP (dr0),
1659						size_zero_node) < 0;
1660          if (!npeel)
1661            {
1662              /* Since it's known at compile time, compute the number of
1663                 iterations in the peeled loop (the peeling factor) for use in
1664                 updating DR_MISALIGNMENT values.  The peeling factor is the
1665                 vectorization factor minus the misalignment as an element
1666                 count.  */
1667              mis = DR_MISALIGNMENT (dr0);
1668              mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1669              npeel = ((negative ? mis - nelements : nelements - mis)
1670		       & (nelements - 1));
1671            }
1672
1673	  /* For interleaved data access every iteration accesses all the
1674	     members of the group, therefore we divide the number of iterations
1675	     by the group size.  */
1676	  stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1677	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1678	    npeel /= GROUP_SIZE (stmt_info);
1679
1680          if (dump_enabled_p ())
1681            dump_printf_loc (MSG_NOTE, vect_location,
1682                             "Try peeling by %d\n", npeel);
1683        }
1684
1685      /* Ensure that all data refs can be vectorized after the peel.  */
1686      FOR_EACH_VEC_ELT (datarefs, i, dr)
1687        {
1688          int save_misalignment;
1689
1690	  if (dr == dr0)
1691	    continue;
1692
1693	  stmt = DR_STMT (dr);
1694	  stmt_info = vinfo_for_stmt (stmt);
1695	  /* For interleaving, only the alignment of the first access
1696            matters.  */
1697	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1698	      && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1699	    continue;
1700
1701	  /* Strided loads perform only component accesses, alignment is
1702	     irrelevant for them.  */
1703	  if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1704	    continue;
1705
1706	  save_misalignment = DR_MISALIGNMENT (dr);
1707	  vect_update_misalignment_for_peel (dr, dr0, npeel);
1708	  supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1709	  SET_DR_MISALIGNMENT (dr, save_misalignment);
1710
1711	  if (!supportable_dr_alignment)
1712	    {
1713	      do_peeling = false;
1714	      break;
1715	    }
1716	}
1717
1718      if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1719        {
1720          stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1721          if (!stat)
1722            do_peeling = false;
1723          else
1724	    {
1725	      body_cost_vec.release ();
1726	      return stat;
1727	    }
1728        }
1729
1730      if (do_peeling)
1731        {
1732          unsigned max_allowed_peel
1733            = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1734          if (max_allowed_peel != (unsigned)-1)
1735            {
1736              unsigned max_peel = npeel;
1737              if (max_peel == 0)
1738                {
1739                  gimple dr_stmt = DR_STMT (dr0);
1740                  stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1741                  tree vtype = STMT_VINFO_VECTYPE (vinfo);
1742                  max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1743                }
1744              if (max_peel > max_allowed_peel)
1745                {
1746                  do_peeling = false;
1747                  if (dump_enabled_p ())
1748                    dump_printf_loc (MSG_NOTE, vect_location,
1749                        "Disable peeling, max peels reached: %d\n", max_peel);
1750                }
1751            }
1752        }
1753
1754      if (do_peeling)
1755        {
1756          /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1757             If the misalignment of DR_i is identical to that of dr0 then set
1758             DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1759             dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1760             by the peeling factor times the element size of DR_i (MOD the
1761             vectorization factor times the size).  Otherwise, the
1762             misalignment of DR_i must be set to unknown.  */
1763	  FOR_EACH_VEC_ELT (datarefs, i, dr)
1764	    if (dr != dr0)
1765	      vect_update_misalignment_for_peel (dr, dr0, npeel);
1766
1767          LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1768          if (npeel)
1769            LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1770          else
1771            LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1772	      = DR_MISALIGNMENT (dr0);
1773	  SET_DR_MISALIGNMENT (dr0, 0);
1774	  if (dump_enabled_p ())
1775            {
1776              dump_printf_loc (MSG_NOTE, vect_location,
1777                               "Alignment of access forced using peeling.\n");
1778              dump_printf_loc (MSG_NOTE, vect_location,
1779                               "Peeling for alignment will be applied.\n");
1780            }
1781	  /* The inside-loop cost will be accounted for in vectorizable_load
1782	     and vectorizable_store correctly with adjusted alignments.
1783	     Drop the body_cst_vec on the floor here.  */
1784	  body_cost_vec.release ();
1785
1786	  stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1787	  gcc_assert (stat);
1788          return stat;
1789        }
1790    }
1791
1792  body_cost_vec.release ();
1793
1794  /* (2) Versioning to force alignment.  */
1795
1796  /* Try versioning if:
1797     1) optimize loop for speed
1798     2) there is at least one unsupported misaligned data ref with an unknown
1799        misalignment, and
1800     3) all misaligned data refs with a known misalignment are supported, and
1801     4) the number of runtime alignment checks is within reason.  */
1802
1803  do_versioning =
1804	optimize_loop_nest_for_speed_p (loop)
1805	&& (!loop->inner); /* FORNOW */
1806
1807  if (do_versioning)
1808    {
1809      FOR_EACH_VEC_ELT (datarefs, i, dr)
1810        {
1811	  stmt = DR_STMT (dr);
1812	  stmt_info = vinfo_for_stmt (stmt);
1813
1814	  /* For interleaving, only the alignment of the first access
1815	     matters.  */
1816	  if (aligned_access_p (dr)
1817	      || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1818		  && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1819	    continue;
1820
1821	  /* Strided loads perform only component accesses, alignment is
1822	     irrelevant for them.  */
1823	  if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1824	    continue;
1825
1826	  supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1827
1828          if (!supportable_dr_alignment)
1829            {
1830              gimple stmt;
1831              int mask;
1832              tree vectype;
1833
1834              if (known_alignment_for_access_p (dr)
1835                  || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1836                     >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1837                {
1838                  do_versioning = false;
1839                  break;
1840                }
1841
1842              stmt = DR_STMT (dr);
1843              vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1844              gcc_assert (vectype);
1845
1846              /* The rightmost bits of an aligned address must be zeros.
1847                 Construct the mask needed for this test.  For example,
1848                 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1849                 mask must be 15 = 0xf. */
1850              mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1851
1852              /* FORNOW: use the same mask to test all potentially unaligned
1853                 references in the loop.  The vectorizer currently supports
1854                 a single vector size, see the reference to
1855                 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1856                 vectorization factor is computed.  */
1857              gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1858                          || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1859              LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1860              LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1861		      DR_STMT (dr));
1862            }
1863        }
1864
1865      /* Versioning requires at least one misaligned data reference.  */
1866      if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1867        do_versioning = false;
1868      else if (!do_versioning)
1869        LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1870    }
1871
1872  if (do_versioning)
1873    {
1874      vec<gimple> may_misalign_stmts
1875        = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1876      gimple stmt;
1877
1878      /* It can now be assumed that the data references in the statements
1879         in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1880         of the loop being vectorized.  */
1881      FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1882        {
1883          stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1884          dr = STMT_VINFO_DATA_REF (stmt_info);
1885	  SET_DR_MISALIGNMENT (dr, 0);
1886	  if (dump_enabled_p ())
1887            dump_printf_loc (MSG_NOTE, vect_location,
1888                             "Alignment of access forced using versioning.\n");
1889        }
1890
1891      if (dump_enabled_p ())
1892        dump_printf_loc (MSG_NOTE, vect_location,
1893                         "Versioning for alignment will be applied.\n");
1894
1895      /* Peeling and versioning can't be done together at this time.  */
1896      gcc_assert (! (do_peeling && do_versioning));
1897
1898      stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1899      gcc_assert (stat);
1900      return stat;
1901    }
1902
1903  /* This point is reached if neither peeling nor versioning is being done.  */
1904  gcc_assert (! (do_peeling || do_versioning));
1905
1906  stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1907  return stat;
1908}
1909
1910
1911/* Function vect_find_same_alignment_drs.
1912
1913   Update group and alignment relations according to the chosen
1914   vectorization factor.  */
1915
1916static void
1917vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1918			      loop_vec_info loop_vinfo)
1919{
1920  unsigned int i;
1921  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1922  int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1923  struct data_reference *dra = DDR_A (ddr);
1924  struct data_reference *drb = DDR_B (ddr);
1925  stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1926  stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1927  int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1928  int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1929  lambda_vector dist_v;
1930  unsigned int loop_depth;
1931
1932  if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1933    return;
1934
1935  if (dra == drb)
1936    return;
1937
1938  if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1939    return;
1940
1941  /* Loop-based vectorization and known data dependence.  */
1942  if (DDR_NUM_DIST_VECTS (ddr) == 0)
1943    return;
1944
1945  /* Data-dependence analysis reports a distance vector of zero
1946     for data-references that overlap only in the first iteration
1947     but have different sign step (see PR45764).
1948     So as a sanity check require equal DR_STEP.  */
1949  if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1950    return;
1951
1952  loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1953  FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1954    {
1955      int dist = dist_v[loop_depth];
1956
1957      if (dump_enabled_p ())
1958	dump_printf_loc (MSG_NOTE, vect_location,
1959	                 "dependence distance  = %d.\n", dist);
1960
1961      /* Same loop iteration.  */
1962      if (dist == 0
1963	  || (dist % vectorization_factor == 0 && dra_size == drb_size))
1964	{
1965	  /* Two references with distance zero have the same alignment.  */
1966	  STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1967	  STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1968	  if (dump_enabled_p ())
1969	    {
1970	      dump_printf_loc (MSG_NOTE, vect_location,
1971	                       "accesses have the same alignment.\n");
1972	      dump_printf (MSG_NOTE,
1973	                   "dependence distance modulo vf == 0 between ");
1974	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1975	      dump_printf (MSG_NOTE,  " and ");
1976	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1977	      dump_printf (MSG_NOTE, "\n");
1978	    }
1979	}
1980    }
1981}
1982
1983
1984/* Function vect_analyze_data_refs_alignment
1985
1986   Analyze the alignment of the data-references in the loop.
1987   Return FALSE if a data reference is found that cannot be vectorized.  */
1988
1989bool
1990vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1991                                  bb_vec_info bb_vinfo)
1992{
1993  if (dump_enabled_p ())
1994    dump_printf_loc (MSG_NOTE, vect_location,
1995                     "=== vect_analyze_data_refs_alignment ===\n");
1996
1997  /* Mark groups of data references with same alignment using
1998     data dependence information.  */
1999  if (loop_vinfo)
2000    {
2001      vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2002      struct data_dependence_relation *ddr;
2003      unsigned int i;
2004
2005      FOR_EACH_VEC_ELT (ddrs, i, ddr)
2006	vect_find_same_alignment_drs (ddr, loop_vinfo);
2007    }
2008
2009  if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2010    {
2011      if (dump_enabled_p ())
2012	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2013	                 "not vectorized: can't calculate alignment "
2014	                 "for data ref.\n");
2015      return false;
2016    }
2017
2018  return true;
2019}
2020
2021
2022/* Analyze groups of accesses: check that DR belongs to a group of
2023   accesses of legal size, step, etc.  Detect gaps, single element
2024   interleaving, and other special cases. Set grouped access info.
2025   Collect groups of strided stores for further use in SLP analysis.  */
2026
2027static bool
2028vect_analyze_group_access (struct data_reference *dr)
2029{
2030  tree step = DR_STEP (dr);
2031  tree scalar_type = TREE_TYPE (DR_REF (dr));
2032  HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2033  gimple stmt = DR_STMT (dr);
2034  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2035  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2036  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2037  HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2038  HOST_WIDE_INT groupsize, last_accessed_element = 1;
2039  bool slp_impossible = false;
2040  struct loop *loop = NULL;
2041
2042  if (loop_vinfo)
2043    loop = LOOP_VINFO_LOOP (loop_vinfo);
2044
2045  /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2046     size of the interleaving group (including gaps).  */
2047  groupsize = absu_hwi (dr_step) / type_size;
2048
2049  /* Not consecutive access is possible only if it is a part of interleaving.  */
2050  if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2051    {
2052      /* Check if it this DR is a part of interleaving, and is a single
2053	 element of the group that is accessed in the loop.  */
2054
2055      /* Gaps are supported only for loads. STEP must be a multiple of the type
2056	 size.  The size of the group must be a power of 2.  */
2057      if (DR_IS_READ (dr)
2058	  && (dr_step % type_size) == 0
2059	  && groupsize > 0
2060	  && exact_log2 (groupsize) != -1)
2061	{
2062	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2063	  GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2064	  if (dump_enabled_p ())
2065	    {
2066	      dump_printf_loc (MSG_NOTE, vect_location,
2067	                       "Detected single element interleaving ");
2068	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2069	      dump_printf (MSG_NOTE, " step ");
2070	      dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2071	      dump_printf (MSG_NOTE, "\n");
2072	    }
2073
2074	  if (loop_vinfo)
2075	    {
2076	      if (dump_enabled_p ())
2077		dump_printf_loc (MSG_NOTE, vect_location,
2078		                 "Data access with gaps requires scalar "
2079		                 "epilogue loop\n");
2080              if (loop->inner)
2081                {
2082                  if (dump_enabled_p ())
2083                    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2084                                     "Peeling for outer loop is not"
2085                                     " supported\n");
2086                  return false;
2087                }
2088
2089              LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2090	    }
2091
2092	  return true;
2093	}
2094
2095      if (dump_enabled_p ())
2096        {
2097 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098	                   "not consecutive access ");
2099	  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2100	  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2101        }
2102
2103      if (bb_vinfo)
2104        {
2105          /* Mark the statement as unvectorizable.  */
2106          STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2107          return true;
2108        }
2109
2110      return false;
2111    }
2112
2113  if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2114    {
2115      /* First stmt in the interleaving chain. Check the chain.  */
2116      gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2117      struct data_reference *data_ref = dr;
2118      unsigned int count = 1;
2119      tree prev_init = DR_INIT (data_ref);
2120      gimple prev = stmt;
2121      HOST_WIDE_INT diff, gaps = 0;
2122      unsigned HOST_WIDE_INT count_in_bytes;
2123
2124      while (next)
2125        {
2126          /* Skip same data-refs.  In case that two or more stmts share
2127             data-ref (supported only for loads), we vectorize only the first
2128             stmt, and the rest get their vectorized loads from the first
2129             one.  */
2130          if (!tree_int_cst_compare (DR_INIT (data_ref),
2131                                     DR_INIT (STMT_VINFO_DATA_REF (
2132						   vinfo_for_stmt (next)))))
2133            {
2134              if (DR_IS_WRITE (data_ref))
2135                {
2136                  if (dump_enabled_p ())
2137                    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2138                                     "Two store stmts share the same dr.\n");
2139                  return false;
2140                }
2141
2142              /* For load use the same data-ref load.  */
2143              GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2144
2145              prev = next;
2146              next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2147              continue;
2148            }
2149
2150          prev = next;
2151          data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2152
2153	  /* All group members have the same STEP by construction.  */
2154	  gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2155
2156          /* Check that the distance between two accesses is equal to the type
2157             size. Otherwise, we have gaps.  */
2158          diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2159                  - TREE_INT_CST_LOW (prev_init)) / type_size;
2160	  if (diff != 1)
2161	    {
2162	      /* FORNOW: SLP of accesses with gaps is not supported.  */
2163	      slp_impossible = true;
2164	      if (DR_IS_WRITE (data_ref))
2165		{
2166                  if (dump_enabled_p ())
2167                    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2168                                     "interleaved store with gaps\n");
2169		  return false;
2170		}
2171
2172              gaps += diff - 1;
2173	    }
2174
2175	  last_accessed_element += diff;
2176
2177          /* Store the gap from the previous member of the group. If there is no
2178             gap in the access, GROUP_GAP is always 1.  */
2179          GROUP_GAP (vinfo_for_stmt (next)) = diff;
2180
2181          prev_init = DR_INIT (data_ref);
2182          next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2183          /* Count the number of data-refs in the chain.  */
2184          count++;
2185        }
2186
2187      /* COUNT is the number of accesses found, we multiply it by the size of
2188         the type to get COUNT_IN_BYTES.  */
2189      count_in_bytes = type_size * count;
2190
2191      /* Check that the size of the interleaving (including gaps) is not
2192         greater than STEP.  */
2193      if (dr_step != 0
2194	  && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2195        {
2196          if (dump_enabled_p ())
2197            {
2198              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2199                               "interleaving size is greater than step for ");
2200              dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2201                                 DR_REF (dr));
2202              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2203            }
2204          return false;
2205        }
2206
2207      /* Check that the size of the interleaving is equal to STEP for stores,
2208         i.e., that there are no gaps.  */
2209      if (dr_step != 0
2210	  && absu_hwi (dr_step) != count_in_bytes)
2211        {
2212          if (DR_IS_READ (dr))
2213            {
2214              slp_impossible = true;
2215              /* There is a gap after the last load in the group. This gap is a
2216                 difference between the groupsize and the number of elements.
2217		 When there is no gap, this difference should be 0.  */
2218              GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2219            }
2220          else
2221            {
2222              if (dump_enabled_p ())
2223                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2224                                 "interleaved store with gaps\n");
2225              return false;
2226            }
2227        }
2228
2229      /* Check that STEP is a multiple of type size.  */
2230      if (dr_step != 0
2231	  && (dr_step % type_size) != 0)
2232        {
2233          if (dump_enabled_p ())
2234            {
2235              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2236                               "step is not a multiple of type size: step ");
2237              dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2238              dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2239              dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2240                                 TYPE_SIZE_UNIT (scalar_type));
2241              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2242            }
2243          return false;
2244        }
2245
2246      if (groupsize == 0)
2247        groupsize = count;
2248
2249      GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2250      if (dump_enabled_p ())
2251        dump_printf_loc (MSG_NOTE, vect_location,
2252                         "Detected interleaving of size %d\n", (int)groupsize);
2253
2254      /* SLP: create an SLP data structure for every interleaving group of
2255	 stores for further analysis in vect_analyse_slp.  */
2256      if (DR_IS_WRITE (dr) && !slp_impossible)
2257        {
2258          if (loop_vinfo)
2259            LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2260          if (bb_vinfo)
2261            BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2262        }
2263
2264      /* There is a gap in the end of the group.  */
2265      if (groupsize - last_accessed_element > 0 && loop_vinfo)
2266	{
2267	  if (dump_enabled_p ())
2268	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2269	                     "Data access with gaps requires scalar "
2270	                     "epilogue loop\n");
2271          if (loop->inner)
2272            {
2273              if (dump_enabled_p ())
2274                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2275                                 "Peeling for outer loop is not supported\n");
2276              return false;
2277            }
2278
2279          LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2280	}
2281    }
2282
2283  return true;
2284}
2285
2286
2287/* Analyze the access pattern of the data-reference DR.
2288   In case of non-consecutive accesses call vect_analyze_group_access() to
2289   analyze groups of accesses.  */
2290
2291static bool
2292vect_analyze_data_ref_access (struct data_reference *dr)
2293{
2294  tree step = DR_STEP (dr);
2295  tree scalar_type = TREE_TYPE (DR_REF (dr));
2296  gimple stmt = DR_STMT (dr);
2297  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2298  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2299  struct loop *loop = NULL;
2300
2301  if (loop_vinfo)
2302    loop = LOOP_VINFO_LOOP (loop_vinfo);
2303
2304  if (loop_vinfo && !step)
2305    {
2306      if (dump_enabled_p ())
2307	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2308	                 "bad data-ref access in loop\n");
2309      return false;
2310    }
2311
2312  /* Allow invariant loads in not nested loops.  */
2313  if (loop_vinfo && integer_zerop (step))
2314    {
2315      GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2316      if (nested_in_vect_loop_p (loop, stmt))
2317	{
2318	  if (dump_enabled_p ())
2319	    dump_printf_loc (MSG_NOTE, vect_location,
2320			     "zero step in inner loop of nest\n");
2321	  return false;
2322	}
2323      return DR_IS_READ (dr);
2324    }
2325
2326  if (loop && nested_in_vect_loop_p (loop, stmt))
2327    {
2328      /* Interleaved accesses are not yet supported within outer-loop
2329        vectorization for references in the inner-loop.  */
2330      GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2331
2332      /* For the rest of the analysis we use the outer-loop step.  */
2333      step = STMT_VINFO_DR_STEP (stmt_info);
2334      if (integer_zerop (step))
2335	{
2336	  if (dump_enabled_p ())
2337	    dump_printf_loc (MSG_NOTE, vect_location,
2338	                     "zero step in outer loop.\n");
2339	  if (DR_IS_READ (dr))
2340  	    return true;
2341	  else
2342	    return false;
2343	}
2344    }
2345
2346  /* Consecutive?  */
2347  if (TREE_CODE (step) == INTEGER_CST)
2348    {
2349      HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2350      if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2351	  || (dr_step < 0
2352	      && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2353	{
2354	  /* Mark that it is not interleaving.  */
2355	  GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2356	  return true;
2357	}
2358    }
2359
2360  if (loop && nested_in_vect_loop_p (loop, stmt))
2361    {
2362      if (dump_enabled_p ())
2363	dump_printf_loc (MSG_NOTE, vect_location,
2364	                 "grouped access in outer loop.\n");
2365      return false;
2366    }
2367
2368  /* Assume this is a DR handled by non-constant strided load case.  */
2369  if (TREE_CODE (step) != INTEGER_CST)
2370    return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2371
2372  /* Not consecutive access - check if it's a part of interleaving group.  */
2373  return vect_analyze_group_access (dr);
2374}
2375
2376
2377
2378/*  A helper function used in the comparator function to sort data
2379    references.  T1 and T2 are two data references to be compared.
2380    The function returns -1, 0, or 1.  */
2381
2382static int
2383compare_tree (tree t1, tree t2)
2384{
2385  int i, cmp;
2386  enum tree_code code;
2387  char tclass;
2388
2389  if (t1 == t2)
2390    return 0;
2391  if (t1 == NULL)
2392    return -1;
2393  if (t2 == NULL)
2394    return 1;
2395
2396
2397  if (TREE_CODE (t1) != TREE_CODE (t2))
2398    return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2399
2400  code = TREE_CODE (t1);
2401  switch (code)
2402    {
2403    /* For const values, we can just use hash values for comparisons.  */
2404    case INTEGER_CST:
2405    case REAL_CST:
2406    case FIXED_CST:
2407    case STRING_CST:
2408    case COMPLEX_CST:
2409    case VECTOR_CST:
2410      {
2411	hashval_t h1 = iterative_hash_expr (t1, 0);
2412	hashval_t h2 = iterative_hash_expr (t2, 0);
2413	if (h1 != h2)
2414	  return h1 < h2 ? -1 : 1;
2415	break;
2416      }
2417
2418    case SSA_NAME:
2419      cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2420      if (cmp != 0)
2421	return cmp;
2422
2423      if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2424	return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2425      break;
2426
2427    default:
2428      tclass = TREE_CODE_CLASS (code);
2429
2430      /* For var-decl, we could compare their UIDs.  */
2431      if (tclass == tcc_declaration)
2432	{
2433	  if (DECL_UID (t1) != DECL_UID (t2))
2434	    return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2435	  break;
2436	}
2437
2438      /* For expressions with operands, compare their operands recursively.  */
2439      for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2440	{
2441	  cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2442	  if (cmp != 0)
2443	    return cmp;
2444	}
2445    }
2446
2447  return 0;
2448}
2449
2450
2451/* Compare two data-references DRA and DRB to group them into chunks
2452   suitable for grouping.  */
2453
2454static int
2455dr_group_sort_cmp (const void *dra_, const void *drb_)
2456{
2457  data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2458  data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2459  int cmp;
2460
2461  /* Stabilize sort.  */
2462  if (dra == drb)
2463    return 0;
2464
2465  /* Ordering of DRs according to base.  */
2466  if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2467    {
2468      cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2469      if (cmp != 0)
2470        return cmp;
2471    }
2472
2473  /* And according to DR_OFFSET.  */
2474  if (!dr_equal_offsets_p (dra, drb))
2475    {
2476      cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2477      if (cmp != 0)
2478        return cmp;
2479    }
2480
2481  /* Put reads before writes.  */
2482  if (DR_IS_READ (dra) != DR_IS_READ (drb))
2483    return DR_IS_READ (dra) ? -1 : 1;
2484
2485  /* Then sort after access size.  */
2486  if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2487			TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2488    {
2489      cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2490                          TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2491      if (cmp != 0)
2492        return cmp;
2493    }
2494
2495  /* And after step.  */
2496  if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2497    {
2498      cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2499      if (cmp != 0)
2500        return cmp;
2501    }
2502
2503  /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2504  cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2505  if (cmp == 0)
2506    return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2507  return cmp;
2508}
2509
2510/* Function vect_analyze_data_ref_accesses.
2511
2512   Analyze the access pattern of all the data references in the loop.
2513
2514   FORNOW: the only access pattern that is considered vectorizable is a
2515	   simple step 1 (consecutive) access.
2516
2517   FORNOW: handle only arrays and pointer accesses.  */
2518
2519bool
2520vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2521{
2522  unsigned int i;
2523  vec<data_reference_p> datarefs;
2524  struct data_reference *dr;
2525
2526  if (dump_enabled_p ())
2527    dump_printf_loc (MSG_NOTE, vect_location,
2528                     "=== vect_analyze_data_ref_accesses ===\n");
2529
2530  if (loop_vinfo)
2531    datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2532  else
2533    datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2534
2535  if (datarefs.is_empty ())
2536    return true;
2537
2538  /* Sort the array of datarefs to make building the interleaving chains
2539     linear.  Don't modify the original vector's order, it is needed for
2540     determining what dependencies are reversed.  */
2541  vec<data_reference_p> datarefs_copy = datarefs.copy ();
2542  datarefs_copy.qsort (dr_group_sort_cmp);
2543
2544  /* Build the interleaving chains.  */
2545  for (i = 0; i < datarefs_copy.length () - 1;)
2546    {
2547      data_reference_p dra = datarefs_copy[i];
2548      stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2549      stmt_vec_info lastinfo = NULL;
2550      for (i = i + 1; i < datarefs_copy.length (); ++i)
2551	{
2552	  data_reference_p drb = datarefs_copy[i];
2553	  stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2554
2555	  /* ???  Imperfect sorting (non-compatible types, non-modulo
2556	     accesses, same accesses) can lead to a group to be artificially
2557	     split here as we don't just skip over those.  If it really
2558	     matters we can push those to a worklist and re-iterate
2559	     over them.  The we can just skip ahead to the next DR here.  */
2560
2561	  /* Check that the data-refs have same first location (except init)
2562	     and they are both either store or load (not load and store,
2563	     not masked loads or stores).  */
2564	  if (DR_IS_READ (dra) != DR_IS_READ (drb)
2565	      || !operand_equal_p (DR_BASE_ADDRESS (dra),
2566				   DR_BASE_ADDRESS (drb), 0)
2567	      || !dr_equal_offsets_p (dra, drb)
2568	      || !gimple_assign_single_p (DR_STMT (dra))
2569	      || !gimple_assign_single_p (DR_STMT (drb)))
2570	    break;
2571
2572	  /* Check that the data-refs have the same constant size and step.  */
2573	  tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2574	  tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2575	  if (!tree_fits_uhwi_p (sza)
2576	      || !tree_fits_uhwi_p (szb)
2577	      || !tree_int_cst_equal (sza, szb)
2578	      || !tree_fits_shwi_p (DR_STEP (dra))
2579	      || !tree_fits_shwi_p (DR_STEP (drb))
2580	      || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2581	    break;
2582
2583	  /* Do not place the same access in the interleaving chain twice.  */
2584	  if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2585	    break;
2586
2587	  /* Check the types are compatible.
2588	     ???  We don't distinguish this during sorting.  */
2589	  if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2590				   TREE_TYPE (DR_REF (drb))))
2591	    break;
2592
2593	  /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2594	  HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2595	  HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2596	  gcc_assert (init_a < init_b);
2597
2598	  /* If init_b == init_a + the size of the type * k, we have an
2599	     interleaving, and DRA is accessed before DRB.  */
2600	  HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2601	  if ((init_b - init_a) % type_size_a != 0)
2602	    break;
2603
2604	  /* The step (if not zero) is greater than the difference between
2605	     data-refs' inits.  This splits groups into suitable sizes.  */
2606	  HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2607	  if (step != 0 && step <= (init_b - init_a))
2608	    break;
2609
2610	  if (dump_enabled_p ())
2611	    {
2612	      dump_printf_loc (MSG_NOTE, vect_location,
2613			       "Detected interleaving ");
2614	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2615	      dump_printf (MSG_NOTE,  " and ");
2616	      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2617	      dump_printf (MSG_NOTE, "\n");
2618	    }
2619
2620	  /* Link the found element into the group list.  */
2621	  if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2622	    {
2623	      GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2624	      lastinfo = stmtinfo_a;
2625	    }
2626	  GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2627	  GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2628	  lastinfo = stmtinfo_b;
2629	}
2630    }
2631
2632  FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2633    if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2634        && !vect_analyze_data_ref_access (dr))
2635      {
2636	if (dump_enabled_p ())
2637	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2638	                   "not vectorized: complicated access pattern.\n");
2639
2640        if (bb_vinfo)
2641          {
2642            /* Mark the statement as not vectorizable.  */
2643            STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2644            continue;
2645          }
2646        else
2647	  {
2648	    datarefs_copy.release ();
2649	    return false;
2650	  }
2651      }
2652
2653  datarefs_copy.release ();
2654  return true;
2655}
2656
2657
2658/* Operator == between two dr_with_seg_len objects.
2659
2660   This equality operator is used to make sure two data refs
2661   are the same one so that we will consider to combine the
2662   aliasing checks of those two pairs of data dependent data
2663   refs.  */
2664
2665static bool
2666operator == (const dr_with_seg_len& d1,
2667	     const dr_with_seg_len& d2)
2668{
2669  return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2670			  DR_BASE_ADDRESS (d2.dr), 0)
2671	   && compare_tree (d1.offset, d2.offset) == 0
2672	   && compare_tree (d1.seg_len, d2.seg_len) == 0;
2673}
2674
2675/* Function comp_dr_with_seg_len_pair.
2676
2677   Comparison function for sorting objects of dr_with_seg_len_pair_t
2678   so that we can combine aliasing checks in one scan.  */
2679
2680static int
2681comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2682{
2683  const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2684  const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2685
2686  const dr_with_seg_len &p11 = p1->first,
2687			&p12 = p1->second,
2688			&p21 = p2->first,
2689			&p22 = p2->second;
2690
2691  /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2692     if a and c have the same basic address snd step, and b and d have the same
2693     address and step.  Therefore, if any a&c or b&d don't have the same address
2694     and step, we don't care the order of those two pairs after sorting.  */
2695  int comp_res;
2696
2697  if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2698				DR_BASE_ADDRESS (p21.dr))) != 0)
2699    return comp_res;
2700  if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2701				DR_BASE_ADDRESS (p22.dr))) != 0)
2702    return comp_res;
2703  if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2704    return comp_res;
2705  if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2706    return comp_res;
2707  if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2708    return comp_res;
2709  if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2710    return comp_res;
2711
2712  return 0;
2713}
2714
2715/* Function vect_vfa_segment_size.
2716
2717   Create an expression that computes the size of segment
2718   that will be accessed for a data reference.  The functions takes into
2719   account that realignment loads may access one more vector.
2720
2721   Input:
2722     DR: The data reference.
2723     LENGTH_FACTOR: segment length to consider.
2724
2725   Return an expression whose value is the size of segment which will be
2726   accessed by DR.  */
2727
2728static tree
2729vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2730{
2731  tree segment_length;
2732
2733  if (integer_zerop (DR_STEP (dr)))
2734    segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2735  else
2736    segment_length = size_binop (MULT_EXPR,
2737				 fold_convert (sizetype, DR_STEP (dr)),
2738				 fold_convert (sizetype, length_factor));
2739
2740  if (vect_supportable_dr_alignment (dr, false)
2741	== dr_explicit_realign_optimized)
2742    {
2743      tree vector_size = TYPE_SIZE_UNIT
2744			  (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2745
2746      segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2747    }
2748  return segment_length;
2749}
2750
2751/* Function vect_prune_runtime_alias_test_list.
2752
2753   Prune a list of ddrs to be tested at run-time by versioning for alias.
2754   Merge several alias checks into one if possible.
2755   Return FALSE if resulting list of ddrs is longer then allowed by
2756   PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2757
2758bool
2759vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2760{
2761  vec<ddr_p> may_alias_ddrs =
2762    LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2763  vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2764    LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2765  int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2766  tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2767
2768  ddr_p ddr;
2769  unsigned int i;
2770  tree length_factor;
2771
2772  if (dump_enabled_p ())
2773    dump_printf_loc (MSG_NOTE, vect_location,
2774                     "=== vect_prune_runtime_alias_test_list ===\n");
2775
2776  if (may_alias_ddrs.is_empty ())
2777    return true;
2778
2779  /* Basically, for each pair of dependent data refs store_ptr_0
2780     and load_ptr_0, we create an expression:
2781
2782     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2783     || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2784
2785     for aliasing checks.  However, in some cases we can decrease
2786     the number of checks by combining two checks into one.  For
2787     example, suppose we have another pair of data refs store_ptr_0
2788     and load_ptr_1, and if the following condition is satisfied:
2789
2790     load_ptr_0 < load_ptr_1  &&
2791     load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2792
2793     (this condition means, in each iteration of vectorized loop,
2794     the accessed memory of store_ptr_0 cannot be between the memory
2795     of load_ptr_0 and load_ptr_1.)
2796
2797     we then can use only the following expression to finish the
2798     alising checks between store_ptr_0 & load_ptr_0 and
2799     store_ptr_0 & load_ptr_1:
2800
2801     ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2802     || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2803
2804     Note that we only consider that load_ptr_0 and load_ptr_1 have the
2805     same basic address.  */
2806
2807  comp_alias_ddrs.create (may_alias_ddrs.length ());
2808
2809  /* First, we collect all data ref pairs for aliasing checks.  */
2810  FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2811    {
2812      struct data_reference *dr_a, *dr_b;
2813      gimple dr_group_first_a, dr_group_first_b;
2814      tree segment_length_a, segment_length_b;
2815      gimple stmt_a, stmt_b;
2816
2817      dr_a = DDR_A (ddr);
2818      stmt_a = DR_STMT (DDR_A (ddr));
2819      dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2820      if (dr_group_first_a)
2821	{
2822	  stmt_a = dr_group_first_a;
2823	  dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2824	}
2825
2826      dr_b = DDR_B (ddr);
2827      stmt_b = DR_STMT (DDR_B (ddr));
2828      dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2829      if (dr_group_first_b)
2830	{
2831	  stmt_b = dr_group_first_b;
2832	  dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2833	}
2834
2835      if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2836	length_factor = scalar_loop_iters;
2837      else
2838	length_factor = size_int (vect_factor);
2839      segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2840      segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2841
2842      dr_with_seg_len_pair_t dr_with_seg_len_pair
2843	  (dr_with_seg_len (dr_a, segment_length_a),
2844	   dr_with_seg_len (dr_b, segment_length_b));
2845
2846      if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2847	std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2848
2849      comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2850    }
2851
2852  /* Second, we sort the collected data ref pairs so that we can scan
2853     them once to combine all possible aliasing checks.  */
2854  comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2855
2856  /* Third, we scan the sorted dr pairs and check if we can combine
2857     alias checks of two neighbouring dr pairs.  */
2858  for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2859    {
2860      /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
2861      dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2862		      *dr_b1 = &comp_alias_ddrs[i-1].second,
2863		      *dr_a2 = &comp_alias_ddrs[i].first,
2864		      *dr_b2 = &comp_alias_ddrs[i].second;
2865
2866      /* Remove duplicate data ref pairs.  */
2867      if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2868	{
2869	  if (dump_enabled_p ())
2870	    {
2871	      dump_printf_loc (MSG_NOTE, vect_location,
2872			       "found equal ranges ");
2873	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
2874				 DR_REF (dr_a1->dr));
2875	      dump_printf (MSG_NOTE,  ", ");
2876	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
2877				 DR_REF (dr_b1->dr));
2878	      dump_printf (MSG_NOTE,  " and ");
2879	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
2880				 DR_REF (dr_a2->dr));
2881	      dump_printf (MSG_NOTE,  ", ");
2882	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
2883				 DR_REF (dr_b2->dr));
2884	      dump_printf (MSG_NOTE, "\n");
2885	    }
2886
2887	  comp_alias_ddrs.ordered_remove (i--);
2888	  continue;
2889	}
2890
2891      if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2892	{
2893	  /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2894	     and DR_A1 and DR_A2 are two consecutive memrefs.  */
2895	  if (*dr_a1 == *dr_a2)
2896	    {
2897	      std::swap (dr_a1, dr_b1);
2898	      std::swap (dr_a2, dr_b2);
2899	    }
2900
2901	  if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2902				DR_BASE_ADDRESS (dr_a2->dr),
2903				0)
2904	      || !tree_fits_shwi_p (dr_a1->offset)
2905	      || !tree_fits_shwi_p (dr_a2->offset))
2906	    continue;
2907
2908	  /* Make sure dr_a1 starts left of dr_a2.  */
2909	  if (tree_int_cst_lt (dr_a2->offset, dr_a1->offset))
2910	    std::swap (*dr_a1, *dr_a2);
2911
2912	  unsigned HOST_WIDE_INT diff
2913	    = tree_to_shwi (dr_a2->offset) - tree_to_shwi (dr_a1->offset);
2914
2915
2916	  bool do_remove = false;
2917
2918	  /* If the left segment does not extend beyond the start of the
2919	     right segment the new segment length is that of the right
2920	     plus the segment distance.  */
2921	  if (tree_fits_uhwi_p (dr_a1->seg_len)
2922	      && compare_tree_int (dr_a1->seg_len, diff) <= 0)
2923	    {
2924	      dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len,
2925					   size_int (diff));
2926	      do_remove = true;
2927	    }
2928	  /* Generally the new segment length is the maximum of the
2929	     left segment size and the right segment size plus the distance.
2930	     ???  We can also build tree MAX_EXPR here but it's not clear this
2931	     is profitable.  */
2932	  else if (tree_fits_uhwi_p (dr_a1->seg_len)
2933		   && tree_fits_uhwi_p (dr_a2->seg_len))
2934	    {
2935	      unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len);
2936	      unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len);
2937	      dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2));
2938	      do_remove = true;
2939	    }
2940	  /* Now we check if the following condition is satisfied:
2941
2942	     DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2943
2944	     where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
2945	     SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2946	     have to make a best estimation.  We can get the minimum value
2947	     of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2948	     then either of the following two conditions can guarantee the
2949	     one above:
2950
2951	     1: DIFF <= MIN_SEG_LEN_B
2952	     2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B  */
2953	  else
2954	    {
2955	      unsigned HOST_WIDE_INT min_seg_len_b
2956		= (tree_fits_uhwi_p (dr_b1->seg_len)
2957		   ? tree_to_uhwi (dr_b1->seg_len)
2958		   : vect_factor);
2959
2960	      if (diff <= min_seg_len_b
2961		  || (tree_fits_uhwi_p (dr_a1->seg_len)
2962		      && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b))
2963		{
2964		  dr_a1->seg_len = size_binop (PLUS_EXPR,
2965					       dr_a2->seg_len, size_int (diff));
2966		  do_remove = true;
2967		}
2968	    }
2969
2970	  if (do_remove)
2971	    {
2972	      if (dump_enabled_p ())
2973		{
2974		  dump_printf_loc (MSG_NOTE, vect_location,
2975				   "merging ranges for ");
2976		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
2977		  dump_printf (MSG_NOTE,  ", ");
2978		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
2979		  dump_printf (MSG_NOTE,  " and ");
2980		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
2981		  dump_printf (MSG_NOTE,  ", ");
2982		  dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
2983		  dump_printf (MSG_NOTE, "\n");
2984		}
2985	      comp_alias_ddrs.ordered_remove (i--);
2986	    }
2987	}
2988    }
2989
2990  dump_printf_loc (MSG_NOTE, vect_location,
2991		   "improved number of alias checks from %d to %d\n",
2992		   may_alias_ddrs.length (), comp_alias_ddrs.length ());
2993  if ((int) comp_alias_ddrs.length () >
2994      PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2995    return false;
2996
2997  return true;
2998}
2999
3000/* Check whether a non-affine read in stmt is suitable for gather load
3001   and if so, return a builtin decl for that operation.  */
3002
3003tree
3004vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
3005		   tree *offp, int *scalep)
3006{
3007  HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3008  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3009  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3010  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3011  tree offtype = NULL_TREE;
3012  tree decl, base, off;
3013  machine_mode pmode;
3014  int punsignedp, pvolatilep;
3015
3016  base = DR_REF (dr);
3017  /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3018     see if we can use the def stmt of the address.  */
3019  if (is_gimple_call (stmt)
3020      && gimple_call_internal_p (stmt)
3021      && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3022	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3023      && TREE_CODE (base) == MEM_REF
3024      && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3025      && integer_zerop (TREE_OPERAND (base, 1))
3026      && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3027    {
3028      gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3029      if (is_gimple_assign (def_stmt)
3030	  && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3031	base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3032    }
3033
3034  /* The gather builtins need address of the form
3035     loop_invariant + vector * {1, 2, 4, 8}
3036     or
3037     loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3038     Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3039     of loop invariants/SSA_NAMEs defined in the loop, with casts,
3040     multiplications and additions in it.  To get a vector, we need
3041     a single SSA_NAME that will be defined in the loop and will
3042     contain everything that is not loop invariant and that can be
3043     vectorized.  The following code attempts to find such a preexistng
3044     SSA_NAME OFF and put the loop invariants into a tree BASE
3045     that can be gimplified before the loop.  */
3046  base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3047			      &pmode, &punsignedp, &pvolatilep, false);
3048  gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3049
3050  if (TREE_CODE (base) == MEM_REF)
3051    {
3052      if (!integer_zerop (TREE_OPERAND (base, 1)))
3053	{
3054	  if (off == NULL_TREE)
3055	    {
3056	      offset_int moff = mem_ref_offset (base);
3057	      off = wide_int_to_tree (sizetype, moff);
3058	    }
3059	  else
3060	    off = size_binop (PLUS_EXPR, off,
3061			      fold_convert (sizetype, TREE_OPERAND (base, 1)));
3062	}
3063      base = TREE_OPERAND (base, 0);
3064    }
3065  else
3066    base = build_fold_addr_expr (base);
3067
3068  if (off == NULL_TREE)
3069    off = size_zero_node;
3070
3071  /* If base is not loop invariant, either off is 0, then we start with just
3072     the constant offset in the loop invariant BASE and continue with base
3073     as OFF, otherwise give up.
3074     We could handle that case by gimplifying the addition of base + off
3075     into some SSA_NAME and use that as off, but for now punt.  */
3076  if (!expr_invariant_in_loop_p (loop, base))
3077    {
3078      if (!integer_zerop (off))
3079	return NULL_TREE;
3080      off = base;
3081      base = size_int (pbitpos / BITS_PER_UNIT);
3082    }
3083  /* Otherwise put base + constant offset into the loop invariant BASE
3084     and continue with OFF.  */
3085  else
3086    {
3087      base = fold_convert (sizetype, base);
3088      base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3089    }
3090
3091  /* OFF at this point may be either a SSA_NAME or some tree expression
3092     from get_inner_reference.  Try to peel off loop invariants from it
3093     into BASE as long as possible.  */
3094  STRIP_NOPS (off);
3095  while (offtype == NULL_TREE)
3096    {
3097      enum tree_code code;
3098      tree op0, op1, add = NULL_TREE;
3099
3100      if (TREE_CODE (off) == SSA_NAME)
3101	{
3102	  gimple def_stmt = SSA_NAME_DEF_STMT (off);
3103
3104	  if (expr_invariant_in_loop_p (loop, off))
3105	    return NULL_TREE;
3106
3107	  if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3108	    break;
3109
3110	  op0 = gimple_assign_rhs1 (def_stmt);
3111	  code = gimple_assign_rhs_code (def_stmt);
3112	  op1 = gimple_assign_rhs2 (def_stmt);
3113	}
3114      else
3115	{
3116	  if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3117	    return NULL_TREE;
3118	  code = TREE_CODE (off);
3119	  extract_ops_from_tree (off, &code, &op0, &op1);
3120	}
3121      switch (code)
3122	{
3123	case POINTER_PLUS_EXPR:
3124	case PLUS_EXPR:
3125	  if (expr_invariant_in_loop_p (loop, op0))
3126	    {
3127	      add = op0;
3128	      off = op1;
3129	    do_add:
3130	      add = fold_convert (sizetype, add);
3131	      if (scale != 1)
3132		add = size_binop (MULT_EXPR, add, size_int (scale));
3133	      base = size_binop (PLUS_EXPR, base, add);
3134	      continue;
3135	    }
3136	  if (expr_invariant_in_loop_p (loop, op1))
3137	    {
3138	      add = op1;
3139	      off = op0;
3140	      goto do_add;
3141	    }
3142	  break;
3143	case MINUS_EXPR:
3144	  if (expr_invariant_in_loop_p (loop, op1))
3145	    {
3146	      add = fold_convert (sizetype, op1);
3147	      add = size_binop (MINUS_EXPR, size_zero_node, add);
3148	      off = op0;
3149	      goto do_add;
3150	    }
3151	  break;
3152	case MULT_EXPR:
3153	  if (scale == 1 && tree_fits_shwi_p (op1))
3154	    {
3155	      scale = tree_to_shwi (op1);
3156	      off = op0;
3157	      continue;
3158	    }
3159	  break;
3160	case SSA_NAME:
3161	  off = op0;
3162	  continue;
3163	CASE_CONVERT:
3164	  if (!POINTER_TYPE_P (TREE_TYPE (op0))
3165	      && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3166	    break;
3167	  if (TYPE_PRECISION (TREE_TYPE (op0))
3168	      == TYPE_PRECISION (TREE_TYPE (off)))
3169	    {
3170	      off = op0;
3171	      continue;
3172	    }
3173	  if (TYPE_PRECISION (TREE_TYPE (op0))
3174	      < TYPE_PRECISION (TREE_TYPE (off)))
3175	    {
3176	      off = op0;
3177	      offtype = TREE_TYPE (off);
3178	      STRIP_NOPS (off);
3179	      continue;
3180	    }
3181	  break;
3182	default:
3183	  break;
3184	}
3185      break;
3186    }
3187
3188  /* If at the end OFF still isn't a SSA_NAME or isn't
3189     defined in the loop, punt.  */
3190  if (TREE_CODE (off) != SSA_NAME
3191      || expr_invariant_in_loop_p (loop, off))
3192    return NULL_TREE;
3193
3194  if (offtype == NULL_TREE)
3195    offtype = TREE_TYPE (off);
3196
3197  decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3198					   offtype, scale);
3199  if (decl == NULL_TREE)
3200    return NULL_TREE;
3201
3202  if (basep)
3203    *basep = base;
3204  if (offp)
3205    *offp = off;
3206  if (scalep)
3207    *scalep = scale;
3208  return decl;
3209}
3210
3211/* Function vect_analyze_data_refs.
3212
3213  Find all the data references in the loop or basic block.
3214
3215   The general structure of the analysis of data refs in the vectorizer is as
3216   follows:
3217   1- vect_analyze_data_refs(loop/bb): call
3218      compute_data_dependences_for_loop/bb to find and analyze all data-refs
3219      in the loop/bb and their dependences.
3220   2- vect_analyze_dependences(): apply dependence testing using ddrs.
3221   3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3222   4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3223
3224*/
3225
3226bool
3227vect_analyze_data_refs (loop_vec_info loop_vinfo,
3228			bb_vec_info bb_vinfo,
3229			int *min_vf, unsigned *n_stmts)
3230{
3231  struct loop *loop = NULL;
3232  basic_block bb = NULL;
3233  unsigned int i;
3234  vec<data_reference_p> datarefs;
3235  struct data_reference *dr;
3236  tree scalar_type;
3237
3238  if (dump_enabled_p ())
3239    dump_printf_loc (MSG_NOTE, vect_location,
3240                     "=== vect_analyze_data_refs ===\n");
3241
3242  if (loop_vinfo)
3243    {
3244      basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3245
3246      loop = LOOP_VINFO_LOOP (loop_vinfo);
3247      datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3248      if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3249	{
3250	  if (dump_enabled_p ())
3251	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3252	                     "not vectorized: loop contains function calls"
3253	                     " or data references that cannot be analyzed\n");
3254	  return false;
3255	}
3256
3257      for (i = 0; i < loop->num_nodes; i++)
3258	{
3259	  gimple_stmt_iterator gsi;
3260
3261	  for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3262	    {
3263	      gimple stmt = gsi_stmt (gsi);
3264	      if (is_gimple_debug (stmt))
3265		continue;
3266	      ++*n_stmts;
3267	      if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3268		{
3269		  if (is_gimple_call (stmt) && loop->safelen)
3270		    {
3271		      tree fndecl = gimple_call_fndecl (stmt), op;
3272		      if (fndecl != NULL_TREE)
3273			{
3274			  struct cgraph_node *node = cgraph_node::get (fndecl);
3275			  if (node != NULL && node->simd_clones != NULL)
3276			    {
3277			      unsigned int j, n = gimple_call_num_args (stmt);
3278			      for (j = 0; j < n; j++)
3279				{
3280				  op = gimple_call_arg (stmt, j);
3281				  if (DECL_P (op)
3282				      || (REFERENCE_CLASS_P (op)
3283					  && get_base_address (op)))
3284				    break;
3285				}
3286			      op = gimple_call_lhs (stmt);
3287			      /* Ignore #pragma omp declare simd functions
3288				 if they don't have data references in the
3289				 call stmt itself.  */
3290			      if (j == n
3291				  && !(op
3292				       && (DECL_P (op)
3293					   || (REFERENCE_CLASS_P (op)
3294					       && get_base_address (op)))))
3295				continue;
3296			    }
3297			}
3298		    }
3299		  LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3300		  if (dump_enabled_p ())
3301		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3302				     "not vectorized: loop contains function "
3303				     "calls or data references that cannot "
3304				     "be analyzed\n");
3305		  return false;
3306		}
3307	    }
3308	}
3309
3310      LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3311    }
3312  else
3313    {
3314      gimple_stmt_iterator gsi;
3315
3316      bb = BB_VINFO_BB (bb_vinfo);
3317      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3318	{
3319	  gimple stmt = gsi_stmt (gsi);
3320	  if (is_gimple_debug (stmt))
3321	    continue;
3322	  ++*n_stmts;
3323	  if (!find_data_references_in_stmt (NULL, stmt,
3324					     &BB_VINFO_DATAREFS (bb_vinfo)))
3325	    {
3326	      /* Mark the rest of the basic-block as unvectorizable.  */
3327	      for (; !gsi_end_p (gsi); gsi_next (&gsi))
3328		{
3329		  stmt = gsi_stmt (gsi);
3330		  STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3331		}
3332	      break;
3333	    }
3334	}
3335
3336      datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3337    }
3338
3339  /* Go through the data-refs, check that the analysis succeeded.  Update
3340     pointer from stmt_vec_info struct to DR and vectype.  */
3341
3342  FOR_EACH_VEC_ELT (datarefs, i, dr)
3343    {
3344      gimple stmt;
3345      stmt_vec_info stmt_info;
3346      tree base, offset, init;
3347      bool gather = false;
3348      bool simd_lane_access = false;
3349      int vf;
3350
3351again:
3352      if (!dr || !DR_REF (dr))
3353        {
3354          if (dump_enabled_p ())
3355	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3356	                     "not vectorized: unhandled data-ref\n");
3357          return false;
3358        }
3359
3360      stmt = DR_STMT (dr);
3361      stmt_info = vinfo_for_stmt (stmt);
3362
3363      /* Discard clobbers from the dataref vector.  We will remove
3364         clobber stmts during vectorization.  */
3365      if (gimple_clobber_p (stmt))
3366	{
3367	  free_data_ref (dr);
3368	  if (i == datarefs.length () - 1)
3369	    {
3370	      datarefs.pop ();
3371	      break;
3372	    }
3373	  datarefs.ordered_remove (i);
3374	  dr = datarefs[i];
3375	  goto again;
3376	}
3377
3378      /* Check that analysis of the data-ref succeeded.  */
3379      if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3380	  || !DR_STEP (dr))
3381        {
3382	  bool maybe_gather
3383	    = DR_IS_READ (dr)
3384	      && !TREE_THIS_VOLATILE (DR_REF (dr))
3385	      && targetm.vectorize.builtin_gather != NULL;
3386	  bool maybe_simd_lane_access
3387	    = loop_vinfo && loop->simduid;
3388
3389	  /* If target supports vector gather loads, or if this might be
3390	     a SIMD lane access, see if they can't be used.  */
3391	  if (loop_vinfo
3392	      && (maybe_gather || maybe_simd_lane_access)
3393	      && !nested_in_vect_loop_p (loop, stmt))
3394	    {
3395	      struct data_reference *newdr
3396		= create_data_ref (NULL, loop_containing_stmt (stmt),
3397				   DR_REF (dr), stmt, true);
3398	      gcc_assert (newdr != NULL && DR_REF (newdr));
3399	      if (DR_BASE_ADDRESS (newdr)
3400		  && DR_OFFSET (newdr)
3401		  && DR_INIT (newdr)
3402		  && DR_STEP (newdr)
3403		  && integer_zerop (DR_STEP (newdr)))
3404		{
3405		  if (maybe_simd_lane_access)
3406		    {
3407		      tree off = DR_OFFSET (newdr);
3408		      STRIP_NOPS (off);
3409		      if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3410			  && TREE_CODE (off) == MULT_EXPR
3411			  && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3412			{
3413			  tree step = TREE_OPERAND (off, 1);
3414			  off = TREE_OPERAND (off, 0);
3415			  STRIP_NOPS (off);
3416			  if (CONVERT_EXPR_P (off)
3417			      && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3418									  0)))
3419				 < TYPE_PRECISION (TREE_TYPE (off)))
3420			    off = TREE_OPERAND (off, 0);
3421			  if (TREE_CODE (off) == SSA_NAME)
3422			    {
3423			      gimple def = SSA_NAME_DEF_STMT (off);
3424			      tree reft = TREE_TYPE (DR_REF (newdr));
3425			      if (is_gimple_call (def)
3426				  && gimple_call_internal_p (def)
3427				  && (gimple_call_internal_fn (def)
3428				      == IFN_GOMP_SIMD_LANE))
3429				{
3430				  tree arg = gimple_call_arg (def, 0);
3431				  gcc_assert (TREE_CODE (arg) == SSA_NAME);
3432				  arg = SSA_NAME_VAR (arg);
3433				  if (arg == loop->simduid
3434				      /* For now.  */
3435				      && tree_int_cst_equal
3436					   (TYPE_SIZE_UNIT (reft),
3437					    step))
3438				    {
3439				      DR_OFFSET (newdr) = ssize_int (0);
3440				      DR_STEP (newdr) = step;
3441				      DR_ALIGNED_TO (newdr)
3442					= size_int (BIGGEST_ALIGNMENT);
3443				      dr = newdr;
3444				      simd_lane_access = true;
3445				    }
3446				}
3447			    }
3448			}
3449		    }
3450		  if (!simd_lane_access && maybe_gather)
3451		    {
3452		      dr = newdr;
3453		      gather = true;
3454		    }
3455		}
3456	      if (!gather && !simd_lane_access)
3457		free_data_ref (newdr);
3458	    }
3459
3460	  if (!gather && !simd_lane_access)
3461	    {
3462	      if (dump_enabled_p ())
3463		{
3464		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3465                                   "not vectorized: data ref analysis "
3466                                   "failed ");
3467		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3468                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3469		}
3470
3471	      if (bb_vinfo)
3472		break;
3473
3474	      return false;
3475	    }
3476        }
3477
3478      if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3479        {
3480          if (dump_enabled_p ())
3481            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3482                             "not vectorized: base addr of dr is a "
3483                             "constant\n");
3484
3485          if (bb_vinfo)
3486	    break;
3487
3488	  if (gather || simd_lane_access)
3489	    free_data_ref (dr);
3490	  return false;
3491        }
3492
3493      if (TREE_THIS_VOLATILE (DR_REF (dr)))
3494        {
3495          if (dump_enabled_p ())
3496            {
3497              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3498                               "not vectorized: volatile type ");
3499              dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3500              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3501            }
3502
3503          if (bb_vinfo)
3504	    break;
3505
3506          return false;
3507        }
3508
3509      if (stmt_can_throw_internal (stmt))
3510        {
3511          if (dump_enabled_p ())
3512            {
3513              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3514                               "not vectorized: statement can throw an "
3515                               "exception ");
3516              dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3517              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3518            }
3519
3520          if (bb_vinfo)
3521	    break;
3522
3523	  if (gather || simd_lane_access)
3524	    free_data_ref (dr);
3525          return false;
3526        }
3527
3528      if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3529	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3530	{
3531          if (dump_enabled_p ())
3532            {
3533              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3534                               "not vectorized: statement is bitfield "
3535                               "access ");
3536              dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3537              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3538            }
3539
3540          if (bb_vinfo)
3541	    break;
3542
3543	  if (gather || simd_lane_access)
3544	    free_data_ref (dr);
3545          return false;
3546	}
3547
3548      base = unshare_expr (DR_BASE_ADDRESS (dr));
3549      offset = unshare_expr (DR_OFFSET (dr));
3550      init = unshare_expr (DR_INIT (dr));
3551
3552      if (is_gimple_call (stmt)
3553	  && (!gimple_call_internal_p (stmt)
3554	      || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3555		  && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3556	{
3557	  if (dump_enabled_p ())
3558	    {
3559	      dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3560	                       "not vectorized: dr in a call ");
3561	      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3562	      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3563	    }
3564
3565	  if (bb_vinfo)
3566	    break;
3567
3568	  if (gather || simd_lane_access)
3569	    free_data_ref (dr);
3570	  return false;
3571	}
3572
3573      /* Update DR field in stmt_vec_info struct.  */
3574
3575      /* If the dataref is in an inner-loop of the loop that is considered for
3576	 for vectorization, we also want to analyze the access relative to
3577	 the outer-loop (DR contains information only relative to the
3578	 inner-most enclosing loop).  We do that by building a reference to the
3579	 first location accessed by the inner-loop, and analyze it relative to
3580	 the outer-loop.  */
3581      if (loop && nested_in_vect_loop_p (loop, stmt))
3582	{
3583	  tree outer_step, outer_base, outer_init;
3584	  HOST_WIDE_INT pbitsize, pbitpos;
3585	  tree poffset;
3586	  machine_mode pmode;
3587	  int punsignedp, pvolatilep;
3588	  affine_iv base_iv, offset_iv;
3589	  tree dinit;
3590
3591	  /* Build a reference to the first location accessed by the
3592	     inner-loop: *(BASE+INIT).  (The first location is actually
3593	     BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3594          tree inner_base = build_fold_indirect_ref
3595                                (fold_build_pointer_plus (base, init));
3596
3597	  if (dump_enabled_p ())
3598	    {
3599	      dump_printf_loc (MSG_NOTE, vect_location,
3600                               "analyze in outer-loop: ");
3601	      dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3602	      dump_printf (MSG_NOTE, "\n");
3603	    }
3604
3605	  outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3606		          &poffset, &pmode, &punsignedp, &pvolatilep, false);
3607	  gcc_assert (outer_base != NULL_TREE);
3608
3609	  if (pbitpos % BITS_PER_UNIT != 0)
3610	    {
3611	      if (dump_enabled_p ())
3612		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3613                                 "failed: bit offset alignment.\n");
3614	      return false;
3615	    }
3616
3617	  outer_base = build_fold_addr_expr (outer_base);
3618	  if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3619                          &base_iv, false))
3620	    {
3621	      if (dump_enabled_p ())
3622		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3623                                 "failed: evolution of base is not affine.\n");
3624	      return false;
3625	    }
3626
3627	  if (offset)
3628	    {
3629	      if (poffset)
3630		poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3631                                       poffset);
3632	      else
3633		poffset = offset;
3634	    }
3635
3636	  if (!poffset)
3637	    {
3638	      offset_iv.base = ssize_int (0);
3639	      offset_iv.step = ssize_int (0);
3640	    }
3641	  else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3642                               &offset_iv, false))
3643	    {
3644	      if (dump_enabled_p ())
3645	        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3646                                 "evolution of offset is not affine.\n");
3647	      return false;
3648	    }
3649
3650	  outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3651	  split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3652	  outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3653	  split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3654	  outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3655
3656	  outer_step = size_binop (PLUS_EXPR,
3657				fold_convert (ssizetype, base_iv.step),
3658				fold_convert (ssizetype, offset_iv.step));
3659
3660	  STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3661	  /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3662	  STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3663	  STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3664	  STMT_VINFO_DR_OFFSET (stmt_info) =
3665				fold_convert (ssizetype, offset_iv.base);
3666	  STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3667				size_int (highest_pow2_factor (offset_iv.base));
3668
3669          if (dump_enabled_p ())
3670	    {
3671	      dump_printf_loc (MSG_NOTE, vect_location,
3672                               "\touter base_address: ");
3673	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3674                                 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3675	      dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3676	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3677                                 STMT_VINFO_DR_OFFSET (stmt_info));
3678	      dump_printf (MSG_NOTE,
3679                           "\n\touter constant offset from base address: ");
3680	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3681                                 STMT_VINFO_DR_INIT (stmt_info));
3682	      dump_printf (MSG_NOTE, "\n\touter step: ");
3683	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3684                                 STMT_VINFO_DR_STEP (stmt_info));
3685	      dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3686	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3687                                 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3688	      dump_printf (MSG_NOTE, "\n");
3689	    }
3690	}
3691
3692      if (STMT_VINFO_DATA_REF (stmt_info))
3693        {
3694          if (dump_enabled_p ())
3695            {
3696              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3697                               "not vectorized: more than one data ref "
3698                               "in stmt: ");
3699              dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3700              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3701            }
3702
3703          if (bb_vinfo)
3704	    break;
3705
3706	  if (gather || simd_lane_access)
3707	    free_data_ref (dr);
3708          return false;
3709        }
3710
3711      STMT_VINFO_DATA_REF (stmt_info) = dr;
3712      if (simd_lane_access)
3713	{
3714	  STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3715	  free_data_ref (datarefs[i]);
3716	  datarefs[i] = dr;
3717	}
3718
3719      /* Set vectype for STMT.  */
3720      scalar_type = TREE_TYPE (DR_REF (dr));
3721      STMT_VINFO_VECTYPE (stmt_info)
3722	= get_vectype_for_scalar_type (scalar_type);
3723      if (!STMT_VINFO_VECTYPE (stmt_info))
3724        {
3725          if (dump_enabled_p ())
3726            {
3727              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3728                               "not vectorized: no vectype for stmt: ");
3729              dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3730              dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3731              dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3732                                 scalar_type);
3733              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3734            }
3735
3736          if (bb_vinfo)
3737	    break;
3738
3739	  if (gather || simd_lane_access)
3740	    {
3741	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
3742	      if (gather)
3743		free_data_ref (dr);
3744	    }
3745	  return false;
3746        }
3747      else
3748	{
3749	  if (dump_enabled_p ())
3750	    {
3751	      dump_printf_loc (MSG_NOTE, vect_location,
3752			       "got vectype for stmt: ");
3753	      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3754	      dump_generic_expr (MSG_NOTE, TDF_SLIM,
3755				 STMT_VINFO_VECTYPE (stmt_info));
3756	      dump_printf (MSG_NOTE, "\n");
3757	    }
3758	}
3759
3760      /* Adjust the minimal vectorization factor according to the
3761	 vector type.  */
3762      vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3763      if (vf > *min_vf)
3764	*min_vf = vf;
3765
3766      if (gather)
3767	{
3768	  tree off;
3769
3770	  gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3771	  if (gather
3772	      && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3773	    gather = false;
3774	  if (!gather)
3775	    {
3776	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
3777	      free_data_ref (dr);
3778	      if (dump_enabled_p ())
3779		{
3780		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3781                                   "not vectorized: not suitable for gather "
3782                                   "load ");
3783		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3784                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3785		}
3786	      return false;
3787	    }
3788
3789	  datarefs[i] = dr;
3790	  STMT_VINFO_GATHER_P (stmt_info) = true;
3791	}
3792      else if (loop_vinfo
3793	       && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3794	{
3795	  if (nested_in_vect_loop_p (loop, stmt)
3796	      || !DR_IS_READ (dr))
3797	    {
3798	      if (dump_enabled_p ())
3799		{
3800		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3801                                   "not vectorized: not suitable for strided "
3802                                   "load ");
3803		  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3804                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3805		}
3806	      return false;
3807	    }
3808	  STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3809	}
3810    }
3811
3812  /* If we stopped analysis at the first dataref we could not analyze
3813     when trying to vectorize a basic-block mark the rest of the datarefs
3814     as not vectorizable and truncate the vector of datarefs.  That
3815     avoids spending useless time in analyzing their dependence.  */
3816  if (i != datarefs.length ())
3817    {
3818      gcc_assert (bb_vinfo != NULL);
3819      for (unsigned j = i; j < datarefs.length (); ++j)
3820	{
3821	  data_reference_p dr = datarefs[j];
3822          STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3823	  free_data_ref (dr);
3824	}
3825      datarefs.truncate (i);
3826    }
3827
3828  return true;
3829}
3830
3831
3832/* Function vect_get_new_vect_var.
3833
3834   Returns a name for a new variable.  The current naming scheme appends the
3835   prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3836   the name of vectorizer generated variables, and appends that to NAME if
3837   provided.  */
3838
3839tree
3840vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3841{
3842  const char *prefix;
3843  tree new_vect_var;
3844
3845  switch (var_kind)
3846  {
3847  case vect_simple_var:
3848    prefix = "vect";
3849    break;
3850  case vect_scalar_var:
3851    prefix = "stmp";
3852    break;
3853  case vect_pointer_var:
3854    prefix = "vectp";
3855    break;
3856  default:
3857    gcc_unreachable ();
3858  }
3859
3860  if (name)
3861    {
3862      char* tmp = concat (prefix, "_", name, NULL);
3863      new_vect_var = create_tmp_reg (type, tmp);
3864      free (tmp);
3865    }
3866  else
3867    new_vect_var = create_tmp_reg (type, prefix);
3868
3869  return new_vect_var;
3870}
3871
3872/* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
3873
3874static void
3875vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3876				  stmt_vec_info stmt_info)
3877{
3878  duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3879  unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3880  int misalign = DR_MISALIGNMENT (dr);
3881  if (misalign == -1)
3882    mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3883  else
3884    set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3885}
3886
3887/* Function vect_create_addr_base_for_vector_ref.
3888
3889   Create an expression that computes the address of the first memory location
3890   that will be accessed for a data reference.
3891
3892   Input:
3893   STMT: The statement containing the data reference.
3894   NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3895   OFFSET: Optional. If supplied, it is be added to the initial address.
3896   LOOP:    Specify relative to which loop-nest should the address be computed.
3897            For example, when the dataref is in an inner-loop nested in an
3898	    outer-loop that is now being vectorized, LOOP can be either the
3899	    outer-loop, or the inner-loop.  The first memory location accessed
3900	    by the following dataref ('in' points to short):
3901
3902		for (i=0; i<N; i++)
3903		   for (j=0; j<M; j++)
3904		     s += in[i+j]
3905
3906	    is as follows:
3907	    if LOOP=i_loop:	&in		(relative to i_loop)
3908	    if LOOP=j_loop: 	&in+i*2B	(relative to j_loop)
3909   BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
3910	    initial address.  Unlike OFFSET, which is number of elements to
3911	    be added, BYTE_OFFSET is measured in bytes.
3912
3913   Output:
3914   1. Return an SSA_NAME whose value is the address of the memory location of
3915      the first vector of the data reference.
3916   2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3917      these statement(s) which define the returned SSA_NAME.
3918
3919   FORNOW: We are only handling array accesses with step 1.  */
3920
3921tree
3922vect_create_addr_base_for_vector_ref (gimple stmt,
3923				      gimple_seq *new_stmt_list,
3924				      tree offset,
3925				      struct loop *loop,
3926				      tree byte_offset)
3927{
3928  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3929  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3930  tree data_ref_base;
3931  const char *base_name;
3932  tree addr_base;
3933  tree dest;
3934  gimple_seq seq = NULL;
3935  tree base_offset;
3936  tree init;
3937  tree vect_ptr_type;
3938  tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3939  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3940
3941  if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3942    {
3943      struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3944
3945      gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3946
3947      data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3948      base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3949      init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3950    }
3951  else
3952    {
3953      data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3954      base_offset = unshare_expr (DR_OFFSET (dr));
3955      init = unshare_expr (DR_INIT (dr));
3956    }
3957
3958  if (loop_vinfo)
3959    base_name = get_name (data_ref_base);
3960  else
3961    {
3962      base_offset = ssize_int (0);
3963      init = ssize_int (0);
3964      base_name = get_name (DR_REF (dr));
3965    }
3966
3967  /* Create base_offset */
3968  base_offset = size_binop (PLUS_EXPR,
3969			    fold_convert (sizetype, base_offset),
3970			    fold_convert (sizetype, init));
3971
3972  if (offset)
3973    {
3974      offset = fold_build2 (MULT_EXPR, sizetype,
3975			    fold_convert (sizetype, offset), step);
3976      base_offset = fold_build2 (PLUS_EXPR, sizetype,
3977				 base_offset, offset);
3978    }
3979  if (byte_offset)
3980    {
3981      byte_offset = fold_convert (sizetype, byte_offset);
3982      base_offset = fold_build2 (PLUS_EXPR, sizetype,
3983				 base_offset, byte_offset);
3984    }
3985
3986  /* base + base_offset */
3987  if (loop_vinfo)
3988    addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3989  else
3990    {
3991      addr_base = build1 (ADDR_EXPR,
3992			  build_pointer_type (TREE_TYPE (DR_REF (dr))),
3993			  unshare_expr (DR_REF (dr)));
3994    }
3995
3996  vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3997  addr_base = fold_convert (vect_ptr_type, addr_base);
3998  dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3999  addr_base = force_gimple_operand (addr_base, &seq, false, dest);
4000  gimple_seq_add_seq (new_stmt_list, seq);
4001
4002  if (DR_PTR_INFO (dr)
4003      && TREE_CODE (addr_base) == SSA_NAME)
4004    {
4005      vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4006      if (offset || byte_offset)
4007	mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4008    }
4009
4010  if (dump_enabled_p ())
4011    {
4012      dump_printf_loc (MSG_NOTE, vect_location, "created ");
4013      dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4014      dump_printf (MSG_NOTE, "\n");
4015    }
4016
4017  return addr_base;
4018}
4019
4020
4021/* Function vect_create_data_ref_ptr.
4022
4023   Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4024   location accessed in the loop by STMT, along with the def-use update
4025   chain to appropriately advance the pointer through the loop iterations.
4026   Also set aliasing information for the pointer.  This pointer is used by
4027   the callers to this function to create a memory reference expression for
4028   vector load/store access.
4029
4030   Input:
4031   1. STMT: a stmt that references memory. Expected to be of the form
4032         GIMPLE_ASSIGN <name, data-ref> or
4033	 GIMPLE_ASSIGN <data-ref, name>.
4034   2. AGGR_TYPE: the type of the reference, which should be either a vector
4035        or an array.
4036   3. AT_LOOP: the loop where the vector memref is to be created.
4037   4. OFFSET (optional): an offset to be added to the initial address accessed
4038        by the data-ref in STMT.
4039   5. BSI: location where the new stmts are to be placed if there is no loop
4040   6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4041        pointing to the initial address.
4042   7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4043	to the initial address accessed by the data-ref in STMT.  This is
4044	similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4045	in bytes.
4046
4047   Output:
4048   1. Declare a new ptr to vector_type, and have it point to the base of the
4049      data reference (initial addressed accessed by the data reference).
4050      For example, for vector of type V8HI, the following code is generated:
4051
4052      v8hi *ap;
4053      ap = (v8hi *)initial_address;
4054
4055      if OFFSET is not supplied:
4056         initial_address = &a[init];
4057      if OFFSET is supplied:
4058         initial_address = &a[init + OFFSET];
4059      if BYTE_OFFSET is supplied:
4060	 initial_address = &a[init] + BYTE_OFFSET;
4061
4062      Return the initial_address in INITIAL_ADDRESS.
4063
4064   2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4065      update the pointer in each iteration of the loop.
4066
4067      Return the increment stmt that updates the pointer in PTR_INCR.
4068
4069   3. Set INV_P to true if the access pattern of the data reference in the
4070      vectorized loop is invariant.  Set it to false otherwise.
4071
4072   4. Return the pointer.  */
4073
4074tree
4075vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4076			  tree offset, tree *initial_address,
4077			  gimple_stmt_iterator *gsi, gimple *ptr_incr,
4078			  bool only_init, bool *inv_p, tree byte_offset)
4079{
4080  const char *base_name;
4081  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4082  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4083  struct loop *loop = NULL;
4084  bool nested_in_vect_loop = false;
4085  struct loop *containing_loop = NULL;
4086  tree aggr_ptr_type;
4087  tree aggr_ptr;
4088  tree new_temp;
4089  gimple vec_stmt;
4090  gimple_seq new_stmt_list = NULL;
4091  edge pe = NULL;
4092  basic_block new_bb;
4093  tree aggr_ptr_init;
4094  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4095  tree aptr;
4096  gimple_stmt_iterator incr_gsi;
4097  bool insert_after;
4098  tree indx_before_incr, indx_after_incr;
4099  gimple incr;
4100  tree step;
4101  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4102
4103  gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4104	      || TREE_CODE (aggr_type) == VECTOR_TYPE);
4105
4106  if (loop_vinfo)
4107    {
4108      loop = LOOP_VINFO_LOOP (loop_vinfo);
4109      nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4110      containing_loop = (gimple_bb (stmt))->loop_father;
4111      pe = loop_preheader_edge (loop);
4112    }
4113  else
4114    {
4115      gcc_assert (bb_vinfo);
4116      only_init = true;
4117      *ptr_incr = NULL;
4118    }
4119
4120  /* Check the step (evolution) of the load in LOOP, and record
4121     whether it's invariant.  */
4122  if (nested_in_vect_loop)
4123    step = STMT_VINFO_DR_STEP (stmt_info);
4124  else
4125    step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4126
4127  if (integer_zerop (step))
4128    *inv_p = true;
4129  else
4130    *inv_p = false;
4131
4132  /* Create an expression for the first address accessed by this load
4133     in LOOP.  */
4134  base_name = get_name (DR_BASE_ADDRESS (dr));
4135
4136  if (dump_enabled_p ())
4137    {
4138      tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4139      dump_printf_loc (MSG_NOTE, vect_location,
4140                       "create %s-pointer variable to type: ",
4141		       get_tree_code_name (TREE_CODE (aggr_type)));
4142      dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4143      if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4144        dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4145      else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4146        dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4147      else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4148        dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4149      else
4150        dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4151      dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4152      dump_printf (MSG_NOTE, "\n");
4153    }
4154
4155  /* (1) Create the new aggregate-pointer variable.
4156     Vector and array types inherit the alias set of their component
4157     type by default so we need to use a ref-all pointer if the data
4158     reference does not conflict with the created aggregated data
4159     reference because it is not addressable.  */
4160  bool need_ref_all = false;
4161  if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4162			      get_alias_set (DR_REF (dr))))
4163    need_ref_all = true;
4164  /* Likewise for any of the data references in the stmt group.  */
4165  else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4166    {
4167      gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4168      do
4169	{
4170	  stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4171	  struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4172	  if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4173				      get_alias_set (DR_REF (sdr))))
4174	    {
4175	      need_ref_all = true;
4176	      break;
4177	    }
4178	  orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4179	}
4180      while (orig_stmt);
4181    }
4182  aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4183					       need_ref_all);
4184  aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4185
4186
4187  /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4188     vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4189     def-use update cycles for the pointer: one relative to the outer-loop
4190     (LOOP), which is what steps (3) and (4) below do.  The other is relative
4191     to the inner-loop (which is the inner-most loop containing the dataref),
4192     and this is done be step (5) below.
4193
4194     When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4195     inner-most loop, and so steps (3),(4) work the same, and step (5) is
4196     redundant.  Steps (3),(4) create the following:
4197
4198	vp0 = &base_addr;
4199	LOOP:	vp1 = phi(vp0,vp2)
4200		...
4201		...
4202		vp2 = vp1 + step
4203		goto LOOP
4204
4205     If there is an inner-loop nested in loop, then step (5) will also be
4206     applied, and an additional update in the inner-loop will be created:
4207
4208	vp0 = &base_addr;
4209	LOOP:   vp1 = phi(vp0,vp2)
4210		...
4211        inner:     vp3 = phi(vp1,vp4)
4212	           vp4 = vp3 + inner_step
4213	           if () goto inner
4214		...
4215		vp2 = vp1 + step
4216		if () goto LOOP   */
4217
4218  /* (2) Calculate the initial address of the aggregate-pointer, and set
4219     the aggregate-pointer to point to it before the loop.  */
4220
4221  /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4222
4223  new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4224						   offset, loop, byte_offset);
4225  if (new_stmt_list)
4226    {
4227      if (pe)
4228        {
4229          new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4230          gcc_assert (!new_bb);
4231        }
4232      else
4233        gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4234    }
4235
4236  *initial_address = new_temp;
4237
4238  /* Create: p = (aggr_type *) initial_base  */
4239  if (TREE_CODE (new_temp) != SSA_NAME
4240      || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4241    {
4242      vec_stmt = gimple_build_assign (aggr_ptr,
4243				      fold_convert (aggr_ptr_type, new_temp));
4244      aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4245      /* Copy the points-to information if it exists. */
4246      if (DR_PTR_INFO (dr))
4247	vect_duplicate_ssa_name_ptr_info (aggr_ptr_init, dr, stmt_info);
4248      gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4249      if (pe)
4250	{
4251	  new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4252	  gcc_assert (!new_bb);
4253	}
4254      else
4255	gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4256    }
4257  else
4258    aggr_ptr_init = new_temp;
4259
4260  /* (3) Handle the updating of the aggregate-pointer inside the loop.
4261     This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4262     inner-loop nested in LOOP (during outer-loop vectorization).  */
4263
4264  /* No update in loop is required.  */
4265  if (only_init && (!loop_vinfo || at_loop == loop))
4266    aptr = aggr_ptr_init;
4267  else
4268    {
4269      /* The step of the aggregate pointer is the type size.  */
4270      tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4271      /* One exception to the above is when the scalar step of the load in
4272	 LOOP is zero. In this case the step here is also zero.  */
4273      if (*inv_p)
4274	iv_step = size_zero_node;
4275      else if (tree_int_cst_sgn (step) == -1)
4276	iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4277
4278      standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4279
4280      create_iv (aggr_ptr_init,
4281		 fold_convert (aggr_ptr_type, iv_step),
4282		 aggr_ptr, loop, &incr_gsi, insert_after,
4283		 &indx_before_incr, &indx_after_incr);
4284      incr = gsi_stmt (incr_gsi);
4285      set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4286
4287      /* Copy the points-to information if it exists. */
4288      if (DR_PTR_INFO (dr))
4289	{
4290	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4291	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4292	}
4293      if (ptr_incr)
4294	*ptr_incr = incr;
4295
4296      aptr = indx_before_incr;
4297    }
4298
4299  if (!nested_in_vect_loop || only_init)
4300    return aptr;
4301
4302
4303  /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4304     nested in LOOP, if exists.  */
4305
4306  gcc_assert (nested_in_vect_loop);
4307  if (!only_init)
4308    {
4309      standard_iv_increment_position (containing_loop, &incr_gsi,
4310				      &insert_after);
4311      create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4312		 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4313		 &indx_after_incr);
4314      incr = gsi_stmt (incr_gsi);
4315      set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4316
4317      /* Copy the points-to information if it exists. */
4318      if (DR_PTR_INFO (dr))
4319	{
4320	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4321	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4322	}
4323      if (ptr_incr)
4324	*ptr_incr = incr;
4325
4326      return indx_before_incr;
4327    }
4328  else
4329    gcc_unreachable ();
4330}
4331
4332
4333/* Function bump_vector_ptr
4334
4335   Increment a pointer (to a vector type) by vector-size. If requested,
4336   i.e. if PTR-INCR is given, then also connect the new increment stmt
4337   to the existing def-use update-chain of the pointer, by modifying
4338   the PTR_INCR as illustrated below:
4339
4340   The pointer def-use update-chain before this function:
4341                        DATAREF_PTR = phi (p_0, p_2)
4342                        ....
4343        PTR_INCR:       p_2 = DATAREF_PTR + step
4344
4345   The pointer def-use update-chain after this function:
4346                        DATAREF_PTR = phi (p_0, p_2)
4347                        ....
4348                        NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4349                        ....
4350        PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4351
4352   Input:
4353   DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4354                 in the loop.
4355   PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4356	      the loop.  The increment amount across iterations is expected
4357	      to be vector_size.
4358   BSI - location where the new update stmt is to be placed.
4359   STMT - the original scalar memory-access stmt that is being vectorized.
4360   BUMP - optional. The offset by which to bump the pointer. If not given,
4361	  the offset is assumed to be vector_size.
4362
4363   Output: Return NEW_DATAREF_PTR as illustrated above.
4364
4365*/
4366
4367tree
4368bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4369		 gimple stmt, tree bump)
4370{
4371  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4372  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4373  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4374  tree update = TYPE_SIZE_UNIT (vectype);
4375  gassign *incr_stmt;
4376  ssa_op_iter iter;
4377  use_operand_p use_p;
4378  tree new_dataref_ptr;
4379
4380  if (bump)
4381    update = bump;
4382
4383  new_dataref_ptr = copy_ssa_name (dataref_ptr);
4384  incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4385				   dataref_ptr, update);
4386  vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4387
4388  /* Copy the points-to information if it exists. */
4389  if (DR_PTR_INFO (dr))
4390    {
4391      duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4392      mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4393    }
4394
4395  if (!ptr_incr)
4396    return new_dataref_ptr;
4397
4398  /* Update the vector-pointer's cross-iteration increment.  */
4399  FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4400    {
4401      tree use = USE_FROM_PTR (use_p);
4402
4403      if (use == dataref_ptr)
4404        SET_USE (use_p, new_dataref_ptr);
4405      else
4406        gcc_assert (tree_int_cst_compare (use, update) == 0);
4407    }
4408
4409  return new_dataref_ptr;
4410}
4411
4412
4413/* Function vect_create_destination_var.
4414
4415   Create a new temporary of type VECTYPE.  */
4416
4417tree
4418vect_create_destination_var (tree scalar_dest, tree vectype)
4419{
4420  tree vec_dest;
4421  const char *name;
4422  char *new_name;
4423  tree type;
4424  enum vect_var_kind kind;
4425
4426  kind = vectype ? vect_simple_var : vect_scalar_var;
4427  type = vectype ? vectype : TREE_TYPE (scalar_dest);
4428
4429  gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4430
4431  name = get_name (scalar_dest);
4432  if (name)
4433    new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4434  else
4435    new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4436  vec_dest = vect_get_new_vect_var (type, kind, new_name);
4437  free (new_name);
4438
4439  return vec_dest;
4440}
4441
4442/* Function vect_grouped_store_supported.
4443
4444   Returns TRUE if interleave high and interleave low permutations
4445   are supported, and FALSE otherwise.  */
4446
4447bool
4448vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4449{
4450  machine_mode mode = TYPE_MODE (vectype);
4451
4452  /* vect_permute_store_chain requires the group size to be equal to 3 or
4453     be a power of two.  */
4454  if (count != 3 && exact_log2 (count) == -1)
4455    {
4456      if (dump_enabled_p ())
4457	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4458			 "the size of the group of accesses"
4459			 " is not a power of 2 or not eqaul to 3\n");
4460      return false;
4461    }
4462
4463  /* Check that the permutation is supported.  */
4464  if (VECTOR_MODE_P (mode))
4465    {
4466      unsigned int i, nelt = GET_MODE_NUNITS (mode);
4467      unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4468
4469      if (count == 3)
4470	{
4471	  unsigned int j0 = 0, j1 = 0, j2 = 0;
4472	  unsigned int i, j;
4473
4474	  for (j = 0; j < 3; j++)
4475	    {
4476	      int nelt0 = ((3 - j) * nelt) % 3;
4477	      int nelt1 = ((3 - j) * nelt + 1) % 3;
4478	      int nelt2 = ((3 - j) * nelt + 2) % 3;
4479	      for (i = 0; i < nelt; i++)
4480		{
4481		  if (3 * i + nelt0 < nelt)
4482		    sel[3 * i + nelt0] = j0++;
4483		  if (3 * i + nelt1 < nelt)
4484		    sel[3 * i + nelt1] = nelt + j1++;
4485		  if (3 * i + nelt2 < nelt)
4486		    sel[3 * i + nelt2] = 0;
4487		}
4488	      if (!can_vec_perm_p (mode, false, sel))
4489		{
4490		  if (dump_enabled_p ())
4491		    dump_printf (MSG_MISSED_OPTIMIZATION,
4492				 "permutaion op not supported by target.\n");
4493		  return false;
4494		}
4495
4496	      for (i = 0; i < nelt; i++)
4497		{
4498		  if (3 * i + nelt0 < nelt)
4499		    sel[3 * i + nelt0] = 3 * i + nelt0;
4500		  if (3 * i + nelt1 < nelt)
4501		    sel[3 * i + nelt1] = 3 * i + nelt1;
4502		  if (3 * i + nelt2 < nelt)
4503		    sel[3 * i + nelt2] = nelt + j2++;
4504		}
4505	      if (!can_vec_perm_p (mode, false, sel))
4506		{
4507		  if (dump_enabled_p ())
4508		    dump_printf (MSG_MISSED_OPTIMIZATION,
4509				 "permutaion op not supported by target.\n");
4510		  return false;
4511		}
4512	    }
4513	  return true;
4514	}
4515      else
4516	{
4517	  /* If length is not equal to 3 then only power of 2 is supported.  */
4518	  gcc_assert (exact_log2 (count) != -1);
4519
4520	  for (i = 0; i < nelt / 2; i++)
4521	    {
4522	      sel[i * 2] = i;
4523	      sel[i * 2 + 1] = i + nelt;
4524	    }
4525	    if (can_vec_perm_p (mode, false, sel))
4526	      {
4527		for (i = 0; i < nelt; i++)
4528		  sel[i] += nelt / 2;
4529		if (can_vec_perm_p (mode, false, sel))
4530		  return true;
4531	      }
4532	}
4533    }
4534
4535  if (dump_enabled_p ())
4536    dump_printf (MSG_MISSED_OPTIMIZATION,
4537		 "permutaion op not supported by target.\n");
4538  return false;
4539}
4540
4541
4542/* Return TRUE if vec_store_lanes is available for COUNT vectors of
4543   type VECTYPE.  */
4544
4545bool
4546vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4547{
4548  return vect_lanes_optab_supported_p ("vec_store_lanes",
4549				       vec_store_lanes_optab,
4550				       vectype, count);
4551}
4552
4553
4554/* Function vect_permute_store_chain.
4555
4556   Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4557   a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4558   the data correctly for the stores.  Return the final references for stores
4559   in RESULT_CHAIN.
4560
4561   E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4562   The input is 4 vectors each containing 8 elements.  We assign a number to
4563   each element, the input sequence is:
4564
4565   1st vec:   0  1  2  3  4  5  6  7
4566   2nd vec:   8  9 10 11 12 13 14 15
4567   3rd vec:  16 17 18 19 20 21 22 23
4568   4th vec:  24 25 26 27 28 29 30 31
4569
4570   The output sequence should be:
4571
4572   1st vec:  0  8 16 24  1  9 17 25
4573   2nd vec:  2 10 18 26  3 11 19 27
4574   3rd vec:  4 12 20 28  5 13 21 30
4575   4th vec:  6 14 22 30  7 15 23 31
4576
4577   i.e., we interleave the contents of the four vectors in their order.
4578
4579   We use interleave_high/low instructions to create such output.  The input of
4580   each interleave_high/low operation is two vectors:
4581   1st vec    2nd vec
4582   0 1 2 3    4 5 6 7
4583   the even elements of the result vector are obtained left-to-right from the
4584   high/low elements of the first vector.  The odd elements of the result are
4585   obtained left-to-right from the high/low elements of the second vector.
4586   The output of interleave_high will be:   0 4 1 5
4587   and of interleave_low:                   2 6 3 7
4588
4589
4590   The permutation is done in log LENGTH stages.  In each stage interleave_high
4591   and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4592   where the first argument is taken from the first half of DR_CHAIN and the
4593   second argument from it's second half.
4594   In our example,
4595
4596   I1: interleave_high (1st vec, 3rd vec)
4597   I2: interleave_low (1st vec, 3rd vec)
4598   I3: interleave_high (2nd vec, 4th vec)
4599   I4: interleave_low (2nd vec, 4th vec)
4600
4601   The output for the first stage is:
4602
4603   I1:  0 16  1 17  2 18  3 19
4604   I2:  4 20  5 21  6 22  7 23
4605   I3:  8 24  9 25 10 26 11 27
4606   I4: 12 28 13 29 14 30 15 31
4607
4608   The output of the second stage, i.e. the final result is:
4609
4610   I1:  0  8 16 24  1  9 17 25
4611   I2:  2 10 18 26  3 11 19 27
4612   I3:  4 12 20 28  5 13 21 30
4613   I4:  6 14 22 30  7 15 23 31.  */
4614
4615void
4616vect_permute_store_chain (vec<tree> dr_chain,
4617			  unsigned int length,
4618			  gimple stmt,
4619			  gimple_stmt_iterator *gsi,
4620			  vec<tree> *result_chain)
4621{
4622  tree vect1, vect2, high, low;
4623  gimple perm_stmt;
4624  tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4625  tree perm_mask_low, perm_mask_high;
4626  tree data_ref;
4627  tree perm3_mask_low, perm3_mask_high;
4628  unsigned int i, n, log_length = exact_log2 (length);
4629  unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4630  unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4631
4632  result_chain->quick_grow (length);
4633  memcpy (result_chain->address (), dr_chain.address (),
4634	  length * sizeof (tree));
4635
4636  if (length == 3)
4637    {
4638      unsigned int j0 = 0, j1 = 0, j2 = 0;
4639
4640      for (j = 0; j < 3; j++)
4641        {
4642	  int nelt0 = ((3 - j) * nelt) % 3;
4643	  int nelt1 = ((3 - j) * nelt + 1) % 3;
4644	  int nelt2 = ((3 - j) * nelt + 2) % 3;
4645
4646	  for (i = 0; i < nelt; i++)
4647	    {
4648	      if (3 * i + nelt0 < nelt)
4649		sel[3 * i + nelt0] = j0++;
4650	      if (3 * i + nelt1 < nelt)
4651		sel[3 * i + nelt1] = nelt + j1++;
4652	      if (3 * i + nelt2 < nelt)
4653		sel[3 * i + nelt2] = 0;
4654	    }
4655	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4656
4657	  for (i = 0; i < nelt; i++)
4658	    {
4659	      if (3 * i + nelt0 < nelt)
4660		sel[3 * i + nelt0] = 3 * i + nelt0;
4661	      if (3 * i + nelt1 < nelt)
4662		sel[3 * i + nelt1] = 3 * i + nelt1;
4663	      if (3 * i + nelt2 < nelt)
4664		sel[3 * i + nelt2] = nelt + j2++;
4665	    }
4666	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4667
4668	  vect1 = dr_chain[0];
4669	  vect2 = dr_chain[1];
4670
4671	  /* Create interleaving stmt:
4672	     low = VEC_PERM_EXPR <vect1, vect2,
4673				  {j, nelt, *, j + 1, nelt + j + 1, *,
4674				   j + 2, nelt + j + 2, *, ...}>  */
4675	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4676	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4677					   vect2, perm3_mask_low);
4678	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4679
4680	  vect1 = data_ref;
4681	  vect2 = dr_chain[2];
4682	  /* Create interleaving stmt:
4683	     low = VEC_PERM_EXPR <vect1, vect2,
4684				  {0, 1, nelt + j, 3, 4, nelt + j + 1,
4685				   6, 7, nelt + j + 2, ...}>  */
4686	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4687	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4688					   vect2, perm3_mask_high);
4689	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4690	  (*result_chain)[j] = data_ref;
4691	}
4692    }
4693  else
4694    {
4695      /* If length is not equal to 3 then only power of 2 is supported.  */
4696      gcc_assert (exact_log2 (length) != -1);
4697
4698      for (i = 0, n = nelt / 2; i < n; i++)
4699	{
4700	  sel[i * 2] = i;
4701	  sel[i * 2 + 1] = i + nelt;
4702	}
4703	perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4704
4705	for (i = 0; i < nelt; i++)
4706	  sel[i] += nelt / 2;
4707	perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4708
4709	for (i = 0, n = log_length; i < n; i++)
4710	  {
4711	    for (j = 0; j < length/2; j++)
4712	      {
4713		vect1 = dr_chain[j];
4714		vect2 = dr_chain[j+length/2];
4715
4716		/* Create interleaving stmt:
4717		   high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4718							...}>  */
4719		high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4720		perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4721						 vect2, perm_mask_high);
4722		vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4723		(*result_chain)[2*j] = high;
4724
4725		/* Create interleaving stmt:
4726		   low = VEC_PERM_EXPR <vect1, vect2,
4727					{nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4728					 ...}>  */
4729		low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4730		perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4731						 vect2, perm_mask_low);
4732		vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4733		(*result_chain)[2*j+1] = low;
4734	      }
4735	    memcpy (dr_chain.address (), result_chain->address (),
4736		    length * sizeof (tree));
4737	  }
4738    }
4739}
4740
4741/* Function vect_setup_realignment
4742
4743   This function is called when vectorizing an unaligned load using
4744   the dr_explicit_realign[_optimized] scheme.
4745   This function generates the following code at the loop prolog:
4746
4747      p = initial_addr;
4748   x  msq_init = *(floor(p));   # prolog load
4749      realignment_token = call target_builtin;
4750    loop:
4751   x  msq = phi (msq_init, ---)
4752
4753   The stmts marked with x are generated only for the case of
4754   dr_explicit_realign_optimized.
4755
4756   The code above sets up a new (vector) pointer, pointing to the first
4757   location accessed by STMT, and a "floor-aligned" load using that pointer.
4758   It also generates code to compute the "realignment-token" (if the relevant
4759   target hook was defined), and creates a phi-node at the loop-header bb
4760   whose arguments are the result of the prolog-load (created by this
4761   function) and the result of a load that takes place in the loop (to be
4762   created by the caller to this function).
4763
4764   For the case of dr_explicit_realign_optimized:
4765   The caller to this function uses the phi-result (msq) to create the
4766   realignment code inside the loop, and sets up the missing phi argument,
4767   as follows:
4768    loop:
4769      msq = phi (msq_init, lsq)
4770      lsq = *(floor(p'));        # load in loop
4771      result = realign_load (msq, lsq, realignment_token);
4772
4773   For the case of dr_explicit_realign:
4774    loop:
4775      msq = *(floor(p)); 	# load in loop
4776      p' = p + (VS-1);
4777      lsq = *(floor(p'));	# load in loop
4778      result = realign_load (msq, lsq, realignment_token);
4779
4780   Input:
4781   STMT - (scalar) load stmt to be vectorized. This load accesses
4782          a memory location that may be unaligned.
4783   BSI - place where new code is to be inserted.
4784   ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4785			      is used.
4786
4787   Output:
4788   REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4789                       target hook, if defined.
4790   Return value - the result of the loop-header phi node.  */
4791
4792tree
4793vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4794                        tree *realignment_token,
4795			enum dr_alignment_support alignment_support_scheme,
4796			tree init_addr,
4797			struct loop **at_loop)
4798{
4799  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4800  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4801  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4802  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4803  struct loop *loop = NULL;
4804  edge pe = NULL;
4805  tree scalar_dest = gimple_assign_lhs (stmt);
4806  tree vec_dest;
4807  gimple inc;
4808  tree ptr;
4809  tree data_ref;
4810  basic_block new_bb;
4811  tree msq_init = NULL_TREE;
4812  tree new_temp;
4813  gphi *phi_stmt;
4814  tree msq = NULL_TREE;
4815  gimple_seq stmts = NULL;
4816  bool inv_p;
4817  bool compute_in_loop = false;
4818  bool nested_in_vect_loop = false;
4819  struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4820  struct loop *loop_for_initial_load = NULL;
4821
4822  if (loop_vinfo)
4823    {
4824      loop = LOOP_VINFO_LOOP (loop_vinfo);
4825      nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4826    }
4827
4828  gcc_assert (alignment_support_scheme == dr_explicit_realign
4829	      || alignment_support_scheme == dr_explicit_realign_optimized);
4830
4831  /* We need to generate three things:
4832     1. the misalignment computation
4833     2. the extra vector load (for the optimized realignment scheme).
4834     3. the phi node for the two vectors from which the realignment is
4835      done (for the optimized realignment scheme).  */
4836
4837  /* 1. Determine where to generate the misalignment computation.
4838
4839     If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4840     calculation will be generated by this function, outside the loop (in the
4841     preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4842     caller, inside the loop.
4843
4844     Background: If the misalignment remains fixed throughout the iterations of
4845     the loop, then both realignment schemes are applicable, and also the
4846     misalignment computation can be done outside LOOP.  This is because we are
4847     vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4848     are a multiple of VS (the Vector Size), and therefore the misalignment in
4849     different vectorized LOOP iterations is always the same.
4850     The problem arises only if the memory access is in an inner-loop nested
4851     inside LOOP, which is now being vectorized using outer-loop vectorization.
4852     This is the only case when the misalignment of the memory access may not
4853     remain fixed throughout the iterations of the inner-loop (as explained in
4854     detail in vect_supportable_dr_alignment).  In this case, not only is the
4855     optimized realignment scheme not applicable, but also the misalignment
4856     computation (and generation of the realignment token that is passed to
4857     REALIGN_LOAD) have to be done inside the loop.
4858
4859     In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4860     or not, which in turn determines if the misalignment is computed inside
4861     the inner-loop, or outside LOOP.  */
4862
4863  if (init_addr != NULL_TREE || !loop_vinfo)
4864    {
4865      compute_in_loop = true;
4866      gcc_assert (alignment_support_scheme == dr_explicit_realign);
4867    }
4868
4869
4870  /* 2. Determine where to generate the extra vector load.
4871
4872     For the optimized realignment scheme, instead of generating two vector
4873     loads in each iteration, we generate a single extra vector load in the
4874     preheader of the loop, and in each iteration reuse the result of the
4875     vector load from the previous iteration.  In case the memory access is in
4876     an inner-loop nested inside LOOP, which is now being vectorized using
4877     outer-loop vectorization, we need to determine whether this initial vector
4878     load should be generated at the preheader of the inner-loop, or can be
4879     generated at the preheader of LOOP.  If the memory access has no evolution
4880     in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4881     to be generated inside LOOP (in the preheader of the inner-loop).  */
4882
4883  if (nested_in_vect_loop)
4884    {
4885      tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4886      bool invariant_in_outerloop =
4887            (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4888      loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4889    }
4890  else
4891    loop_for_initial_load = loop;
4892  if (at_loop)
4893    *at_loop = loop_for_initial_load;
4894
4895  if (loop_for_initial_load)
4896    pe = loop_preheader_edge (loop_for_initial_load);
4897
4898  /* 3. For the case of the optimized realignment, create the first vector
4899      load at the loop preheader.  */
4900
4901  if (alignment_support_scheme == dr_explicit_realign_optimized)
4902    {
4903      /* Create msq_init = *(floor(p1)) in the loop preheader  */
4904      gassign *new_stmt;
4905
4906      gcc_assert (!compute_in_loop);
4907      vec_dest = vect_create_destination_var (scalar_dest, vectype);
4908      ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4909				      NULL_TREE, &init_addr, NULL, &inc,
4910				      true, &inv_p);
4911      new_temp = copy_ssa_name (ptr);
4912      new_stmt = gimple_build_assign
4913		   (new_temp, BIT_AND_EXPR, ptr,
4914		    build_int_cst (TREE_TYPE (ptr),
4915				   -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4916      new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4917      gcc_assert (!new_bb);
4918      data_ref
4919	= build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4920		  build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4921      new_stmt = gimple_build_assign (vec_dest, data_ref);
4922      new_temp = make_ssa_name (vec_dest, new_stmt);
4923      gimple_assign_set_lhs (new_stmt, new_temp);
4924      if (pe)
4925        {
4926          new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4927          gcc_assert (!new_bb);
4928        }
4929      else
4930         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4931
4932      msq_init = gimple_assign_lhs (new_stmt);
4933    }
4934
4935  /* 4. Create realignment token using a target builtin, if available.
4936      It is done either inside the containing loop, or before LOOP (as
4937      determined above).  */
4938
4939  if (targetm.vectorize.builtin_mask_for_load)
4940    {
4941      gcall *new_stmt;
4942      tree builtin_decl;
4943
4944      /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4945      if (!init_addr)
4946	{
4947	  /* Generate the INIT_ADDR computation outside LOOP.  */
4948	  init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4949							NULL_TREE, loop);
4950          if (loop)
4951            {
4952   	      pe = loop_preheader_edge (loop);
4953	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4954	      gcc_assert (!new_bb);
4955            }
4956          else
4957             gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4958	}
4959
4960      builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4961      new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4962      vec_dest =
4963	vect_create_destination_var (scalar_dest,
4964				     gimple_call_return_type (new_stmt));
4965      new_temp = make_ssa_name (vec_dest, new_stmt);
4966      gimple_call_set_lhs (new_stmt, new_temp);
4967
4968      if (compute_in_loop)
4969	gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4970      else
4971	{
4972	  /* Generate the misalignment computation outside LOOP.  */
4973	  pe = loop_preheader_edge (loop);
4974	  new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4975	  gcc_assert (!new_bb);
4976	}
4977
4978      *realignment_token = gimple_call_lhs (new_stmt);
4979
4980      /* The result of the CALL_EXPR to this builtin is determined from
4981         the value of the parameter and no global variables are touched
4982         which makes the builtin a "const" function.  Requiring the
4983         builtin to have the "const" attribute makes it unnecessary
4984         to call mark_call_clobbered.  */
4985      gcc_assert (TREE_READONLY (builtin_decl));
4986    }
4987
4988  if (alignment_support_scheme == dr_explicit_realign)
4989    return msq;
4990
4991  gcc_assert (!compute_in_loop);
4992  gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4993
4994
4995  /* 5. Create msq = phi <msq_init, lsq> in loop  */
4996
4997  pe = loop_preheader_edge (containing_loop);
4998  vec_dest = vect_create_destination_var (scalar_dest, vectype);
4999  msq = make_ssa_name (vec_dest);
5000  phi_stmt = create_phi_node (msq, containing_loop->header);
5001  add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5002
5003  return msq;
5004}
5005
5006
5007/* Function vect_grouped_load_supported.
5008
5009   Returns TRUE if even and odd permutations are supported,
5010   and FALSE otherwise.  */
5011
5012bool
5013vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5014{
5015  machine_mode mode = TYPE_MODE (vectype);
5016
5017  /* vect_permute_load_chain requires the group size to be equal to 3 or
5018     be a power of two.  */
5019  if (count != 3 && exact_log2 (count) == -1)
5020    {
5021      if (dump_enabled_p ())
5022	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5023			 "the size of the group of accesses"
5024			 " is not a power of 2 or not equal to 3\n");
5025      return false;
5026    }
5027
5028  /* Check that the permutation is supported.  */
5029  if (VECTOR_MODE_P (mode))
5030    {
5031      unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5032      unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5033
5034      if (count == 3)
5035	{
5036	  unsigned int k;
5037	  for (k = 0; k < 3; k++)
5038	    {
5039	      for (i = 0; i < nelt; i++)
5040		if (3 * i + k < 2 * nelt)
5041		  sel[i] = 3 * i + k;
5042		else
5043		  sel[i] = 0;
5044	      if (!can_vec_perm_p (mode, false, sel))
5045		{
5046		  if (dump_enabled_p ())
5047		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5048				     "shuffle of 3 loads is not supported by"
5049				     " target\n");
5050		    return false;
5051		}
5052	      for (i = 0, j = 0; i < nelt; i++)
5053		if (3 * i + k < 2 * nelt)
5054		  sel[i] = i;
5055		else
5056		  sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5057	      if (!can_vec_perm_p (mode, false, sel))
5058		{
5059		  if (dump_enabled_p ())
5060		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5061				     "shuffle of 3 loads is not supported by"
5062				     " target\n");
5063		  return false;
5064		}
5065	    }
5066	  return true;
5067	}
5068      else
5069	{
5070	  /* If length is not equal to 3 then only power of 2 is supported.  */
5071	  gcc_assert (exact_log2 (count) != -1);
5072	  for (i = 0; i < nelt; i++)
5073	    sel[i] = i * 2;
5074	  if (can_vec_perm_p (mode, false, sel))
5075	    {
5076	      for (i = 0; i < nelt; i++)
5077		sel[i] = i * 2 + 1;
5078	      if (can_vec_perm_p (mode, false, sel))
5079		return true;
5080	    }
5081        }
5082    }
5083
5084  if (dump_enabled_p ())
5085    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5086		     "extract even/odd not supported by target\n");
5087  return false;
5088}
5089
5090/* Return TRUE if vec_load_lanes is available for COUNT vectors of
5091   type VECTYPE.  */
5092
5093bool
5094vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5095{
5096  return vect_lanes_optab_supported_p ("vec_load_lanes",
5097				       vec_load_lanes_optab,
5098				       vectype, count);
5099}
5100
5101/* Function vect_permute_load_chain.
5102
5103   Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5104   a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5105   the input data correctly.  Return the final references for loads in
5106   RESULT_CHAIN.
5107
5108   E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5109   The input is 4 vectors each containing 8 elements. We assign a number to each
5110   element, the input sequence is:
5111
5112   1st vec:   0  1  2  3  4  5  6  7
5113   2nd vec:   8  9 10 11 12 13 14 15
5114   3rd vec:  16 17 18 19 20 21 22 23
5115   4th vec:  24 25 26 27 28 29 30 31
5116
5117   The output sequence should be:
5118
5119   1st vec:  0 4  8 12 16 20 24 28
5120   2nd vec:  1 5  9 13 17 21 25 29
5121   3rd vec:  2 6 10 14 18 22 26 30
5122   4th vec:  3 7 11 15 19 23 27 31
5123
5124   i.e., the first output vector should contain the first elements of each
5125   interleaving group, etc.
5126
5127   We use extract_even/odd instructions to create such output.  The input of
5128   each extract_even/odd operation is two vectors
5129   1st vec    2nd vec
5130   0 1 2 3    4 5 6 7
5131
5132   and the output is the vector of extracted even/odd elements.  The output of
5133   extract_even will be:   0 2 4 6
5134   and of extract_odd:     1 3 5 7
5135
5136
5137   The permutation is done in log LENGTH stages.  In each stage extract_even
5138   and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5139   their order.  In our example,
5140
5141   E1: extract_even (1st vec, 2nd vec)
5142   E2: extract_odd (1st vec, 2nd vec)
5143   E3: extract_even (3rd vec, 4th vec)
5144   E4: extract_odd (3rd vec, 4th vec)
5145
5146   The output for the first stage will be:
5147
5148   E1:  0  2  4  6  8 10 12 14
5149   E2:  1  3  5  7  9 11 13 15
5150   E3: 16 18 20 22 24 26 28 30
5151   E4: 17 19 21 23 25 27 29 31
5152
5153   In order to proceed and create the correct sequence for the next stage (or
5154   for the correct output, if the second stage is the last one, as in our
5155   example), we first put the output of extract_even operation and then the
5156   output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5157   The input for the second stage is:
5158
5159   1st vec (E1):  0  2  4  6  8 10 12 14
5160   2nd vec (E3): 16 18 20 22 24 26 28 30
5161   3rd vec (E2):  1  3  5  7  9 11 13 15
5162   4th vec (E4): 17 19 21 23 25 27 29 31
5163
5164   The output of the second stage:
5165
5166   E1: 0 4  8 12 16 20 24 28
5167   E2: 2 6 10 14 18 22 26 30
5168   E3: 1 5  9 13 17 21 25 29
5169   E4: 3 7 11 15 19 23 27 31
5170
5171   And RESULT_CHAIN after reordering:
5172
5173   1st vec (E1):  0 4  8 12 16 20 24 28
5174   2nd vec (E3):  1 5  9 13 17 21 25 29
5175   3rd vec (E2):  2 6 10 14 18 22 26 30
5176   4th vec (E4):  3 7 11 15 19 23 27 31.  */
5177
5178static void
5179vect_permute_load_chain (vec<tree> dr_chain,
5180			 unsigned int length,
5181			 gimple stmt,
5182			 gimple_stmt_iterator *gsi,
5183			 vec<tree> *result_chain)
5184{
5185  tree data_ref, first_vect, second_vect;
5186  tree perm_mask_even, perm_mask_odd;
5187  tree perm3_mask_low, perm3_mask_high;
5188  gimple perm_stmt;
5189  tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5190  unsigned int i, j, log_length = exact_log2 (length);
5191  unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5192  unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5193
5194  result_chain->quick_grow (length);
5195  memcpy (result_chain->address (), dr_chain.address (),
5196	  length * sizeof (tree));
5197
5198  if (length == 3)
5199    {
5200      unsigned int k;
5201
5202      for (k = 0; k < 3; k++)
5203	{
5204	  for (i = 0; i < nelt; i++)
5205	    if (3 * i + k < 2 * nelt)
5206	      sel[i] = 3 * i + k;
5207	    else
5208	      sel[i] = 0;
5209	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5210
5211	  for (i = 0, j = 0; i < nelt; i++)
5212	    if (3 * i + k < 2 * nelt)
5213	      sel[i] = i;
5214	    else
5215	      sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5216
5217	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5218
5219	  first_vect = dr_chain[0];
5220	  second_vect = dr_chain[1];
5221
5222	  /* Create interleaving stmt (low part of):
5223	     low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5224							     ...}>  */
5225	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5226	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5227					   second_vect, perm3_mask_low);
5228	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5229
5230	  /* Create interleaving stmt (high part of):
5231	     high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5232							      ...}>  */
5233	  first_vect = data_ref;
5234	  second_vect = dr_chain[2];
5235	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5236	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5237					   second_vect, perm3_mask_high);
5238	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5239	  (*result_chain)[k] = data_ref;
5240	}
5241    }
5242  else
5243    {
5244      /* If length is not equal to 3 then only power of 2 is supported.  */
5245      gcc_assert (exact_log2 (length) != -1);
5246
5247      for (i = 0; i < nelt; ++i)
5248	sel[i] = i * 2;
5249      perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5250
5251      for (i = 0; i < nelt; ++i)
5252	sel[i] = i * 2 + 1;
5253      perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5254
5255      for (i = 0; i < log_length; i++)
5256	{
5257	  for (j = 0; j < length; j += 2)
5258	    {
5259	      first_vect = dr_chain[j];
5260	      second_vect = dr_chain[j+1];
5261
5262	      /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5263	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5264	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5265					       first_vect, second_vect,
5266					       perm_mask_even);
5267	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5268	      (*result_chain)[j/2] = data_ref;
5269
5270	      /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5271	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5272	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5273					       first_vect, second_vect,
5274					       perm_mask_odd);
5275	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5276	      (*result_chain)[j/2+length/2] = data_ref;
5277	    }
5278	  memcpy (dr_chain.address (), result_chain->address (),
5279		  length * sizeof (tree));
5280	}
5281    }
5282}
5283
5284/* Function vect_shift_permute_load_chain.
5285
5286   Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5287   sequence of stmts to reorder the input data accordingly.
5288   Return the final references for loads in RESULT_CHAIN.
5289   Return true if successed, false otherwise.
5290
5291   E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5292   The input is 3 vectors each containing 8 elements.  We assign a
5293   number to each element, the input sequence is:
5294
5295   1st vec:   0  1  2  3  4  5  6  7
5296   2nd vec:   8  9 10 11 12 13 14 15
5297   3rd vec:  16 17 18 19 20 21 22 23
5298
5299   The output sequence should be:
5300
5301   1st vec:  0 3 6  9 12 15 18 21
5302   2nd vec:  1 4 7 10 13 16 19 22
5303   3rd vec:  2 5 8 11 14 17 20 23
5304
5305   We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5306
5307   First we shuffle all 3 vectors to get correct elements order:
5308
5309   1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5310   2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5311   3rd vec:  (16 19 22) (17 20 23) (18 21)
5312
5313   Next we unite and shift vector 3 times:
5314
5315   1st step:
5316     shift right by 6 the concatenation of:
5317     "1st vec" and  "2nd vec"
5318       ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5319     "2nd vec" and  "3rd vec"
5320       ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5321     "3rd vec" and  "1st vec"
5322       (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5323			     | New vectors                   |
5324
5325     So that now new vectors are:
5326
5327     1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5328     2nd vec:  (10 13) (16 19 22) (17 20 23)
5329     3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5330
5331   2nd step:
5332     shift right by 5 the concatenation of:
5333     "1st vec" and  "3rd vec"
5334       ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5335     "2nd vec" and  "1st vec"
5336       (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5337     "3rd vec" and  "2nd vec"
5338       (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5339			  | New vectors                   |
5340
5341     So that now new vectors are:
5342
5343     1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5344     2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5345     3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5346
5347   3rd step:
5348     shift right by 5 the concatenation of:
5349     "1st vec" and  "1st vec"
5350       ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5351     shift right by 3 the concatenation of:
5352     "2nd vec" and  "2nd vec"
5353               (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5354			  | New vectors                   |
5355
5356     So that now all vectors are READY:
5357     1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5358     2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5359     3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5360
5361   This algorithm is faster than one in vect_permute_load_chain if:
5362     1.  "shift of a concatination" is faster than general permutation.
5363	 This is usually so.
5364     2.  The TARGET machine can't execute vector instructions in parallel.
5365	 This is because each step of the algorithm depends on previous.
5366	 The algorithm in vect_permute_load_chain is much more parallel.
5367
5368   The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5369*/
5370
5371static bool
5372vect_shift_permute_load_chain (vec<tree> dr_chain,
5373			       unsigned int length,
5374			       gimple stmt,
5375			       gimple_stmt_iterator *gsi,
5376			       vec<tree> *result_chain)
5377{
5378  tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5379  tree perm2_mask1, perm2_mask2, perm3_mask;
5380  tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5381  gimple perm_stmt;
5382
5383  tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5384  unsigned int i;
5385  unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5386  unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5387  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5388  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5389
5390  result_chain->quick_grow (length);
5391  memcpy (result_chain->address (), dr_chain.address (),
5392	  length * sizeof (tree));
5393
5394  if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5395    {
5396      unsigned int j, log_length = exact_log2 (length);
5397      for (i = 0; i < nelt / 2; ++i)
5398	sel[i] = i * 2;
5399      for (i = 0; i < nelt / 2; ++i)
5400	sel[nelt / 2 + i] = i * 2 + 1;
5401      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5402	{
5403	  if (dump_enabled_p ())
5404	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5405			     "shuffle of 2 fields structure is not \
5406			      supported by target\n");
5407	  return false;
5408	}
5409      perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5410
5411      for (i = 0; i < nelt / 2; ++i)
5412	sel[i] = i * 2 + 1;
5413      for (i = 0; i < nelt / 2; ++i)
5414	sel[nelt / 2 + i] = i * 2;
5415      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5416	{
5417	  if (dump_enabled_p ())
5418	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5419			     "shuffle of 2 fields structure is not \
5420			      supported by target\n");
5421	  return false;
5422	}
5423      perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5424
5425      /* Generating permutation constant to shift all elements.
5426	 For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5427      for (i = 0; i < nelt; i++)
5428	sel[i] = nelt / 2 + i;
5429      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5430	{
5431	  if (dump_enabled_p ())
5432	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5433			     "shift permutation is not supported by target\n");
5434	  return false;
5435	}
5436      shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5437
5438      /* Generating permutation constant to select vector from 2.
5439	 For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5440      for (i = 0; i < nelt / 2; i++)
5441	sel[i] = i;
5442      for (i = nelt / 2; i < nelt; i++)
5443	sel[i] = nelt + i;
5444      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5445	{
5446	  if (dump_enabled_p ())
5447	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5448			     "select is not supported by target\n");
5449	  return false;
5450	}
5451      select_mask = vect_gen_perm_mask_checked (vectype, sel);
5452
5453      for (i = 0; i < log_length; i++)
5454	{
5455	  for (j = 0; j < length; j += 2)
5456	    {
5457	      first_vect = dr_chain[j];
5458	      second_vect = dr_chain[j + 1];
5459
5460	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5461	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5462					       first_vect, first_vect,
5463					       perm2_mask1);
5464	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5465	      vect[0] = data_ref;
5466
5467	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5468	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5469					       second_vect, second_vect,
5470					       perm2_mask2);
5471	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5472	      vect[1] = data_ref;
5473
5474	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5475	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5476					       vect[0], vect[1], shift1_mask);
5477	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5478	      (*result_chain)[j/2 + length/2] = data_ref;
5479
5480	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5481	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5482					       vect[0], vect[1], select_mask);
5483	      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5484	      (*result_chain)[j/2] = data_ref;
5485	    }
5486	  memcpy (dr_chain.address (), result_chain->address (),
5487		  length * sizeof (tree));
5488	}
5489      return true;
5490    }
5491  if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5492    {
5493      unsigned int k = 0, l = 0;
5494
5495      /* Generating permutation constant to get all elements in rigth order.
5496	 For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5497      for (i = 0; i < nelt; i++)
5498	{
5499	  if (3 * k + (l % 3) >= nelt)
5500	    {
5501	      k = 0;
5502	      l += (3 - (nelt % 3));
5503	    }
5504	  sel[i] = 3 * k + (l % 3);
5505	  k++;
5506	}
5507      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5508	{
5509	  if (dump_enabled_p ())
5510	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5511			     "shuffle of 3 fields structure is not \
5512			      supported by target\n");
5513	  return false;
5514	}
5515      perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5516
5517      /* Generating permutation constant to shift all elements.
5518	 For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5519      for (i = 0; i < nelt; i++)
5520	sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5521      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5522	{
5523	  if (dump_enabled_p ())
5524	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5525			     "shift permutation is not supported by target\n");
5526	  return false;
5527	}
5528      shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5529
5530      /* Generating permutation constant to shift all elements.
5531	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5532      for (i = 0; i < nelt; i++)
5533	sel[i] = 2 * (nelt / 3) + 1 + i;
5534      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5535	{
5536	  if (dump_enabled_p ())
5537	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5538			     "shift permutation is not supported by target\n");
5539	  return false;
5540	}
5541      shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5542
5543      /* Generating permutation constant to shift all elements.
5544	 For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5545      for (i = 0; i < nelt; i++)
5546	sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5547      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5548	{
5549	  if (dump_enabled_p ())
5550	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5551			     "shift permutation is not supported by target\n");
5552	  return false;
5553	}
5554      shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5555
5556      /* Generating permutation constant to shift all elements.
5557	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5558      for (i = 0; i < nelt; i++)
5559	sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5560      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5561	{
5562	  if (dump_enabled_p ())
5563	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5564			     "shift permutation is not supported by target\n");
5565	  return false;
5566	}
5567      shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5568
5569      for (k = 0; k < 3; k++)
5570	{
5571	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5572	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5573					   dr_chain[k], dr_chain[k],
5574					   perm3_mask);
5575	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5576	  vect[k] = data_ref;
5577	}
5578
5579      for (k = 0; k < 3; k++)
5580	{
5581	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5582	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5583					   vect[k % 3], vect[(k + 1) % 3],
5584					   shift1_mask);
5585	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5586	  vect_shift[k] = data_ref;
5587	}
5588
5589      for (k = 0; k < 3; k++)
5590	{
5591	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5592	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5593					   vect_shift[(4 - k) % 3],
5594					   vect_shift[(3 - k) % 3],
5595					   shift2_mask);
5596	  vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5597	  vect[k] = data_ref;
5598	}
5599
5600      (*result_chain)[3 - (nelt % 3)] = vect[2];
5601
5602      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5603      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5604				       vect[0], shift3_mask);
5605      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5606      (*result_chain)[nelt % 3] = data_ref;
5607
5608      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5609      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5610				       vect[1], shift4_mask);
5611      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5612      (*result_chain)[0] = data_ref;
5613      return true;
5614    }
5615  return false;
5616}
5617
5618/* Function vect_transform_grouped_load.
5619
5620   Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5621   to perform their permutation and ascribe the result vectorized statements to
5622   the scalar statements.
5623*/
5624
5625void
5626vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5627			     gimple_stmt_iterator *gsi)
5628{
5629  machine_mode mode;
5630  vec<tree> result_chain = vNULL;
5631
5632  /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5633     RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5634     vectors, that are ready for vector computation.  */
5635  result_chain.create (size);
5636
5637  /* If reassociation width for vector type is 2 or greater target machine can
5638     execute 2 or more vector instructions in parallel.  Otherwise try to
5639     get chain for loads group using vect_shift_permute_load_chain.  */
5640  mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5641  if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5642      || exact_log2 (size) != -1
5643      || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5644					 gsi, &result_chain))
5645    vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5646  vect_record_grouped_load_vectors (stmt, result_chain);
5647  result_chain.release ();
5648}
5649
5650/* RESULT_CHAIN contains the output of a group of grouped loads that were
5651   generated as part of the vectorization of STMT.  Assign the statement
5652   for each vector to the associated scalar statement.  */
5653
5654void
5655vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5656{
5657  gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5658  gimple next_stmt, new_stmt;
5659  unsigned int i, gap_count;
5660  tree tmp_data_ref;
5661
5662  /* Put a permuted data-ref in the VECTORIZED_STMT field.
5663     Since we scan the chain starting from it's first node, their order
5664     corresponds the order of data-refs in RESULT_CHAIN.  */
5665  next_stmt = first_stmt;
5666  gap_count = 1;
5667  FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5668    {
5669      if (!next_stmt)
5670	break;
5671
5672      /* Skip the gaps.  Loads created for the gaps will be removed by dead
5673       code elimination pass later.  No need to check for the first stmt in
5674       the group, since it always exists.
5675       GROUP_GAP is the number of steps in elements from the previous
5676       access (if there is no gap GROUP_GAP is 1).  We skip loads that
5677       correspond to the gaps.  */
5678      if (next_stmt != first_stmt
5679          && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5680      {
5681        gap_count++;
5682        continue;
5683      }
5684
5685      while (next_stmt)
5686        {
5687	  new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5688	  /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5689	     copies, and we put the new vector statement in the first available
5690	     RELATED_STMT.  */
5691	  if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5692	    STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5693	  else
5694            {
5695              if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5696                {
5697 	          gimple prev_stmt =
5698		    STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5699	          gimple rel_stmt =
5700		    STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5701	          while (rel_stmt)
5702		    {
5703		      prev_stmt = rel_stmt;
5704		      rel_stmt =
5705                        STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5706		    }
5707
5708  	          STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5709                    new_stmt;
5710                }
5711            }
5712
5713	  next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5714	  gap_count = 1;
5715	  /* If NEXT_STMT accesses the same DR as the previous statement,
5716	     put the same TMP_DATA_REF as its vectorized statement; otherwise
5717	     get the next data-ref from RESULT_CHAIN.  */
5718	  if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5719	    break;
5720        }
5721    }
5722}
5723
5724/* Function vect_force_dr_alignment_p.
5725
5726   Returns whether the alignment of a DECL can be forced to be aligned
5727   on ALIGNMENT bit boundary.  */
5728
5729bool
5730vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5731{
5732  if (TREE_CODE (decl) != VAR_DECL)
5733    return false;
5734
5735  if (decl_in_symtab_p (decl)
5736      && !symtab_node::get (decl)->can_increase_alignment_p ())
5737    return false;
5738
5739  if (TREE_STATIC (decl))
5740    return (alignment <= MAX_OFILE_ALIGNMENT);
5741  else
5742    return (alignment <= MAX_STACK_ALIGNMENT);
5743}
5744
5745
5746/* Return whether the data reference DR is supported with respect to its
5747   alignment.
5748   If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5749   it is aligned, i.e., check if it is possible to vectorize it with different
5750   alignment.  */
5751
5752enum dr_alignment_support
5753vect_supportable_dr_alignment (struct data_reference *dr,
5754                               bool check_aligned_accesses)
5755{
5756  gimple stmt = DR_STMT (dr);
5757  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5758  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5759  machine_mode mode = TYPE_MODE (vectype);
5760  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5761  struct loop *vect_loop = NULL;
5762  bool nested_in_vect_loop = false;
5763
5764  if (aligned_access_p (dr) && !check_aligned_accesses)
5765    return dr_aligned;
5766
5767  /* For now assume all conditional loads/stores support unaligned
5768     access without any special code.  */
5769  if (is_gimple_call (stmt)
5770      && gimple_call_internal_p (stmt)
5771      && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5772	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5773    return dr_unaligned_supported;
5774
5775  if (loop_vinfo)
5776    {
5777      vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5778      nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5779    }
5780
5781  /* Possibly unaligned access.  */
5782
5783  /* We can choose between using the implicit realignment scheme (generating
5784     a misaligned_move stmt) and the explicit realignment scheme (generating
5785     aligned loads with a REALIGN_LOAD).  There are two variants to the
5786     explicit realignment scheme: optimized, and unoptimized.
5787     We can optimize the realignment only if the step between consecutive
5788     vector loads is equal to the vector size.  Since the vector memory
5789     accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5790     is guaranteed that the misalignment amount remains the same throughout the
5791     execution of the vectorized loop.  Therefore, we can create the
5792     "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5793     at the loop preheader.
5794
5795     However, in the case of outer-loop vectorization, when vectorizing a
5796     memory access in the inner-loop nested within the LOOP that is now being
5797     vectorized, while it is guaranteed that the misalignment of the
5798     vectorized memory access will remain the same in different outer-loop
5799     iterations, it is *not* guaranteed that is will remain the same throughout
5800     the execution of the inner-loop.  This is because the inner-loop advances
5801     with the original scalar step (and not in steps of VS).  If the inner-loop
5802     step happens to be a multiple of VS, then the misalignment remains fixed
5803     and we can use the optimized realignment scheme.  For example:
5804
5805      for (i=0; i<N; i++)
5806        for (j=0; j<M; j++)
5807          s += a[i+j];
5808
5809     When vectorizing the i-loop in the above example, the step between
5810     consecutive vector loads is 1, and so the misalignment does not remain
5811     fixed across the execution of the inner-loop, and the realignment cannot
5812     be optimized (as illustrated in the following pseudo vectorized loop):
5813
5814      for (i=0; i<N; i+=4)
5815        for (j=0; j<M; j++){
5816          vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5817                         // when j is {0,1,2,3,4,5,6,7,...} respectively.
5818                         // (assuming that we start from an aligned address).
5819          }
5820
5821     We therefore have to use the unoptimized realignment scheme:
5822
5823      for (i=0; i<N; i+=4)
5824          for (j=k; j<M; j+=4)
5825          vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5826                           // that the misalignment of the initial address is
5827                           // 0).
5828
5829     The loop can then be vectorized as follows:
5830
5831      for (k=0; k<4; k++){
5832        rt = get_realignment_token (&vp[k]);
5833        for (i=0; i<N; i+=4){
5834          v1 = vp[i+k];
5835          for (j=k; j<M; j+=4){
5836            v2 = vp[i+j+VS-1];
5837            va = REALIGN_LOAD <v1,v2,rt>;
5838            vs += va;
5839            v1 = v2;
5840          }
5841        }
5842    } */
5843
5844  if (DR_IS_READ (dr))
5845    {
5846      bool is_packed = false;
5847      tree type = (TREE_TYPE (DR_REF (dr)));
5848
5849      if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5850	  && (!targetm.vectorize.builtin_mask_for_load
5851	      || targetm.vectorize.builtin_mask_for_load ()))
5852	{
5853	  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5854	  if ((nested_in_vect_loop
5855	       && (TREE_INT_CST_LOW (DR_STEP (dr))
5856	 	   != GET_MODE_SIZE (TYPE_MODE (vectype))))
5857              || !loop_vinfo)
5858	    return dr_explicit_realign;
5859	  else
5860	    return dr_explicit_realign_optimized;
5861	}
5862      if (!known_alignment_for_access_p (dr))
5863	is_packed = not_size_aligned (DR_REF (dr));
5864
5865      if ((TYPE_USER_ALIGN (type) && !is_packed)
5866	  || targetm.vectorize.
5867	       support_vector_misalignment (mode, type,
5868					    DR_MISALIGNMENT (dr), is_packed))
5869	/* Can't software pipeline the loads, but can at least do them.  */
5870	return dr_unaligned_supported;
5871    }
5872  else
5873    {
5874      bool is_packed = false;
5875      tree type = (TREE_TYPE (DR_REF (dr)));
5876
5877      if (!known_alignment_for_access_p (dr))
5878	is_packed = not_size_aligned (DR_REF (dr));
5879
5880     if ((TYPE_USER_ALIGN (type) && !is_packed)
5881	 || targetm.vectorize.
5882	      support_vector_misalignment (mode, type,
5883					   DR_MISALIGNMENT (dr), is_packed))
5884       return dr_unaligned_supported;
5885    }
5886
5887  /* Unsupported.  */
5888  return dr_unaligned_unsupported;
5889}
5890