1/* Array prefetching.
2   Copyright (C) 2005-2015 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify it
7under the terms of the GNU General Public License as published by the
8Free Software Foundation; either version 3, or (at your option) any
9later version.
10
11GCC is distributed in the hope that it will be useful, but WITHOUT
12ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14for more details.
15
16You should have received a copy of the GNU General Public License
17along with GCC; see the file COPYING3.  If not see
18<http://www.gnu.org/licenses/>.  */
19
20#include "config.h"
21#include "system.h"
22#include "coretypes.h"
23#include "tm.h"
24#include "hash-set.h"
25#include "machmode.h"
26#include "vec.h"
27#include "double-int.h"
28#include "input.h"
29#include "alias.h"
30#include "symtab.h"
31#include "wide-int.h"
32#include "inchash.h"
33#include "tree.h"
34#include "fold-const.h"
35#include "stor-layout.h"
36#include "tm_p.h"
37#include "predict.h"
38#include "hard-reg-set.h"
39#include "function.h"
40#include "dominance.h"
41#include "cfg.h"
42#include "basic-block.h"
43#include "tree-pretty-print.h"
44#include "tree-ssa-alias.h"
45#include "internal-fn.h"
46#include "gimple-expr.h"
47#include "is-a.h"
48#include "gimple.h"
49#include "gimplify.h"
50#include "gimple-iterator.h"
51#include "gimplify-me.h"
52#include "gimple-ssa.h"
53#include "tree-ssa-loop-ivopts.h"
54#include "tree-ssa-loop-manip.h"
55#include "tree-ssa-loop-niter.h"
56#include "tree-ssa-loop.h"
57#include "tree-into-ssa.h"
58#include "cfgloop.h"
59#include "tree-pass.h"
60#include "insn-config.h"
61#include "tree-chrec.h"
62#include "tree-scalar-evolution.h"
63#include "diagnostic-core.h"
64#include "params.h"
65#include "langhooks.h"
66#include "tree-inline.h"
67#include "tree-data-ref.h"
68
69
70/* FIXME: Needed for optabs, but this should all be moved to a TBD interface
71   between the GIMPLE and RTL worlds.  */
72#include "hashtab.h"
73#include "rtl.h"
74#include "flags.h"
75#include "statistics.h"
76#include "real.h"
77#include "fixed-value.h"
78#include "expmed.h"
79#include "dojump.h"
80#include "explow.h"
81#include "calls.h"
82#include "emit-rtl.h"
83#include "varasm.h"
84#include "stmt.h"
85#include "expr.h"
86#include "insn-codes.h"
87#include "optabs.h"
88#include "recog.h"
89
90/* This pass inserts prefetch instructions to optimize cache usage during
91   accesses to arrays in loops.  It processes loops sequentially and:
92
93   1) Gathers all memory references in the single loop.
94   2) For each of the references it decides when it is profitable to prefetch
95      it.  To do it, we evaluate the reuse among the accesses, and determines
96      two values: PREFETCH_BEFORE (meaning that it only makes sense to do
97      prefetching in the first PREFETCH_BEFORE iterations of the loop) and
98      PREFETCH_MOD (meaning that it only makes sense to prefetch in the
99      iterations of the loop that are zero modulo PREFETCH_MOD).  For example
100      (assuming cache line size is 64 bytes, char has size 1 byte and there
101      is no hardware sequential prefetch):
102
103      char *a;
104      for (i = 0; i < max; i++)
105	{
106	  a[255] = ...;		(0)
107	  a[i] = ...;		(1)
108	  a[i + 64] = ...;	(2)
109	  a[16*i] = ...;	(3)
110	  a[187*i] = ...;	(4)
111	  a[187*i + 50] = ...;	(5)
112	}
113
114       (0) obviously has PREFETCH_BEFORE 1
115       (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
116           location 64 iterations before it, and PREFETCH_MOD 64 (since
117	   it hits the same cache line otherwise).
118       (2) has PREFETCH_MOD 64
119       (3) has PREFETCH_MOD 4
120       (4) has PREFETCH_MOD 1.  We do not set PREFETCH_BEFORE here, since
121           the cache line accessed by (5) is the same with probability only
122	   7/32.
123       (5) has PREFETCH_MOD 1 as well.
124
125      Additionally, we use data dependence analysis to determine for each
126      reference the distance till the first reuse; this information is used
127      to determine the temporality of the issued prefetch instruction.
128
129   3) We determine how much ahead we need to prefetch.  The number of
130      iterations needed is time to fetch / time spent in one iteration of
131      the loop.  The problem is that we do not know either of these values,
132      so we just make a heuristic guess based on a magic (possibly)
133      target-specific constant and size of the loop.
134
135   4) Determine which of the references we prefetch.  We take into account
136      that there is a maximum number of simultaneous prefetches (provided
137      by machine description).  We prefetch as many prefetches as possible
138      while still within this bound (starting with those with lowest
139      prefetch_mod, since they are responsible for most of the cache
140      misses).
141
142   5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
143      and PREFETCH_BEFORE requirements (within some bounds), and to avoid
144      prefetching nonaccessed memory.
145      TODO -- actually implement peeling.
146
147   6) We actually emit the prefetch instructions.  ??? Perhaps emit the
148      prefetch instructions with guards in cases where 5) was not sufficient
149      to satisfy the constraints?
150
151   A cost model is implemented to determine whether or not prefetching is
152   profitable for a given loop.  The cost model has three heuristics:
153
154   1. Function trip_count_to_ahead_ratio_too_small_p implements a
155      heuristic that determines whether or not the loop has too few
156      iterations (compared to ahead).  Prefetching is not likely to be
157      beneficial if the trip count to ahead ratio is below a certain
158      minimum.
159
160   2. Function mem_ref_count_reasonable_p implements a heuristic that
161      determines whether the given loop has enough CPU ops that can be
162      overlapped with cache missing memory ops.  If not, the loop
163      won't benefit from prefetching.  In the implementation,
164      prefetching is not considered beneficial if the ratio between
165      the instruction count and the mem ref count is below a certain
166      minimum.
167
168   3. Function insn_to_prefetch_ratio_too_small_p implements a
169      heuristic that disables prefetching in a loop if the prefetching
170      cost is above a certain limit.  The relative prefetching cost is
171      estimated by taking the ratio between the prefetch count and the
172      total intruction count (this models the I-cache cost).
173
174   The limits used in these heuristics are defined as parameters with
175   reasonable default values. Machine-specific default values will be
176   added later.
177
178   Some other TODO:
179      -- write and use more general reuse analysis (that could be also used
180	 in other cache aimed loop optimizations)
181      -- make it behave sanely together with the prefetches given by user
182	 (now we just ignore them; at the very least we should avoid
183	 optimizing loops in that user put his own prefetches)
184      -- we assume cache line size alignment of arrays; this could be
185	 improved.  */
186
187/* Magic constants follow.  These should be replaced by machine specific
188   numbers.  */
189
190/* True if write can be prefetched by a read prefetch.  */
191
192#ifndef WRITE_CAN_USE_READ_PREFETCH
193#define WRITE_CAN_USE_READ_PREFETCH 1
194#endif
195
196/* True if read can be prefetched by a write prefetch. */
197
198#ifndef READ_CAN_USE_WRITE_PREFETCH
199#define READ_CAN_USE_WRITE_PREFETCH 0
200#endif
201
202/* The size of the block loaded by a single prefetch.  Usually, this is
203   the same as cache line size (at the moment, we only consider one level
204   of cache hierarchy).  */
205
206#ifndef PREFETCH_BLOCK
207#define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
208#endif
209
210/* Do we have a forward hardware sequential prefetching?  */
211
212#ifndef HAVE_FORWARD_PREFETCH
213#define HAVE_FORWARD_PREFETCH 0
214#endif
215
216/* Do we have a backward hardware sequential prefetching?  */
217
218#ifndef HAVE_BACKWARD_PREFETCH
219#define HAVE_BACKWARD_PREFETCH 0
220#endif
221
222/* In some cases we are only able to determine that there is a certain
223   probability that the two accesses hit the same cache line.  In this
224   case, we issue the prefetches for both of them if this probability
225   is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand.  */
226
227#ifndef ACCEPTABLE_MISS_RATE
228#define ACCEPTABLE_MISS_RATE 50
229#endif
230
231#ifndef HAVE_prefetch
232#define HAVE_prefetch 0
233#endif
234
235#define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024))
236#define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024))
237
238/* We consider a memory access nontemporal if it is not reused sooner than
239   after L2_CACHE_SIZE_BYTES of memory are accessed.  However, we ignore
240   accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
241   so that we use nontemporal prefetches e.g. if single memory location
242   is accessed several times in a single iteration of the loop.  */
243#define NONTEMPORAL_FRACTION 16
244
245/* In case we have to emit a memory fence instruction after the loop that
246   uses nontemporal stores, this defines the builtin to use.  */
247
248#ifndef FENCE_FOLLOWING_MOVNT
249#define FENCE_FOLLOWING_MOVNT NULL_TREE
250#endif
251
252/* It is not profitable to prefetch when the trip count is not at
253   least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
254   For example, in a loop with a prefetch ahead distance of 10,
255   supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
256   profitable to prefetch when the trip count is greater or equal to
257   40.  In that case, 30 out of the 40 iterations will benefit from
258   prefetching.  */
259
260#ifndef TRIP_COUNT_TO_AHEAD_RATIO
261#define TRIP_COUNT_TO_AHEAD_RATIO 4
262#endif
263
264/* The group of references between that reuse may occur.  */
265
266struct mem_ref_group
267{
268  tree base;			/* Base of the reference.  */
269  tree step;			/* Step of the reference.  */
270  struct mem_ref *refs;		/* References in the group.  */
271  struct mem_ref_group *next;	/* Next group of references.  */
272};
273
274/* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched.  */
275
276#define PREFETCH_ALL		(~(unsigned HOST_WIDE_INT) 0)
277
278/* Do not generate a prefetch if the unroll factor is significantly less
279   than what is required by the prefetch.  This is to avoid redundant
280   prefetches.  For example, when prefetch_mod is 16 and unroll_factor is
281   2, prefetching requires unrolling the loop 16 times, but
282   the loop is actually unrolled twice.  In this case (ratio = 8),
283   prefetching is not likely to be beneficial.  */
284
285#ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
286#define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4
287#endif
288
289/* Some of the prefetch computations have quadratic complexity.  We want to
290   avoid huge compile times and, therefore, want to limit the amount of
291   memory references per loop where we consider prefetching.  */
292
293#ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP
294#define PREFETCH_MAX_MEM_REFS_PER_LOOP 200
295#endif
296
297/* The memory reference.  */
298
299struct mem_ref
300{
301  gimple stmt;			/* Statement in that the reference appears.  */
302  tree mem;			/* The reference.  */
303  HOST_WIDE_INT delta;		/* Constant offset of the reference.  */
304  struct mem_ref_group *group;	/* The group of references it belongs to.  */
305  unsigned HOST_WIDE_INT prefetch_mod;
306				/* Prefetch only each PREFETCH_MOD-th
307				   iteration.  */
308  unsigned HOST_WIDE_INT prefetch_before;
309				/* Prefetch only first PREFETCH_BEFORE
310				   iterations.  */
311  unsigned reuse_distance;	/* The amount of data accessed before the first
312				   reuse of this value.  */
313  struct mem_ref *next;		/* The next reference in the group.  */
314  unsigned write_p : 1;		/* Is it a write?  */
315  unsigned independent_p : 1;	/* True if the reference is independent on
316				   all other references inside the loop.  */
317  unsigned issue_prefetch_p : 1;	/* Should we really issue the prefetch?  */
318  unsigned storent_p : 1;	/* True if we changed the store to a
319				   nontemporal one.  */
320};
321
322/* Dumps information about memory reference */
323static void
324dump_mem_details (FILE *file, tree base, tree step,
325	    HOST_WIDE_INT delta, bool write_p)
326{
327  fprintf (file, "(base ");
328  print_generic_expr (file, base, TDF_SLIM);
329  fprintf (file, ", step ");
330  if (cst_and_fits_in_hwi (step))
331    fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (step));
332  else
333    print_generic_expr (file, step, TDF_TREE);
334  fprintf (file, ")\n");
335  fprintf (file, "  delta ");
336  fprintf (file, HOST_WIDE_INT_PRINT_DEC, delta);
337  fprintf (file, "\n");
338  fprintf (file, "  %s\n", write_p ? "write" : "read");
339  fprintf (file, "\n");
340}
341
342/* Dumps information about reference REF to FILE.  */
343
344static void
345dump_mem_ref (FILE *file, struct mem_ref *ref)
346{
347  fprintf (file, "Reference %p:\n", (void *) ref);
348
349  fprintf (file, "  group %p ", (void *) ref->group);
350
351  dump_mem_details (file, ref->group->base, ref->group->step, ref->delta,
352                   ref->write_p);
353}
354
355/* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
356   exist.  */
357
358static struct mem_ref_group *
359find_or_create_group (struct mem_ref_group **groups, tree base, tree step)
360{
361  struct mem_ref_group *group;
362
363  for (; *groups; groups = &(*groups)->next)
364    {
365      if (operand_equal_p ((*groups)->step, step, 0)
366	  && operand_equal_p ((*groups)->base, base, 0))
367	return *groups;
368
369      /* If step is an integer constant, keep the list of groups sorted
370         by decreasing step.  */
371        if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step)
372            && int_cst_value ((*groups)->step) < int_cst_value (step))
373	break;
374    }
375
376  group = XNEW (struct mem_ref_group);
377  group->base = base;
378  group->step = step;
379  group->refs = NULL;
380  group->next = *groups;
381  *groups = group;
382
383  return group;
384}
385
386/* Records a memory reference MEM in GROUP with offset DELTA and write status
387   WRITE_P.  The reference occurs in statement STMT.  */
388
389static void
390record_ref (struct mem_ref_group *group, gimple stmt, tree mem,
391	    HOST_WIDE_INT delta, bool write_p)
392{
393  struct mem_ref **aref;
394
395  /* Do not record the same address twice.  */
396  for (aref = &group->refs; *aref; aref = &(*aref)->next)
397    {
398      /* It does not have to be possible for write reference to reuse the read
399	 prefetch, or vice versa.  */
400      if (!WRITE_CAN_USE_READ_PREFETCH
401	  && write_p
402	  && !(*aref)->write_p)
403	continue;
404      if (!READ_CAN_USE_WRITE_PREFETCH
405	  && !write_p
406	  && (*aref)->write_p)
407	continue;
408
409      if ((*aref)->delta == delta)
410	return;
411    }
412
413  (*aref) = XNEW (struct mem_ref);
414  (*aref)->stmt = stmt;
415  (*aref)->mem = mem;
416  (*aref)->delta = delta;
417  (*aref)->write_p = write_p;
418  (*aref)->prefetch_before = PREFETCH_ALL;
419  (*aref)->prefetch_mod = 1;
420  (*aref)->reuse_distance = 0;
421  (*aref)->issue_prefetch_p = false;
422  (*aref)->group = group;
423  (*aref)->next = NULL;
424  (*aref)->independent_p = false;
425  (*aref)->storent_p = false;
426
427  if (dump_file && (dump_flags & TDF_DETAILS))
428    dump_mem_ref (dump_file, *aref);
429}
430
431/* Release memory references in GROUPS.  */
432
433static void
434release_mem_refs (struct mem_ref_group *groups)
435{
436  struct mem_ref_group *next_g;
437  struct mem_ref *ref, *next_r;
438
439  for (; groups; groups = next_g)
440    {
441      next_g = groups->next;
442      for (ref = groups->refs; ref; ref = next_r)
443	{
444	  next_r = ref->next;
445	  free (ref);
446	}
447      free (groups);
448    }
449}
450
451/* A structure used to pass arguments to idx_analyze_ref.  */
452
453struct ar_data
454{
455  struct loop *loop;			/* Loop of the reference.  */
456  gimple stmt;				/* Statement of the reference.  */
457  tree *step;				/* Step of the memory reference.  */
458  HOST_WIDE_INT *delta;			/* Offset of the memory reference.  */
459};
460
461/* Analyzes a single INDEX of a memory reference to obtain information
462   described at analyze_ref.  Callback for for_each_index.  */
463
464static bool
465idx_analyze_ref (tree base, tree *index, void *data)
466{
467  struct ar_data *ar_data = (struct ar_data *) data;
468  tree ibase, step, stepsize;
469  HOST_WIDE_INT idelta = 0, imult = 1;
470  affine_iv iv;
471
472  if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
473		  *index, &iv, true))
474    return false;
475  ibase = iv.base;
476  step = iv.step;
477
478  if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
479      && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
480    {
481      idelta = int_cst_value (TREE_OPERAND (ibase, 1));
482      ibase = TREE_OPERAND (ibase, 0);
483    }
484  if (cst_and_fits_in_hwi (ibase))
485    {
486      idelta += int_cst_value (ibase);
487      ibase = build_int_cst (TREE_TYPE (ibase), 0);
488    }
489
490  if (TREE_CODE (base) == ARRAY_REF)
491    {
492      stepsize = array_ref_element_size (base);
493      if (!cst_and_fits_in_hwi (stepsize))
494	return false;
495      imult = int_cst_value (stepsize);
496      step = fold_build2 (MULT_EXPR, sizetype,
497			  fold_convert (sizetype, step),
498			  fold_convert (sizetype, stepsize));
499      idelta *= imult;
500    }
501
502  if (*ar_data->step == NULL_TREE)
503    *ar_data->step = step;
504  else
505    *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
506				  fold_convert (sizetype, *ar_data->step),
507				  fold_convert (sizetype, step));
508  *ar_data->delta += idelta;
509  *index = ibase;
510
511  return true;
512}
513
514/* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
515   STEP are integer constants and iter is number of iterations of LOOP.  The
516   reference occurs in statement STMT.  Strips nonaddressable component
517   references from REF_P.  */
518
519static bool
520analyze_ref (struct loop *loop, tree *ref_p, tree *base,
521	     tree *step, HOST_WIDE_INT *delta,
522	     gimple stmt)
523{
524  struct ar_data ar_data;
525  tree off;
526  HOST_WIDE_INT bit_offset;
527  tree ref = *ref_p;
528
529  *step = NULL_TREE;
530  *delta = 0;
531
532  /* First strip off the component references.  Ignore bitfields.
533     Also strip off the real and imagine parts of a complex, so that
534     they can have the same base.  */
535  if (TREE_CODE (ref) == REALPART_EXPR
536      || TREE_CODE (ref) == IMAGPART_EXPR
537      || (TREE_CODE (ref) == COMPONENT_REF
538          && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))))
539    {
540      if (TREE_CODE (ref) == IMAGPART_EXPR)
541        *delta += int_size_in_bytes (TREE_TYPE (ref));
542      ref = TREE_OPERAND (ref, 0);
543    }
544
545  *ref_p = ref;
546
547  for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
548    {
549      off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
550      bit_offset = TREE_INT_CST_LOW (off);
551      gcc_assert (bit_offset % BITS_PER_UNIT == 0);
552
553      *delta += bit_offset / BITS_PER_UNIT;
554    }
555
556  *base = unshare_expr (ref);
557  ar_data.loop = loop;
558  ar_data.stmt = stmt;
559  ar_data.step = step;
560  ar_data.delta = delta;
561  return for_each_index (base, idx_analyze_ref, &ar_data);
562}
563
564/* Record a memory reference REF to the list REFS.  The reference occurs in
565   LOOP in statement STMT and it is write if WRITE_P.  Returns true if the
566   reference was recorded, false otherwise.  */
567
568static bool
569gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
570			      tree ref, bool write_p, gimple stmt)
571{
572  tree base, step;
573  HOST_WIDE_INT delta;
574  struct mem_ref_group *agrp;
575
576  if (get_base_address (ref) == NULL)
577    return false;
578
579  if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
580    return false;
581  /* If analyze_ref fails the default is a NULL_TREE.  We can stop here.  */
582  if (step == NULL_TREE)
583    return false;
584
585  /* Stop if the address of BASE could not be taken.  */
586  if (may_be_nonaddressable_p (base))
587    return false;
588
589  /* Limit non-constant step prefetching only to the innermost loops and
590     only when the step is loop invariant in the entire loop nest. */
591  if (!cst_and_fits_in_hwi (step))
592    {
593      if (loop->inner != NULL)
594        {
595          if (dump_file && (dump_flags & TDF_DETAILS))
596            {
597              fprintf (dump_file, "Memory expression %p\n",(void *) ref );
598              print_generic_expr (dump_file, ref, TDF_TREE);
599              fprintf (dump_file,":");
600              dump_mem_details (dump_file, base, step, delta, write_p);
601              fprintf (dump_file,
602                       "Ignoring %p, non-constant step prefetching is "
603                       "limited to inner most loops \n",
604                       (void *) ref);
605            }
606            return false;
607         }
608      else
609        {
610          if (!expr_invariant_in_loop_p (loop_outermost (loop), step))
611          {
612            if (dump_file && (dump_flags & TDF_DETAILS))
613              {
614                fprintf (dump_file, "Memory expression %p\n",(void *) ref );
615                print_generic_expr (dump_file, ref, TDF_TREE);
616                fprintf (dump_file,":");
617                dump_mem_details (dump_file, base, step, delta, write_p);
618                fprintf (dump_file,
619                         "Not prefetching, ignoring %p due to "
620                         "loop variant step\n",
621                         (void *) ref);
622              }
623              return false;
624            }
625        }
626    }
627
628  /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
629     are integer constants.  */
630  agrp = find_or_create_group (refs, base, step);
631  record_ref (agrp, stmt, ref, delta, write_p);
632
633  return true;
634}
635
636/* Record the suitable memory references in LOOP.  NO_OTHER_REFS is set to
637   true if there are no other memory references inside the loop.  */
638
639static struct mem_ref_group *
640gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count)
641{
642  basic_block *body = get_loop_body_in_dom_order (loop);
643  basic_block bb;
644  unsigned i;
645  gimple_stmt_iterator bsi;
646  gimple stmt;
647  tree lhs, rhs;
648  struct mem_ref_group *refs = NULL;
649
650  *no_other_refs = true;
651  *ref_count = 0;
652
653  /* Scan the loop body in order, so that the former references precede the
654     later ones.  */
655  for (i = 0; i < loop->num_nodes; i++)
656    {
657      bb = body[i];
658      if (bb->loop_father != loop)
659	continue;
660
661      for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
662	{
663	  stmt = gsi_stmt (bsi);
664
665	  if (gimple_code (stmt) != GIMPLE_ASSIGN)
666	    {
667	      if (gimple_vuse (stmt)
668		  || (is_gimple_call (stmt)
669		      && !(gimple_call_flags (stmt) & ECF_CONST)))
670		*no_other_refs = false;
671	      continue;
672	    }
673
674	  lhs = gimple_assign_lhs (stmt);
675	  rhs = gimple_assign_rhs1 (stmt);
676
677	  if (REFERENCE_CLASS_P (rhs))
678	    {
679	    *no_other_refs &= gather_memory_references_ref (loop, &refs,
680							    rhs, false, stmt);
681	    *ref_count += 1;
682	    }
683	  if (REFERENCE_CLASS_P (lhs))
684	    {
685	    *no_other_refs &= gather_memory_references_ref (loop, &refs,
686							    lhs, true, stmt);
687	    *ref_count += 1;
688	    }
689	}
690    }
691  free (body);
692
693  return refs;
694}
695
696/* Prune the prefetch candidate REF using the self-reuse.  */
697
698static void
699prune_ref_by_self_reuse (struct mem_ref *ref)
700{
701  HOST_WIDE_INT step;
702  bool backward;
703
704  /* If the step size is non constant, we cannot calculate prefetch_mod.  */
705  if (!cst_and_fits_in_hwi (ref->group->step))
706    return;
707
708  step = int_cst_value (ref->group->step);
709
710  backward = step < 0;
711
712  if (step == 0)
713    {
714      /* Prefetch references to invariant address just once.  */
715      ref->prefetch_before = 1;
716      return;
717    }
718
719  if (backward)
720    step = -step;
721
722  if (step > PREFETCH_BLOCK)
723    return;
724
725  if ((backward && HAVE_BACKWARD_PREFETCH)
726      || (!backward && HAVE_FORWARD_PREFETCH))
727    {
728      ref->prefetch_before = 1;
729      return;
730    }
731
732  ref->prefetch_mod = PREFETCH_BLOCK / step;
733}
734
735/* Divides X by BY, rounding down.  */
736
737static HOST_WIDE_INT
738ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
739{
740  gcc_assert (by > 0);
741
742  if (x >= 0)
743    return x / by;
744  else
745    return (x + by - 1) / by;
746}
747
748/* Given a CACHE_LINE_SIZE and two inductive memory references
749   with a common STEP greater than CACHE_LINE_SIZE and an address
750   difference DELTA, compute the probability that they will fall
751   in different cache lines.  Return true if the computed miss rate
752   is not greater than the ACCEPTABLE_MISS_RATE.  DISTINCT_ITERS is the
753   number of distinct iterations after which the pattern repeats itself.
754   ALIGN_UNIT is the unit of alignment in bytes.  */
755
756static bool
757is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size,
758		   HOST_WIDE_INT step, HOST_WIDE_INT delta,
759		   unsigned HOST_WIDE_INT distinct_iters,
760		   int align_unit)
761{
762  unsigned align, iter;
763  int total_positions, miss_positions, max_allowed_miss_positions;
764  int address1, address2, cache_line1, cache_line2;
765
766  /* It always misses if delta is greater than or equal to the cache
767     line size.  */
768  if (delta >= (HOST_WIDE_INT) cache_line_size)
769    return false;
770
771  miss_positions = 0;
772  total_positions = (cache_line_size / align_unit) * distinct_iters;
773  max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000;
774
775  /* Iterate through all possible alignments of the first
776     memory reference within its cache line.  */
777  for (align = 0; align < cache_line_size; align += align_unit)
778
779    /* Iterate through all distinct iterations.  */
780    for (iter = 0; iter < distinct_iters; iter++)
781      {
782	address1 = align + step * iter;
783	address2 = address1 + delta;
784	cache_line1 = address1 / cache_line_size;
785	cache_line2 = address2 / cache_line_size;
786	if (cache_line1 != cache_line2)
787	  {
788	    miss_positions += 1;
789            if (miss_positions > max_allowed_miss_positions)
790	      return false;
791          }
792      }
793  return true;
794}
795
796/* Prune the prefetch candidate REF using the reuse with BY.
797   If BY_IS_BEFORE is true, BY is before REF in the loop.  */
798
799static void
800prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
801			  bool by_is_before)
802{
803  HOST_WIDE_INT step;
804  bool backward;
805  HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
806  HOST_WIDE_INT delta = delta_b - delta_r;
807  HOST_WIDE_INT hit_from;
808  unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
809  HOST_WIDE_INT reduced_step;
810  unsigned HOST_WIDE_INT reduced_prefetch_block;
811  tree ref_type;
812  int align_unit;
813
814  /* If the step is non constant we cannot calculate prefetch_before.  */
815  if (!cst_and_fits_in_hwi (ref->group->step)) {
816    return;
817  }
818
819  step = int_cst_value (ref->group->step);
820
821  backward = step < 0;
822
823
824  if (delta == 0)
825    {
826      /* If the references has the same address, only prefetch the
827	 former.  */
828      if (by_is_before)
829	ref->prefetch_before = 0;
830
831      return;
832    }
833
834  if (!step)
835    {
836      /* If the reference addresses are invariant and fall into the
837	 same cache line, prefetch just the first one.  */
838      if (!by_is_before)
839	return;
840
841      if (ddown (ref->delta, PREFETCH_BLOCK)
842	  != ddown (by->delta, PREFETCH_BLOCK))
843	return;
844
845      ref->prefetch_before = 0;
846      return;
847    }
848
849  /* Only prune the reference that is behind in the array.  */
850  if (backward)
851    {
852      if (delta > 0)
853	return;
854
855      /* Transform the data so that we may assume that the accesses
856	 are forward.  */
857      delta = - delta;
858      step = -step;
859      delta_r = PREFETCH_BLOCK - 1 - delta_r;
860      delta_b = PREFETCH_BLOCK - 1 - delta_b;
861    }
862  else
863    {
864      if (delta < 0)
865	return;
866    }
867
868  /* Check whether the two references are likely to hit the same cache
869     line, and how distant the iterations in that it occurs are from
870     each other.  */
871
872  if (step <= PREFETCH_BLOCK)
873    {
874      /* The accesses are sure to meet.  Let us check when.  */
875      hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
876      prefetch_before = (hit_from - delta_r + step - 1) / step;
877
878      /* Do not reduce prefetch_before if we meet beyond cache size.  */
879      if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES / step))
880        prefetch_before = PREFETCH_ALL;
881      if (prefetch_before < ref->prefetch_before)
882	ref->prefetch_before = prefetch_before;
883
884      return;
885    }
886
887  /* A more complicated case with step > prefetch_block.  First reduce
888     the ratio between the step and the cache line size to its simplest
889     terms.  The resulting denominator will then represent the number of
890     distinct iterations after which each address will go back to its
891     initial location within the cache line.  This computation assumes
892     that PREFETCH_BLOCK is a power of two.  */
893  prefetch_block = PREFETCH_BLOCK;
894  reduced_prefetch_block = prefetch_block;
895  reduced_step = step;
896  while ((reduced_step & 1) == 0
897	 && reduced_prefetch_block > 1)
898    {
899      reduced_step >>= 1;
900      reduced_prefetch_block >>= 1;
901    }
902
903  prefetch_before = delta / step;
904  delta %= step;
905  ref_type = TREE_TYPE (ref->mem);
906  align_unit = TYPE_ALIGN (ref_type) / 8;
907  if (is_miss_rate_acceptable (prefetch_block, step, delta,
908			       reduced_prefetch_block, align_unit))
909    {
910      /* Do not reduce prefetch_before if we meet beyond cache size.  */
911      if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
912        prefetch_before = PREFETCH_ALL;
913      if (prefetch_before < ref->prefetch_before)
914	ref->prefetch_before = prefetch_before;
915
916      return;
917    }
918
919  /* Try also the following iteration.  */
920  prefetch_before++;
921  delta = step - delta;
922  if (is_miss_rate_acceptable (prefetch_block, step, delta,
923			       reduced_prefetch_block, align_unit))
924    {
925      if (prefetch_before < ref->prefetch_before)
926	ref->prefetch_before = prefetch_before;
927
928      return;
929    }
930
931  /* The ref probably does not reuse by.  */
932  return;
933}
934
935/* Prune the prefetch candidate REF using the reuses with other references
936   in REFS.  */
937
938static void
939prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
940{
941  struct mem_ref *prune_by;
942  bool before = true;
943
944  prune_ref_by_self_reuse (ref);
945
946  for (prune_by = refs; prune_by; prune_by = prune_by->next)
947    {
948      if (prune_by == ref)
949	{
950	  before = false;
951	  continue;
952	}
953
954      if (!WRITE_CAN_USE_READ_PREFETCH
955	  && ref->write_p
956	  && !prune_by->write_p)
957	continue;
958      if (!READ_CAN_USE_WRITE_PREFETCH
959	  && !ref->write_p
960	  && prune_by->write_p)
961	continue;
962
963      prune_ref_by_group_reuse (ref, prune_by, before);
964    }
965}
966
967/* Prune the prefetch candidates in GROUP using the reuse analysis.  */
968
969static void
970prune_group_by_reuse (struct mem_ref_group *group)
971{
972  struct mem_ref *ref_pruned;
973
974  for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
975    {
976      prune_ref_by_reuse (ref_pruned, group->refs);
977
978      if (dump_file && (dump_flags & TDF_DETAILS))
979	{
980	  fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
981
982	  if (ref_pruned->prefetch_before == PREFETCH_ALL
983	      && ref_pruned->prefetch_mod == 1)
984	    fprintf (dump_file, " no restrictions");
985	  else if (ref_pruned->prefetch_before == 0)
986	    fprintf (dump_file, " do not prefetch");
987	  else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
988	    fprintf (dump_file, " prefetch once");
989	  else
990	    {
991	      if (ref_pruned->prefetch_before != PREFETCH_ALL)
992		{
993		  fprintf (dump_file, " prefetch before ");
994		  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
995			   ref_pruned->prefetch_before);
996		}
997	      if (ref_pruned->prefetch_mod != 1)
998		{
999		  fprintf (dump_file, " prefetch mod ");
1000		  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
1001			   ref_pruned->prefetch_mod);
1002		}
1003	    }
1004	  fprintf (dump_file, "\n");
1005	}
1006    }
1007}
1008
1009/* Prune the list of prefetch candidates GROUPS using the reuse analysis.  */
1010
1011static void
1012prune_by_reuse (struct mem_ref_group *groups)
1013{
1014  for (; groups; groups = groups->next)
1015    prune_group_by_reuse (groups);
1016}
1017
1018/* Returns true if we should issue prefetch for REF.  */
1019
1020static bool
1021should_issue_prefetch_p (struct mem_ref *ref)
1022{
1023  /* For now do not issue prefetches for only first few of the
1024     iterations.  */
1025  if (ref->prefetch_before != PREFETCH_ALL)
1026    {
1027      if (dump_file && (dump_flags & TDF_DETAILS))
1028        fprintf (dump_file, "Ignoring %p due to prefetch_before\n",
1029		 (void *) ref);
1030      return false;
1031    }
1032
1033  /* Do not prefetch nontemporal stores.  */
1034  if (ref->storent_p)
1035    {
1036      if (dump_file && (dump_flags & TDF_DETAILS))
1037        fprintf (dump_file, "Ignoring nontemporal store %p\n", (void *) ref);
1038      return false;
1039    }
1040
1041  return true;
1042}
1043
1044/* Decide which of the prefetch candidates in GROUPS to prefetch.
1045   AHEAD is the number of iterations to prefetch ahead (which corresponds
1046   to the number of simultaneous instances of one prefetch running at a
1047   time).  UNROLL_FACTOR is the factor by that the loop is going to be
1048   unrolled.  Returns true if there is anything to prefetch.  */
1049
1050static bool
1051schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
1052		     unsigned ahead)
1053{
1054  unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
1055  unsigned slots_per_prefetch;
1056  struct mem_ref *ref;
1057  bool any = false;
1058
1059  /* At most SIMULTANEOUS_PREFETCHES should be running at the same time.  */
1060  remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
1061
1062  /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
1063     AHEAD / UNROLL_FACTOR iterations of the unrolled loop.  In each iteration,
1064     it will need a prefetch slot.  */
1065  slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
1066  if (dump_file && (dump_flags & TDF_DETAILS))
1067    fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
1068	     slots_per_prefetch);
1069
1070  /* For now we just take memory references one by one and issue
1071     prefetches for as many as possible.  The groups are sorted
1072     starting with the largest step, since the references with
1073     large step are more likely to cause many cache misses.  */
1074
1075  for (; groups; groups = groups->next)
1076    for (ref = groups->refs; ref; ref = ref->next)
1077      {
1078	if (!should_issue_prefetch_p (ref))
1079	  continue;
1080
1081        /* The loop is far from being sufficiently unrolled for this
1082           prefetch.  Do not generate prefetch to avoid many redudant
1083           prefetches.  */
1084        if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
1085          continue;
1086
1087	/* If we need to prefetch the reference each PREFETCH_MOD iterations,
1088	   and we unroll the loop UNROLL_FACTOR times, we need to insert
1089	   ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
1090	   iteration.  */
1091	n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1092			/ ref->prefetch_mod);
1093	prefetch_slots = n_prefetches * slots_per_prefetch;
1094
1095	/* If more than half of the prefetches would be lost anyway, do not
1096	   issue the prefetch.  */
1097	if (2 * remaining_prefetch_slots < prefetch_slots)
1098	  continue;
1099
1100	ref->issue_prefetch_p = true;
1101
1102	if (remaining_prefetch_slots <= prefetch_slots)
1103	  return true;
1104	remaining_prefetch_slots -= prefetch_slots;
1105	any = true;
1106      }
1107
1108  return any;
1109}
1110
1111/* Return TRUE if no prefetch is going to be generated in the given
1112   GROUPS.  */
1113
1114static bool
1115nothing_to_prefetch_p (struct mem_ref_group *groups)
1116{
1117  struct mem_ref *ref;
1118
1119  for (; groups; groups = groups->next)
1120    for (ref = groups->refs; ref; ref = ref->next)
1121      if (should_issue_prefetch_p (ref))
1122	return false;
1123
1124  return true;
1125}
1126
1127/* Estimate the number of prefetches in the given GROUPS.
1128   UNROLL_FACTOR is the factor by which LOOP was unrolled.  */
1129
1130static int
1131estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor)
1132{
1133  struct mem_ref *ref;
1134  unsigned n_prefetches;
1135  int prefetch_count = 0;
1136
1137  for (; groups; groups = groups->next)
1138    for (ref = groups->refs; ref; ref = ref->next)
1139      if (should_issue_prefetch_p (ref))
1140	{
1141	  n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1142			  / ref->prefetch_mod);
1143	  prefetch_count += n_prefetches;
1144	}
1145
1146  return prefetch_count;
1147}
1148
1149/* Issue prefetches for the reference REF into loop as decided before.
1150   HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR
1151   is the factor by which LOOP was unrolled.  */
1152
1153static void
1154issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
1155{
1156  HOST_WIDE_INT delta;
1157  tree addr, addr_base, write_p, local, forward;
1158  gcall *prefetch;
1159  gimple_stmt_iterator bsi;
1160  unsigned n_prefetches, ap;
1161  bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
1162
1163  if (dump_file && (dump_flags & TDF_DETAILS))
1164    fprintf (dump_file, "Issued%s prefetch for %p.\n",
1165	     nontemporal ? " nontemporal" : "",
1166	     (void *) ref);
1167
1168  bsi = gsi_for_stmt (ref->stmt);
1169
1170  n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1171		  / ref->prefetch_mod);
1172  addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
1173  addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
1174					true, NULL, true, GSI_SAME_STMT);
1175  write_p = ref->write_p ? integer_one_node : integer_zero_node;
1176  local = nontemporal ? integer_zero_node : integer_three_node;
1177
1178  for (ap = 0; ap < n_prefetches; ap++)
1179    {
1180      if (cst_and_fits_in_hwi (ref->group->step))
1181        {
1182          /* Determine the address to prefetch.  */
1183          delta = (ahead + ap * ref->prefetch_mod) *
1184		   int_cst_value (ref->group->step);
1185          addr = fold_build_pointer_plus_hwi (addr_base, delta);
1186          addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL,
1187                                           true, GSI_SAME_STMT);
1188        }
1189      else
1190        {
1191          /* The step size is non-constant but loop-invariant.  We use the
1192             heuristic to simply prefetch ahead iterations ahead.  */
1193          forward = fold_build2 (MULT_EXPR, sizetype,
1194                                 fold_convert (sizetype, ref->group->step),
1195                                 fold_convert (sizetype, size_int (ahead)));
1196          addr = fold_build_pointer_plus (addr_base, forward);
1197          addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1198					   NULL, true, GSI_SAME_STMT);
1199      }
1200      /* Create the prefetch instruction.  */
1201      prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
1202				    3, addr, write_p, local);
1203      gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
1204    }
1205}
1206
1207/* Issue prefetches for the references in GROUPS into loop as decided before.
1208   HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR is the
1209   factor by that LOOP was unrolled.  */
1210
1211static void
1212issue_prefetches (struct mem_ref_group *groups,
1213		  unsigned unroll_factor, unsigned ahead)
1214{
1215  struct mem_ref *ref;
1216
1217  for (; groups; groups = groups->next)
1218    for (ref = groups->refs; ref; ref = ref->next)
1219      if (ref->issue_prefetch_p)
1220	issue_prefetch_ref (ref, unroll_factor, ahead);
1221}
1222
1223/* Returns true if REF is a memory write for that a nontemporal store insn
1224   can be used.  */
1225
1226static bool
1227nontemporal_store_p (struct mem_ref *ref)
1228{
1229  machine_mode mode;
1230  enum insn_code code;
1231
1232  /* REF must be a write that is not reused.  We require it to be independent
1233     on all other memory references in the loop, as the nontemporal stores may
1234     be reordered with respect to other memory references.  */
1235  if (!ref->write_p
1236      || !ref->independent_p
1237      || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1238    return false;
1239
1240  /* Check that we have the storent instruction for the mode.  */
1241  mode = TYPE_MODE (TREE_TYPE (ref->mem));
1242  if (mode == BLKmode)
1243    return false;
1244
1245  code = optab_handler (storent_optab, mode);
1246  return code != CODE_FOR_nothing;
1247}
1248
1249/* If REF is a nontemporal store, we mark the corresponding modify statement
1250   and return true.  Otherwise, we return false.  */
1251
1252static bool
1253mark_nontemporal_store (struct mem_ref *ref)
1254{
1255  if (!nontemporal_store_p (ref))
1256    return false;
1257
1258  if (dump_file && (dump_flags & TDF_DETAILS))
1259    fprintf (dump_file, "Marked reference %p as a nontemporal store.\n",
1260	     (void *) ref);
1261
1262  gimple_assign_set_nontemporal_move (ref->stmt, true);
1263  ref->storent_p = true;
1264
1265  return true;
1266}
1267
1268/* Issue a memory fence instruction after LOOP.  */
1269
1270static void
1271emit_mfence_after_loop (struct loop *loop)
1272{
1273  vec<edge> exits = get_loop_exit_edges (loop);
1274  edge exit;
1275  gcall *call;
1276  gimple_stmt_iterator bsi;
1277  unsigned i;
1278
1279  FOR_EACH_VEC_ELT (exits, i, exit)
1280    {
1281      call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
1282
1283      if (!single_pred_p (exit->dest)
1284	  /* If possible, we prefer not to insert the fence on other paths
1285	     in cfg.  */
1286	  && !(exit->flags & EDGE_ABNORMAL))
1287	split_loop_exit_edge (exit);
1288      bsi = gsi_after_labels (exit->dest);
1289
1290      gsi_insert_before (&bsi, call, GSI_NEW_STMT);
1291    }
1292
1293  exits.release ();
1294  update_ssa (TODO_update_ssa_only_virtuals);
1295}
1296
1297/* Returns true if we can use storent in loop, false otherwise.  */
1298
1299static bool
1300may_use_storent_in_loop_p (struct loop *loop)
1301{
1302  bool ret = true;
1303
1304  if (loop->inner != NULL)
1305    return false;
1306
1307  /* If we must issue a mfence insn after using storent, check that there
1308     is a suitable place for it at each of the loop exits.  */
1309  if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1310    {
1311      vec<edge> exits = get_loop_exit_edges (loop);
1312      unsigned i;
1313      edge exit;
1314
1315      FOR_EACH_VEC_ELT (exits, i, exit)
1316	if ((exit->flags & EDGE_ABNORMAL)
1317	    && exit->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
1318	  ret = false;
1319
1320      exits.release ();
1321    }
1322
1323  return ret;
1324}
1325
1326/* Marks nontemporal stores in LOOP.  GROUPS contains the description of memory
1327   references in the loop.  */
1328
1329static void
1330mark_nontemporal_stores (struct loop *loop, struct mem_ref_group *groups)
1331{
1332  struct mem_ref *ref;
1333  bool any = false;
1334
1335  if (!may_use_storent_in_loop_p (loop))
1336    return;
1337
1338  for (; groups; groups = groups->next)
1339    for (ref = groups->refs; ref; ref = ref->next)
1340      any |= mark_nontemporal_store (ref);
1341
1342  if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1343    emit_mfence_after_loop (loop);
1344}
1345
1346/* Determines whether we can profitably unroll LOOP FACTOR times, and if
1347   this is the case, fill in DESC by the description of number of
1348   iterations.  */
1349
1350static bool
1351should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
1352		      unsigned factor)
1353{
1354  if (!can_unroll_loop_p (loop, factor, desc))
1355    return false;
1356
1357  /* We only consider loops without control flow for unrolling.  This is not
1358     a hard restriction -- tree_unroll_loop works with arbitrary loops
1359     as well; but the unrolling/prefetching is usually more profitable for
1360     loops consisting of a single basic block, and we want to limit the
1361     code growth.  */
1362  if (loop->num_nodes > 2)
1363    return false;
1364
1365  return true;
1366}
1367
1368/* Determine the coefficient by that unroll LOOP, from the information
1369   contained in the list of memory references REFS.  Description of
1370   umber of iterations of LOOP is stored to DESC.  NINSNS is the number of
1371   insns of the LOOP.  EST_NITER is the estimated number of iterations of
1372   the loop, or -1 if no estimate is available.  */
1373
1374static unsigned
1375determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
1376			 unsigned ninsns, struct tree_niter_desc *desc,
1377			 HOST_WIDE_INT est_niter)
1378{
1379  unsigned upper_bound;
1380  unsigned nfactor, factor, mod_constraint;
1381  struct mem_ref_group *agp;
1382  struct mem_ref *ref;
1383
1384  /* First check whether the loop is not too large to unroll.  We ignore
1385     PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1386     from unrolling them enough to make exactly one cache line covered by each
1387     iteration.  Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1388     us from unrolling the loops too many times in cases where we only expect
1389     gains from better scheduling and decreasing loop overhead, which is not
1390     the case here.  */
1391  upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
1392
1393  /* If we unrolled the loop more times than it iterates, the unrolled version
1394     of the loop would be never entered.  */
1395  if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
1396    upper_bound = est_niter;
1397
1398  if (upper_bound <= 1)
1399    return 1;
1400
1401  /* Choose the factor so that we may prefetch each cache just once,
1402     but bound the unrolling by UPPER_BOUND.  */
1403  factor = 1;
1404  for (agp = refs; agp; agp = agp->next)
1405    for (ref = agp->refs; ref; ref = ref->next)
1406      if (should_issue_prefetch_p (ref))
1407	{
1408	  mod_constraint = ref->prefetch_mod;
1409	  nfactor = least_common_multiple (mod_constraint, factor);
1410	  if (nfactor <= upper_bound)
1411	    factor = nfactor;
1412	}
1413
1414  if (!should_unroll_loop_p (loop, desc, factor))
1415    return 1;
1416
1417  return factor;
1418}
1419
1420/* Returns the total volume of the memory references REFS, taking into account
1421   reuses in the innermost loop and cache line size.  TODO -- we should also
1422   take into account reuses across the iterations of the loops in the loop
1423   nest.  */
1424
1425static unsigned
1426volume_of_references (struct mem_ref_group *refs)
1427{
1428  unsigned volume = 0;
1429  struct mem_ref_group *gr;
1430  struct mem_ref *ref;
1431
1432  for (gr = refs; gr; gr = gr->next)
1433    for (ref = gr->refs; ref; ref = ref->next)
1434      {
1435	/* Almost always reuses another value?  */
1436	if (ref->prefetch_before != PREFETCH_ALL)
1437	  continue;
1438
1439	/* If several iterations access the same cache line, use the size of
1440	   the line divided by this number.  Otherwise, a cache line is
1441	   accessed in each iteration.  TODO -- in the latter case, we should
1442	   take the size of the reference into account, rounding it up on cache
1443	   line size multiple.  */
1444	volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
1445      }
1446  return volume;
1447}
1448
1449/* Returns the volume of memory references accessed across VEC iterations of
1450   loops, whose sizes are described in the LOOP_SIZES array.  N is the number
1451   of the loops in the nest (length of VEC and LOOP_SIZES vectors).  */
1452
1453static unsigned
1454volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
1455{
1456  unsigned i;
1457
1458  for (i = 0; i < n; i++)
1459    if (vec[i] != 0)
1460      break;
1461
1462  if (i == n)
1463    return 0;
1464
1465  gcc_assert (vec[i] > 0);
1466
1467  /* We ignore the parts of the distance vector in subloops, since usually
1468     the numbers of iterations are much smaller.  */
1469  return loop_sizes[i] * vec[i];
1470}
1471
1472/* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1473   at the position corresponding to the loop of the step.  N is the depth
1474   of the considered loop nest, and, LOOP is its innermost loop.  */
1475
1476static void
1477add_subscript_strides (tree access_fn, unsigned stride,
1478		       HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
1479{
1480  struct loop *aloop;
1481  tree step;
1482  HOST_WIDE_INT astep;
1483  unsigned min_depth = loop_depth (loop) - n;
1484
1485  while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1486    {
1487      aloop = get_chrec_loop (access_fn);
1488      step = CHREC_RIGHT (access_fn);
1489      access_fn = CHREC_LEFT (access_fn);
1490
1491      if ((unsigned) loop_depth (aloop) <= min_depth)
1492	continue;
1493
1494      if (tree_fits_shwi_p (step))
1495	astep = tree_to_shwi (step);
1496      else
1497	astep = L1_CACHE_LINE_SIZE;
1498
1499      strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
1500
1501    }
1502}
1503
1504/* Returns the volume of memory references accessed between two consecutive
1505   self-reuses of the reference DR.  We consider the subscripts of DR in N
1506   loops, and LOOP_SIZES contains the volumes of accesses in each of the
1507   loops.  LOOP is the innermost loop of the current loop nest.  */
1508
1509static unsigned
1510self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1511		     struct loop *loop)
1512{
1513  tree stride, access_fn;
1514  HOST_WIDE_INT *strides, astride;
1515  vec<tree> access_fns;
1516  tree ref = DR_REF (dr);
1517  unsigned i, ret = ~0u;
1518
1519  /* In the following example:
1520
1521     for (i = 0; i < N; i++)
1522       for (j = 0; j < N; j++)
1523         use (a[j][i]);
1524     the same cache line is accessed each N steps (except if the change from
1525     i to i + 1 crosses the boundary of the cache line).  Thus, for self-reuse,
1526     we cannot rely purely on the results of the data dependence analysis.
1527
1528     Instead, we compute the stride of the reference in each loop, and consider
1529     the innermost loop in that the stride is less than cache size.  */
1530
1531  strides = XCNEWVEC (HOST_WIDE_INT, n);
1532  access_fns = DR_ACCESS_FNS (dr);
1533
1534  FOR_EACH_VEC_ELT (access_fns, i, access_fn)
1535    {
1536      /* Keep track of the reference corresponding to the subscript, so that we
1537	 know its stride.  */
1538      while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1539	ref = TREE_OPERAND (ref, 0);
1540
1541      if (TREE_CODE (ref) == ARRAY_REF)
1542	{
1543	  stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1544	  if (tree_fits_uhwi_p (stride))
1545	    astride = tree_to_uhwi (stride);
1546	  else
1547	    astride = L1_CACHE_LINE_SIZE;
1548
1549	  ref = TREE_OPERAND (ref, 0);
1550	}
1551      else
1552	astride = 1;
1553
1554      add_subscript_strides (access_fn, astride, strides, n, loop);
1555    }
1556
1557  for (i = n; i-- > 0; )
1558    {
1559      unsigned HOST_WIDE_INT s;
1560
1561      s = strides[i] < 0 ?  -strides[i] : strides[i];
1562
1563      if (s < (unsigned) L1_CACHE_LINE_SIZE
1564	  && (loop_sizes[i]
1565	      > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1566	{
1567	  ret = loop_sizes[i];
1568	  break;
1569	}
1570    }
1571
1572  free (strides);
1573  return ret;
1574}
1575
1576/* Determines the distance till the first reuse of each reference in REFS
1577   in the loop nest of LOOP.  NO_OTHER_REFS is true if there are no other
1578   memory references in the loop.  Return false if the analysis fails.  */
1579
1580static bool
1581determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs,
1582			   bool no_other_refs)
1583{
1584  struct loop *nest, *aloop;
1585  vec<data_reference_p> datarefs = vNULL;
1586  vec<ddr_p> dependences = vNULL;
1587  struct mem_ref_group *gr;
1588  struct mem_ref *ref, *refb;
1589  vec<loop_p> vloops = vNULL;
1590  unsigned *loop_data_size;
1591  unsigned i, j, n;
1592  unsigned volume, dist, adist;
1593  HOST_WIDE_INT vol;
1594  data_reference_p dr;
1595  ddr_p dep;
1596
1597  if (loop->inner)
1598    return true;
1599
1600  /* Find the outermost loop of the loop nest of loop (we require that
1601     there are no sibling loops inside the nest).  */
1602  nest = loop;
1603  while (1)
1604    {
1605      aloop = loop_outer (nest);
1606
1607      if (aloop == current_loops->tree_root
1608	  || aloop->inner->next)
1609	break;
1610
1611      nest = aloop;
1612    }
1613
1614  /* For each loop, determine the amount of data accessed in each iteration.
1615     We use this to estimate whether the reference is evicted from the
1616     cache before its reuse.  */
1617  find_loop_nest (nest, &vloops);
1618  n = vloops.length ();
1619  loop_data_size = XNEWVEC (unsigned, n);
1620  volume = volume_of_references (refs);
1621  i = n;
1622  while (i-- != 0)
1623    {
1624      loop_data_size[i] = volume;
1625      /* Bound the volume by the L2 cache size, since above this bound,
1626	 all dependence distances are equivalent.  */
1627      if (volume > L2_CACHE_SIZE_BYTES)
1628	continue;
1629
1630      aloop = vloops[i];
1631      vol = estimated_stmt_executions_int (aloop);
1632      if (vol == -1)
1633	vol = expected_loop_iterations (aloop);
1634      volume *= vol;
1635    }
1636
1637  /* Prepare the references in the form suitable for data dependence
1638     analysis.  We ignore unanalyzable data references (the results
1639     are used just as a heuristics to estimate temporality of the
1640     references, hence we do not need to worry about correctness).  */
1641  for (gr = refs; gr; gr = gr->next)
1642    for (ref = gr->refs; ref; ref = ref->next)
1643      {
1644	dr = create_data_ref (nest, loop_containing_stmt (ref->stmt),
1645			      ref->mem, ref->stmt, !ref->write_p);
1646
1647	if (dr)
1648	  {
1649	    ref->reuse_distance = volume;
1650	    dr->aux = ref;
1651	    datarefs.safe_push (dr);
1652	  }
1653	else
1654	  no_other_refs = false;
1655      }
1656
1657  FOR_EACH_VEC_ELT (datarefs, i, dr)
1658    {
1659      dist = self_reuse_distance (dr, loop_data_size, n, loop);
1660      ref = (struct mem_ref *) dr->aux;
1661      if (ref->reuse_distance > dist)
1662	ref->reuse_distance = dist;
1663
1664      if (no_other_refs)
1665	ref->independent_p = true;
1666    }
1667
1668  if (!compute_all_dependences (datarefs, &dependences, vloops, true))
1669    return false;
1670
1671  FOR_EACH_VEC_ELT (dependences, i, dep)
1672    {
1673      if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1674	continue;
1675
1676      ref = (struct mem_ref *) DDR_A (dep)->aux;
1677      refb = (struct mem_ref *) DDR_B (dep)->aux;
1678
1679      if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1680	  || DDR_NUM_DIST_VECTS (dep) == 0)
1681	{
1682	  /* If the dependence cannot be analyzed, assume that there might be
1683	     a reuse.  */
1684	  dist = 0;
1685
1686	  ref->independent_p = false;
1687	  refb->independent_p = false;
1688	}
1689      else
1690	{
1691	  /* The distance vectors are normalized to be always lexicographically
1692	     positive, hence we cannot tell just from them whether DDR_A comes
1693	     before DDR_B or vice versa.  However, it is not important,
1694	     anyway -- if DDR_A is close to DDR_B, then it is either reused in
1695	     DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1696	     in cache (and marking it as nontemporal would not affect
1697	     anything).  */
1698
1699	  dist = volume;
1700	  for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
1701	    {
1702	      adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1703					     loop_data_size, n);
1704
1705	      /* If this is a dependence in the innermost loop (i.e., the
1706		 distances in all superloops are zero) and it is not
1707		 the trivial self-dependence with distance zero, record that
1708		 the references are not completely independent.  */
1709	      if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
1710		  && (ref != refb
1711		      || DDR_DIST_VECT (dep, j)[n-1] != 0))
1712		{
1713		  ref->independent_p = false;
1714		  refb->independent_p = false;
1715		}
1716
1717	      /* Ignore accesses closer than
1718		 L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1719	      	 so that we use nontemporal prefetches e.g. if single memory
1720		 location is accessed several times in a single iteration of
1721		 the loop.  */
1722	      if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1723		continue;
1724
1725	      if (adist < dist)
1726		dist = adist;
1727	    }
1728	}
1729
1730      if (ref->reuse_distance > dist)
1731	ref->reuse_distance = dist;
1732      if (refb->reuse_distance > dist)
1733	refb->reuse_distance = dist;
1734    }
1735
1736  free_dependence_relations (dependences);
1737  free_data_refs (datarefs);
1738  free (loop_data_size);
1739
1740  if (dump_file && (dump_flags & TDF_DETAILS))
1741    {
1742      fprintf (dump_file, "Reuse distances:\n");
1743      for (gr = refs; gr; gr = gr->next)
1744	for (ref = gr->refs; ref; ref = ref->next)
1745	  fprintf (dump_file, " ref %p distance %u\n",
1746		   (void *) ref, ref->reuse_distance);
1747    }
1748
1749  return true;
1750}
1751
1752/* Determine whether or not the trip count to ahead ratio is too small based
1753   on prefitablility consideration.
1754   AHEAD: the iteration ahead distance,
1755   EST_NITER: the estimated trip count.  */
1756
1757static bool
1758trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter)
1759{
1760  /* Assume trip count to ahead ratio is big enough if the trip count could not
1761     be estimated at compile time.  */
1762  if (est_niter < 0)
1763    return false;
1764
1765  if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
1766    {
1767      if (dump_file && (dump_flags & TDF_DETAILS))
1768	fprintf (dump_file,
1769		 "Not prefetching -- loop estimated to roll only %d times\n",
1770		 (int) est_niter);
1771      return true;
1772    }
1773
1774  return false;
1775}
1776
1777/* Determine whether or not the number of memory references in the loop is
1778   reasonable based on the profitablity and compilation time considerations.
1779   NINSNS: estimated number of instructions in the loop,
1780   MEM_REF_COUNT: total number of memory references in the loop.  */
1781
1782static bool
1783mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count)
1784{
1785  int insn_to_mem_ratio;
1786
1787  if (mem_ref_count == 0)
1788    return false;
1789
1790  /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis
1791     (compute_all_dependences) have high costs based on quadratic complexity.
1792     To avoid huge compilation time, we give up prefetching if mem_ref_count
1793     is too large.  */
1794  if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP)
1795    return false;
1796
1797  /* Prefetching improves performance by overlapping cache missing
1798     memory accesses with CPU operations.  If the loop does not have
1799     enough CPU operations to overlap with memory operations, prefetching
1800     won't give a significant benefit.  One approximate way of checking
1801     this is to require the ratio of instructions to memory references to
1802     be above a certain limit.  This approximation works well in practice.
1803     TODO: Implement a more precise computation by estimating the time
1804     for each CPU or memory op in the loop. Time estimates for memory ops
1805     should account for cache misses.  */
1806  insn_to_mem_ratio = ninsns / mem_ref_count;
1807
1808  if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO)
1809    {
1810      if (dump_file && (dump_flags & TDF_DETAILS))
1811        fprintf (dump_file,
1812		 "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1813		 insn_to_mem_ratio);
1814      return false;
1815    }
1816
1817  return true;
1818}
1819
1820/* Determine whether or not the instruction to prefetch ratio in the loop is
1821   too small based on the profitablity consideration.
1822   NINSNS: estimated number of instructions in the loop,
1823   PREFETCH_COUNT: an estimate of the number of prefetches,
1824   UNROLL_FACTOR:  the factor to unroll the loop if prefetching.  */
1825
1826static bool
1827insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count,
1828                                     unsigned unroll_factor)
1829{
1830  int insn_to_prefetch_ratio;
1831
1832  /* Prefetching most likely causes performance degradation when the instruction
1833     to prefetch ratio is too small.  Too many prefetch instructions in a loop
1834     may reduce the I-cache performance.
1835     (unroll_factor * ninsns) is used to estimate the number of instructions in
1836     the unrolled loop.  This implementation is a bit simplistic -- the number
1837     of issued prefetch instructions is also affected by unrolling.  So,
1838     prefetch_mod and the unroll factor should be taken into account when
1839     determining prefetch_count.  Also, the number of insns of the unrolled
1840     loop will usually be significantly smaller than the number of insns of the
1841     original loop * unroll_factor (at least the induction variable increases
1842     and the exit branches will get eliminated), so it might be better to use
1843     tree_estimate_loop_size + estimated_unrolled_size.  */
1844  insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
1845  if (insn_to_prefetch_ratio < MIN_INSN_TO_PREFETCH_RATIO)
1846    {
1847      if (dump_file && (dump_flags & TDF_DETAILS))
1848        fprintf (dump_file,
1849		 "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
1850		 insn_to_prefetch_ratio);
1851      return true;
1852    }
1853
1854  return false;
1855}
1856
1857
1858/* Issue prefetch instructions for array references in LOOP.  Returns
1859   true if the LOOP was unrolled.  */
1860
1861static bool
1862loop_prefetch_arrays (struct loop *loop)
1863{
1864  struct mem_ref_group *refs;
1865  unsigned ahead, ninsns, time, unroll_factor;
1866  HOST_WIDE_INT est_niter;
1867  struct tree_niter_desc desc;
1868  bool unrolled = false, no_other_refs;
1869  unsigned prefetch_count;
1870  unsigned mem_ref_count;
1871
1872  if (optimize_loop_nest_for_size_p (loop))
1873    {
1874      if (dump_file && (dump_flags & TDF_DETAILS))
1875	fprintf (dump_file, "  ignored (cold area)\n");
1876      return false;
1877    }
1878
1879  /* FIXME: the time should be weighted by the probabilities of the blocks in
1880     the loop body.  */
1881  time = tree_num_loop_insns (loop, &eni_time_weights);
1882  if (time == 0)
1883    return false;
1884
1885  ahead = (PREFETCH_LATENCY + time - 1) / time;
1886  est_niter = estimated_stmt_executions_int (loop);
1887  if (est_niter == -1)
1888    est_niter = max_stmt_executions_int (loop);
1889
1890  /* Prefetching is not likely to be profitable if the trip count to ahead
1891     ratio is too small.  */
1892  if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter))
1893    return false;
1894
1895  ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1896
1897  /* Step 1: gather the memory references.  */
1898  refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1899
1900  /* Give up prefetching if the number of memory references in the
1901     loop is not reasonable based on profitablity and compilation time
1902     considerations.  */
1903  if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count))
1904    goto fail;
1905
1906  /* Step 2: estimate the reuse effects.  */
1907  prune_by_reuse (refs);
1908
1909  if (nothing_to_prefetch_p (refs))
1910    goto fail;
1911
1912  if (!determine_loop_nest_reuse (loop, refs, no_other_refs))
1913    goto fail;
1914
1915  /* Step 3: determine unroll factor.  */
1916  unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1917					   est_niter);
1918
1919  /* Estimate prefetch count for the unrolled loop.  */
1920  prefetch_count = estimate_prefetch_count (refs, unroll_factor);
1921  if (prefetch_count == 0)
1922    goto fail;
1923
1924  if (dump_file && (dump_flags & TDF_DETAILS))
1925    fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
1926	     HOST_WIDE_INT_PRINT_DEC "\n"
1927	     "insn count %d, mem ref count %d, prefetch count %d\n",
1928	     ahead, unroll_factor, est_niter,
1929	     ninsns, mem_ref_count, prefetch_count);
1930
1931  /* Prefetching is not likely to be profitable if the instruction to prefetch
1932     ratio is too small.  */
1933  if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
1934					  unroll_factor))
1935    goto fail;
1936
1937  mark_nontemporal_stores (loop, refs);
1938
1939  /* Step 4: what to prefetch?  */
1940  if (!schedule_prefetches (refs, unroll_factor, ahead))
1941    goto fail;
1942
1943  /* Step 5: unroll the loop.  TODO -- peeling of first and last few
1944     iterations so that we do not issue superfluous prefetches.  */
1945  if (unroll_factor != 1)
1946    {
1947      tree_unroll_loop (loop, unroll_factor,
1948			single_dom_exit (loop), &desc);
1949      unrolled = true;
1950    }
1951
1952  /* Step 6: issue the prefetches.  */
1953  issue_prefetches (refs, unroll_factor, ahead);
1954
1955fail:
1956  release_mem_refs (refs);
1957  return unrolled;
1958}
1959
1960/* Issue prefetch instructions for array references in loops.  */
1961
1962unsigned int
1963tree_ssa_prefetch_arrays (void)
1964{
1965  struct loop *loop;
1966  bool unrolled = false;
1967  int todo_flags = 0;
1968
1969  if (!HAVE_prefetch
1970      /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1971	 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1972	 of processor costs and i486 does not have prefetch, but
1973	 -march=pentium4 causes HAVE_prefetch to be true.  Ugh.  */
1974      || PREFETCH_BLOCK == 0)
1975    return 0;
1976
1977  if (dump_file && (dump_flags & TDF_DETAILS))
1978    {
1979      fprintf (dump_file, "Prefetching parameters:\n");
1980      fprintf (dump_file, "    simultaneous prefetches: %d\n",
1981	       SIMULTANEOUS_PREFETCHES);
1982      fprintf (dump_file, "    prefetch latency: %d\n", PREFETCH_LATENCY);
1983      fprintf (dump_file, "    prefetch block size: %d\n", PREFETCH_BLOCK);
1984      fprintf (dump_file, "    L1 cache size: %d lines, %d kB\n",
1985	       L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE);
1986      fprintf (dump_file, "    L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
1987      fprintf (dump_file, "    L2 cache size: %d kB\n", L2_CACHE_SIZE);
1988      fprintf (dump_file, "    min insn-to-prefetch ratio: %d \n",
1989	       MIN_INSN_TO_PREFETCH_RATIO);
1990      fprintf (dump_file, "    min insn-to-mem ratio: %d \n",
1991	       PREFETCH_MIN_INSN_TO_MEM_RATIO);
1992      fprintf (dump_file, "\n");
1993    }
1994
1995  initialize_original_copy_tables ();
1996
1997  if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
1998    {
1999      tree type = build_function_type_list (void_type_node,
2000					    const_ptr_type_node, NULL_TREE);
2001      tree decl = add_builtin_function ("__builtin_prefetch", type,
2002					BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
2003					NULL, NULL_TREE);
2004      DECL_IS_NOVOPS (decl) = true;
2005      set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
2006    }
2007
2008  /* We assume that size of cache line is a power of two, so verify this
2009     here.  */
2010  gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
2011
2012  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
2013    {
2014      if (dump_file && (dump_flags & TDF_DETAILS))
2015	fprintf (dump_file, "Processing loop %d:\n", loop->num);
2016
2017      unrolled |= loop_prefetch_arrays (loop);
2018
2019      if (dump_file && (dump_flags & TDF_DETAILS))
2020	fprintf (dump_file, "\n\n");
2021    }
2022
2023  if (unrolled)
2024    {
2025      scev_reset ();
2026      todo_flags |= TODO_cleanup_cfg;
2027    }
2028
2029  free_original_copy_tables ();
2030  return todo_flags;
2031}
2032
2033/* Prefetching.  */
2034
2035namespace {
2036
2037const pass_data pass_data_loop_prefetch =
2038{
2039  GIMPLE_PASS, /* type */
2040  "aprefetch", /* name */
2041  OPTGROUP_LOOP, /* optinfo_flags */
2042  TV_TREE_PREFETCH, /* tv_id */
2043  ( PROP_cfg | PROP_ssa ), /* properties_required */
2044  0, /* properties_provided */
2045  0, /* properties_destroyed */
2046  0, /* todo_flags_start */
2047  0, /* todo_flags_finish */
2048};
2049
2050class pass_loop_prefetch : public gimple_opt_pass
2051{
2052public:
2053  pass_loop_prefetch (gcc::context *ctxt)
2054    : gimple_opt_pass (pass_data_loop_prefetch, ctxt)
2055  {}
2056
2057  /* opt_pass methods: */
2058  virtual bool gate (function *) { return flag_prefetch_loop_arrays > 0; }
2059  virtual unsigned int execute (function *);
2060
2061}; // class pass_loop_prefetch
2062
2063unsigned int
2064pass_loop_prefetch::execute (function *fun)
2065{
2066  if (number_of_loops (fun) <= 1)
2067    return 0;
2068
2069  return tree_ssa_prefetch_arrays ();
2070}
2071
2072} // anon namespace
2073
2074gimple_opt_pass *
2075make_pass_loop_prefetch (gcc::context *ctxt)
2076{
2077  return new pass_loop_prefetch (ctxt);
2078}
2079
2080
2081