1169689Skan/* Array prefetching. 2169689Skan Copyright (C) 2005 Free Software Foundation, Inc. 3169689Skan 4169689SkanThis file is part of GCC. 5169689Skan 6169689SkanGCC is free software; you can redistribute it and/or modify it 7169689Skanunder the terms of the GNU General Public License as published by the 8169689SkanFree Software Foundation; either version 2, or (at your option) any 9169689Skanlater version. 10169689Skan 11169689SkanGCC is distributed in the hope that it will be useful, but WITHOUT 12169689SkanANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13169689SkanFITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14169689Skanfor more details. 15169689Skan 16169689SkanYou should have received a copy of the GNU General Public License 17169689Skanalong with GCC; see the file COPYING. If not, write to the Free 18169689SkanSoftware Foundation, 59 Temple Place - Suite 330, Boston, MA 19169689Skan02111-1307, USA. */ 20169689Skan 21169689Skan#include "config.h" 22169689Skan#include "system.h" 23169689Skan#include "coretypes.h" 24169689Skan#include "tm.h" 25169689Skan#include "tree.h" 26169689Skan#include "rtl.h" 27169689Skan#include "tm_p.h" 28169689Skan#include "hard-reg-set.h" 29169689Skan#include "basic-block.h" 30169689Skan#include "output.h" 31169689Skan#include "diagnostic.h" 32169689Skan#include "tree-flow.h" 33169689Skan#include "tree-dump.h" 34169689Skan#include "timevar.h" 35169689Skan#include "cfgloop.h" 36169689Skan#include "varray.h" 37169689Skan#include "expr.h" 38169689Skan#include "tree-pass.h" 39169689Skan#include "ggc.h" 40169689Skan#include "insn-config.h" 41169689Skan#include "recog.h" 42169689Skan#include "hashtab.h" 43169689Skan#include "tree-chrec.h" 44169689Skan#include "tree-scalar-evolution.h" 45169689Skan#include "toplev.h" 46169689Skan#include "params.h" 47169689Skan#include "langhooks.h" 48169689Skan 49169689Skan/* This pass inserts prefetch instructions to optimize cache usage during 50169689Skan accesses to arrays in loops. It processes loops sequentially and: 51169689Skan 52169689Skan 1) Gathers all memory references in the single loop. 53169689Skan 2) For each of the references it decides when it is profitable to prefetch 54169689Skan it. To do it, we evaluate the reuse among the accesses, and determines 55169689Skan two values: PREFETCH_BEFORE (meaning that it only makes sense to do 56169689Skan prefetching in the first PREFETCH_BEFORE iterations of the loop) and 57169689Skan PREFETCH_MOD (meaning that it only makes sense to prefetch in the 58169689Skan iterations of the loop that are zero modulo PREFETCH_MOD). For example 59169689Skan (assuming cache line size is 64 bytes, char has size 1 byte and there 60169689Skan is no hardware sequential prefetch): 61169689Skan 62169689Skan char *a; 63169689Skan for (i = 0; i < max; i++) 64169689Skan { 65169689Skan a[255] = ...; (0) 66169689Skan a[i] = ...; (1) 67169689Skan a[i + 64] = ...; (2) 68169689Skan a[16*i] = ...; (3) 69169689Skan a[187*i] = ...; (4) 70169689Skan a[187*i + 50] = ...; (5) 71169689Skan } 72169689Skan 73169689Skan (0) obviously has PREFETCH_BEFORE 1 74169689Skan (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory 75169689Skan location 64 iterations before it, and PREFETCH_MOD 64 (since 76169689Skan it hits the same cache line otherwise). 77169689Skan (2) has PREFETCH_MOD 64 78169689Skan (3) has PREFETCH_MOD 4 79169689Skan (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since 80169689Skan the cache line accessed by (4) is the same with probability only 81169689Skan 7/32. 82169689Skan (5) has PREFETCH_MOD 1 as well. 83169689Skan 84169689Skan 3) We determine how much ahead we need to prefetch. The number of 85169689Skan iterations needed is time to fetch / time spent in one iteration of 86169689Skan the loop. The problem is that we do not know either of these values, 87169689Skan so we just make a heuristic guess based on a magic (possibly) 88169689Skan target-specific constant and size of the loop. 89169689Skan 90169689Skan 4) Determine which of the references we prefetch. We take into account 91169689Skan that there is a maximum number of simultaneous prefetches (provided 92169689Skan by machine description). We prefetch as many prefetches as possible 93169689Skan while still within this bound (starting with those with lowest 94169689Skan prefetch_mod, since they are responsible for most of the cache 95169689Skan misses). 96169689Skan 97169689Skan 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD 98169689Skan and PREFETCH_BEFORE requirements (within some bounds), and to avoid 99169689Skan prefetching nonaccessed memory. 100169689Skan TODO -- actually implement peeling. 101169689Skan 102169689Skan 6) We actually emit the prefetch instructions. ??? Perhaps emit the 103169689Skan prefetch instructions with guards in cases where 5) was not sufficient 104169689Skan to satisfy the constraints? 105169689Skan 106169689Skan Some other TODO: 107169689Skan -- write and use more general reuse analysis (that could be also used 108169689Skan in other cache aimed loop optimizations) 109169689Skan -- make it behave sanely together with the prefetches given by user 110169689Skan (now we just ignore them; at the very least we should avoid 111169689Skan optimizing loops in that user put his own prefetches) 112169689Skan -- we assume cache line size alignment of arrays; this could be 113169689Skan improved. */ 114169689Skan 115169689Skan/* Magic constants follow. These should be replaced by machine specific 116169689Skan numbers. */ 117169689Skan 118169689Skan/* A number that should roughly correspond to the number of instructions 119169689Skan executed before the prefetch is completed. */ 120169689Skan 121169689Skan#ifndef PREFETCH_LATENCY 122169689Skan#define PREFETCH_LATENCY 200 123169689Skan#endif 124169689Skan 125169689Skan/* Number of prefetches that can run at the same time. */ 126169689Skan 127169689Skan#ifndef SIMULTANEOUS_PREFETCHES 128169689Skan#define SIMULTANEOUS_PREFETCHES 3 129169689Skan#endif 130169689Skan 131169689Skan/* True if write can be prefetched by a read prefetch. */ 132169689Skan 133169689Skan#ifndef WRITE_CAN_USE_READ_PREFETCH 134169689Skan#define WRITE_CAN_USE_READ_PREFETCH 1 135169689Skan#endif 136169689Skan 137169689Skan/* True if read can be prefetched by a write prefetch. */ 138169689Skan 139169689Skan#ifndef READ_CAN_USE_WRITE_PREFETCH 140169689Skan#define READ_CAN_USE_WRITE_PREFETCH 0 141169689Skan#endif 142169689Skan 143169689Skan/* Cache line size. Assumed to be a power of two. */ 144169689Skan 145169689Skan#ifndef PREFETCH_BLOCK 146169689Skan#define PREFETCH_BLOCK 32 147169689Skan#endif 148169689Skan 149169689Skan/* Do we have a forward hardware sequential prefetching? */ 150169689Skan 151169689Skan#ifndef HAVE_FORWARD_PREFETCH 152169689Skan#define HAVE_FORWARD_PREFETCH 0 153169689Skan#endif 154169689Skan 155169689Skan/* Do we have a backward hardware sequential prefetching? */ 156169689Skan 157169689Skan#ifndef HAVE_BACKWARD_PREFETCH 158169689Skan#define HAVE_BACKWARD_PREFETCH 0 159169689Skan#endif 160169689Skan 161169689Skan/* In some cases we are only able to determine that there is a certain 162169689Skan probability that the two accesses hit the same cache line. In this 163169689Skan case, we issue the prefetches for both of them if this probability 164169689Skan is less then (1000 - ACCEPTABLE_MISS_RATE) promile. */ 165169689Skan 166169689Skan#ifndef ACCEPTABLE_MISS_RATE 167169689Skan#define ACCEPTABLE_MISS_RATE 50 168169689Skan#endif 169169689Skan 170169689Skan#ifndef HAVE_prefetch 171169689Skan#define HAVE_prefetch 0 172169689Skan#endif 173169689Skan 174169689Skan/* The group of references between that reuse may occur. */ 175169689Skan 176169689Skanstruct mem_ref_group 177169689Skan{ 178169689Skan tree base; /* Base of the reference. */ 179169689Skan HOST_WIDE_INT step; /* Step of the reference. */ 180169689Skan struct mem_ref *refs; /* References in the group. */ 181169689Skan struct mem_ref_group *next; /* Next group of references. */ 182169689Skan}; 183169689Skan 184169689Skan/* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */ 185169689Skan 186169689Skan#define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0) 187169689Skan 188169689Skan/* The memory reference. */ 189169689Skan 190169689Skanstruct mem_ref 191169689Skan{ 192169689Skan tree stmt; /* Statement in that the reference appears. */ 193169689Skan tree mem; /* The reference. */ 194169689Skan HOST_WIDE_INT delta; /* Constant offset of the reference. */ 195169689Skan bool write_p; /* Is it a write? */ 196169689Skan struct mem_ref_group *group; /* The group of references it belongs to. */ 197169689Skan unsigned HOST_WIDE_INT prefetch_mod; 198169689Skan /* Prefetch only each PREFETCH_MOD-th 199169689Skan iteration. */ 200169689Skan unsigned HOST_WIDE_INT prefetch_before; 201169689Skan /* Prefetch only first PREFETCH_BEFORE 202169689Skan iterations. */ 203169689Skan bool issue_prefetch_p; /* Should we really issue the prefetch? */ 204169689Skan struct mem_ref *next; /* The next reference in the group. */ 205169689Skan}; 206169689Skan 207169689Skan/* Dumps information about reference REF to FILE. */ 208169689Skan 209169689Skanstatic void 210169689Skandump_mem_ref (FILE *file, struct mem_ref *ref) 211169689Skan{ 212169689Skan fprintf (file, "Reference %p:\n", (void *) ref); 213169689Skan 214169689Skan fprintf (file, " group %p (base ", (void *) ref->group); 215169689Skan print_generic_expr (file, ref->group->base, TDF_SLIM); 216169689Skan fprintf (file, ", step "); 217169689Skan fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->group->step); 218169689Skan fprintf (file, ")\n"); 219169689Skan 220169689Skan fprintf (dump_file, " delta "); 221169689Skan fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta); 222169689Skan fprintf (file, "\n"); 223169689Skan 224169689Skan fprintf (file, " %s\n", ref->write_p ? "write" : "read"); 225169689Skan 226169689Skan fprintf (file, "\n"); 227169689Skan} 228169689Skan 229169689Skan/* Finds a group with BASE and STEP in GROUPS, or creates one if it does not 230169689Skan exist. */ 231169689Skan 232169689Skanstatic struct mem_ref_group * 233169689Skanfind_or_create_group (struct mem_ref_group **groups, tree base, 234169689Skan HOST_WIDE_INT step) 235169689Skan{ 236169689Skan struct mem_ref_group *group; 237169689Skan 238169689Skan for (; *groups; groups = &(*groups)->next) 239169689Skan { 240169689Skan if ((*groups)->step == step 241169689Skan && operand_equal_p ((*groups)->base, base, 0)) 242169689Skan return *groups; 243169689Skan 244169689Skan /* Keep the list of groups sorted by decreasing step. */ 245169689Skan if ((*groups)->step < step) 246169689Skan break; 247169689Skan } 248169689Skan 249169689Skan group = xcalloc (1, sizeof (struct mem_ref_group)); 250169689Skan group->base = base; 251169689Skan group->step = step; 252169689Skan group->refs = NULL; 253169689Skan group->next = *groups; 254169689Skan *groups = group; 255169689Skan 256169689Skan return group; 257169689Skan} 258169689Skan 259169689Skan/* Records a memory reference MEM in GROUP with offset DELTA and write status 260169689Skan WRITE_P. The reference occurs in statement STMT. */ 261169689Skan 262169689Skanstatic void 263169689Skanrecord_ref (struct mem_ref_group *group, tree stmt, tree mem, 264169689Skan HOST_WIDE_INT delta, bool write_p) 265169689Skan{ 266169689Skan struct mem_ref **aref; 267169689Skan 268169689Skan /* Do not record the same address twice. */ 269169689Skan for (aref = &group->refs; *aref; aref = &(*aref)->next) 270169689Skan { 271169689Skan /* It does not have to be possible for write reference to reuse the read 272169689Skan prefetch, or vice versa. */ 273169689Skan if (!WRITE_CAN_USE_READ_PREFETCH 274169689Skan && write_p 275169689Skan && !(*aref)->write_p) 276169689Skan continue; 277169689Skan if (!READ_CAN_USE_WRITE_PREFETCH 278169689Skan && !write_p 279169689Skan && (*aref)->write_p) 280169689Skan continue; 281169689Skan 282169689Skan if ((*aref)->delta == delta) 283169689Skan return; 284169689Skan } 285169689Skan 286169689Skan (*aref) = xcalloc (1, sizeof (struct mem_ref)); 287169689Skan (*aref)->stmt = stmt; 288169689Skan (*aref)->mem = mem; 289169689Skan (*aref)->delta = delta; 290169689Skan (*aref)->write_p = write_p; 291169689Skan (*aref)->prefetch_before = PREFETCH_ALL; 292169689Skan (*aref)->prefetch_mod = 1; 293169689Skan (*aref)->issue_prefetch_p = false; 294169689Skan (*aref)->group = group; 295169689Skan (*aref)->next = NULL; 296169689Skan 297169689Skan if (dump_file && (dump_flags & TDF_DETAILS)) 298169689Skan dump_mem_ref (dump_file, *aref); 299169689Skan} 300169689Skan 301169689Skan/* Release memory references in GROUPS. */ 302169689Skan 303169689Skanstatic void 304169689Skanrelease_mem_refs (struct mem_ref_group *groups) 305169689Skan{ 306169689Skan struct mem_ref_group *next_g; 307169689Skan struct mem_ref *ref, *next_r; 308169689Skan 309169689Skan for (; groups; groups = next_g) 310169689Skan { 311169689Skan next_g = groups->next; 312169689Skan for (ref = groups->refs; ref; ref = next_r) 313169689Skan { 314169689Skan next_r = ref->next; 315169689Skan free (ref); 316169689Skan } 317169689Skan free (groups); 318169689Skan } 319169689Skan} 320169689Skan 321169689Skan/* A structure used to pass arguments to idx_analyze_ref. */ 322169689Skan 323169689Skanstruct ar_data 324169689Skan{ 325169689Skan struct loop *loop; /* Loop of the reference. */ 326169689Skan tree stmt; /* Statement of the reference. */ 327169689Skan HOST_WIDE_INT *step; /* Step of the memory reference. */ 328169689Skan HOST_WIDE_INT *delta; /* Offset of the memory reference. */ 329169689Skan}; 330169689Skan 331169689Skan/* Analyzes a single INDEX of a memory reference to obtain information 332169689Skan described at analyze_ref. Callback for for_each_index. */ 333169689Skan 334169689Skanstatic bool 335169689Skanidx_analyze_ref (tree base, tree *index, void *data) 336169689Skan{ 337169689Skan struct ar_data *ar_data = data; 338169689Skan tree ibase, step, stepsize; 339169689Skan HOST_WIDE_INT istep, idelta = 0, imult = 1; 340169689Skan affine_iv iv; 341169689Skan 342169689Skan if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF 343169689Skan || TREE_CODE (base) == ALIGN_INDIRECT_REF) 344169689Skan return false; 345169689Skan 346169689Skan if (!simple_iv (ar_data->loop, ar_data->stmt, *index, &iv, false)) 347169689Skan return false; 348169689Skan ibase = iv.base; 349169689Skan step = iv.step; 350169689Skan 351169689Skan if (zero_p (step)) 352169689Skan istep = 0; 353169689Skan else 354169689Skan { 355169689Skan if (!cst_and_fits_in_hwi (step)) 356169689Skan return false; 357169689Skan istep = int_cst_value (step); 358169689Skan } 359169689Skan 360169689Skan if (TREE_CODE (ibase) == PLUS_EXPR 361169689Skan && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1))) 362169689Skan { 363169689Skan idelta = int_cst_value (TREE_OPERAND (ibase, 1)); 364169689Skan ibase = TREE_OPERAND (ibase, 0); 365169689Skan } 366169689Skan if (cst_and_fits_in_hwi (ibase)) 367169689Skan { 368169689Skan idelta += int_cst_value (ibase); 369169689Skan ibase = build_int_cst (TREE_TYPE (ibase), 0); 370169689Skan } 371169689Skan 372169689Skan if (TREE_CODE (base) == ARRAY_REF) 373169689Skan { 374169689Skan stepsize = array_ref_element_size (base); 375169689Skan if (!cst_and_fits_in_hwi (stepsize)) 376169689Skan return false; 377169689Skan imult = int_cst_value (stepsize); 378169689Skan 379169689Skan istep *= imult; 380169689Skan idelta *= imult; 381169689Skan } 382169689Skan 383169689Skan *ar_data->step += istep; 384169689Skan *ar_data->delta += idelta; 385169689Skan *index = ibase; 386169689Skan 387169689Skan return true; 388169689Skan} 389169689Skan 390169689Skan/* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and 391169689Skan STEP are integer constants and iter is number of iterations of LOOP. The 392169689Skan reference occurs in statement STMT. Strips nonaddressable component 393169689Skan references from REF_P. */ 394169689Skan 395169689Skanstatic bool 396169689Skananalyze_ref (struct loop *loop, tree *ref_p, tree *base, 397169689Skan HOST_WIDE_INT *step, HOST_WIDE_INT *delta, 398169689Skan tree stmt) 399169689Skan{ 400169689Skan struct ar_data ar_data; 401169689Skan tree off; 402169689Skan HOST_WIDE_INT bit_offset; 403169689Skan tree ref = *ref_p; 404169689Skan 405169689Skan *step = 0; 406169689Skan *delta = 0; 407169689Skan 408169689Skan /* First strip off the component references. Ignore bitfields. */ 409169689Skan if (TREE_CODE (ref) == COMPONENT_REF 410169689Skan && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))) 411169689Skan ref = TREE_OPERAND (ref, 0); 412169689Skan 413169689Skan *ref_p = ref; 414169689Skan 415169689Skan for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0)) 416169689Skan { 417169689Skan off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1)); 418169689Skan bit_offset = TREE_INT_CST_LOW (off); 419169689Skan gcc_assert (bit_offset % BITS_PER_UNIT == 0); 420169689Skan 421169689Skan *delta += bit_offset / BITS_PER_UNIT; 422169689Skan } 423169689Skan 424169689Skan *base = unshare_expr (ref); 425169689Skan ar_data.loop = loop; 426169689Skan ar_data.stmt = stmt; 427169689Skan ar_data.step = step; 428169689Skan ar_data.delta = delta; 429169689Skan return for_each_index (base, idx_analyze_ref, &ar_data); 430169689Skan} 431169689Skan 432169689Skan/* Record a memory reference REF to the list REFS. The reference occurs in 433169689Skan LOOP in statement STMT and it is write if WRITE_P. */ 434169689Skan 435169689Skanstatic void 436169689Skangather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs, 437169689Skan tree ref, bool write_p, tree stmt) 438169689Skan{ 439169689Skan tree base; 440169689Skan HOST_WIDE_INT step, delta; 441169689Skan struct mem_ref_group *agrp; 442169689Skan 443169689Skan if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt)) 444169689Skan return; 445169689Skan 446169689Skan /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP 447169689Skan are integer constants. */ 448169689Skan agrp = find_or_create_group (refs, base, step); 449169689Skan record_ref (agrp, stmt, ref, delta, write_p); 450169689Skan} 451169689Skan 452169689Skan/* Record the suitable memory references in LOOP. */ 453169689Skan 454169689Skanstatic struct mem_ref_group * 455169689Skangather_memory_references (struct loop *loop) 456169689Skan{ 457169689Skan basic_block *body = get_loop_body_in_dom_order (loop); 458169689Skan basic_block bb; 459169689Skan unsigned i; 460169689Skan block_stmt_iterator bsi; 461169689Skan tree stmt, lhs, rhs; 462169689Skan struct mem_ref_group *refs = NULL; 463169689Skan 464169689Skan /* Scan the loop body in order, so that the former references precede the 465169689Skan later ones. */ 466169689Skan for (i = 0; i < loop->num_nodes; i++) 467169689Skan { 468169689Skan bb = body[i]; 469169689Skan if (bb->loop_father != loop) 470169689Skan continue; 471169689Skan 472169689Skan for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi)) 473169689Skan { 474169689Skan stmt = bsi_stmt (bsi); 475169689Skan if (TREE_CODE (stmt) != MODIFY_EXPR) 476169689Skan continue; 477169689Skan 478169689Skan lhs = TREE_OPERAND (stmt, 0); 479169689Skan rhs = TREE_OPERAND (stmt, 1); 480169689Skan 481169689Skan if (REFERENCE_CLASS_P (rhs)) 482169689Skan gather_memory_references_ref (loop, &refs, rhs, false, stmt); 483169689Skan if (REFERENCE_CLASS_P (lhs)) 484169689Skan gather_memory_references_ref (loop, &refs, lhs, true, stmt); 485169689Skan } 486169689Skan } 487169689Skan free (body); 488169689Skan 489169689Skan return refs; 490169689Skan} 491169689Skan 492169689Skan/* Prune the prefetch candidate REF using the self-reuse. */ 493169689Skan 494169689Skanstatic void 495169689Skanprune_ref_by_self_reuse (struct mem_ref *ref) 496169689Skan{ 497169689Skan HOST_WIDE_INT step = ref->group->step; 498169689Skan bool backward = step < 0; 499169689Skan 500169689Skan if (step == 0) 501169689Skan { 502169689Skan /* Prefetch references to invariant address just once. */ 503169689Skan ref->prefetch_before = 1; 504169689Skan return; 505169689Skan } 506169689Skan 507169689Skan if (backward) 508169689Skan step = -step; 509169689Skan 510169689Skan if (step > PREFETCH_BLOCK) 511169689Skan return; 512169689Skan 513169689Skan if ((backward && HAVE_BACKWARD_PREFETCH) 514169689Skan || (!backward && HAVE_FORWARD_PREFETCH)) 515169689Skan { 516169689Skan ref->prefetch_before = 1; 517169689Skan return; 518169689Skan } 519169689Skan 520169689Skan ref->prefetch_mod = PREFETCH_BLOCK / step; 521169689Skan} 522169689Skan 523169689Skan/* Divides X by BY, rounding down. */ 524169689Skan 525169689Skanstatic HOST_WIDE_INT 526169689Skanddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by) 527169689Skan{ 528169689Skan gcc_assert (by > 0); 529169689Skan 530169689Skan if (x >= 0) 531169689Skan return x / by; 532169689Skan else 533169689Skan return (x + by - 1) / by; 534169689Skan} 535169689Skan 536169689Skan/* Prune the prefetch candidate REF using the reuse with BY. 537169689Skan If BY_IS_BEFORE is true, BY is before REF in the loop. */ 538169689Skan 539169689Skanstatic void 540169689Skanprune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by, 541169689Skan bool by_is_before) 542169689Skan{ 543169689Skan HOST_WIDE_INT step = ref->group->step; 544169689Skan bool backward = step < 0; 545169689Skan HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta; 546169689Skan HOST_WIDE_INT delta = delta_b - delta_r; 547169689Skan HOST_WIDE_INT hit_from; 548169689Skan unsigned HOST_WIDE_INT prefetch_before, prefetch_block; 549169689Skan 550169689Skan if (delta == 0) 551169689Skan { 552169689Skan /* If the references has the same address, only prefetch the 553169689Skan former. */ 554169689Skan if (by_is_before) 555169689Skan ref->prefetch_before = 0; 556169689Skan 557169689Skan return; 558169689Skan } 559169689Skan 560169689Skan if (!step) 561169689Skan { 562169689Skan /* If the reference addresses are invariant and fall into the 563169689Skan same cache line, prefetch just the first one. */ 564169689Skan if (!by_is_before) 565169689Skan return; 566169689Skan 567169689Skan if (ddown (ref->delta, PREFETCH_BLOCK) 568169689Skan != ddown (by->delta, PREFETCH_BLOCK)) 569169689Skan return; 570169689Skan 571169689Skan ref->prefetch_before = 0; 572169689Skan return; 573169689Skan } 574169689Skan 575169689Skan /* Only prune the reference that is behind in the array. */ 576169689Skan if (backward) 577169689Skan { 578169689Skan if (delta > 0) 579169689Skan return; 580169689Skan 581169689Skan /* Transform the data so that we may assume that the accesses 582169689Skan are forward. */ 583169689Skan delta = - delta; 584169689Skan step = -step; 585169689Skan delta_r = PREFETCH_BLOCK - 1 - delta_r; 586169689Skan delta_b = PREFETCH_BLOCK - 1 - delta_b; 587169689Skan } 588169689Skan else 589169689Skan { 590169689Skan if (delta < 0) 591169689Skan return; 592169689Skan } 593169689Skan 594169689Skan /* Check whether the two references are likely to hit the same cache 595169689Skan line, and how distant the iterations in that it occurs are from 596169689Skan each other. */ 597169689Skan 598169689Skan if (step <= PREFETCH_BLOCK) 599169689Skan { 600169689Skan /* The accesses are sure to meet. Let us check when. */ 601169689Skan hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK; 602169689Skan prefetch_before = (hit_from - delta_r + step - 1) / step; 603169689Skan 604169689Skan if (prefetch_before < ref->prefetch_before) 605169689Skan ref->prefetch_before = prefetch_before; 606169689Skan 607169689Skan return; 608169689Skan } 609169689Skan 610169689Skan /* A more complicated case. First let us ensure that size of cache line 611169689Skan and step are coprime (here we assume that PREFETCH_BLOCK is a power 612169689Skan of two. */ 613169689Skan prefetch_block = PREFETCH_BLOCK; 614169689Skan while ((step & 1) == 0 615169689Skan && prefetch_block > 1) 616169689Skan { 617169689Skan step >>= 1; 618169689Skan prefetch_block >>= 1; 619169689Skan delta >>= 1; 620169689Skan } 621169689Skan 622169689Skan /* Now step > prefetch_block, and step and prefetch_block are coprime. 623169689Skan Determine the probability that the accesses hit the same cache line. */ 624169689Skan 625169689Skan prefetch_before = delta / step; 626169689Skan delta %= step; 627169689Skan if ((unsigned HOST_WIDE_INT) delta 628169689Skan <= (prefetch_block * ACCEPTABLE_MISS_RATE / 1000)) 629169689Skan { 630169689Skan if (prefetch_before < ref->prefetch_before) 631169689Skan ref->prefetch_before = prefetch_before; 632169689Skan 633169689Skan return; 634169689Skan } 635169689Skan 636169689Skan /* Try also the following iteration. */ 637169689Skan prefetch_before++; 638169689Skan delta = step - delta; 639169689Skan if ((unsigned HOST_WIDE_INT) delta 640169689Skan <= (prefetch_block * ACCEPTABLE_MISS_RATE / 1000)) 641169689Skan { 642169689Skan if (prefetch_before < ref->prefetch_before) 643169689Skan ref->prefetch_before = prefetch_before; 644169689Skan 645169689Skan return; 646169689Skan } 647169689Skan 648169689Skan /* The ref probably does not reuse by. */ 649169689Skan return; 650169689Skan} 651169689Skan 652169689Skan/* Prune the prefetch candidate REF using the reuses with other references 653169689Skan in REFS. */ 654169689Skan 655169689Skanstatic void 656169689Skanprune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs) 657169689Skan{ 658169689Skan struct mem_ref *prune_by; 659169689Skan bool before = true; 660169689Skan 661169689Skan prune_ref_by_self_reuse (ref); 662169689Skan 663169689Skan for (prune_by = refs; prune_by; prune_by = prune_by->next) 664169689Skan { 665169689Skan if (prune_by == ref) 666169689Skan { 667169689Skan before = false; 668169689Skan continue; 669169689Skan } 670169689Skan 671169689Skan if (!WRITE_CAN_USE_READ_PREFETCH 672169689Skan && ref->write_p 673169689Skan && !prune_by->write_p) 674169689Skan continue; 675169689Skan if (!READ_CAN_USE_WRITE_PREFETCH 676169689Skan && !ref->write_p 677169689Skan && prune_by->write_p) 678169689Skan continue; 679169689Skan 680169689Skan prune_ref_by_group_reuse (ref, prune_by, before); 681169689Skan } 682169689Skan} 683169689Skan 684169689Skan/* Prune the prefetch candidates in GROUP using the reuse analysis. */ 685169689Skan 686169689Skanstatic void 687169689Skanprune_group_by_reuse (struct mem_ref_group *group) 688169689Skan{ 689169689Skan struct mem_ref *ref_pruned; 690169689Skan 691169689Skan for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next) 692169689Skan { 693169689Skan prune_ref_by_reuse (ref_pruned, group->refs); 694169689Skan 695169689Skan if (dump_file && (dump_flags & TDF_DETAILS)) 696169689Skan { 697169689Skan fprintf (dump_file, "Reference %p:", (void *) ref_pruned); 698169689Skan 699169689Skan if (ref_pruned->prefetch_before == PREFETCH_ALL 700169689Skan && ref_pruned->prefetch_mod == 1) 701169689Skan fprintf (dump_file, " no restrictions"); 702169689Skan else if (ref_pruned->prefetch_before == 0) 703169689Skan fprintf (dump_file, " do not prefetch"); 704169689Skan else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod) 705169689Skan fprintf (dump_file, " prefetch once"); 706169689Skan else 707169689Skan { 708169689Skan if (ref_pruned->prefetch_before != PREFETCH_ALL) 709169689Skan { 710169689Skan fprintf (dump_file, " prefetch before "); 711169689Skan fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC, 712169689Skan ref_pruned->prefetch_before); 713169689Skan } 714169689Skan if (ref_pruned->prefetch_mod != 1) 715169689Skan { 716169689Skan fprintf (dump_file, " prefetch mod "); 717169689Skan fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC, 718169689Skan ref_pruned->prefetch_mod); 719169689Skan } 720169689Skan } 721169689Skan fprintf (dump_file, "\n"); 722169689Skan } 723169689Skan } 724169689Skan} 725169689Skan 726169689Skan/* Prune the list of prefetch candidates GROUPS using the reuse analysis. */ 727169689Skan 728169689Skanstatic void 729169689Skanprune_by_reuse (struct mem_ref_group *groups) 730169689Skan{ 731169689Skan for (; groups; groups = groups->next) 732169689Skan prune_group_by_reuse (groups); 733169689Skan} 734169689Skan 735169689Skan/* Returns true if we should issue prefetch for REF. */ 736169689Skan 737169689Skanstatic bool 738169689Skanshould_issue_prefetch_p (struct mem_ref *ref) 739169689Skan{ 740169689Skan /* For now do not issue prefetches for only first few of the 741169689Skan iterations. */ 742169689Skan if (ref->prefetch_before != PREFETCH_ALL) 743169689Skan return false; 744169689Skan 745169689Skan return true; 746169689Skan} 747169689Skan 748169689Skan/* Decide which of the prefetch candidates in GROUPS to prefetch. 749169689Skan AHEAD is the number of iterations to prefetch ahead (which corresponds 750169689Skan to the number of simultaneous instances of one prefetch running at a 751169689Skan time). UNROLL_FACTOR is the factor by that the loop is going to be 752169689Skan unrolled. Returns true if there is anything to prefetch. */ 753169689Skan 754169689Skanstatic bool 755169689Skanschedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor, 756169689Skan unsigned ahead) 757169689Skan{ 758169689Skan unsigned max_prefetches, n_prefetches; 759169689Skan struct mem_ref *ref; 760169689Skan bool any = false; 761169689Skan 762169689Skan max_prefetches = (SIMULTANEOUS_PREFETCHES * unroll_factor) / ahead; 763169689Skan if (max_prefetches > (unsigned) SIMULTANEOUS_PREFETCHES) 764169689Skan max_prefetches = SIMULTANEOUS_PREFETCHES; 765169689Skan 766169689Skan if (dump_file && (dump_flags & TDF_DETAILS)) 767169689Skan fprintf (dump_file, "Max prefetches to issue: %d.\n", max_prefetches); 768169689Skan 769169689Skan if (!max_prefetches) 770169689Skan return false; 771169689Skan 772169689Skan /* For now we just take memory references one by one and issue 773169689Skan prefetches for as many as possible. The groups are sorted 774169689Skan starting with the largest step, since the references with 775169689Skan large step are more likely to cause many cache misses. */ 776169689Skan 777169689Skan for (; groups; groups = groups->next) 778169689Skan for (ref = groups->refs; ref; ref = ref->next) 779169689Skan { 780169689Skan if (!should_issue_prefetch_p (ref)) 781169689Skan continue; 782169689Skan 783169689Skan ref->issue_prefetch_p = true; 784169689Skan 785169689Skan /* If prefetch_mod is less then unroll_factor, we need to insert 786169689Skan several prefetches for the reference. */ 787169689Skan n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) 788169689Skan / ref->prefetch_mod); 789169689Skan if (max_prefetches <= n_prefetches) 790169689Skan return true; 791169689Skan 792169689Skan max_prefetches -= n_prefetches; 793169689Skan any = true; 794169689Skan } 795169689Skan 796169689Skan return any; 797169689Skan} 798169689Skan 799169689Skan/* Determine whether there is any reference suitable for prefetching 800169689Skan in GROUPS. */ 801169689Skan 802169689Skanstatic bool 803169689Skananything_to_prefetch_p (struct mem_ref_group *groups) 804169689Skan{ 805169689Skan struct mem_ref *ref; 806169689Skan 807169689Skan for (; groups; groups = groups->next) 808169689Skan for (ref = groups->refs; ref; ref = ref->next) 809169689Skan if (should_issue_prefetch_p (ref)) 810169689Skan return true; 811169689Skan 812169689Skan return false; 813169689Skan} 814169689Skan 815169689Skan/* Issue prefetches for the reference REF into loop as decided before. 816169689Skan HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR 817169689Skan is the factor by which LOOP was unrolled. */ 818169689Skan 819169689Skanstatic void 820169689Skanissue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead) 821169689Skan{ 822169689Skan HOST_WIDE_INT delta; 823169689Skan tree addr, addr_base, prefetch, params, write_p; 824169689Skan block_stmt_iterator bsi; 825169689Skan unsigned n_prefetches, ap; 826169689Skan 827169689Skan if (dump_file && (dump_flags & TDF_DETAILS)) 828169689Skan fprintf (dump_file, "Issued prefetch for %p.\n", (void *) ref); 829169689Skan 830169689Skan bsi = bsi_for_stmt (ref->stmt); 831169689Skan 832169689Skan n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) 833169689Skan / ref->prefetch_mod); 834169689Skan addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node); 835169689Skan addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL); 836169689Skan 837169689Skan for (ap = 0; ap < n_prefetches; ap++) 838169689Skan { 839169689Skan /* Determine the address to prefetch. */ 840169689Skan delta = (ahead + ap * ref->prefetch_mod) * ref->group->step; 841169689Skan addr = fold_build2 (PLUS_EXPR, ptr_type_node, 842169689Skan addr_base, build_int_cst (ptr_type_node, delta)); 843169689Skan addr = force_gimple_operand_bsi (&bsi, unshare_expr (addr), true, NULL); 844169689Skan 845169689Skan /* Create the prefetch instruction. */ 846169689Skan write_p = ref->write_p ? integer_one_node : integer_zero_node; 847169689Skan params = tree_cons (NULL_TREE, addr, 848169689Skan tree_cons (NULL_TREE, write_p, NULL_TREE)); 849169689Skan 850169689Skan prefetch = build_function_call_expr (built_in_decls[BUILT_IN_PREFETCH], 851169689Skan params); 852169689Skan bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT); 853169689Skan } 854169689Skan} 855169689Skan 856169689Skan/* Issue prefetches for the references in GROUPS into loop as decided before. 857169689Skan HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the 858169689Skan factor by that LOOP was unrolled. */ 859169689Skan 860169689Skanstatic void 861169689Skanissue_prefetches (struct mem_ref_group *groups, 862169689Skan unsigned unroll_factor, unsigned ahead) 863169689Skan{ 864169689Skan struct mem_ref *ref; 865169689Skan 866169689Skan for (; groups; groups = groups->next) 867169689Skan for (ref = groups->refs; ref; ref = ref->next) 868169689Skan if (ref->issue_prefetch_p) 869169689Skan issue_prefetch_ref (ref, unroll_factor, ahead); 870169689Skan} 871169689Skan 872169689Skan/* Determines whether we can profitably unroll LOOP FACTOR times, and if 873169689Skan this is the case, fill in DESC by the description of number of 874169689Skan iterations. */ 875169689Skan 876169689Skanstatic bool 877169689Skanshould_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc, 878169689Skan unsigned factor) 879169689Skan{ 880169689Skan if (!can_unroll_loop_p (loop, factor, desc)) 881169689Skan return false; 882169689Skan 883169689Skan /* We only consider loops without control flow for unrolling. This is not 884169689Skan a hard restriction -- tree_unroll_loop works with arbitrary loops 885169689Skan as well; but the unrolling/prefetching is usually more profitable for 886169689Skan loops consisting of a single basic block, and we want to limit the 887169689Skan code growth. */ 888169689Skan if (loop->num_nodes > 2) 889169689Skan return false; 890169689Skan 891169689Skan return true; 892169689Skan} 893169689Skan 894169689Skan/* Determine the coefficient by that unroll LOOP, from the information 895169689Skan contained in the list of memory references REFS. Description of 896169689Skan umber of iterations of LOOP is stored to DESC. AHEAD is the number 897169689Skan of iterations ahead that we need to prefetch. NINSNS is number of 898169689Skan insns of the LOOP. */ 899169689Skan 900169689Skanstatic unsigned 901169689Skandetermine_unroll_factor (struct loop *loop, struct mem_ref_group *refs, 902169689Skan unsigned ahead, unsigned ninsns, 903169689Skan struct tree_niter_desc *desc) 904169689Skan{ 905169689Skan unsigned upper_bound, size_factor, constraint_factor; 906169689Skan unsigned factor, max_mod_constraint, ahead_factor; 907169689Skan struct mem_ref_group *agp; 908169689Skan struct mem_ref *ref; 909169689Skan 910169689Skan upper_bound = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES); 911169689Skan 912169689Skan /* First check whether the loop is not too large to unroll. */ 913169689Skan size_factor = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns; 914169689Skan if (size_factor <= 1) 915169689Skan return 1; 916169689Skan 917169689Skan if (size_factor < upper_bound) 918169689Skan upper_bound = size_factor; 919169689Skan 920169689Skan max_mod_constraint = 1; 921169689Skan for (agp = refs; agp; agp = agp->next) 922169689Skan for (ref = agp->refs; ref; ref = ref->next) 923169689Skan if (should_issue_prefetch_p (ref) 924169689Skan && ref->prefetch_mod > max_mod_constraint) 925169689Skan max_mod_constraint = ref->prefetch_mod; 926169689Skan 927169689Skan /* Set constraint_factor as large as needed to be able to satisfy the 928169689Skan largest modulo constraint. */ 929169689Skan constraint_factor = max_mod_constraint; 930169689Skan 931169689Skan /* If ahead is too large in comparison with the number of available 932169689Skan prefetches, unroll the loop as much as needed to be able to prefetch 933169689Skan at least partially some of the references in the loop. */ 934169689Skan ahead_factor = ((ahead + SIMULTANEOUS_PREFETCHES - 1) 935169689Skan / SIMULTANEOUS_PREFETCHES); 936169689Skan 937169689Skan /* Unroll as much as useful, but bound the code size growth. */ 938169689Skan if (constraint_factor < ahead_factor) 939169689Skan factor = ahead_factor; 940169689Skan else 941169689Skan factor = constraint_factor; 942169689Skan if (factor > upper_bound) 943169689Skan factor = upper_bound; 944169689Skan 945169689Skan if (!should_unroll_loop_p (loop, desc, factor)) 946169689Skan return 1; 947169689Skan 948169689Skan return factor; 949169689Skan} 950169689Skan 951169689Skan/* Issue prefetch instructions for array references in LOOP. Returns 952169689Skan true if the LOOP was unrolled. LOOPS is the array containing all 953169689Skan loops. */ 954169689Skan 955169689Skanstatic bool 956169689Skanloop_prefetch_arrays (struct loops *loops, struct loop *loop) 957169689Skan{ 958169689Skan struct mem_ref_group *refs; 959169689Skan unsigned ahead, ninsns, unroll_factor; 960169689Skan struct tree_niter_desc desc; 961169689Skan bool unrolled = false; 962169689Skan 963169689Skan /* Step 1: gather the memory references. */ 964169689Skan refs = gather_memory_references (loop); 965169689Skan 966169689Skan /* Step 2: estimate the reuse effects. */ 967169689Skan prune_by_reuse (refs); 968169689Skan 969169689Skan if (!anything_to_prefetch_p (refs)) 970169689Skan goto fail; 971169689Skan 972169689Skan /* Step 3: determine the ahead and unroll factor. */ 973169689Skan 974169689Skan /* FIXME: We should use not size of the loop, but the average number of 975169689Skan instructions executed per iteration of the loop. */ 976169689Skan ninsns = tree_num_loop_insns (loop); 977169689Skan ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns; 978169689Skan unroll_factor = determine_unroll_factor (loop, refs, ahead, ninsns, 979169689Skan &desc); 980169689Skan if (dump_file && (dump_flags & TDF_DETAILS)) 981169689Skan fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor); 982169689Skan 983169689Skan /* If the loop rolls less than the required unroll factor, prefetching 984169689Skan is useless. */ 985169689Skan if (unroll_factor > 1 986169689Skan && cst_and_fits_in_hwi (desc.niter) 987169689Skan && (unsigned HOST_WIDE_INT) int_cst_value (desc.niter) < unroll_factor) 988169689Skan goto fail; 989169689Skan 990169689Skan /* Step 4: what to prefetch? */ 991169689Skan if (!schedule_prefetches (refs, unroll_factor, ahead)) 992169689Skan goto fail; 993169689Skan 994169689Skan /* Step 5: unroll the loop. TODO -- peeling of first and last few 995169689Skan iterations so that we do not issue superfluous prefetches. */ 996169689Skan if (unroll_factor != 1) 997169689Skan { 998169689Skan tree_unroll_loop (loops, loop, unroll_factor, 999169689Skan single_dom_exit (loop), &desc); 1000169689Skan unrolled = true; 1001169689Skan } 1002169689Skan 1003169689Skan /* Step 6: issue the prefetches. */ 1004169689Skan issue_prefetches (refs, unroll_factor, ahead); 1005169689Skan 1006169689Skanfail: 1007169689Skan release_mem_refs (refs); 1008169689Skan return unrolled; 1009169689Skan} 1010169689Skan 1011169689Skan/* Issue prefetch instructions for array references in LOOPS. */ 1012169689Skan 1013169689Skanunsigned int 1014169689Skantree_ssa_prefetch_arrays (struct loops *loops) 1015169689Skan{ 1016169689Skan unsigned i; 1017169689Skan struct loop *loop; 1018169689Skan bool unrolled = false; 1019169689Skan int todo_flags = 0; 1020169689Skan 1021169689Skan if (!HAVE_prefetch 1022169689Skan /* It is possible to ask compiler for say -mtune=i486 -march=pentium4. 1023169689Skan -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part 1024169689Skan of processor costs and i486 does not have prefetch, but 1025169689Skan -march=pentium4 causes HAVE_prefetch to be true. Ugh. */ 1026169689Skan || PREFETCH_BLOCK == 0) 1027169689Skan return 0; 1028169689Skan 1029169689Skan initialize_original_copy_tables (); 1030169689Skan 1031169689Skan if (!built_in_decls[BUILT_IN_PREFETCH]) 1032169689Skan { 1033169689Skan tree type = build_function_type (void_type_node, 1034169689Skan tree_cons (NULL_TREE, 1035169689Skan const_ptr_type_node, 1036169689Skan NULL_TREE)); 1037169689Skan tree decl = lang_hooks.builtin_function ("__builtin_prefetch", type, 1038169689Skan BUILT_IN_PREFETCH, BUILT_IN_NORMAL, 1039169689Skan NULL, NULL_TREE); 1040169689Skan DECL_IS_NOVOPS (decl) = true; 1041169689Skan built_in_decls[BUILT_IN_PREFETCH] = decl; 1042169689Skan } 1043169689Skan 1044169689Skan /* We assume that size of cache line is a power of two, so verify this 1045169689Skan here. */ 1046169689Skan gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0); 1047169689Skan 1048169689Skan for (i = loops->num - 1; i > 0; i--) 1049169689Skan { 1050169689Skan loop = loops->parray[i]; 1051169689Skan 1052169689Skan if (dump_file && (dump_flags & TDF_DETAILS)) 1053169689Skan fprintf (dump_file, "Processing loop %d:\n", loop->num); 1054169689Skan 1055169689Skan if (loop) 1056169689Skan unrolled |= loop_prefetch_arrays (loops, loop); 1057169689Skan 1058169689Skan if (dump_file && (dump_flags & TDF_DETAILS)) 1059169689Skan fprintf (dump_file, "\n\n"); 1060169689Skan } 1061169689Skan 1062169689Skan if (unrolled) 1063169689Skan { 1064169689Skan scev_reset (); 1065169689Skan todo_flags |= TODO_cleanup_cfg; 1066169689Skan } 1067169689Skan 1068169689Skan free_original_copy_tables (); 1069169689Skan return todo_flags; 1070169689Skan} 1071