1169689Skan/* Swing Modulo Scheduling implementation. 2169689Skan Copyright (C) 2004, 2005, 2006 3169689Skan Free Software Foundation, Inc. 4169689Skan Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com> 5169689Skan 6169689SkanThis file is part of GCC. 7169689Skan 8169689SkanGCC is free software; you can redistribute it and/or modify it under 9169689Skanthe terms of the GNU General Public License as published by the Free 10169689SkanSoftware Foundation; either version 2, or (at your option) any later 11169689Skanversion. 12169689Skan 13169689SkanGCC is distributed in the hope that it will be useful, but WITHOUT ANY 14169689SkanWARRANTY; without even the implied warranty of MERCHANTABILITY or 15169689SkanFITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16169689Skanfor more details. 17169689Skan 18169689SkanYou should have received a copy of the GNU General Public License 19169689Skanalong with GCC; see the file COPYING. If not, write to the Free 20169689SkanSoftware Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 21169689Skan02110-1301, USA. */ 22169689Skan 23169689Skan 24169689Skan#include "config.h" 25169689Skan#include "system.h" 26169689Skan#include "coretypes.h" 27169689Skan#include "tm.h" 28169689Skan#include "toplev.h" 29169689Skan#include "rtl.h" 30169689Skan#include "tm_p.h" 31169689Skan#include "hard-reg-set.h" 32169689Skan#include "regs.h" 33169689Skan#include "function.h" 34169689Skan#include "flags.h" 35169689Skan#include "insn-config.h" 36169689Skan#include "insn-attr.h" 37169689Skan#include "except.h" 38169689Skan#include "toplev.h" 39169689Skan#include "recog.h" 40169689Skan#include "sched-int.h" 41169689Skan#include "target.h" 42169689Skan#include "cfglayout.h" 43169689Skan#include "cfgloop.h" 44169689Skan#include "cfghooks.h" 45169689Skan#include "expr.h" 46169689Skan#include "params.h" 47169689Skan#include "gcov-io.h" 48169689Skan#include "df.h" 49169689Skan#include "ddg.h" 50169689Skan#include "timevar.h" 51169689Skan#include "tree-pass.h" 52169689Skan 53169689Skan#ifdef INSN_SCHEDULING 54169689Skan 55169689Skan/* This file contains the implementation of the Swing Modulo Scheduler, 56169689Skan described in the following references: 57169689Skan [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt. 58169689Skan Lifetime--sensitive modulo scheduling in a production environment. 59169689Skan IEEE Trans. on Comps., 50(3), March 2001 60169689Skan [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero. 61169689Skan Swing Modulo Scheduling: A Lifetime Sensitive Approach. 62169689Skan PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA). 63169689Skan 64169689Skan The basic structure is: 65169689Skan 1. Build a data-dependence graph (DDG) for each loop. 66169689Skan 2. Use the DDG to order the insns of a loop (not in topological order 67169689Skan necessarily, but rather) trying to place each insn after all its 68169689Skan predecessors _or_ after all its successors. 69169689Skan 3. Compute MII: a lower bound on the number of cycles to schedule the loop. 70169689Skan 4. Use the ordering to perform list-scheduling of the loop: 71169689Skan 1. Set II = MII. We will try to schedule the loop within II cycles. 72169689Skan 2. Try to schedule the insns one by one according to the ordering. 73169689Skan For each insn compute an interval of cycles by considering already- 74169689Skan scheduled preds and succs (and associated latencies); try to place 75169689Skan the insn in the cycles of this window checking for potential 76169689Skan resource conflicts (using the DFA interface). 77169689Skan Note: this is different from the cycle-scheduling of schedule_insns; 78169689Skan here the insns are not scheduled monotonically top-down (nor bottom- 79169689Skan up). 80169689Skan 3. If failed in scheduling all insns - bump II++ and try again, unless 81169689Skan II reaches an upper bound MaxII, in which case report failure. 82169689Skan 5. If we succeeded in scheduling the loop within II cycles, we now 83169689Skan generate prolog and epilog, decrease the counter of the loop, and 84169689Skan perform modulo variable expansion for live ranges that span more than 85169689Skan II cycles (i.e. use register copies to prevent a def from overwriting 86169689Skan itself before reaching the use). 87169689Skan*/ 88169689Skan 89169689Skan 90169689Skan/* This page defines partial-schedule structures and functions for 91169689Skan modulo scheduling. */ 92169689Skan 93169689Skantypedef struct partial_schedule *partial_schedule_ptr; 94169689Skantypedef struct ps_insn *ps_insn_ptr; 95169689Skan 96169689Skan/* The minimum (absolute) cycle that a node of ps was scheduled in. */ 97169689Skan#define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle) 98169689Skan 99169689Skan/* The maximum (absolute) cycle that a node of ps was scheduled in. */ 100169689Skan#define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle) 101169689Skan 102169689Skan/* Perform signed modulo, always returning a non-negative value. */ 103169689Skan#define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y)) 104169689Skan 105169689Skan/* The number of different iterations the nodes in ps span, assuming 106169689Skan the stage boundaries are placed efficiently. */ 107169689Skan#define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \ 108169689Skan + 1 + (ps)->ii - 1) / (ps)->ii) 109169689Skan 110169689Skan/* A single instruction in the partial schedule. */ 111169689Skanstruct ps_insn 112169689Skan{ 113169689Skan /* The corresponding DDG_NODE. */ 114169689Skan ddg_node_ptr node; 115169689Skan 116169689Skan /* The (absolute) cycle in which the PS instruction is scheduled. 117169689Skan Same as SCHED_TIME (node). */ 118169689Skan int cycle; 119169689Skan 120169689Skan /* The next/prev PS_INSN in the same row. */ 121169689Skan ps_insn_ptr next_in_row, 122169689Skan prev_in_row; 123169689Skan 124169689Skan /* The number of nodes in the same row that come after this node. */ 125169689Skan int row_rest_count; 126169689Skan}; 127169689Skan 128169689Skan/* Holds the partial schedule as an array of II rows. Each entry of the 129169689Skan array points to a linked list of PS_INSNs, which represents the 130169689Skan instructions that are scheduled for that row. */ 131169689Skanstruct partial_schedule 132169689Skan{ 133169689Skan int ii; /* Number of rows in the partial schedule. */ 134169689Skan int history; /* Threshold for conflict checking using DFA. */ 135169689Skan 136169689Skan /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii). */ 137169689Skan ps_insn_ptr *rows; 138169689Skan 139169689Skan /* The earliest absolute cycle of an insn in the partial schedule. */ 140169689Skan int min_cycle; 141169689Skan 142169689Skan /* The latest absolute cycle of an insn in the partial schedule. */ 143169689Skan int max_cycle; 144169689Skan 145169689Skan ddg_ptr g; /* The DDG of the insns in the partial schedule. */ 146169689Skan}; 147169689Skan 148169689Skan/* We use this to record all the register replacements we do in 149169689Skan the kernel so we can undo SMS if it is not profitable. */ 150169689Skanstruct undo_replace_buff_elem 151169689Skan{ 152169689Skan rtx insn; 153169689Skan rtx orig_reg; 154169689Skan rtx new_reg; 155169689Skan struct undo_replace_buff_elem *next; 156169689Skan}; 157169689Skan 158169689Skan 159169689Skan 160169689Skanstatic partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history); 161169689Skanstatic void free_partial_schedule (partial_schedule_ptr); 162169689Skanstatic void reset_partial_schedule (partial_schedule_ptr, int new_ii); 163169689Skanvoid print_partial_schedule (partial_schedule_ptr, FILE *); 164169689Skanstatic int kernel_number_of_cycles (rtx first_insn, rtx last_insn); 165169689Skanstatic ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr, 166169689Skan ddg_node_ptr node, int cycle, 167169689Skan sbitmap must_precede, 168169689Skan sbitmap must_follow); 169169689Skanstatic void rotate_partial_schedule (partial_schedule_ptr, int); 170169689Skanvoid set_row_column_for_ps (partial_schedule_ptr); 171169689Skanstatic bool ps_unschedule_node (partial_schedule_ptr, ddg_node_ptr ); 172169689Skan 173169689Skan 174169689Skan/* This page defines constants and structures for the modulo scheduling 175169689Skan driver. */ 176169689Skan 177169689Skan/* As in haifa-sched.c: */ 178169689Skan/* issue_rate is the number of insns that can be scheduled in the same 179169689Skan machine cycle. It can be defined in the config/mach/mach.h file, 180169689Skan otherwise we set it to 1. */ 181169689Skan 182169689Skanstatic int issue_rate; 183169689Skan 184169689Skanstatic int sms_order_nodes (ddg_ptr, int, int * result); 185169689Skanstatic void set_node_sched_params (ddg_ptr); 186169689Skanstatic partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *); 187169689Skanstatic void permute_partial_schedule (partial_schedule_ptr ps, rtx last); 188169689Skanstatic void generate_prolog_epilog (partial_schedule_ptr ,struct loop * loop, rtx); 189169689Skanstatic void duplicate_insns_of_cycles (partial_schedule_ptr ps, 190169689Skan int from_stage, int to_stage, 191169689Skan int is_prolog); 192169689Skan 193169689Skan#define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap) 194169689Skan#define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time) 195169689Skan#define SCHED_FIRST_REG_MOVE(x) \ 196169689Skan (((node_sched_params_ptr)(x)->aux.info)->first_reg_move) 197169689Skan#define SCHED_NREG_MOVES(x) \ 198169689Skan (((node_sched_params_ptr)(x)->aux.info)->nreg_moves) 199169689Skan#define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row) 200169689Skan#define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage) 201169689Skan#define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column) 202169689Skan 203169689Skan/* The scheduling parameters held for each node. */ 204169689Skantypedef struct node_sched_params 205169689Skan{ 206169689Skan int asap; /* A lower-bound on the absolute scheduling cycle. */ 207169689Skan int time; /* The absolute scheduling cycle (time >= asap). */ 208169689Skan 209169689Skan /* The following field (first_reg_move) is a pointer to the first 210169689Skan register-move instruction added to handle the modulo-variable-expansion 211169689Skan of the register defined by this node. This register-move copies the 212169689Skan original register defined by the node. */ 213169689Skan rtx first_reg_move; 214169689Skan 215169689Skan /* The number of register-move instructions added, immediately preceding 216169689Skan first_reg_move. */ 217169689Skan int nreg_moves; 218169689Skan 219169689Skan int row; /* Holds time % ii. */ 220169689Skan int stage; /* Holds time / ii. */ 221169689Skan 222169689Skan /* The column of a node inside the ps. If nodes u, v are on the same row, 223169689Skan u will precede v if column (u) < column (v). */ 224169689Skan int column; 225169689Skan} *node_sched_params_ptr; 226169689Skan 227169689Skan 228169689Skan/* The following three functions are copied from the current scheduler 229169689Skan code in order to use sched_analyze() for computing the dependencies. 230169689Skan They are used when initializing the sched_info structure. */ 231169689Skanstatic const char * 232169689Skansms_print_insn (rtx insn, int aligned ATTRIBUTE_UNUSED) 233169689Skan{ 234169689Skan static char tmp[80]; 235169689Skan 236169689Skan sprintf (tmp, "i%4d", INSN_UID (insn)); 237169689Skan return tmp; 238169689Skan} 239169689Skan 240169689Skanstatic void 241169689Skancompute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED, 242169689Skan regset cond_exec ATTRIBUTE_UNUSED, 243169689Skan regset used ATTRIBUTE_UNUSED, 244169689Skan regset set ATTRIBUTE_UNUSED) 245169689Skan{ 246169689Skan} 247169689Skan 248169689Skanstatic struct sched_info sms_sched_info = 249169689Skan{ 250169689Skan NULL, 251169689Skan NULL, 252169689Skan NULL, 253169689Skan NULL, 254169689Skan NULL, 255169689Skan sms_print_insn, 256169689Skan NULL, 257169689Skan compute_jump_reg_dependencies, 258169689Skan NULL, NULL, 259169689Skan NULL, NULL, 260169689Skan 0, 0, 0, 261169689Skan 262169689Skan NULL, NULL, NULL, NULL, NULL, 263169689Skan#ifdef ENABLE_CHECKING 264169689Skan NULL, 265169689Skan#endif 266169689Skan 0 267169689Skan}; 268169689Skan 269169689Skan 270169689Skan/* Return the register decremented and tested in INSN, 271169689Skan or zero if it is not a decrement-and-branch insn. */ 272169689Skan 273169689Skanstatic rtx 274169689Skandoloop_register_get (rtx insn ATTRIBUTE_UNUSED) 275169689Skan{ 276169689Skan#ifdef HAVE_doloop_end 277169689Skan rtx pattern, reg, condition; 278169689Skan 279169689Skan if (! JUMP_P (insn)) 280169689Skan return NULL_RTX; 281169689Skan 282169689Skan pattern = PATTERN (insn); 283169689Skan condition = doloop_condition_get (pattern); 284169689Skan if (! condition) 285169689Skan return NULL_RTX; 286169689Skan 287169689Skan if (REG_P (XEXP (condition, 0))) 288169689Skan reg = XEXP (condition, 0); 289169689Skan else if (GET_CODE (XEXP (condition, 0)) == PLUS 290169689Skan && REG_P (XEXP (XEXP (condition, 0), 0))) 291169689Skan reg = XEXP (XEXP (condition, 0), 0); 292169689Skan else 293169689Skan gcc_unreachable (); 294169689Skan 295169689Skan return reg; 296169689Skan#else 297169689Skan return NULL_RTX; 298169689Skan#endif 299169689Skan} 300169689Skan 301169689Skan/* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so 302169689Skan that the number of iterations is a compile-time constant. If so, 303169689Skan return the rtx that sets COUNT_REG to a constant, and set COUNT to 304169689Skan this constant. Otherwise return 0. */ 305169689Skanstatic rtx 306169689Skanconst_iteration_count (rtx count_reg, basic_block pre_header, 307169689Skan HOST_WIDEST_INT * count) 308169689Skan{ 309169689Skan rtx insn; 310169689Skan rtx head, tail; 311169689Skan 312169689Skan if (! pre_header) 313169689Skan return NULL_RTX; 314169689Skan 315169689Skan get_ebb_head_tail (pre_header, pre_header, &head, &tail); 316169689Skan 317169689Skan for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn)) 318169689Skan if (INSN_P (insn) && single_set (insn) && 319169689Skan rtx_equal_p (count_reg, SET_DEST (single_set (insn)))) 320169689Skan { 321169689Skan rtx pat = single_set (insn); 322169689Skan 323169689Skan if (GET_CODE (SET_SRC (pat)) == CONST_INT) 324169689Skan { 325169689Skan *count = INTVAL (SET_SRC (pat)); 326169689Skan return insn; 327169689Skan } 328169689Skan 329169689Skan return NULL_RTX; 330169689Skan } 331169689Skan 332169689Skan return NULL_RTX; 333169689Skan} 334169689Skan 335169689Skan/* A very simple resource-based lower bound on the initiation interval. 336169689Skan ??? Improve the accuracy of this bound by considering the 337169689Skan utilization of various units. */ 338169689Skanstatic int 339169689Skanres_MII (ddg_ptr g) 340169689Skan{ 341169689Skan return (g->num_nodes / issue_rate); 342169689Skan} 343169689Skan 344169689Skan 345169689Skan/* Points to the array that contains the sched data for each node. */ 346169689Skanstatic node_sched_params_ptr node_sched_params; 347169689Skan 348169689Skan/* Allocate sched_params for each node and initialize it. Assumes that 349169689Skan the aux field of each node contain the asap bound (computed earlier), 350169689Skan and copies it into the sched_params field. */ 351169689Skanstatic void 352169689Skanset_node_sched_params (ddg_ptr g) 353169689Skan{ 354169689Skan int i; 355169689Skan 356169689Skan /* Allocate for each node in the DDG a place to hold the "sched_data". */ 357169689Skan /* Initialize ASAP/ALAP/HIGHT to zero. */ 358169689Skan node_sched_params = (node_sched_params_ptr) 359169689Skan xcalloc (g->num_nodes, 360169689Skan sizeof (struct node_sched_params)); 361169689Skan 362169689Skan /* Set the pointer of the general data of the node to point to the 363169689Skan appropriate sched_params structure. */ 364169689Skan for (i = 0; i < g->num_nodes; i++) 365169689Skan { 366169689Skan /* Watch out for aliasing problems? */ 367169689Skan node_sched_params[i].asap = g->nodes[i].aux.count; 368169689Skan g->nodes[i].aux.info = &node_sched_params[i]; 369169689Skan } 370169689Skan} 371169689Skan 372169689Skanstatic void 373169689Skanprint_node_sched_params (FILE * file, int num_nodes) 374169689Skan{ 375169689Skan int i; 376169689Skan 377169689Skan if (! file) 378169689Skan return; 379169689Skan for (i = 0; i < num_nodes; i++) 380169689Skan { 381169689Skan node_sched_params_ptr nsp = &node_sched_params[i]; 382169689Skan rtx reg_move = nsp->first_reg_move; 383169689Skan int j; 384169689Skan 385169689Skan fprintf (file, "Node %d:\n", i); 386169689Skan fprintf (file, " asap = %d:\n", nsp->asap); 387169689Skan fprintf (file, " time = %d:\n", nsp->time); 388169689Skan fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves); 389169689Skan for (j = 0; j < nsp->nreg_moves; j++) 390169689Skan { 391169689Skan fprintf (file, " reg_move = "); 392169689Skan print_rtl_single (file, reg_move); 393169689Skan reg_move = PREV_INSN (reg_move); 394169689Skan } 395169689Skan } 396169689Skan} 397169689Skan 398169689Skan/* Calculate an upper bound for II. SMS should not schedule the loop if it 399169689Skan requires more cycles than this bound. Currently set to the sum of the 400169689Skan longest latency edge for each node. Reset based on experiments. */ 401169689Skanstatic int 402169689Skancalculate_maxii (ddg_ptr g) 403169689Skan{ 404169689Skan int i; 405169689Skan int maxii = 0; 406169689Skan 407169689Skan for (i = 0; i < g->num_nodes; i++) 408169689Skan { 409169689Skan ddg_node_ptr u = &g->nodes[i]; 410169689Skan ddg_edge_ptr e; 411169689Skan int max_edge_latency = 0; 412169689Skan 413169689Skan for (e = u->out; e; e = e->next_out) 414169689Skan max_edge_latency = MAX (max_edge_latency, e->latency); 415169689Skan 416169689Skan maxii += max_edge_latency; 417169689Skan } 418169689Skan return maxii; 419169689Skan} 420169689Skan 421169689Skan/* 422169689Skan Breaking intra-loop register anti-dependences: 423169689Skan Each intra-loop register anti-dependence implies a cross-iteration true 424169689Skan dependence of distance 1. Therefore, we can remove such false dependencies 425169689Skan and figure out if the partial schedule broke them by checking if (for a 426169689Skan true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and 427169689Skan if so generate a register move. The number of such moves is equal to: 428169689Skan SCHED_TIME (use) - SCHED_TIME (def) { 0 broken 429169689Skan nreg_moves = ----------------------------------- + 1 - { dependence. 430169689Skan ii { 1 if not. 431169689Skan*/ 432169689Skanstatic struct undo_replace_buff_elem * 433169689Skangenerate_reg_moves (partial_schedule_ptr ps) 434169689Skan{ 435169689Skan ddg_ptr g = ps->g; 436169689Skan int ii = ps->ii; 437169689Skan int i; 438169689Skan struct undo_replace_buff_elem *reg_move_replaces = NULL; 439169689Skan 440169689Skan for (i = 0; i < g->num_nodes; i++) 441169689Skan { 442169689Skan ddg_node_ptr u = &g->nodes[i]; 443169689Skan ddg_edge_ptr e; 444169689Skan int nreg_moves = 0, i_reg_move; 445169689Skan sbitmap *uses_of_defs; 446169689Skan rtx last_reg_move; 447169689Skan rtx prev_reg, old_reg; 448169689Skan 449169689Skan /* Compute the number of reg_moves needed for u, by looking at life 450169689Skan ranges started at u (excluding self-loops). */ 451169689Skan for (e = u->out; e; e = e->next_out) 452169689Skan if (e->type == TRUE_DEP && e->dest != e->src) 453169689Skan { 454169689Skan int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii; 455169689Skan 456169689Skan if (e->distance == 1) 457169689Skan nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii; 458169689Skan 459169689Skan /* If dest precedes src in the schedule of the kernel, then dest 460169689Skan will read before src writes and we can save one reg_copy. */ 461169689Skan if (SCHED_ROW (e->dest) == SCHED_ROW (e->src) 462169689Skan && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src)) 463169689Skan nreg_moves4e--; 464169689Skan 465169689Skan nreg_moves = MAX (nreg_moves, nreg_moves4e); 466169689Skan } 467169689Skan 468169689Skan if (nreg_moves == 0) 469169689Skan continue; 470169689Skan 471169689Skan /* Every use of the register defined by node may require a different 472169689Skan copy of this register, depending on the time the use is scheduled. 473169689Skan Set a bitmap vector, telling which nodes use each copy of this 474169689Skan register. */ 475169689Skan uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes); 476169689Skan sbitmap_vector_zero (uses_of_defs, nreg_moves); 477169689Skan for (e = u->out; e; e = e->next_out) 478169689Skan if (e->type == TRUE_DEP && e->dest != e->src) 479169689Skan { 480169689Skan int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii; 481169689Skan 482169689Skan if (e->distance == 1) 483169689Skan dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii; 484169689Skan 485169689Skan if (SCHED_ROW (e->dest) == SCHED_ROW (e->src) 486169689Skan && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src)) 487169689Skan dest_copy--; 488169689Skan 489169689Skan if (dest_copy) 490169689Skan SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid); 491169689Skan } 492169689Skan 493169689Skan /* Now generate the reg_moves, attaching relevant uses to them. */ 494169689Skan SCHED_NREG_MOVES (u) = nreg_moves; 495169689Skan old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn))); 496169689Skan last_reg_move = u->insn; 497169689Skan 498169689Skan for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++) 499169689Skan { 500169689Skan unsigned int i_use = 0; 501169689Skan rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg)); 502169689Skan rtx reg_move = gen_move_insn (new_reg, prev_reg); 503169689Skan sbitmap_iterator sbi; 504169689Skan 505169689Skan add_insn_before (reg_move, last_reg_move); 506169689Skan last_reg_move = reg_move; 507169689Skan 508169689Skan if (!SCHED_FIRST_REG_MOVE (u)) 509169689Skan SCHED_FIRST_REG_MOVE (u) = reg_move; 510169689Skan 511169689Skan EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use, sbi) 512169689Skan { 513169689Skan struct undo_replace_buff_elem *rep; 514169689Skan 515169689Skan rep = (struct undo_replace_buff_elem *) 516169689Skan xcalloc (1, sizeof (struct undo_replace_buff_elem)); 517169689Skan rep->insn = g->nodes[i_use].insn; 518169689Skan rep->orig_reg = old_reg; 519169689Skan rep->new_reg = new_reg; 520169689Skan 521169689Skan if (! reg_move_replaces) 522169689Skan reg_move_replaces = rep; 523169689Skan else 524169689Skan { 525169689Skan rep->next = reg_move_replaces; 526169689Skan reg_move_replaces = rep; 527169689Skan } 528169689Skan 529169689Skan replace_rtx (g->nodes[i_use].insn, old_reg, new_reg); 530169689Skan } 531169689Skan 532169689Skan prev_reg = new_reg; 533169689Skan } 534169689Skan sbitmap_vector_free (uses_of_defs); 535169689Skan } 536169689Skan return reg_move_replaces; 537169689Skan} 538169689Skan 539169689Skan/* We call this when we want to undo the SMS schedule for a given loop. 540169689Skan One of the things that we do is to delete the register moves generated 541169689Skan for the sake of SMS; this function deletes the register move instructions 542169689Skan recorded in the undo buffer. */ 543169689Skanstatic void 544169689Skanundo_generate_reg_moves (partial_schedule_ptr ps, 545169689Skan struct undo_replace_buff_elem *reg_move_replaces) 546169689Skan{ 547169689Skan int i,j; 548169689Skan 549169689Skan for (i = 0; i < ps->g->num_nodes; i++) 550169689Skan { 551169689Skan ddg_node_ptr u = &ps->g->nodes[i]; 552169689Skan rtx prev; 553169689Skan rtx crr = SCHED_FIRST_REG_MOVE (u); 554169689Skan 555169689Skan for (j = 0; j < SCHED_NREG_MOVES (u); j++) 556169689Skan { 557169689Skan prev = PREV_INSN (crr); 558169689Skan delete_insn (crr); 559169689Skan crr = prev; 560169689Skan } 561169689Skan SCHED_FIRST_REG_MOVE (u) = NULL_RTX; 562169689Skan } 563169689Skan 564169689Skan while (reg_move_replaces) 565169689Skan { 566169689Skan struct undo_replace_buff_elem *rep = reg_move_replaces; 567169689Skan 568169689Skan reg_move_replaces = reg_move_replaces->next; 569169689Skan replace_rtx (rep->insn, rep->new_reg, rep->orig_reg); 570169689Skan } 571169689Skan} 572169689Skan 573169689Skan/* Free memory allocated for the undo buffer. */ 574169689Skanstatic void 575169689Skanfree_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces) 576169689Skan{ 577169689Skan 578169689Skan while (reg_move_replaces) 579169689Skan { 580169689Skan struct undo_replace_buff_elem *rep = reg_move_replaces; 581169689Skan 582169689Skan reg_move_replaces = reg_move_replaces->next; 583169689Skan free (rep); 584169689Skan } 585169689Skan} 586169689Skan 587169689Skan/* Bump the SCHED_TIMEs of all nodes to start from zero. Set the values 588169689Skan of SCHED_ROW and SCHED_STAGE. */ 589169689Skanstatic void 590169689Skannormalize_sched_times (partial_schedule_ptr ps) 591169689Skan{ 592169689Skan int i; 593169689Skan ddg_ptr g = ps->g; 594169689Skan int amount = PS_MIN_CYCLE (ps); 595169689Skan int ii = ps->ii; 596169689Skan 597169689Skan /* Don't include the closing branch assuming that it is the last node. */ 598169689Skan for (i = 0; i < g->num_nodes - 1; i++) 599169689Skan { 600169689Skan ddg_node_ptr u = &g->nodes[i]; 601169689Skan int normalized_time = SCHED_TIME (u) - amount; 602169689Skan 603169689Skan gcc_assert (normalized_time >= 0); 604169689Skan 605169689Skan SCHED_TIME (u) = normalized_time; 606169689Skan SCHED_ROW (u) = normalized_time % ii; 607169689Skan SCHED_STAGE (u) = normalized_time / ii; 608169689Skan } 609169689Skan} 610169689Skan 611169689Skan/* Set SCHED_COLUMN of each node according to its position in PS. */ 612169689Skanstatic void 613169689Skanset_columns_for_ps (partial_schedule_ptr ps) 614169689Skan{ 615169689Skan int row; 616169689Skan 617169689Skan for (row = 0; row < ps->ii; row++) 618169689Skan { 619169689Skan ps_insn_ptr cur_insn = ps->rows[row]; 620169689Skan int column = 0; 621169689Skan 622169689Skan for (; cur_insn; cur_insn = cur_insn->next_in_row) 623169689Skan SCHED_COLUMN (cur_insn->node) = column++; 624169689Skan } 625169689Skan} 626169689Skan 627169689Skan/* Permute the insns according to their order in PS, from row 0 to 628169689Skan row ii-1, and position them right before LAST. This schedules 629169689Skan the insns of the loop kernel. */ 630169689Skanstatic void 631169689Skanpermute_partial_schedule (partial_schedule_ptr ps, rtx last) 632169689Skan{ 633169689Skan int ii = ps->ii; 634169689Skan int row; 635169689Skan ps_insn_ptr ps_ij; 636169689Skan 637169689Skan for (row = 0; row < ii ; row++) 638169689Skan for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row) 639169689Skan if (PREV_INSN (last) != ps_ij->node->insn) 640169689Skan reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn, 641169689Skan PREV_INSN (last)); 642169689Skan} 643169689Skan 644169689Skan/* As part of undoing SMS we return to the original ordering of the 645169689Skan instructions inside the loop kernel. Given the partial schedule PS, this 646169689Skan function returns the ordering of the instruction according to their CUID 647169689Skan in the DDG (PS->G), which is the original order of the instruction before 648169689Skan performing SMS. */ 649169689Skanstatic void 650169689Skanundo_permute_partial_schedule (partial_schedule_ptr ps, rtx last) 651169689Skan{ 652169689Skan int i; 653169689Skan 654169689Skan for (i = 0 ; i < ps->g->num_nodes; i++) 655169689Skan if (last == ps->g->nodes[i].insn 656169689Skan || last == ps->g->nodes[i].first_note) 657169689Skan break; 658169689Skan else if (PREV_INSN (last) != ps->g->nodes[i].insn) 659169689Skan reorder_insns_nobb (ps->g->nodes[i].first_note, ps->g->nodes[i].insn, 660169689Skan PREV_INSN (last)); 661169689Skan} 662169689Skan 663169689Skan/* Used to generate the prologue & epilogue. Duplicate the subset of 664169689Skan nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive 665169689Skan of both), together with a prefix/suffix of their reg_moves. */ 666169689Skanstatic void 667169689Skanduplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage, 668169689Skan int to_stage, int for_prolog) 669169689Skan{ 670169689Skan int row; 671169689Skan ps_insn_ptr ps_ij; 672169689Skan 673169689Skan for (row = 0; row < ps->ii; row++) 674169689Skan for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row) 675169689Skan { 676169689Skan ddg_node_ptr u_node = ps_ij->node; 677169689Skan int j, i_reg_moves; 678169689Skan rtx reg_move = NULL_RTX; 679169689Skan 680169689Skan if (for_prolog) 681169689Skan { 682169689Skan /* SCHED_STAGE (u_node) >= from_stage == 0. Generate increasing 683169689Skan number of reg_moves starting with the second occurrence of 684169689Skan u_node, which is generated if its SCHED_STAGE <= to_stage. */ 685169689Skan i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1; 686169689Skan i_reg_moves = MAX (i_reg_moves, 0); 687169689Skan i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node)); 688169689Skan 689169689Skan /* The reg_moves start from the *first* reg_move backwards. */ 690169689Skan if (i_reg_moves) 691169689Skan { 692169689Skan reg_move = SCHED_FIRST_REG_MOVE (u_node); 693169689Skan for (j = 1; j < i_reg_moves; j++) 694169689Skan reg_move = PREV_INSN (reg_move); 695169689Skan } 696169689Skan } 697169689Skan else /* It's for the epilog. */ 698169689Skan { 699169689Skan /* SCHED_STAGE (u_node) <= to_stage. Generate all reg_moves, 700169689Skan starting to decrease one stage after u_node no longer occurs; 701169689Skan that is, generate all reg_moves until 702169689Skan SCHED_STAGE (u_node) == from_stage - 1. */ 703169689Skan i_reg_moves = SCHED_NREG_MOVES (u_node) 704169689Skan - (from_stage - SCHED_STAGE (u_node) - 1); 705169689Skan i_reg_moves = MAX (i_reg_moves, 0); 706169689Skan i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node)); 707169689Skan 708169689Skan /* The reg_moves start from the *last* reg_move forwards. */ 709169689Skan if (i_reg_moves) 710169689Skan { 711169689Skan reg_move = SCHED_FIRST_REG_MOVE (u_node); 712169689Skan for (j = 1; j < SCHED_NREG_MOVES (u_node); j++) 713169689Skan reg_move = PREV_INSN (reg_move); 714169689Skan } 715169689Skan } 716169689Skan 717169689Skan for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move)) 718169689Skan emit_insn (copy_rtx (PATTERN (reg_move))); 719169689Skan if (SCHED_STAGE (u_node) >= from_stage 720169689Skan && SCHED_STAGE (u_node) <= to_stage) 721169689Skan duplicate_insn_chain (u_node->first_note, u_node->insn); 722169689Skan } 723169689Skan} 724169689Skan 725169689Skan 726169689Skan/* Generate the instructions (including reg_moves) for prolog & epilog. */ 727169689Skanstatic void 728169689Skangenerate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_reg) 729169689Skan{ 730169689Skan int i; 731169689Skan int last_stage = PS_STAGE_COUNT (ps) - 1; 732169689Skan edge e; 733169689Skan 734169689Skan /* Generate the prolog, inserting its insns on the loop-entry edge. */ 735169689Skan start_sequence (); 736169689Skan 737169689Skan if (count_reg) 738169689Skan /* Generate a subtract instruction at the beginning of the prolog to 739169689Skan adjust the loop count by STAGE_COUNT. */ 740169689Skan emit_insn (gen_sub2_insn (count_reg, GEN_INT (last_stage))); 741169689Skan 742169689Skan for (i = 0; i < last_stage; i++) 743169689Skan duplicate_insns_of_cycles (ps, 0, i, 1); 744169689Skan 745169689Skan /* Put the prolog , on the one and only entry edge. */ 746169689Skan e = loop_preheader_edge (loop); 747169689Skan loop_split_edge_with(e , get_insns()); 748169689Skan 749169689Skan end_sequence (); 750169689Skan 751169689Skan /* Generate the epilog, inserting its insns on the loop-exit edge. */ 752169689Skan start_sequence (); 753169689Skan 754169689Skan for (i = 0; i < last_stage; i++) 755169689Skan duplicate_insns_of_cycles (ps, i + 1, last_stage, 0); 756169689Skan 757169689Skan /* Put the epilogue on the one and only one exit edge. */ 758169689Skan gcc_assert (loop->single_exit); 759169689Skan e = loop->single_exit; 760169689Skan loop_split_edge_with(e , get_insns()); 761169689Skan end_sequence (); 762169689Skan} 763169689Skan 764169689Skan/* Return the line note insn preceding INSN, for debugging. Taken from 765169689Skan emit-rtl.c. */ 766169689Skanstatic rtx 767169689Skanfind_line_note (rtx insn) 768169689Skan{ 769169689Skan for (; insn; insn = PREV_INSN (insn)) 770169689Skan if (NOTE_P (insn) 771169689Skan && NOTE_LINE_NUMBER (insn) >= 0) 772169689Skan break; 773169689Skan 774169689Skan return insn; 775169689Skan} 776169689Skan 777169689Skan/* Return true if all the BBs of the loop are empty except the 778169689Skan loop header. */ 779169689Skanstatic bool 780169689Skanloop_single_full_bb_p (struct loop *loop) 781169689Skan{ 782169689Skan unsigned i; 783169689Skan basic_block *bbs = get_loop_body (loop); 784169689Skan 785169689Skan for (i = 0; i < loop->num_nodes ; i++) 786169689Skan { 787169689Skan rtx head, tail; 788169689Skan bool empty_bb = true; 789169689Skan 790169689Skan if (bbs[i] == loop->header) 791169689Skan continue; 792169689Skan 793169689Skan /* Make sure that basic blocks other than the header 794169689Skan have only notes labels or jumps. */ 795169689Skan get_ebb_head_tail (bbs[i], bbs[i], &head, &tail); 796169689Skan for (; head != NEXT_INSN (tail); head = NEXT_INSN (head)) 797169689Skan { 798169689Skan if (NOTE_P (head) || LABEL_P (head) 799169689Skan || (INSN_P (head) && JUMP_P (head))) 800169689Skan continue; 801169689Skan empty_bb = false; 802169689Skan break; 803169689Skan } 804169689Skan 805169689Skan if (! empty_bb) 806169689Skan { 807169689Skan free (bbs); 808169689Skan return false; 809169689Skan } 810169689Skan } 811169689Skan free (bbs); 812169689Skan return true; 813169689Skan} 814169689Skan 815169689Skan/* A simple loop from SMS point of view; it is a loop that is composed of 816169689Skan either a single basic block or two BBs - a header and a latch. */ 817169689Skan#define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 ) \ 818169689Skan && (EDGE_COUNT (loop->latch->preds) == 1) \ 819169689Skan && (EDGE_COUNT (loop->latch->succs) == 1)) 820169689Skan 821169689Skan/* Return true if the loop is in its canonical form and false if not. 822169689Skan i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit. */ 823169689Skanstatic bool 824169689Skanloop_canon_p (struct loop *loop) 825169689Skan{ 826169689Skan 827169689Skan if (loop->inner || ! loop->outer) 828169689Skan return false; 829169689Skan 830169689Skan if (!loop->single_exit) 831169689Skan { 832169689Skan if (dump_file) 833169689Skan { 834169689Skan rtx line_note = find_line_note (BB_END (loop->header)); 835169689Skan 836169689Skan fprintf (dump_file, "SMS loop many exits "); 837169689Skan if (line_note) 838169689Skan { 839169689Skan expanded_location xloc; 840169689Skan NOTE_EXPANDED_LOCATION (xloc, line_note); 841169689Skan fprintf (dump_file, " %s %d (file, line)\n", 842169689Skan xloc.file, xloc.line); 843169689Skan } 844169689Skan } 845169689Skan return false; 846169689Skan } 847169689Skan 848169689Skan if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop)) 849169689Skan { 850169689Skan if (dump_file) 851169689Skan { 852169689Skan rtx line_note = find_line_note (BB_END (loop->header)); 853169689Skan 854169689Skan fprintf (dump_file, "SMS loop many BBs. "); 855169689Skan if (line_note) 856169689Skan { 857169689Skan expanded_location xloc; 858169689Skan NOTE_EXPANDED_LOCATION (xloc, line_note); 859169689Skan fprintf (dump_file, " %s %d (file, line)\n", 860169689Skan xloc.file, xloc.line); 861169689Skan } 862169689Skan } 863169689Skan return false; 864169689Skan } 865169689Skan 866169689Skan return true; 867169689Skan} 868169689Skan 869169689Skan/* If there are more than one entry for the loop, 870169689Skan make it one by splitting the first entry edge and 871169689Skan redirecting the others to the new BB. */ 872169689Skanstatic void 873169689Skancanon_loop (struct loop *loop) 874169689Skan{ 875169689Skan edge e; 876169689Skan edge_iterator i; 877169689Skan 878169689Skan /* Avoid annoying special cases of edges going to exit 879169689Skan block. */ 880169689Skan FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR->preds) 881169689Skan if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1)) 882169689Skan loop_split_edge_with (e, NULL_RTX); 883169689Skan 884169689Skan if (loop->latch == loop->header 885169689Skan || EDGE_COUNT (loop->latch->succs) > 1) 886169689Skan { 887169689Skan FOR_EACH_EDGE (e, i, loop->header->preds) 888169689Skan if (e->src == loop->latch) 889169689Skan break; 890169689Skan loop_split_edge_with (e, NULL_RTX); 891169689Skan } 892169689Skan} 893169689Skan 894169689Skan/* Main entry point, perform SMS scheduling on the loops of the function 895169689Skan that consist of single basic blocks. */ 896169689Skanstatic void 897169689Skansms_schedule (void) 898169689Skan{ 899169689Skan static int passes = 0; 900169689Skan rtx insn; 901169689Skan ddg_ptr *g_arr, g; 902169689Skan int * node_order; 903169689Skan int maxii; 904169689Skan unsigned i,num_loops; 905169689Skan partial_schedule_ptr ps; 906169689Skan struct df *df; 907169689Skan struct loops *loops; 908169689Skan basic_block bb = NULL; 909169689Skan /* vars to the versioning only if needed*/ 910169689Skan struct loop * nloop; 911169689Skan basic_block condition_bb = NULL; 912169689Skan edge latch_edge; 913169689Skan gcov_type trip_count = 0; 914169689Skan 915169689Skan loops = loop_optimizer_init (LOOPS_HAVE_PREHEADERS 916169689Skan | LOOPS_HAVE_MARKED_SINGLE_EXITS); 917169689Skan if (!loops) 918169689Skan return; /* There is no loops to schedule. */ 919169689Skan 920169689Skan /* Initialize issue_rate. */ 921169689Skan if (targetm.sched.issue_rate) 922169689Skan { 923169689Skan int temp = reload_completed; 924169689Skan 925169689Skan reload_completed = 1; 926169689Skan issue_rate = targetm.sched.issue_rate (); 927169689Skan reload_completed = temp; 928169689Skan } 929169689Skan else 930169689Skan issue_rate = 1; 931169689Skan 932169689Skan /* Initialize the scheduler. */ 933169689Skan current_sched_info = &sms_sched_info; 934169689Skan sched_init (); 935169689Skan 936169689Skan /* Init Data Flow analysis, to be used in interloop dep calculation. */ 937169689Skan df = df_init (DF_HARD_REGS | DF_EQUIV_NOTES | DF_SUBREGS); 938169689Skan df_rd_add_problem (df, 0); 939169689Skan df_ru_add_problem (df, 0); 940169689Skan df_chain_add_problem (df, DF_DU_CHAIN | DF_UD_CHAIN); 941169689Skan df_analyze (df); 942169689Skan 943169689Skan if (dump_file) 944169689Skan df_dump (df, dump_file); 945169689Skan 946169689Skan /* Allocate memory to hold the DDG array one entry for each loop. 947169689Skan We use loop->num as index into this array. */ 948169689Skan g_arr = XCNEWVEC (ddg_ptr, loops->num); 949169689Skan 950169689Skan 951169689Skan /* Build DDGs for all the relevant loops and hold them in G_ARR 952169689Skan indexed by the loop index. */ 953169689Skan for (i = 0; i < loops->num; i++) 954169689Skan { 955169689Skan rtx head, tail; 956169689Skan rtx count_reg; 957169689Skan struct loop *loop = loops->parray[i]; 958169689Skan 959169689Skan /* For debugging. */ 960169689Skan if ((passes++ > MAX_SMS_LOOP_NUMBER) && (MAX_SMS_LOOP_NUMBER != -1)) 961169689Skan { 962169689Skan if (dump_file) 963169689Skan fprintf (dump_file, "SMS reached MAX_PASSES... \n"); 964169689Skan 965169689Skan break; 966169689Skan } 967169689Skan 968169689Skan if (! loop_canon_p (loop)) 969169689Skan continue; 970169689Skan 971169689Skan if (! loop_single_full_bb_p (loop)) 972169689Skan continue; 973169689Skan 974169689Skan bb = loop->header; 975169689Skan 976169689Skan get_ebb_head_tail (bb, bb, &head, &tail); 977169689Skan latch_edge = loop_latch_edge (loop); 978169689Skan gcc_assert (loop->single_exit); 979169689Skan if (loop->single_exit->count) 980169689Skan trip_count = latch_edge->count / loop->single_exit->count; 981169689Skan 982169689Skan /* Perfrom SMS only on loops that their average count is above threshold. */ 983169689Skan 984169689Skan if ( latch_edge->count 985169689Skan && (latch_edge->count < loop->single_exit->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD)) 986169689Skan { 987169689Skan if (dump_file) 988169689Skan { 989169689Skan rtx line_note = find_line_note (tail); 990169689Skan 991169689Skan if (line_note) 992169689Skan { 993169689Skan expanded_location xloc; 994169689Skan NOTE_EXPANDED_LOCATION (xloc, line_note); 995169689Skan fprintf (dump_file, "SMS bb %s %d (file, line)\n", 996169689Skan xloc.file, xloc.line); 997169689Skan } 998169689Skan fprintf (dump_file, "SMS single-bb-loop\n"); 999169689Skan if (profile_info && flag_branch_probabilities) 1000169689Skan { 1001169689Skan fprintf (dump_file, "SMS loop-count "); 1002169689Skan fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, 1003169689Skan (HOST_WIDEST_INT) bb->count); 1004169689Skan fprintf (dump_file, "\n"); 1005169689Skan fprintf (dump_file, "SMS trip-count "); 1006169689Skan fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, 1007169689Skan (HOST_WIDEST_INT) trip_count); 1008169689Skan fprintf (dump_file, "\n"); 1009169689Skan fprintf (dump_file, "SMS profile-sum-max "); 1010169689Skan fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, 1011169689Skan (HOST_WIDEST_INT) profile_info->sum_max); 1012169689Skan fprintf (dump_file, "\n"); 1013169689Skan } 1014169689Skan } 1015169689Skan continue; 1016169689Skan } 1017169689Skan 1018169689Skan /* Make sure this is a doloop. */ 1019169689Skan if ( !(count_reg = doloop_register_get (tail))) 1020169689Skan continue; 1021169689Skan 1022169689Skan /* Don't handle BBs with calls or barriers, or !single_set insns. */ 1023169689Skan for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn)) 1024169689Skan if (CALL_P (insn) 1025169689Skan || BARRIER_P (insn) 1026169689Skan || (INSN_P (insn) && !JUMP_P (insn) 1027169689Skan && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE)) 1028169689Skan break; 1029169689Skan 1030169689Skan if (insn != NEXT_INSN (tail)) 1031169689Skan { 1032169689Skan if (dump_file) 1033169689Skan { 1034169689Skan if (CALL_P (insn)) 1035169689Skan fprintf (dump_file, "SMS loop-with-call\n"); 1036169689Skan else if (BARRIER_P (insn)) 1037169689Skan fprintf (dump_file, "SMS loop-with-barrier\n"); 1038169689Skan else 1039169689Skan fprintf (dump_file, "SMS loop-with-not-single-set\n"); 1040169689Skan print_rtl_single (dump_file, insn); 1041169689Skan } 1042169689Skan 1043169689Skan continue; 1044169689Skan } 1045169689Skan 1046169689Skan if (! (g = create_ddg (bb, df, 0))) 1047169689Skan { 1048169689Skan if (dump_file) 1049169689Skan fprintf (dump_file, "SMS doloop\n"); 1050169689Skan continue; 1051169689Skan } 1052169689Skan 1053169689Skan g_arr[i] = g; 1054169689Skan } 1055169689Skan 1056169689Skan /* Release Data Flow analysis data structures. */ 1057169689Skan df_finish (df); 1058169689Skan df = NULL; 1059169689Skan 1060169689Skan /* We don't want to perform SMS on new loops - created by versioning. */ 1061169689Skan num_loops = loops->num; 1062169689Skan /* Go over the built DDGs and perfrom SMS for each one of them. */ 1063169689Skan for (i = 0; i < num_loops; i++) 1064169689Skan { 1065169689Skan rtx head, tail; 1066169689Skan rtx count_reg, count_init; 1067169689Skan int mii, rec_mii; 1068169689Skan unsigned stage_count = 0; 1069169689Skan HOST_WIDEST_INT loop_count = 0; 1070169689Skan struct loop *loop = loops->parray[i]; 1071169689Skan 1072169689Skan if (! (g = g_arr[i])) 1073169689Skan continue; 1074169689Skan 1075169689Skan if (dump_file) 1076169689Skan print_ddg (dump_file, g); 1077169689Skan 1078169689Skan get_ebb_head_tail (loop->header, loop->header, &head, &tail); 1079169689Skan 1080169689Skan latch_edge = loop_latch_edge (loop); 1081169689Skan gcc_assert (loop->single_exit); 1082169689Skan if (loop->single_exit->count) 1083169689Skan trip_count = latch_edge->count / loop->single_exit->count; 1084169689Skan 1085169689Skan if (dump_file) 1086169689Skan { 1087169689Skan rtx line_note = find_line_note (tail); 1088169689Skan 1089169689Skan if (line_note) 1090169689Skan { 1091169689Skan expanded_location xloc; 1092169689Skan NOTE_EXPANDED_LOCATION (xloc, line_note); 1093169689Skan fprintf (dump_file, "SMS bb %s %d (file, line)\n", 1094169689Skan xloc.file, xloc.line); 1095169689Skan } 1096169689Skan fprintf (dump_file, "SMS single-bb-loop\n"); 1097169689Skan if (profile_info && flag_branch_probabilities) 1098169689Skan { 1099169689Skan fprintf (dump_file, "SMS loop-count "); 1100169689Skan fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, 1101169689Skan (HOST_WIDEST_INT) bb->count); 1102169689Skan fprintf (dump_file, "\n"); 1103169689Skan fprintf (dump_file, "SMS profile-sum-max "); 1104169689Skan fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, 1105169689Skan (HOST_WIDEST_INT) profile_info->sum_max); 1106169689Skan fprintf (dump_file, "\n"); 1107169689Skan } 1108169689Skan fprintf (dump_file, "SMS doloop\n"); 1109169689Skan fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes); 1110169689Skan fprintf (dump_file, "SMS num-loads %d\n", g->num_loads); 1111169689Skan fprintf (dump_file, "SMS num-stores %d\n", g->num_stores); 1112169689Skan } 1113169689Skan 1114169689Skan 1115169689Skan /* In case of th loop have doloop register it gets special 1116169689Skan handling. */ 1117169689Skan count_init = NULL_RTX; 1118169689Skan if ((count_reg = doloop_register_get (tail))) 1119169689Skan { 1120169689Skan basic_block pre_header; 1121169689Skan 1122169689Skan pre_header = loop_preheader_edge (loop)->src; 1123169689Skan count_init = const_iteration_count (count_reg, pre_header, 1124169689Skan &loop_count); 1125169689Skan } 1126169689Skan gcc_assert (count_reg); 1127169689Skan 1128169689Skan if (dump_file && count_init) 1129169689Skan { 1130169689Skan fprintf (dump_file, "SMS const-doloop "); 1131169689Skan fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, 1132169689Skan loop_count); 1133169689Skan fprintf (dump_file, "\n"); 1134169689Skan } 1135169689Skan 1136169689Skan node_order = XNEWVEC (int, g->num_nodes); 1137169689Skan 1138169689Skan mii = 1; /* Need to pass some estimate of mii. */ 1139169689Skan rec_mii = sms_order_nodes (g, mii, node_order); 1140169689Skan mii = MAX (res_MII (g), rec_mii); 1141169689Skan maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100; 1142169689Skan 1143169689Skan if (dump_file) 1144169689Skan fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n", 1145169689Skan rec_mii, mii, maxii); 1146169689Skan 1147169689Skan /* After sms_order_nodes and before sms_schedule_by_order, to copy over 1148169689Skan ASAP. */ 1149169689Skan set_node_sched_params (g); 1150169689Skan 1151169689Skan ps = sms_schedule_by_order (g, mii, maxii, node_order); 1152169689Skan 1153169689Skan if (ps) 1154169689Skan stage_count = PS_STAGE_COUNT (ps); 1155169689Skan 1156169689Skan /* Stage count of 1 means that there is no interleaving between 1157169689Skan iterations, let the scheduling passes do the job. */ 1158169689Skan if (stage_count < 1 1159169689Skan || (count_init && (loop_count <= stage_count)) 1160169689Skan || (flag_branch_probabilities && (trip_count <= stage_count))) 1161169689Skan { 1162169689Skan if (dump_file) 1163169689Skan { 1164169689Skan fprintf (dump_file, "SMS failed... \n"); 1165169689Skan fprintf (dump_file, "SMS sched-failed (stage-count=%d, loop-count=", stage_count); 1166169689Skan fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count); 1167169689Skan fprintf (dump_file, ", trip-count="); 1168169689Skan fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count); 1169169689Skan fprintf (dump_file, ")\n"); 1170169689Skan } 1171169689Skan continue; 1172169689Skan } 1173169689Skan else 1174169689Skan { 1175169689Skan int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb)); 1176169689Skan int new_cycles; 1177169689Skan struct undo_replace_buff_elem *reg_move_replaces; 1178169689Skan 1179169689Skan if (dump_file) 1180169689Skan { 1181169689Skan fprintf (dump_file, 1182169689Skan "SMS succeeded %d %d (with ii, sc)\n", ps->ii, 1183169689Skan stage_count); 1184169689Skan print_partial_schedule (ps, dump_file); 1185169689Skan fprintf (dump_file, 1186169689Skan "SMS Branch (%d) will later be scheduled at cycle %d.\n", 1187169689Skan g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1); 1188169689Skan } 1189169689Skan 1190169689Skan /* Set the stage boundaries. If the DDG is built with closing_branch_deps, 1191169689Skan the closing_branch was scheduled and should appear in the last (ii-1) 1192169689Skan row. Otherwise, we are free to schedule the branch, and we let nodes 1193169689Skan that were scheduled at the first PS_MIN_CYCLE cycle appear in the first 1194169689Skan row; this should reduce stage_count to minimum. */ 1195169689Skan normalize_sched_times (ps); 1196169689Skan rotate_partial_schedule (ps, PS_MIN_CYCLE (ps)); 1197169689Skan set_columns_for_ps (ps); 1198169689Skan 1199169689Skan /* Generate the kernel just to be able to measure its cycles. */ 1200169689Skan permute_partial_schedule (ps, g->closing_branch->first_note); 1201169689Skan reg_move_replaces = generate_reg_moves (ps); 1202169689Skan 1203169689Skan /* Get the number of cycles the new kernel expect to execute in. */ 1204169689Skan new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb)); 1205169689Skan 1206169689Skan /* Get back to the original loop so we can do loop versioning. */ 1207169689Skan undo_permute_partial_schedule (ps, g->closing_branch->first_note); 1208169689Skan if (reg_move_replaces) 1209169689Skan undo_generate_reg_moves (ps, reg_move_replaces); 1210169689Skan 1211169689Skan if ( new_cycles >= orig_cycles) 1212169689Skan { 1213169689Skan /* SMS is not profitable so undo the permutation and reg move generation 1214169689Skan and return the kernel to its original state. */ 1215169689Skan if (dump_file) 1216169689Skan fprintf (dump_file, "Undoing SMS because it is not profitable.\n"); 1217169689Skan 1218169689Skan } 1219169689Skan else 1220169689Skan { 1221169689Skan canon_loop (loop); 1222169689Skan 1223169689Skan /* case the BCT count is not known , Do loop-versioning */ 1224169689Skan if (count_reg && ! count_init) 1225169689Skan { 1226169689Skan rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg, 1227169689Skan GEN_INT(stage_count)); 1228169689Skan 1229169689Skan nloop = loop_version (loops, loop, comp_rtx, &condition_bb, 1230169689Skan true); 1231169689Skan } 1232169689Skan 1233169689Skan /* Set new iteration count of loop kernel. */ 1234169689Skan if (count_reg && count_init) 1235169689Skan SET_SRC (single_set (count_init)) = GEN_INT (loop_count 1236169689Skan - stage_count + 1); 1237169689Skan 1238169689Skan /* Now apply the scheduled kernel to the RTL of the loop. */ 1239169689Skan permute_partial_schedule (ps, g->closing_branch->first_note); 1240169689Skan 1241169689Skan /* Mark this loop as software pipelined so the later 1242169689Skan scheduling passes doesn't touch it. */ 1243169689Skan if (! flag_resched_modulo_sched) 1244169689Skan g->bb->flags |= BB_DISABLE_SCHEDULE; 1245169689Skan /* The life-info is not valid any more. */ 1246169689Skan g->bb->flags |= BB_DIRTY; 1247169689Skan 1248169689Skan reg_move_replaces = generate_reg_moves (ps); 1249169689Skan if (dump_file) 1250169689Skan print_node_sched_params (dump_file, g->num_nodes); 1251169689Skan /* Generate prolog and epilog. */ 1252169689Skan if (count_reg && !count_init) 1253169689Skan generate_prolog_epilog (ps, loop, count_reg); 1254169689Skan else 1255169689Skan generate_prolog_epilog (ps, loop, NULL_RTX); 1256169689Skan } 1257169689Skan free_undo_replace_buff (reg_move_replaces); 1258169689Skan } 1259169689Skan 1260169689Skan free_partial_schedule (ps); 1261169689Skan free (node_sched_params); 1262169689Skan free (node_order); 1263169689Skan free_ddg (g); 1264169689Skan } 1265169689Skan 1266169689Skan free (g_arr); 1267169689Skan 1268169689Skan /* Release scheduler data, needed until now because of DFA. */ 1269169689Skan sched_finish (); 1270169689Skan loop_optimizer_finalize (loops); 1271169689Skan} 1272169689Skan 1273169689Skan/* The SMS scheduling algorithm itself 1274169689Skan ----------------------------------- 1275169689Skan Input: 'O' an ordered list of insns of a loop. 1276169689Skan Output: A scheduling of the loop - kernel, prolog, and epilogue. 1277169689Skan 1278169689Skan 'Q' is the empty Set 1279169689Skan 'PS' is the partial schedule; it holds the currently scheduled nodes with 1280169689Skan their cycle/slot. 1281169689Skan 'PSP' previously scheduled predecessors. 1282169689Skan 'PSS' previously scheduled successors. 1283169689Skan 't(u)' the cycle where u is scheduled. 1284169689Skan 'l(u)' is the latency of u. 1285169689Skan 'd(v,u)' is the dependence distance from v to u. 1286169689Skan 'ASAP(u)' the earliest time at which u could be scheduled as computed in 1287169689Skan the node ordering phase. 1288169689Skan 'check_hardware_resources_conflicts(u, PS, c)' 1289169689Skan run a trace around cycle/slot through DFA model 1290169689Skan to check resource conflicts involving instruction u 1291169689Skan at cycle c given the partial schedule PS. 1292169689Skan 'add_to_partial_schedule_at_time(u, PS, c)' 1293169689Skan Add the node/instruction u to the partial schedule 1294169689Skan PS at time c. 1295169689Skan 'calculate_register_pressure(PS)' 1296169689Skan Given a schedule of instructions, calculate the register 1297169689Skan pressure it implies. One implementation could be the 1298169689Skan maximum number of overlapping live ranges. 1299169689Skan 'maxRP' The maximum allowed register pressure, it is usually derived from the number 1300169689Skan registers available in the hardware. 1301169689Skan 1302169689Skan 1. II = MII. 1303169689Skan 2. PS = empty list 1304169689Skan 3. for each node u in O in pre-computed order 1305169689Skan 4. if (PSP(u) != Q && PSS(u) == Q) then 1306169689Skan 5. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u). 1307169689Skan 6. start = Early_start; end = Early_start + II - 1; step = 1 1308169689Skan 11. else if (PSP(u) == Q && PSS(u) != Q) then 1309169689Skan 12. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u). 1310169689Skan 13. start = Late_start; end = Late_start - II + 1; step = -1 1311169689Skan 14. else if (PSP(u) != Q && PSS(u) != Q) then 1312169689Skan 15. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u). 1313169689Skan 16. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u). 1314169689Skan 17. start = Early_start; 1315169689Skan 18. end = min(Early_start + II - 1 , Late_start); 1316169689Skan 19. step = 1 1317169689Skan 20. else "if (PSP(u) == Q && PSS(u) == Q)" 1318169689Skan 21. start = ASAP(u); end = start + II - 1; step = 1 1319169689Skan 22. endif 1320169689Skan 1321169689Skan 23. success = false 1322169689Skan 24. for (c = start ; c != end ; c += step) 1323169689Skan 25. if check_hardware_resources_conflicts(u, PS, c) then 1324169689Skan 26. add_to_partial_schedule_at_time(u, PS, c) 1325169689Skan 27. success = true 1326169689Skan 28. break 1327169689Skan 29. endif 1328169689Skan 30. endfor 1329169689Skan 31. if (success == false) then 1330169689Skan 32. II = II + 1 1331169689Skan 33. if (II > maxII) then 1332169689Skan 34. finish - failed to schedule 1333169689Skan 35. endif 1334169689Skan 36. goto 2. 1335169689Skan 37. endif 1336169689Skan 38. endfor 1337169689Skan 39. if (calculate_register_pressure(PS) > maxRP) then 1338169689Skan 40. goto 32. 1339169689Skan 41. endif 1340169689Skan 42. compute epilogue & prologue 1341169689Skan 43. finish - succeeded to schedule 1342169689Skan*/ 1343169689Skan 1344169689Skan/* A limit on the number of cycles that resource conflicts can span. ??? Should 1345169689Skan be provided by DFA, and be dependent on the type of insn scheduled. Currently 1346169689Skan set to 0 to save compile time. */ 1347169689Skan#define DFA_HISTORY SMS_DFA_HISTORY 1348169689Skan 1349169689Skan/* Given the partial schedule PS, this function calculates and returns the 1350169689Skan cycles in which we can schedule the node with the given index I. 1351169689Skan NOTE: Here we do the backtracking in SMS, in some special cases. We have 1352169689Skan noticed that there are several cases in which we fail to SMS the loop 1353169689Skan because the sched window of a node is empty due to tight data-deps. In 1354169689Skan such cases we want to unschedule some of the predecessors/successors 1355169689Skan until we get non-empty scheduling window. It returns -1 if the 1356169689Skan scheduling window is empty and zero otherwise. */ 1357169689Skan 1358169689Skanstatic int 1359169689Skanget_sched_window (partial_schedule_ptr ps, int *nodes_order, int i, 1360169689Skan sbitmap sched_nodes, int ii, int *start_p, int *step_p, int *end_p) 1361169689Skan{ 1362169689Skan int start, step, end; 1363169689Skan ddg_edge_ptr e; 1364169689Skan int u = nodes_order [i]; 1365169689Skan ddg_node_ptr u_node = &ps->g->nodes[u]; 1366169689Skan sbitmap psp = sbitmap_alloc (ps->g->num_nodes); 1367169689Skan sbitmap pss = sbitmap_alloc (ps->g->num_nodes); 1368169689Skan sbitmap u_node_preds = NODE_PREDECESSORS (u_node); 1369169689Skan sbitmap u_node_succs = NODE_SUCCESSORS (u_node); 1370169689Skan int psp_not_empty; 1371169689Skan int pss_not_empty; 1372169689Skan 1373169689Skan /* 1. compute sched window for u (start, end, step). */ 1374169689Skan sbitmap_zero (psp); 1375169689Skan sbitmap_zero (pss); 1376169689Skan psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes); 1377169689Skan pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes); 1378169689Skan 1379169689Skan if (psp_not_empty && !pss_not_empty) 1380169689Skan { 1381169689Skan int early_start = INT_MIN; 1382169689Skan 1383169689Skan end = INT_MAX; 1384169689Skan for (e = u_node->in; e != 0; e = e->next_in) 1385169689Skan { 1386169689Skan ddg_node_ptr v_node = e->src; 1387169689Skan if (TEST_BIT (sched_nodes, v_node->cuid)) 1388169689Skan { 1389169689Skan int node_st = SCHED_TIME (v_node) 1390169689Skan + e->latency - (e->distance * ii); 1391169689Skan 1392169689Skan early_start = MAX (early_start, node_st); 1393169689Skan 1394169689Skan if (e->data_type == MEM_DEP) 1395169689Skan end = MIN (end, SCHED_TIME (v_node) + ii - 1); 1396169689Skan } 1397169689Skan } 1398169689Skan start = early_start; 1399169689Skan end = MIN (end, early_start + ii); 1400169689Skan step = 1; 1401169689Skan } 1402169689Skan 1403169689Skan else if (!psp_not_empty && pss_not_empty) 1404169689Skan { 1405169689Skan int late_start = INT_MAX; 1406169689Skan 1407169689Skan end = INT_MIN; 1408169689Skan for (e = u_node->out; e != 0; e = e->next_out) 1409169689Skan { 1410169689Skan ddg_node_ptr v_node = e->dest; 1411169689Skan if (TEST_BIT (sched_nodes, v_node->cuid)) 1412169689Skan { 1413169689Skan late_start = MIN (late_start, 1414169689Skan SCHED_TIME (v_node) - e->latency 1415169689Skan + (e->distance * ii)); 1416169689Skan if (e->data_type == MEM_DEP) 1417169689Skan end = MAX (end, SCHED_TIME (v_node) - ii + 1); 1418169689Skan } 1419169689Skan } 1420169689Skan start = late_start; 1421169689Skan end = MAX (end, late_start - ii); 1422169689Skan step = -1; 1423169689Skan } 1424169689Skan 1425169689Skan else if (psp_not_empty && pss_not_empty) 1426169689Skan { 1427169689Skan int early_start = INT_MIN; 1428169689Skan int late_start = INT_MAX; 1429169689Skan 1430169689Skan start = INT_MIN; 1431169689Skan end = INT_MAX; 1432169689Skan for (e = u_node->in; e != 0; e = e->next_in) 1433169689Skan { 1434169689Skan ddg_node_ptr v_node = e->src; 1435169689Skan 1436169689Skan if (TEST_BIT (sched_nodes, v_node->cuid)) 1437169689Skan { 1438169689Skan early_start = MAX (early_start, 1439169689Skan SCHED_TIME (v_node) + e->latency 1440169689Skan - (e->distance * ii)); 1441169689Skan if (e->data_type == MEM_DEP) 1442169689Skan end = MIN (end, SCHED_TIME (v_node) + ii - 1); 1443169689Skan } 1444169689Skan } 1445169689Skan for (e = u_node->out; e != 0; e = e->next_out) 1446169689Skan { 1447169689Skan ddg_node_ptr v_node = e->dest; 1448169689Skan 1449169689Skan if (TEST_BIT (sched_nodes, v_node->cuid)) 1450169689Skan { 1451169689Skan late_start = MIN (late_start, 1452169689Skan SCHED_TIME (v_node) - e->latency 1453169689Skan + (e->distance * ii)); 1454169689Skan if (e->data_type == MEM_DEP) 1455169689Skan start = MAX (start, SCHED_TIME (v_node) - ii + 1); 1456169689Skan } 1457169689Skan } 1458169689Skan start = MAX (start, early_start); 1459169689Skan end = MIN (end, MIN (early_start + ii, late_start + 1)); 1460169689Skan step = 1; 1461169689Skan } 1462169689Skan else /* psp is empty && pss is empty. */ 1463169689Skan { 1464169689Skan start = SCHED_ASAP (u_node); 1465169689Skan end = start + ii; 1466169689Skan step = 1; 1467169689Skan } 1468169689Skan 1469169689Skan *start_p = start; 1470169689Skan *step_p = step; 1471169689Skan *end_p = end; 1472169689Skan sbitmap_free (psp); 1473169689Skan sbitmap_free (pss); 1474169689Skan 1475169689Skan if ((start >= end && step == 1) || (start <= end && step == -1)) 1476169689Skan return -1; 1477169689Skan else 1478169689Skan return 0; 1479169689Skan} 1480169689Skan 1481169689Skan/* This function implements the scheduling algorithm for SMS according to the 1482169689Skan above algorithm. */ 1483169689Skanstatic partial_schedule_ptr 1484169689Skansms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order) 1485169689Skan{ 1486169689Skan int ii = mii; 1487169689Skan int i, c, success; 1488169689Skan int try_again_with_larger_ii = true; 1489169689Skan int num_nodes = g->num_nodes; 1490169689Skan ddg_edge_ptr e; 1491169689Skan int start, end, step; /* Place together into one struct? */ 1492169689Skan sbitmap sched_nodes = sbitmap_alloc (num_nodes); 1493169689Skan sbitmap must_precede = sbitmap_alloc (num_nodes); 1494169689Skan sbitmap must_follow = sbitmap_alloc (num_nodes); 1495169689Skan sbitmap tobe_scheduled = sbitmap_alloc (num_nodes); 1496169689Skan 1497169689Skan partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY); 1498169689Skan 1499169689Skan sbitmap_ones (tobe_scheduled); 1500169689Skan sbitmap_zero (sched_nodes); 1501169689Skan 1502169689Skan while ((! sbitmap_equal (tobe_scheduled, sched_nodes) 1503169689Skan || try_again_with_larger_ii ) && ii < maxii) 1504169689Skan { 1505169689Skan int j; 1506169689Skan bool unscheduled_nodes = false; 1507169689Skan 1508169689Skan if (dump_file) 1509169689Skan fprintf(dump_file, "Starting with ii=%d\n", ii); 1510169689Skan if (try_again_with_larger_ii) 1511169689Skan { 1512169689Skan try_again_with_larger_ii = false; 1513169689Skan sbitmap_zero (sched_nodes); 1514169689Skan } 1515169689Skan 1516169689Skan for (i = 0; i < num_nodes; i++) 1517169689Skan { 1518169689Skan int u = nodes_order[i]; 1519169689Skan ddg_node_ptr u_node = &ps->g->nodes[u]; 1520169689Skan rtx insn = u_node->insn; 1521169689Skan 1522169689Skan if (!INSN_P (insn)) 1523169689Skan { 1524169689Skan RESET_BIT (tobe_scheduled, u); 1525169689Skan continue; 1526169689Skan } 1527169689Skan 1528169689Skan if (JUMP_P (insn)) /* Closing branch handled later. */ 1529169689Skan { 1530169689Skan RESET_BIT (tobe_scheduled, u); 1531169689Skan continue; 1532169689Skan } 1533169689Skan 1534169689Skan if (TEST_BIT (sched_nodes, u)) 1535169689Skan continue; 1536169689Skan 1537169689Skan /* Try to get non-empty scheduling window. */ 1538169689Skan j = i; 1539169689Skan while (get_sched_window (ps, nodes_order, i, sched_nodes, ii, &start, &step, &end) < 0 1540169689Skan && j > 0) 1541169689Skan { 1542169689Skan unscheduled_nodes = true; 1543169689Skan if (TEST_BIT (NODE_PREDECESSORS (u_node), nodes_order[j - 1]) 1544169689Skan || TEST_BIT (NODE_SUCCESSORS (u_node), nodes_order[j - 1])) 1545169689Skan { 1546169689Skan ps_unschedule_node (ps, &ps->g->nodes[nodes_order[j - 1]]); 1547169689Skan RESET_BIT (sched_nodes, nodes_order [j - 1]); 1548169689Skan } 1549169689Skan j--; 1550169689Skan } 1551169689Skan if (j < 0) 1552169689Skan { 1553169689Skan /* ??? Try backtracking instead of immediately ii++? */ 1554169689Skan ii++; 1555169689Skan try_again_with_larger_ii = true; 1556169689Skan reset_partial_schedule (ps, ii); 1557169689Skan break; 1558169689Skan } 1559169689Skan /* 2. Try scheduling u in window. */ 1560169689Skan if (dump_file) 1561169689Skan fprintf(dump_file, "Trying to schedule node %d in (%d .. %d) step %d\n", 1562169689Skan u, start, end, step); 1563169689Skan 1564169689Skan /* use must_follow & must_precede bitmaps to determine order 1565169689Skan of nodes within the cycle. */ 1566169689Skan sbitmap_zero (must_precede); 1567169689Skan sbitmap_zero (must_follow); 1568169689Skan for (e = u_node->in; e != 0; e = e->next_in) 1569169689Skan if (TEST_BIT (sched_nodes, e->src->cuid) 1570169689Skan && e->latency == (ii * e->distance) 1571169689Skan && start == SCHED_TIME (e->src)) 1572169689Skan SET_BIT (must_precede, e->src->cuid); 1573169689Skan 1574169689Skan for (e = u_node->out; e != 0; e = e->next_out) 1575169689Skan if (TEST_BIT (sched_nodes, e->dest->cuid) 1576169689Skan && e->latency == (ii * e->distance) 1577169689Skan && end == SCHED_TIME (e->dest)) 1578169689Skan SET_BIT (must_follow, e->dest->cuid); 1579169689Skan 1580169689Skan success = 0; 1581169689Skan if ((step > 0 && start < end) || (step < 0 && start > end)) 1582169689Skan for (c = start; c != end; c += step) 1583169689Skan { 1584169689Skan ps_insn_ptr psi; 1585169689Skan 1586169689Skan psi = ps_add_node_check_conflicts (ps, u_node, c, 1587169689Skan must_precede, 1588169689Skan must_follow); 1589169689Skan 1590169689Skan if (psi) 1591169689Skan { 1592169689Skan SCHED_TIME (u_node) = c; 1593169689Skan SET_BIT (sched_nodes, u); 1594169689Skan success = 1; 1595169689Skan if (dump_file) 1596169689Skan fprintf(dump_file, "Schedule in %d\n", c); 1597169689Skan break; 1598169689Skan } 1599169689Skan } 1600169689Skan if (!success) 1601169689Skan { 1602169689Skan /* ??? Try backtracking instead of immediately ii++? */ 1603169689Skan ii++; 1604169689Skan try_again_with_larger_ii = true; 1605169689Skan reset_partial_schedule (ps, ii); 1606169689Skan break; 1607169689Skan } 1608169689Skan if (unscheduled_nodes) 1609169689Skan break; 1610169689Skan 1611169689Skan /* ??? If (success), check register pressure estimates. */ 1612169689Skan } /* Continue with next node. */ 1613169689Skan } /* While try_again_with_larger_ii. */ 1614169689Skan 1615169689Skan sbitmap_free (sched_nodes); 1616169689Skan sbitmap_free (must_precede); 1617169689Skan sbitmap_free (must_follow); 1618169689Skan sbitmap_free (tobe_scheduled); 1619169689Skan 1620169689Skan if (ii >= maxii) 1621169689Skan { 1622169689Skan free_partial_schedule (ps); 1623169689Skan ps = NULL; 1624169689Skan } 1625169689Skan return ps; 1626169689Skan} 1627169689Skan 1628169689Skan 1629169689Skan/* This page implements the algorithm for ordering the nodes of a DDG 1630169689Skan for modulo scheduling, activated through the 1631169689Skan "int sms_order_nodes (ddg_ptr, int mii, int * result)" API. */ 1632169689Skan 1633169689Skan#define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info) 1634169689Skan#define ASAP(x) (ORDER_PARAMS ((x))->asap) 1635169689Skan#define ALAP(x) (ORDER_PARAMS ((x))->alap) 1636169689Skan#define HEIGHT(x) (ORDER_PARAMS ((x))->height) 1637169689Skan#define MOB(x) (ALAP ((x)) - ASAP ((x))) 1638169689Skan#define DEPTH(x) (ASAP ((x))) 1639169689Skan 1640169689Skantypedef struct node_order_params * nopa; 1641169689Skan 1642169689Skanstatic void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result); 1643169689Skanstatic int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int); 1644169689Skanstatic nopa calculate_order_params (ddg_ptr, int mii); 1645169689Skanstatic int find_max_asap (ddg_ptr, sbitmap); 1646169689Skanstatic int find_max_hv_min_mob (ddg_ptr, sbitmap); 1647169689Skanstatic int find_max_dv_min_mob (ddg_ptr, sbitmap); 1648169689Skan 1649169689Skanenum sms_direction {BOTTOMUP, TOPDOWN}; 1650169689Skan 1651169689Skanstruct node_order_params 1652169689Skan{ 1653169689Skan int asap; 1654169689Skan int alap; 1655169689Skan int height; 1656169689Skan}; 1657169689Skan 1658169689Skan/* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1. */ 1659169689Skanstatic void 1660169689Skancheck_nodes_order (int *node_order, int num_nodes) 1661169689Skan{ 1662169689Skan int i; 1663169689Skan sbitmap tmp = sbitmap_alloc (num_nodes); 1664169689Skan 1665169689Skan sbitmap_zero (tmp); 1666169689Skan 1667169689Skan for (i = 0; i < num_nodes; i++) 1668169689Skan { 1669169689Skan int u = node_order[i]; 1670169689Skan 1671169689Skan gcc_assert (u < num_nodes && u >= 0 && !TEST_BIT (tmp, u)); 1672169689Skan 1673169689Skan SET_BIT (tmp, u); 1674169689Skan } 1675169689Skan 1676169689Skan sbitmap_free (tmp); 1677169689Skan} 1678169689Skan 1679169689Skan/* Order the nodes of G for scheduling and pass the result in 1680169689Skan NODE_ORDER. Also set aux.count of each node to ASAP. 1681169689Skan Return the recMII for the given DDG. */ 1682169689Skanstatic int 1683169689Skansms_order_nodes (ddg_ptr g, int mii, int * node_order) 1684169689Skan{ 1685169689Skan int i; 1686169689Skan int rec_mii = 0; 1687169689Skan ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g); 1688169689Skan 1689169689Skan nopa nops = calculate_order_params (g, mii); 1690169689Skan 1691169689Skan order_nodes_of_sccs (sccs, node_order); 1692169689Skan 1693169689Skan if (sccs->num_sccs > 0) 1694169689Skan /* First SCC has the largest recurrence_length. */ 1695169689Skan rec_mii = sccs->sccs[0]->recurrence_length; 1696169689Skan 1697169689Skan /* Save ASAP before destroying node_order_params. */ 1698169689Skan for (i = 0; i < g->num_nodes; i++) 1699169689Skan { 1700169689Skan ddg_node_ptr v = &g->nodes[i]; 1701169689Skan v->aux.count = ASAP (v); 1702169689Skan } 1703169689Skan 1704169689Skan free (nops); 1705169689Skan free_ddg_all_sccs (sccs); 1706169689Skan check_nodes_order (node_order, g->num_nodes); 1707169689Skan 1708169689Skan return rec_mii; 1709169689Skan} 1710169689Skan 1711169689Skanstatic void 1712169689Skanorder_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order) 1713169689Skan{ 1714169689Skan int i, pos = 0; 1715169689Skan ddg_ptr g = all_sccs->ddg; 1716169689Skan int num_nodes = g->num_nodes; 1717169689Skan sbitmap prev_sccs = sbitmap_alloc (num_nodes); 1718169689Skan sbitmap on_path = sbitmap_alloc (num_nodes); 1719169689Skan sbitmap tmp = sbitmap_alloc (num_nodes); 1720169689Skan sbitmap ones = sbitmap_alloc (num_nodes); 1721169689Skan 1722169689Skan sbitmap_zero (prev_sccs); 1723169689Skan sbitmap_ones (ones); 1724169689Skan 1725169689Skan /* Perfrom the node ordering starting from the SCC with the highest recMII. 1726169689Skan For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc. */ 1727169689Skan for (i = 0; i < all_sccs->num_sccs; i++) 1728169689Skan { 1729169689Skan ddg_scc_ptr scc = all_sccs->sccs[i]; 1730169689Skan 1731169689Skan /* Add nodes on paths from previous SCCs to the current SCC. */ 1732169689Skan find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes); 1733169689Skan sbitmap_a_or_b (tmp, scc->nodes, on_path); 1734169689Skan 1735169689Skan /* Add nodes on paths from the current SCC to previous SCCs. */ 1736169689Skan find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs); 1737169689Skan sbitmap_a_or_b (tmp, tmp, on_path); 1738169689Skan 1739169689Skan /* Remove nodes of previous SCCs from current extended SCC. */ 1740169689Skan sbitmap_difference (tmp, tmp, prev_sccs); 1741169689Skan 1742169689Skan pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos); 1743169689Skan /* Above call to order_nodes_in_scc updated prev_sccs |= tmp. */ 1744169689Skan } 1745169689Skan 1746169689Skan /* Handle the remaining nodes that do not belong to any scc. Each call 1747169689Skan to order_nodes_in_scc handles a single connected component. */ 1748169689Skan while (pos < g->num_nodes) 1749169689Skan { 1750169689Skan sbitmap_difference (tmp, ones, prev_sccs); 1751169689Skan pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos); 1752169689Skan } 1753169689Skan sbitmap_free (prev_sccs); 1754169689Skan sbitmap_free (on_path); 1755169689Skan sbitmap_free (tmp); 1756169689Skan sbitmap_free (ones); 1757169689Skan} 1758169689Skan 1759169689Skan/* MII is needed if we consider backarcs (that do not close recursive cycles). */ 1760169689Skanstatic struct node_order_params * 1761169689Skancalculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED) 1762169689Skan{ 1763169689Skan int u; 1764169689Skan int max_asap; 1765169689Skan int num_nodes = g->num_nodes; 1766169689Skan ddg_edge_ptr e; 1767169689Skan /* Allocate a place to hold ordering params for each node in the DDG. */ 1768169689Skan nopa node_order_params_arr; 1769169689Skan 1770169689Skan /* Initialize of ASAP/ALAP/HEIGHT to zero. */ 1771169689Skan node_order_params_arr = (nopa) xcalloc (num_nodes, 1772169689Skan sizeof (struct node_order_params)); 1773169689Skan 1774169689Skan /* Set the aux pointer of each node to point to its order_params structure. */ 1775169689Skan for (u = 0; u < num_nodes; u++) 1776169689Skan g->nodes[u].aux.info = &node_order_params_arr[u]; 1777169689Skan 1778169689Skan /* Disregarding a backarc from each recursive cycle to obtain a DAG, 1779169689Skan calculate ASAP, ALAP, mobility, distance, and height for each node 1780169689Skan in the dependence (direct acyclic) graph. */ 1781169689Skan 1782169689Skan /* We assume that the nodes in the array are in topological order. */ 1783169689Skan 1784169689Skan max_asap = 0; 1785169689Skan for (u = 0; u < num_nodes; u++) 1786169689Skan { 1787169689Skan ddg_node_ptr u_node = &g->nodes[u]; 1788169689Skan 1789169689Skan ASAP (u_node) = 0; 1790169689Skan for (e = u_node->in; e; e = e->next_in) 1791169689Skan if (e->distance == 0) 1792169689Skan ASAP (u_node) = MAX (ASAP (u_node), 1793169689Skan ASAP (e->src) + e->latency); 1794169689Skan max_asap = MAX (max_asap, ASAP (u_node)); 1795169689Skan } 1796169689Skan 1797169689Skan for (u = num_nodes - 1; u > -1; u--) 1798169689Skan { 1799169689Skan ddg_node_ptr u_node = &g->nodes[u]; 1800169689Skan 1801169689Skan ALAP (u_node) = max_asap; 1802169689Skan HEIGHT (u_node) = 0; 1803169689Skan for (e = u_node->out; e; e = e->next_out) 1804169689Skan if (e->distance == 0) 1805169689Skan { 1806169689Skan ALAP (u_node) = MIN (ALAP (u_node), 1807169689Skan ALAP (e->dest) - e->latency); 1808169689Skan HEIGHT (u_node) = MAX (HEIGHT (u_node), 1809169689Skan HEIGHT (e->dest) + e->latency); 1810169689Skan } 1811169689Skan } 1812169689Skan 1813169689Skan return node_order_params_arr; 1814169689Skan} 1815169689Skan 1816169689Skanstatic int 1817169689Skanfind_max_asap (ddg_ptr g, sbitmap nodes) 1818169689Skan{ 1819169689Skan unsigned int u = 0; 1820169689Skan int max_asap = -1; 1821169689Skan int result = -1; 1822169689Skan sbitmap_iterator sbi; 1823169689Skan 1824169689Skan EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi) 1825169689Skan { 1826169689Skan ddg_node_ptr u_node = &g->nodes[u]; 1827169689Skan 1828169689Skan if (max_asap < ASAP (u_node)) 1829169689Skan { 1830169689Skan max_asap = ASAP (u_node); 1831169689Skan result = u; 1832169689Skan } 1833169689Skan } 1834169689Skan return result; 1835169689Skan} 1836169689Skan 1837169689Skanstatic int 1838169689Skanfind_max_hv_min_mob (ddg_ptr g, sbitmap nodes) 1839169689Skan{ 1840169689Skan unsigned int u = 0; 1841169689Skan int max_hv = -1; 1842169689Skan int min_mob = INT_MAX; 1843169689Skan int result = -1; 1844169689Skan sbitmap_iterator sbi; 1845169689Skan 1846169689Skan EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi) 1847169689Skan { 1848169689Skan ddg_node_ptr u_node = &g->nodes[u]; 1849169689Skan 1850169689Skan if (max_hv < HEIGHT (u_node)) 1851169689Skan { 1852169689Skan max_hv = HEIGHT (u_node); 1853169689Skan min_mob = MOB (u_node); 1854169689Skan result = u; 1855169689Skan } 1856169689Skan else if ((max_hv == HEIGHT (u_node)) 1857169689Skan && (min_mob > MOB (u_node))) 1858169689Skan { 1859169689Skan min_mob = MOB (u_node); 1860169689Skan result = u; 1861169689Skan } 1862169689Skan } 1863169689Skan return result; 1864169689Skan} 1865169689Skan 1866169689Skanstatic int 1867169689Skanfind_max_dv_min_mob (ddg_ptr g, sbitmap nodes) 1868169689Skan{ 1869169689Skan unsigned int u = 0; 1870169689Skan int max_dv = -1; 1871169689Skan int min_mob = INT_MAX; 1872169689Skan int result = -1; 1873169689Skan sbitmap_iterator sbi; 1874169689Skan 1875169689Skan EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi) 1876169689Skan { 1877169689Skan ddg_node_ptr u_node = &g->nodes[u]; 1878169689Skan 1879169689Skan if (max_dv < DEPTH (u_node)) 1880169689Skan { 1881169689Skan max_dv = DEPTH (u_node); 1882169689Skan min_mob = MOB (u_node); 1883169689Skan result = u; 1884169689Skan } 1885169689Skan else if ((max_dv == DEPTH (u_node)) 1886169689Skan && (min_mob > MOB (u_node))) 1887169689Skan { 1888169689Skan min_mob = MOB (u_node); 1889169689Skan result = u; 1890169689Skan } 1891169689Skan } 1892169689Skan return result; 1893169689Skan} 1894169689Skan 1895169689Skan/* Places the nodes of SCC into the NODE_ORDER array starting 1896169689Skan at position POS, according to the SMS ordering algorithm. 1897169689Skan NODES_ORDERED (in&out parameter) holds the bitset of all nodes in 1898169689Skan the NODE_ORDER array, starting from position zero. */ 1899169689Skanstatic int 1900169689Skanorder_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc, 1901169689Skan int * node_order, int pos) 1902169689Skan{ 1903169689Skan enum sms_direction dir; 1904169689Skan int num_nodes = g->num_nodes; 1905169689Skan sbitmap workset = sbitmap_alloc (num_nodes); 1906169689Skan sbitmap tmp = sbitmap_alloc (num_nodes); 1907169689Skan sbitmap zero_bitmap = sbitmap_alloc (num_nodes); 1908169689Skan sbitmap predecessors = sbitmap_alloc (num_nodes); 1909169689Skan sbitmap successors = sbitmap_alloc (num_nodes); 1910169689Skan 1911169689Skan sbitmap_zero (predecessors); 1912169689Skan find_predecessors (predecessors, g, nodes_ordered); 1913169689Skan 1914169689Skan sbitmap_zero (successors); 1915169689Skan find_successors (successors, g, nodes_ordered); 1916169689Skan 1917169689Skan sbitmap_zero (tmp); 1918169689Skan if (sbitmap_a_and_b_cg (tmp, predecessors, scc)) 1919169689Skan { 1920169689Skan sbitmap_copy (workset, tmp); 1921169689Skan dir = BOTTOMUP; 1922169689Skan } 1923169689Skan else if (sbitmap_a_and_b_cg (tmp, successors, scc)) 1924169689Skan { 1925169689Skan sbitmap_copy (workset, tmp); 1926169689Skan dir = TOPDOWN; 1927169689Skan } 1928169689Skan else 1929169689Skan { 1930169689Skan int u; 1931169689Skan 1932169689Skan sbitmap_zero (workset); 1933169689Skan if ((u = find_max_asap (g, scc)) >= 0) 1934169689Skan SET_BIT (workset, u); 1935169689Skan dir = BOTTOMUP; 1936169689Skan } 1937169689Skan 1938169689Skan sbitmap_zero (zero_bitmap); 1939169689Skan while (!sbitmap_equal (workset, zero_bitmap)) 1940169689Skan { 1941169689Skan int v; 1942169689Skan ddg_node_ptr v_node; 1943169689Skan sbitmap v_node_preds; 1944169689Skan sbitmap v_node_succs; 1945169689Skan 1946169689Skan if (dir == TOPDOWN) 1947169689Skan { 1948169689Skan while (!sbitmap_equal (workset, zero_bitmap)) 1949169689Skan { 1950169689Skan v = find_max_hv_min_mob (g, workset); 1951169689Skan v_node = &g->nodes[v]; 1952169689Skan node_order[pos++] = v; 1953169689Skan v_node_succs = NODE_SUCCESSORS (v_node); 1954169689Skan sbitmap_a_and_b (tmp, v_node_succs, scc); 1955169689Skan 1956169689Skan /* Don't consider the already ordered successors again. */ 1957169689Skan sbitmap_difference (tmp, tmp, nodes_ordered); 1958169689Skan sbitmap_a_or_b (workset, workset, tmp); 1959169689Skan RESET_BIT (workset, v); 1960169689Skan SET_BIT (nodes_ordered, v); 1961169689Skan } 1962169689Skan dir = BOTTOMUP; 1963169689Skan sbitmap_zero (predecessors); 1964169689Skan find_predecessors (predecessors, g, nodes_ordered); 1965169689Skan sbitmap_a_and_b (workset, predecessors, scc); 1966169689Skan } 1967169689Skan else 1968169689Skan { 1969169689Skan while (!sbitmap_equal (workset, zero_bitmap)) 1970169689Skan { 1971169689Skan v = find_max_dv_min_mob (g, workset); 1972169689Skan v_node = &g->nodes[v]; 1973169689Skan node_order[pos++] = v; 1974169689Skan v_node_preds = NODE_PREDECESSORS (v_node); 1975169689Skan sbitmap_a_and_b (tmp, v_node_preds, scc); 1976169689Skan 1977169689Skan /* Don't consider the already ordered predecessors again. */ 1978169689Skan sbitmap_difference (tmp, tmp, nodes_ordered); 1979169689Skan sbitmap_a_or_b (workset, workset, tmp); 1980169689Skan RESET_BIT (workset, v); 1981169689Skan SET_BIT (nodes_ordered, v); 1982169689Skan } 1983169689Skan dir = TOPDOWN; 1984169689Skan sbitmap_zero (successors); 1985169689Skan find_successors (successors, g, nodes_ordered); 1986169689Skan sbitmap_a_and_b (workset, successors, scc); 1987169689Skan } 1988169689Skan } 1989169689Skan sbitmap_free (tmp); 1990169689Skan sbitmap_free (workset); 1991169689Skan sbitmap_free (zero_bitmap); 1992169689Skan sbitmap_free (predecessors); 1993169689Skan sbitmap_free (successors); 1994169689Skan return pos; 1995169689Skan} 1996169689Skan 1997169689Skan 1998169689Skan/* This page contains functions for manipulating partial-schedules during 1999169689Skan modulo scheduling. */ 2000169689Skan 2001169689Skan/* Create a partial schedule and allocate a memory to hold II rows. */ 2002169689Skan 2003169689Skanstatic partial_schedule_ptr 2004169689Skancreate_partial_schedule (int ii, ddg_ptr g, int history) 2005169689Skan{ 2006169689Skan partial_schedule_ptr ps = XNEW (struct partial_schedule); 2007169689Skan ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr)); 2008169689Skan ps->ii = ii; 2009169689Skan ps->history = history; 2010169689Skan ps->min_cycle = INT_MAX; 2011169689Skan ps->max_cycle = INT_MIN; 2012169689Skan ps->g = g; 2013169689Skan 2014169689Skan return ps; 2015169689Skan} 2016169689Skan 2017169689Skan/* Free the PS_INSNs in rows array of the given partial schedule. 2018169689Skan ??? Consider caching the PS_INSN's. */ 2019169689Skanstatic void 2020169689Skanfree_ps_insns (partial_schedule_ptr ps) 2021169689Skan{ 2022169689Skan int i; 2023169689Skan 2024169689Skan for (i = 0; i < ps->ii; i++) 2025169689Skan { 2026169689Skan while (ps->rows[i]) 2027169689Skan { 2028169689Skan ps_insn_ptr ps_insn = ps->rows[i]->next_in_row; 2029169689Skan 2030169689Skan free (ps->rows[i]); 2031169689Skan ps->rows[i] = ps_insn; 2032169689Skan } 2033169689Skan ps->rows[i] = NULL; 2034169689Skan } 2035169689Skan} 2036169689Skan 2037169689Skan/* Free all the memory allocated to the partial schedule. */ 2038169689Skan 2039169689Skanstatic void 2040169689Skanfree_partial_schedule (partial_schedule_ptr ps) 2041169689Skan{ 2042169689Skan if (!ps) 2043169689Skan return; 2044169689Skan free_ps_insns (ps); 2045169689Skan free (ps->rows); 2046169689Skan free (ps); 2047169689Skan} 2048169689Skan 2049169689Skan/* Clear the rows array with its PS_INSNs, and create a new one with 2050169689Skan NEW_II rows. */ 2051169689Skan 2052169689Skanstatic void 2053169689Skanreset_partial_schedule (partial_schedule_ptr ps, int new_ii) 2054169689Skan{ 2055169689Skan if (!ps) 2056169689Skan return; 2057169689Skan free_ps_insns (ps); 2058169689Skan if (new_ii == ps->ii) 2059169689Skan return; 2060169689Skan ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii 2061169689Skan * sizeof (ps_insn_ptr)); 2062169689Skan memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr)); 2063169689Skan ps->ii = new_ii; 2064169689Skan ps->min_cycle = INT_MAX; 2065169689Skan ps->max_cycle = INT_MIN; 2066169689Skan} 2067169689Skan 2068169689Skan/* Prints the partial schedule as an ii rows array, for each rows 2069169689Skan print the ids of the insns in it. */ 2070169689Skanvoid 2071169689Skanprint_partial_schedule (partial_schedule_ptr ps, FILE *dump) 2072169689Skan{ 2073169689Skan int i; 2074169689Skan 2075169689Skan for (i = 0; i < ps->ii; i++) 2076169689Skan { 2077169689Skan ps_insn_ptr ps_i = ps->rows[i]; 2078169689Skan 2079169689Skan fprintf (dump, "\n[CYCLE %d ]: ", i); 2080169689Skan while (ps_i) 2081169689Skan { 2082169689Skan fprintf (dump, "%d, ", 2083169689Skan INSN_UID (ps_i->node->insn)); 2084169689Skan ps_i = ps_i->next_in_row; 2085169689Skan } 2086169689Skan } 2087169689Skan} 2088169689Skan 2089169689Skan/* Creates an object of PS_INSN and initializes it to the given parameters. */ 2090169689Skanstatic ps_insn_ptr 2091169689Skancreate_ps_insn (ddg_node_ptr node, int rest_count, int cycle) 2092169689Skan{ 2093169689Skan ps_insn_ptr ps_i = XNEW (struct ps_insn); 2094169689Skan 2095169689Skan ps_i->node = node; 2096169689Skan ps_i->next_in_row = NULL; 2097169689Skan ps_i->prev_in_row = NULL; 2098169689Skan ps_i->row_rest_count = rest_count; 2099169689Skan ps_i->cycle = cycle; 2100169689Skan 2101169689Skan return ps_i; 2102169689Skan} 2103169689Skan 2104169689Skan 2105169689Skan/* Removes the given PS_INSN from the partial schedule. Returns false if the 2106169689Skan node is not found in the partial schedule, else returns true. */ 2107169689Skanstatic bool 2108169689Skanremove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i) 2109169689Skan{ 2110169689Skan int row; 2111169689Skan 2112169689Skan if (!ps || !ps_i) 2113169689Skan return false; 2114169689Skan 2115169689Skan row = SMODULO (ps_i->cycle, ps->ii); 2116169689Skan if (! ps_i->prev_in_row) 2117169689Skan { 2118169689Skan if (ps_i != ps->rows[row]) 2119169689Skan return false; 2120169689Skan 2121169689Skan ps->rows[row] = ps_i->next_in_row; 2122169689Skan if (ps->rows[row]) 2123169689Skan ps->rows[row]->prev_in_row = NULL; 2124169689Skan } 2125169689Skan else 2126169689Skan { 2127169689Skan ps_i->prev_in_row->next_in_row = ps_i->next_in_row; 2128169689Skan if (ps_i->next_in_row) 2129169689Skan ps_i->next_in_row->prev_in_row = ps_i->prev_in_row; 2130169689Skan } 2131169689Skan free (ps_i); 2132169689Skan return true; 2133169689Skan} 2134169689Skan 2135169689Skan/* Unlike what literature describes for modulo scheduling (which focuses 2136169689Skan on VLIW machines) the order of the instructions inside a cycle is 2137169689Skan important. Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know 2138169689Skan where the current instruction should go relative to the already 2139169689Skan scheduled instructions in the given cycle. Go over these 2140169689Skan instructions and find the first possible column to put it in. */ 2141169689Skanstatic bool 2142169689Skanps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i, 2143169689Skan sbitmap must_precede, sbitmap must_follow) 2144169689Skan{ 2145169689Skan ps_insn_ptr next_ps_i; 2146169689Skan ps_insn_ptr first_must_follow = NULL; 2147169689Skan ps_insn_ptr last_must_precede = NULL; 2148169689Skan int row; 2149169689Skan 2150169689Skan if (! ps_i) 2151169689Skan return false; 2152169689Skan 2153169689Skan row = SMODULO (ps_i->cycle, ps->ii); 2154169689Skan 2155169689Skan /* Find the first must follow and the last must precede 2156169689Skan and insert the node immediately after the must precede 2157169689Skan but make sure that it there is no must follow after it. */ 2158169689Skan for (next_ps_i = ps->rows[row]; 2159169689Skan next_ps_i; 2160169689Skan next_ps_i = next_ps_i->next_in_row) 2161169689Skan { 2162169689Skan if (TEST_BIT (must_follow, next_ps_i->node->cuid) 2163169689Skan && ! first_must_follow) 2164169689Skan first_must_follow = next_ps_i; 2165169689Skan if (TEST_BIT (must_precede, next_ps_i->node->cuid)) 2166169689Skan { 2167169689Skan /* If we have already met a node that must follow, then 2168169689Skan there is no possible column. */ 2169169689Skan if (first_must_follow) 2170169689Skan return false; 2171169689Skan else 2172169689Skan last_must_precede = next_ps_i; 2173169689Skan } 2174169689Skan } 2175169689Skan 2176169689Skan /* Now insert the node after INSERT_AFTER_PSI. */ 2177169689Skan 2178169689Skan if (! last_must_precede) 2179169689Skan { 2180169689Skan ps_i->next_in_row = ps->rows[row]; 2181169689Skan ps_i->prev_in_row = NULL; 2182169689Skan if (ps_i->next_in_row) 2183169689Skan ps_i->next_in_row->prev_in_row = ps_i; 2184169689Skan ps->rows[row] = ps_i; 2185169689Skan } 2186169689Skan else 2187169689Skan { 2188169689Skan ps_i->next_in_row = last_must_precede->next_in_row; 2189169689Skan last_must_precede->next_in_row = ps_i; 2190169689Skan ps_i->prev_in_row = last_must_precede; 2191169689Skan if (ps_i->next_in_row) 2192169689Skan ps_i->next_in_row->prev_in_row = ps_i; 2193169689Skan } 2194169689Skan 2195169689Skan return true; 2196169689Skan} 2197169689Skan 2198169689Skan/* Advances the PS_INSN one column in its current row; returns false 2199169689Skan in failure and true in success. Bit N is set in MUST_FOLLOW if 2200169689Skan the node with cuid N must be come after the node pointed to by 2201169689Skan PS_I when scheduled in the same cycle. */ 2202169689Skanstatic int 2203169689Skanps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i, 2204169689Skan sbitmap must_follow) 2205169689Skan{ 2206169689Skan ps_insn_ptr prev, next; 2207169689Skan int row; 2208169689Skan ddg_node_ptr next_node; 2209169689Skan 2210169689Skan if (!ps || !ps_i) 2211169689Skan return false; 2212169689Skan 2213169689Skan row = SMODULO (ps_i->cycle, ps->ii); 2214169689Skan 2215169689Skan if (! ps_i->next_in_row) 2216169689Skan return false; 2217169689Skan 2218169689Skan next_node = ps_i->next_in_row->node; 2219169689Skan 2220169689Skan /* Check if next_in_row is dependent on ps_i, both having same sched 2221169689Skan times (typically ANTI_DEP). If so, ps_i cannot skip over it. */ 2222169689Skan if (TEST_BIT (must_follow, next_node->cuid)) 2223169689Skan return false; 2224169689Skan 2225169689Skan /* Advance PS_I over its next_in_row in the doubly linked list. */ 2226169689Skan prev = ps_i->prev_in_row; 2227169689Skan next = ps_i->next_in_row; 2228169689Skan 2229169689Skan if (ps_i == ps->rows[row]) 2230169689Skan ps->rows[row] = next; 2231169689Skan 2232169689Skan ps_i->next_in_row = next->next_in_row; 2233169689Skan 2234169689Skan if (next->next_in_row) 2235169689Skan next->next_in_row->prev_in_row = ps_i; 2236169689Skan 2237169689Skan next->next_in_row = ps_i; 2238169689Skan ps_i->prev_in_row = next; 2239169689Skan 2240169689Skan next->prev_in_row = prev; 2241169689Skan if (prev) 2242169689Skan prev->next_in_row = next; 2243169689Skan 2244169689Skan return true; 2245169689Skan} 2246169689Skan 2247169689Skan/* Inserts a DDG_NODE to the given partial schedule at the given cycle. 2248169689Skan Returns 0 if this is not possible and a PS_INSN otherwise. Bit N is 2249169689Skan set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come 2250169689Skan before/after (respectively) the node pointed to by PS_I when scheduled 2251169689Skan in the same cycle. */ 2252169689Skanstatic ps_insn_ptr 2253169689Skanadd_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle, 2254169689Skan sbitmap must_precede, sbitmap must_follow) 2255169689Skan{ 2256169689Skan ps_insn_ptr ps_i; 2257169689Skan int rest_count = 1; 2258169689Skan int row = SMODULO (cycle, ps->ii); 2259169689Skan 2260169689Skan if (ps->rows[row] 2261169689Skan && ps->rows[row]->row_rest_count >= issue_rate) 2262169689Skan return NULL; 2263169689Skan 2264169689Skan if (ps->rows[row]) 2265169689Skan rest_count += ps->rows[row]->row_rest_count; 2266169689Skan 2267169689Skan ps_i = create_ps_insn (node, rest_count, cycle); 2268169689Skan 2269169689Skan /* Finds and inserts PS_I according to MUST_FOLLOW and 2270169689Skan MUST_PRECEDE. */ 2271169689Skan if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow)) 2272169689Skan { 2273169689Skan free (ps_i); 2274169689Skan return NULL; 2275169689Skan } 2276169689Skan 2277169689Skan return ps_i; 2278169689Skan} 2279169689Skan 2280169689Skan/* Advance time one cycle. Assumes DFA is being used. */ 2281169689Skanstatic void 2282169689Skanadvance_one_cycle (void) 2283169689Skan{ 2284169689Skan if (targetm.sched.dfa_pre_cycle_insn) 2285169689Skan state_transition (curr_state, 2286169689Skan targetm.sched.dfa_pre_cycle_insn ()); 2287169689Skan 2288169689Skan state_transition (curr_state, NULL); 2289169689Skan 2290169689Skan if (targetm.sched.dfa_post_cycle_insn) 2291169689Skan state_transition (curr_state, 2292169689Skan targetm.sched.dfa_post_cycle_insn ()); 2293169689Skan} 2294169689Skan 2295169689Skan/* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds 2296169689Skan the number of cycles according to DFA that the kernel fits in, 2297169689Skan we use this to check if we done well with SMS after we add 2298169689Skan register moves. In some cases register moves overhead makes 2299169689Skan it even worse than the original loop. We want SMS to be performed 2300169689Skan when it gives less cycles after register moves are added. */ 2301169689Skanstatic int 2302169689Skankernel_number_of_cycles (rtx first_insn, rtx last_insn) 2303169689Skan{ 2304169689Skan int cycles = 0; 2305169689Skan rtx insn; 2306169689Skan int can_issue_more = issue_rate; 2307169689Skan 2308169689Skan state_reset (curr_state); 2309169689Skan 2310169689Skan for (insn = first_insn; 2311169689Skan insn != NULL_RTX && insn != last_insn; 2312169689Skan insn = NEXT_INSN (insn)) 2313169689Skan { 2314169689Skan if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE) 2315169689Skan continue; 2316169689Skan 2317169689Skan /* Check if there is room for the current insn. */ 2318169689Skan if (!can_issue_more || state_dead_lock_p (curr_state)) 2319169689Skan { 2320169689Skan cycles ++; 2321169689Skan advance_one_cycle (); 2322169689Skan can_issue_more = issue_rate; 2323169689Skan } 2324169689Skan 2325169689Skan /* Update the DFA state and return with failure if the DFA found 2326169689Skan recource conflicts. */ 2327169689Skan if (state_transition (curr_state, insn) >= 0) 2328169689Skan { 2329169689Skan cycles ++; 2330169689Skan advance_one_cycle (); 2331169689Skan can_issue_more = issue_rate; 2332169689Skan } 2333169689Skan 2334169689Skan if (targetm.sched.variable_issue) 2335169689Skan can_issue_more = 2336169689Skan targetm.sched.variable_issue (sched_dump, sched_verbose, 2337169689Skan insn, can_issue_more); 2338169689Skan /* A naked CLOBBER or USE generates no instruction, so don't 2339169689Skan let them consume issue slots. */ 2340169689Skan else if (GET_CODE (PATTERN (insn)) != USE 2341169689Skan && GET_CODE (PATTERN (insn)) != CLOBBER) 2342169689Skan can_issue_more--; 2343169689Skan } 2344169689Skan return cycles; 2345169689Skan} 2346169689Skan 2347169689Skan/* Checks if PS has resource conflicts according to DFA, starting from 2348169689Skan FROM cycle to TO cycle; returns true if there are conflicts and false 2349169689Skan if there are no conflicts. Assumes DFA is being used. */ 2350169689Skanstatic int 2351169689Skanps_has_conflicts (partial_schedule_ptr ps, int from, int to) 2352169689Skan{ 2353169689Skan int cycle; 2354169689Skan 2355169689Skan state_reset (curr_state); 2356169689Skan 2357169689Skan for (cycle = from; cycle <= to; cycle++) 2358169689Skan { 2359169689Skan ps_insn_ptr crr_insn; 2360169689Skan /* Holds the remaining issue slots in the current row. */ 2361169689Skan int can_issue_more = issue_rate; 2362169689Skan 2363169689Skan /* Walk through the DFA for the current row. */ 2364169689Skan for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)]; 2365169689Skan crr_insn; 2366169689Skan crr_insn = crr_insn->next_in_row) 2367169689Skan { 2368169689Skan rtx insn = crr_insn->node->insn; 2369169689Skan 2370169689Skan if (!INSN_P (insn)) 2371169689Skan continue; 2372169689Skan 2373169689Skan /* Check if there is room for the current insn. */ 2374169689Skan if (!can_issue_more || state_dead_lock_p (curr_state)) 2375169689Skan return true; 2376169689Skan 2377169689Skan /* Update the DFA state and return with failure if the DFA found 2378169689Skan recource conflicts. */ 2379169689Skan if (state_transition (curr_state, insn) >= 0) 2380169689Skan return true; 2381169689Skan 2382169689Skan if (targetm.sched.variable_issue) 2383169689Skan can_issue_more = 2384169689Skan targetm.sched.variable_issue (sched_dump, sched_verbose, 2385169689Skan insn, can_issue_more); 2386169689Skan /* A naked CLOBBER or USE generates no instruction, so don't 2387169689Skan let them consume issue slots. */ 2388169689Skan else if (GET_CODE (PATTERN (insn)) != USE 2389169689Skan && GET_CODE (PATTERN (insn)) != CLOBBER) 2390169689Skan can_issue_more--; 2391169689Skan } 2392169689Skan 2393169689Skan /* Advance the DFA to the next cycle. */ 2394169689Skan advance_one_cycle (); 2395169689Skan } 2396169689Skan return false; 2397169689Skan} 2398169689Skan 2399169689Skan/* Checks if the given node causes resource conflicts when added to PS at 2400169689Skan cycle C. If not the node is added to PS and returned; otherwise zero 2401169689Skan is returned. Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with 2402169689Skan cuid N must be come before/after (respectively) the node pointed to by 2403169689Skan PS_I when scheduled in the same cycle. */ 2404169689Skanps_insn_ptr 2405169689Skanps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n, 2406169689Skan int c, sbitmap must_precede, 2407169689Skan sbitmap must_follow) 2408169689Skan{ 2409169689Skan int has_conflicts = 0; 2410169689Skan ps_insn_ptr ps_i; 2411169689Skan 2412169689Skan /* First add the node to the PS, if this succeeds check for 2413169689Skan conflicts, trying different issue slots in the same row. */ 2414169689Skan if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow))) 2415169689Skan return NULL; /* Failed to insert the node at the given cycle. */ 2416169689Skan 2417169689Skan has_conflicts = ps_has_conflicts (ps, c, c) 2418169689Skan || (ps->history > 0 2419169689Skan && ps_has_conflicts (ps, 2420169689Skan c - ps->history, 2421169689Skan c + ps->history)); 2422169689Skan 2423169689Skan /* Try different issue slots to find one that the given node can be 2424169689Skan scheduled in without conflicts. */ 2425169689Skan while (has_conflicts) 2426169689Skan { 2427169689Skan if (! ps_insn_advance_column (ps, ps_i, must_follow)) 2428169689Skan break; 2429169689Skan has_conflicts = ps_has_conflicts (ps, c, c) 2430169689Skan || (ps->history > 0 2431169689Skan && ps_has_conflicts (ps, 2432169689Skan c - ps->history, 2433169689Skan c + ps->history)); 2434169689Skan } 2435169689Skan 2436169689Skan if (has_conflicts) 2437169689Skan { 2438169689Skan remove_node_from_ps (ps, ps_i); 2439169689Skan return NULL; 2440169689Skan } 2441169689Skan 2442169689Skan ps->min_cycle = MIN (ps->min_cycle, c); 2443169689Skan ps->max_cycle = MAX (ps->max_cycle, c); 2444169689Skan return ps_i; 2445169689Skan} 2446169689Skan 2447169689Skan/* Rotate the rows of PS such that insns scheduled at time 2448169689Skan START_CYCLE will appear in row 0. Updates max/min_cycles. */ 2449169689Skanvoid 2450169689Skanrotate_partial_schedule (partial_schedule_ptr ps, int start_cycle) 2451169689Skan{ 2452169689Skan int i, row, backward_rotates; 2453169689Skan int last_row = ps->ii - 1; 2454169689Skan 2455169689Skan if (start_cycle == 0) 2456169689Skan return; 2457169689Skan 2458169689Skan backward_rotates = SMODULO (start_cycle, ps->ii); 2459169689Skan 2460169689Skan /* Revisit later and optimize this into a single loop. */ 2461169689Skan for (i = 0; i < backward_rotates; i++) 2462169689Skan { 2463169689Skan ps_insn_ptr first_row = ps->rows[0]; 2464169689Skan 2465169689Skan for (row = 0; row < last_row; row++) 2466169689Skan ps->rows[row] = ps->rows[row+1]; 2467169689Skan 2468169689Skan ps->rows[last_row] = first_row; 2469169689Skan } 2470169689Skan 2471169689Skan ps->max_cycle -= start_cycle; 2472169689Skan ps->min_cycle -= start_cycle; 2473169689Skan} 2474169689Skan 2475169689Skan/* Remove the node N from the partial schedule PS; because we restart the DFA 2476169689Skan each time we want to check for resource conflicts; this is equivalent to 2477169689Skan unscheduling the node N. */ 2478169689Skanstatic bool 2479169689Skanps_unschedule_node (partial_schedule_ptr ps, ddg_node_ptr n) 2480169689Skan{ 2481169689Skan ps_insn_ptr ps_i; 2482169689Skan int row = SMODULO (SCHED_TIME (n), ps->ii); 2483169689Skan 2484169689Skan if (row < 0 || row > ps->ii) 2485169689Skan return false; 2486169689Skan 2487169689Skan for (ps_i = ps->rows[row]; 2488169689Skan ps_i && ps_i->node != n; 2489169689Skan ps_i = ps_i->next_in_row); 2490169689Skan if (!ps_i) 2491169689Skan return false; 2492169689Skan 2493169689Skan return remove_node_from_ps (ps, ps_i); 2494169689Skan} 2495169689Skan#endif /* INSN_SCHEDULING */ 2496169689Skan 2497169689Skanstatic bool 2498169689Skangate_handle_sms (void) 2499169689Skan{ 2500169689Skan return (optimize > 0 && flag_modulo_sched); 2501169689Skan} 2502169689Skan 2503169689Skan 2504169689Skan/* Run instruction scheduler. */ 2505169689Skan/* Perform SMS module scheduling. */ 2506169689Skanstatic unsigned int 2507169689Skanrest_of_handle_sms (void) 2508169689Skan{ 2509169689Skan#ifdef INSN_SCHEDULING 2510169689Skan basic_block bb; 2511169689Skan 2512169689Skan /* We want to be able to create new pseudos. */ 2513169689Skan no_new_pseudos = 0; 2514169689Skan /* Collect loop information to be used in SMS. */ 2515169689Skan cfg_layout_initialize (CLEANUP_UPDATE_LIFE); 2516169689Skan sms_schedule (); 2517169689Skan 2518169689Skan /* Update the life information, because we add pseudos. */ 2519169689Skan max_regno = max_reg_num (); 2520169689Skan allocate_reg_info (max_regno, FALSE, FALSE); 2521169689Skan update_life_info (NULL, UPDATE_LIFE_GLOBAL_RM_NOTES, 2522169689Skan (PROP_DEATH_NOTES 2523169689Skan | PROP_REG_INFO 2524169689Skan | PROP_KILL_DEAD_CODE 2525169689Skan | PROP_SCAN_DEAD_CODE)); 2526169689Skan 2527169689Skan no_new_pseudos = 1; 2528169689Skan 2529169689Skan /* Finalize layout changes. */ 2530169689Skan FOR_EACH_BB (bb) 2531169689Skan if (bb->next_bb != EXIT_BLOCK_PTR) 2532169689Skan bb->aux = bb->next_bb; 2533169689Skan cfg_layout_finalize (); 2534169689Skan free_dominance_info (CDI_DOMINATORS); 2535169689Skan#endif /* INSN_SCHEDULING */ 2536169689Skan return 0; 2537169689Skan} 2538169689Skan 2539169689Skanstruct tree_opt_pass pass_sms = 2540169689Skan{ 2541169689Skan "sms", /* name */ 2542169689Skan gate_handle_sms, /* gate */ 2543169689Skan rest_of_handle_sms, /* execute */ 2544169689Skan NULL, /* sub */ 2545169689Skan NULL, /* next */ 2546169689Skan 0, /* static_pass_number */ 2547169689Skan TV_SMS, /* tv_id */ 2548169689Skan 0, /* properties_required */ 2549169689Skan 0, /* properties_provided */ 2550169689Skan 0, /* properties_destroyed */ 2551169689Skan TODO_dump_func, /* todo_flags_start */ 2552169689Skan TODO_dump_func | 2553169689Skan TODO_ggc_collect, /* todo_flags_finish */ 2554169689Skan 'm' /* letter */ 2555169689Skan}; 2556169689Skan 2557