1/* Swing Modulo Scheduling implementation.
2   Copyright (C) 2004, 2005, 2006
3   Free Software Foundation, Inc.
4   Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 2, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING.  If not, write to the Free
20Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
2102110-1301, USA.  */
22
23
24#include "config.h"
25#include "system.h"
26#include "coretypes.h"
27#include "tm.h"
28#include "toplev.h"
29#include "rtl.h"
30#include "tm_p.h"
31#include "hard-reg-set.h"
32#include "regs.h"
33#include "function.h"
34#include "flags.h"
35#include "insn-config.h"
36#include "insn-attr.h"
37#include "except.h"
38#include "toplev.h"
39#include "recog.h"
40#include "sched-int.h"
41#include "target.h"
42#include "cfglayout.h"
43#include "cfgloop.h"
44#include "cfghooks.h"
45#include "expr.h"
46#include "params.h"
47#include "gcov-io.h"
48#include "df.h"
49#include "ddg.h"
50#include "timevar.h"
51#include "tree-pass.h"
52
53#ifdef INSN_SCHEDULING
54
55/* This file contains the implementation of the Swing Modulo Scheduler,
56   described in the following references:
57   [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
58       Lifetime--sensitive modulo scheduling in a production environment.
59       IEEE Trans. on Comps., 50(3), March 2001
60   [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
61       Swing Modulo Scheduling: A Lifetime Sensitive Approach.
62       PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
63
64   The basic structure is:
65   1. Build a data-dependence graph (DDG) for each loop.
66   2. Use the DDG to order the insns of a loop (not in topological order
67      necessarily, but rather) trying to place each insn after all its
68      predecessors _or_ after all its successors.
69   3. Compute MII: a lower bound on the number of cycles to schedule the loop.
70   4. Use the ordering to perform list-scheduling of the loop:
71      1. Set II = MII.  We will try to schedule the loop within II cycles.
72      2. Try to schedule the insns one by one according to the ordering.
73	 For each insn compute an interval of cycles by considering already-
74	 scheduled preds and succs (and associated latencies); try to place
75	 the insn in the cycles of this window checking for potential
76	 resource conflicts (using the DFA interface).
77	 Note: this is different from the cycle-scheduling of schedule_insns;
78	 here the insns are not scheduled monotonically top-down (nor bottom-
79	 up).
80      3. If failed in scheduling all insns - bump II++ and try again, unless
81	 II reaches an upper bound MaxII, in which case report failure.
82   5. If we succeeded in scheduling the loop within II cycles, we now
83      generate prolog and epilog, decrease the counter of the loop, and
84      perform modulo variable expansion for live ranges that span more than
85      II cycles (i.e. use register copies to prevent a def from overwriting
86      itself before reaching the use).
87*/
88
89
90/* This page defines partial-schedule structures and functions for
91   modulo scheduling.  */
92
93typedef struct partial_schedule *partial_schedule_ptr;
94typedef struct ps_insn *ps_insn_ptr;
95
96/* The minimum (absolute) cycle that a node of ps was scheduled in.  */
97#define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
98
99/* The maximum (absolute) cycle that a node of ps was scheduled in.  */
100#define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
101
102/* Perform signed modulo, always returning a non-negative value.  */
103#define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
104
105/* The number of different iterations the nodes in ps span, assuming
106   the stage boundaries are placed efficiently.  */
107#define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
108			     + 1 + (ps)->ii - 1) / (ps)->ii)
109
110/* A single instruction in the partial schedule.  */
111struct ps_insn
112{
113  /* The corresponding DDG_NODE.  */
114  ddg_node_ptr node;
115
116  /* The (absolute) cycle in which the PS instruction is scheduled.
117     Same as SCHED_TIME (node).  */
118  int cycle;
119
120  /* The next/prev PS_INSN in the same row.  */
121  ps_insn_ptr next_in_row,
122	      prev_in_row;
123
124  /* The number of nodes in the same row that come after this node.  */
125  int row_rest_count;
126};
127
128/* Holds the partial schedule as an array of II rows.  Each entry of the
129   array points to a linked list of PS_INSNs, which represents the
130   instructions that are scheduled for that row.  */
131struct partial_schedule
132{
133  int ii;	/* Number of rows in the partial schedule.  */
134  int history;  /* Threshold for conflict checking using DFA.  */
135
136  /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii).  */
137  ps_insn_ptr *rows;
138
139  /* The earliest absolute cycle of an insn in the partial schedule.  */
140  int min_cycle;
141
142  /* The latest absolute cycle of an insn in the partial schedule.  */
143  int max_cycle;
144
145  ddg_ptr g;	/* The DDG of the insns in the partial schedule.  */
146};
147
148/* We use this to record all the register replacements we do in
149   the kernel so we can undo SMS if it is not profitable.  */
150struct undo_replace_buff_elem
151{
152  rtx insn;
153  rtx orig_reg;
154  rtx new_reg;
155  struct undo_replace_buff_elem *next;
156};
157
158
159
160static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
161static void free_partial_schedule (partial_schedule_ptr);
162static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
163void print_partial_schedule (partial_schedule_ptr, FILE *);
164static int kernel_number_of_cycles (rtx first_insn, rtx last_insn);
165static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
166						ddg_node_ptr node, int cycle,
167						sbitmap must_precede,
168						sbitmap must_follow);
169static void rotate_partial_schedule (partial_schedule_ptr, int);
170void set_row_column_for_ps (partial_schedule_ptr);
171static bool ps_unschedule_node (partial_schedule_ptr, ddg_node_ptr );
172
173
174/* This page defines constants and structures for the modulo scheduling
175   driver.  */
176
177/* As in haifa-sched.c:  */
178/* issue_rate is the number of insns that can be scheduled in the same
179   machine cycle.  It can be defined in the config/mach/mach.h file,
180   otherwise we set it to 1.  */
181
182static int issue_rate;
183
184static int sms_order_nodes (ddg_ptr, int, int * result);
185static void set_node_sched_params (ddg_ptr);
186static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
187static void permute_partial_schedule (partial_schedule_ptr ps, rtx last);
188static void generate_prolog_epilog (partial_schedule_ptr ,struct loop * loop, rtx);
189static void duplicate_insns_of_cycles (partial_schedule_ptr ps,
190				       int from_stage, int to_stage,
191				       int is_prolog);
192
193#define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
194#define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
195#define SCHED_FIRST_REG_MOVE(x) \
196	(((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
197#define SCHED_NREG_MOVES(x) \
198	(((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
199#define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
200#define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
201#define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
202
203/* The scheduling parameters held for each node.  */
204typedef struct node_sched_params
205{
206  int asap;	/* A lower-bound on the absolute scheduling cycle.  */
207  int time;	/* The absolute scheduling cycle (time >= asap).  */
208
209  /* The following field (first_reg_move) is a pointer to the first
210     register-move instruction added to handle the modulo-variable-expansion
211     of the register defined by this node.  This register-move copies the
212     original register defined by the node.  */
213  rtx first_reg_move;
214
215  /* The number of register-move instructions added, immediately preceding
216     first_reg_move.  */
217  int nreg_moves;
218
219  int row;    /* Holds time % ii.  */
220  int stage;  /* Holds time / ii.  */
221
222  /* The column of a node inside the ps.  If nodes u, v are on the same row,
223     u will precede v if column (u) < column (v).  */
224  int column;
225} *node_sched_params_ptr;
226
227
228/* The following three functions are copied from the current scheduler
229   code in order to use sched_analyze() for computing the dependencies.
230   They are used when initializing the sched_info structure.  */
231static const char *
232sms_print_insn (rtx insn, int aligned ATTRIBUTE_UNUSED)
233{
234  static char tmp[80];
235
236  sprintf (tmp, "i%4d", INSN_UID (insn));
237  return tmp;
238}
239
240static void
241compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
242			       regset cond_exec ATTRIBUTE_UNUSED,
243			       regset used ATTRIBUTE_UNUSED,
244			       regset set ATTRIBUTE_UNUSED)
245{
246}
247
248static struct sched_info sms_sched_info =
249{
250  NULL,
251  NULL,
252  NULL,
253  NULL,
254  NULL,
255  sms_print_insn,
256  NULL,
257  compute_jump_reg_dependencies,
258  NULL, NULL,
259  NULL, NULL,
260  0, 0, 0,
261
262  NULL, NULL, NULL, NULL, NULL,
263#ifdef ENABLE_CHECKING
264  NULL,
265#endif
266  0
267};
268
269
270/* Return the register decremented and tested in INSN,
271   or zero if it is not a decrement-and-branch insn.  */
272
273static rtx
274doloop_register_get (rtx insn ATTRIBUTE_UNUSED)
275{
276#ifdef HAVE_doloop_end
277  rtx pattern, reg, condition;
278
279  if (! JUMP_P (insn))
280    return NULL_RTX;
281
282  pattern = PATTERN (insn);
283  condition = doloop_condition_get (pattern);
284  if (! condition)
285    return NULL_RTX;
286
287  if (REG_P (XEXP (condition, 0)))
288    reg = XEXP (condition, 0);
289  else if (GET_CODE (XEXP (condition, 0)) == PLUS
290	   && REG_P (XEXP (XEXP (condition, 0), 0)))
291    reg = XEXP (XEXP (condition, 0), 0);
292  else
293    gcc_unreachable ();
294
295  return reg;
296#else
297  return NULL_RTX;
298#endif
299}
300
301/* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
302   that the number of iterations is a compile-time constant.  If so,
303   return the rtx that sets COUNT_REG to a constant, and set COUNT to
304   this constant.  Otherwise return 0.  */
305static rtx
306const_iteration_count (rtx count_reg, basic_block pre_header,
307		       HOST_WIDEST_INT * count)
308{
309  rtx insn;
310  rtx head, tail;
311
312  if (! pre_header)
313    return NULL_RTX;
314
315  get_ebb_head_tail (pre_header, pre_header, &head, &tail);
316
317  for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
318    if (INSN_P (insn) && single_set (insn) &&
319	rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
320      {
321	rtx pat = single_set (insn);
322
323	if (GET_CODE (SET_SRC (pat)) == CONST_INT)
324	  {
325	    *count = INTVAL (SET_SRC (pat));
326	    return insn;
327	  }
328
329	return NULL_RTX;
330      }
331
332  return NULL_RTX;
333}
334
335/* A very simple resource-based lower bound on the initiation interval.
336   ??? Improve the accuracy of this bound by considering the
337   utilization of various units.  */
338static int
339res_MII (ddg_ptr g)
340{
341  return (g->num_nodes / issue_rate);
342}
343
344
345/* Points to the array that contains the sched data for each node.  */
346static node_sched_params_ptr node_sched_params;
347
348/* Allocate sched_params for each node and initialize it.  Assumes that
349   the aux field of each node contain the asap bound (computed earlier),
350   and copies it into the sched_params field.  */
351static void
352set_node_sched_params (ddg_ptr g)
353{
354  int i;
355
356  /* Allocate for each node in the DDG a place to hold the "sched_data".  */
357  /* Initialize ASAP/ALAP/HIGHT to zero.  */
358  node_sched_params = (node_sched_params_ptr)
359		       xcalloc (g->num_nodes,
360				sizeof (struct node_sched_params));
361
362  /* Set the pointer of the general data of the node to point to the
363     appropriate sched_params structure.  */
364  for (i = 0; i < g->num_nodes; i++)
365    {
366      /* Watch out for aliasing problems?  */
367      node_sched_params[i].asap = g->nodes[i].aux.count;
368      g->nodes[i].aux.info = &node_sched_params[i];
369    }
370}
371
372static void
373print_node_sched_params (FILE * file, int num_nodes)
374{
375  int i;
376
377  if (! file)
378    return;
379  for (i = 0; i < num_nodes; i++)
380    {
381      node_sched_params_ptr nsp = &node_sched_params[i];
382      rtx reg_move = nsp->first_reg_move;
383      int j;
384
385      fprintf (file, "Node %d:\n", i);
386      fprintf (file, " asap = %d:\n", nsp->asap);
387      fprintf (file, " time = %d:\n", nsp->time);
388      fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves);
389      for (j = 0; j < nsp->nreg_moves; j++)
390	{
391	  fprintf (file, " reg_move = ");
392	  print_rtl_single (file, reg_move);
393	  reg_move = PREV_INSN (reg_move);
394	}
395    }
396}
397
398/* Calculate an upper bound for II.  SMS should not schedule the loop if it
399   requires more cycles than this bound.  Currently set to the sum of the
400   longest latency edge for each node.  Reset based on experiments.  */
401static int
402calculate_maxii (ddg_ptr g)
403{
404  int i;
405  int maxii = 0;
406
407  for (i = 0; i < g->num_nodes; i++)
408    {
409      ddg_node_ptr u = &g->nodes[i];
410      ddg_edge_ptr e;
411      int max_edge_latency = 0;
412
413      for (e = u->out; e; e = e->next_out)
414	max_edge_latency = MAX (max_edge_latency, e->latency);
415
416      maxii += max_edge_latency;
417    }
418  return maxii;
419}
420
421/*
422   Breaking intra-loop register anti-dependences:
423   Each intra-loop register anti-dependence implies a cross-iteration true
424   dependence of distance 1. Therefore, we can remove such false dependencies
425   and figure out if the partial schedule broke them by checking if (for a
426   true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
427   if so generate a register move.   The number of such moves is equal to:
428              SCHED_TIME (use) - SCHED_TIME (def)       { 0 broken
429   nreg_moves = ----------------------------------- + 1 - {   dependence.
430                            ii                          { 1 if not.
431*/
432static struct undo_replace_buff_elem *
433generate_reg_moves (partial_schedule_ptr ps)
434{
435  ddg_ptr g = ps->g;
436  int ii = ps->ii;
437  int i;
438  struct undo_replace_buff_elem *reg_move_replaces = NULL;
439
440  for (i = 0; i < g->num_nodes; i++)
441    {
442      ddg_node_ptr u = &g->nodes[i];
443      ddg_edge_ptr e;
444      int nreg_moves = 0, i_reg_move;
445      sbitmap *uses_of_defs;
446      rtx last_reg_move;
447      rtx prev_reg, old_reg;
448
449      /* Compute the number of reg_moves needed for u, by looking at life
450	 ranges started at u (excluding self-loops).  */
451      for (e = u->out; e; e = e->next_out)
452	if (e->type == TRUE_DEP && e->dest != e->src)
453	  {
454	    int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
455
456            if (e->distance == 1)
457              nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
458
459	    /* If dest precedes src in the schedule of the kernel, then dest
460	       will read before src writes and we can save one reg_copy.  */
461	    if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
462		&& SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
463	      nreg_moves4e--;
464
465	    nreg_moves = MAX (nreg_moves, nreg_moves4e);
466	  }
467
468      if (nreg_moves == 0)
469	continue;
470
471      /* Every use of the register defined by node may require a different
472	 copy of this register, depending on the time the use is scheduled.
473	 Set a bitmap vector, telling which nodes use each copy of this
474	 register.  */
475      uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes);
476      sbitmap_vector_zero (uses_of_defs, nreg_moves);
477      for (e = u->out; e; e = e->next_out)
478	if (e->type == TRUE_DEP && e->dest != e->src)
479	  {
480	    int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
481
482	    if (e->distance == 1)
483	      dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
484
485	    if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
486		&& SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
487	      dest_copy--;
488
489	    if (dest_copy)
490	      SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid);
491	  }
492
493      /* Now generate the reg_moves, attaching relevant uses to them.  */
494      SCHED_NREG_MOVES (u) = nreg_moves;
495      old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn)));
496      last_reg_move = u->insn;
497
498      for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
499	{
500	  unsigned int i_use = 0;
501	  rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
502	  rtx reg_move = gen_move_insn (new_reg, prev_reg);
503	  sbitmap_iterator sbi;
504
505	  add_insn_before (reg_move, last_reg_move);
506	  last_reg_move = reg_move;
507
508	  if (!SCHED_FIRST_REG_MOVE (u))
509	    SCHED_FIRST_REG_MOVE (u) = reg_move;
510
511	  EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use, sbi)
512	    {
513	      struct undo_replace_buff_elem *rep;
514
515	      rep = (struct undo_replace_buff_elem *)
516		    xcalloc (1, sizeof (struct undo_replace_buff_elem));
517	      rep->insn = g->nodes[i_use].insn;
518	      rep->orig_reg = old_reg;
519	      rep->new_reg = new_reg;
520
521	      if (! reg_move_replaces)
522		reg_move_replaces = rep;
523	      else
524		{
525		  rep->next = reg_move_replaces;
526		  reg_move_replaces = rep;
527		}
528
529	      replace_rtx (g->nodes[i_use].insn, old_reg, new_reg);
530	    }
531
532	  prev_reg = new_reg;
533	}
534      sbitmap_vector_free (uses_of_defs);
535    }
536  return reg_move_replaces;
537}
538
539/* We call this when we want to undo the SMS schedule for a given loop.
540   One of the things that we do is to delete the register moves generated
541   for the sake of SMS; this function deletes the register move instructions
542   recorded in the undo buffer.  */
543static void
544undo_generate_reg_moves (partial_schedule_ptr ps,
545			 struct undo_replace_buff_elem *reg_move_replaces)
546{
547  int i,j;
548
549  for (i = 0; i < ps->g->num_nodes; i++)
550    {
551      ddg_node_ptr u = &ps->g->nodes[i];
552      rtx prev;
553      rtx crr = SCHED_FIRST_REG_MOVE (u);
554
555      for (j = 0; j < SCHED_NREG_MOVES (u); j++)
556	{
557	  prev = PREV_INSN (crr);
558	  delete_insn (crr);
559	  crr = prev;
560	}
561      SCHED_FIRST_REG_MOVE (u) = NULL_RTX;
562    }
563
564  while (reg_move_replaces)
565    {
566      struct undo_replace_buff_elem *rep = reg_move_replaces;
567
568      reg_move_replaces = reg_move_replaces->next;
569      replace_rtx (rep->insn, rep->new_reg, rep->orig_reg);
570    }
571}
572
573/* Free memory allocated for the undo buffer.  */
574static void
575free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
576{
577
578  while (reg_move_replaces)
579    {
580      struct undo_replace_buff_elem *rep = reg_move_replaces;
581
582      reg_move_replaces = reg_move_replaces->next;
583      free (rep);
584    }
585}
586
587/* Bump the SCHED_TIMEs of all nodes to start from zero.  Set the values
588   of SCHED_ROW and SCHED_STAGE.  */
589static void
590normalize_sched_times (partial_schedule_ptr ps)
591{
592  int i;
593  ddg_ptr g = ps->g;
594  int amount = PS_MIN_CYCLE (ps);
595  int ii = ps->ii;
596
597  /* Don't include the closing branch assuming that it is the last node.  */
598  for (i = 0; i < g->num_nodes - 1; i++)
599    {
600      ddg_node_ptr u = &g->nodes[i];
601      int normalized_time = SCHED_TIME (u) - amount;
602
603      gcc_assert (normalized_time >= 0);
604
605      SCHED_TIME (u) = normalized_time;
606      SCHED_ROW (u) = normalized_time % ii;
607      SCHED_STAGE (u) = normalized_time / ii;
608    }
609}
610
611/* Set SCHED_COLUMN of each node according to its position in PS.  */
612static void
613set_columns_for_ps (partial_schedule_ptr ps)
614{
615  int row;
616
617  for (row = 0; row < ps->ii; row++)
618    {
619      ps_insn_ptr cur_insn = ps->rows[row];
620      int column = 0;
621
622      for (; cur_insn; cur_insn = cur_insn->next_in_row)
623	SCHED_COLUMN (cur_insn->node) = column++;
624    }
625}
626
627/* Permute the insns according to their order in PS, from row 0 to
628   row ii-1, and position them right before LAST.  This schedules
629   the insns of the loop kernel.  */
630static void
631permute_partial_schedule (partial_schedule_ptr ps, rtx last)
632{
633  int ii = ps->ii;
634  int row;
635  ps_insn_ptr ps_ij;
636
637  for (row = 0; row < ii ; row++)
638    for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
639      if (PREV_INSN (last) != ps_ij->node->insn)
640      	reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn,
641			    PREV_INSN (last));
642}
643
644/* As part of undoing SMS we return to the original ordering of the
645   instructions inside the loop kernel.  Given the partial schedule PS, this
646   function returns the ordering of the instruction according to their CUID
647   in the DDG (PS->G), which is the original order of the instruction before
648   performing SMS.  */
649static void
650undo_permute_partial_schedule (partial_schedule_ptr ps, rtx last)
651{
652  int i;
653
654  for (i = 0 ; i < ps->g->num_nodes; i++)
655    if (last == ps->g->nodes[i].insn
656	|| last == ps->g->nodes[i].first_note)
657      break;
658    else if (PREV_INSN (last) != ps->g->nodes[i].insn)
659      reorder_insns_nobb (ps->g->nodes[i].first_note, ps->g->nodes[i].insn,
660			  PREV_INSN (last));
661}
662
663/* Used to generate the prologue & epilogue.  Duplicate the subset of
664   nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
665   of both), together with a prefix/suffix of their reg_moves.  */
666static void
667duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
668			   int to_stage, int for_prolog)
669{
670  int row;
671  ps_insn_ptr ps_ij;
672
673  for (row = 0; row < ps->ii; row++)
674    for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
675      {
676	ddg_node_ptr u_node = ps_ij->node;
677	int j, i_reg_moves;
678	rtx reg_move = NULL_RTX;
679
680	if (for_prolog)
681	  {
682	    /* SCHED_STAGE (u_node) >= from_stage == 0.  Generate increasing
683	       number of reg_moves starting with the second occurrence of
684	       u_node, which is generated if its SCHED_STAGE <= to_stage.  */
685	    i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1;
686	    i_reg_moves = MAX (i_reg_moves, 0);
687	    i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
688
689	    /* The reg_moves start from the *first* reg_move backwards.  */
690	    if (i_reg_moves)
691	      {
692		reg_move = SCHED_FIRST_REG_MOVE (u_node);
693		for (j = 1; j < i_reg_moves; j++)
694		  reg_move = PREV_INSN (reg_move);
695	      }
696	  }
697	else /* It's for the epilog.  */
698	  {
699	    /* SCHED_STAGE (u_node) <= to_stage.  Generate all reg_moves,
700	       starting to decrease one stage after u_node no longer occurs;
701	       that is, generate all reg_moves until
702	       SCHED_STAGE (u_node) == from_stage - 1.  */
703	    i_reg_moves = SCHED_NREG_MOVES (u_node)
704	    	       - (from_stage - SCHED_STAGE (u_node) - 1);
705	    i_reg_moves = MAX (i_reg_moves, 0);
706	    i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
707
708	    /* The reg_moves start from the *last* reg_move forwards.  */
709	    if (i_reg_moves)
710	      {
711		reg_move = SCHED_FIRST_REG_MOVE (u_node);
712		for (j = 1; j < SCHED_NREG_MOVES (u_node); j++)
713		  reg_move = PREV_INSN (reg_move);
714	      }
715	  }
716
717	for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
718	  emit_insn (copy_rtx (PATTERN (reg_move)));
719	if (SCHED_STAGE (u_node) >= from_stage
720	    && SCHED_STAGE (u_node) <= to_stage)
721	  duplicate_insn_chain (u_node->first_note, u_node->insn);
722      }
723}
724
725
726/* Generate the instructions (including reg_moves) for prolog & epilog.  */
727static void
728generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_reg)
729{
730  int i;
731  int last_stage = PS_STAGE_COUNT (ps) - 1;
732  edge e;
733
734  /* Generate the prolog, inserting its insns on the loop-entry edge.  */
735  start_sequence ();
736
737  if (count_reg)
738   /* Generate a subtract instruction at the beginning of the prolog to
739      adjust the loop count by STAGE_COUNT.  */
740   emit_insn (gen_sub2_insn (count_reg, GEN_INT (last_stage)));
741
742  for (i = 0; i < last_stage; i++)
743    duplicate_insns_of_cycles (ps, 0, i, 1);
744
745  /* Put the prolog ,  on the one and only entry edge.  */
746  e = loop_preheader_edge (loop);
747  loop_split_edge_with(e , get_insns());
748
749  end_sequence ();
750
751  /* Generate the epilog, inserting its insns on the loop-exit edge.  */
752  start_sequence ();
753
754  for (i = 0; i < last_stage; i++)
755    duplicate_insns_of_cycles (ps, i + 1, last_stage, 0);
756
757  /* Put the epilogue on the one and only one exit edge.  */
758  gcc_assert (loop->single_exit);
759  e = loop->single_exit;
760  loop_split_edge_with(e , get_insns());
761  end_sequence ();
762}
763
764/* Return the line note insn preceding INSN, for debugging.  Taken from
765   emit-rtl.c.  */
766static rtx
767find_line_note (rtx insn)
768{
769  for (; insn; insn = PREV_INSN (insn))
770    if (NOTE_P (insn)
771	&& NOTE_LINE_NUMBER (insn) >= 0)
772      break;
773
774  return insn;
775}
776
777/* Return true if all the BBs of the loop are empty except the
778   loop header.  */
779static bool
780loop_single_full_bb_p (struct loop *loop)
781{
782  unsigned i;
783  basic_block *bbs = get_loop_body (loop);
784
785  for (i = 0; i < loop->num_nodes ; i++)
786    {
787      rtx head, tail;
788      bool empty_bb = true;
789
790      if (bbs[i] == loop->header)
791        continue;
792
793      /* Make sure that basic blocks other than the header
794         have only notes labels or jumps.  */
795      get_ebb_head_tail (bbs[i], bbs[i], &head, &tail);
796      for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
797        {
798          if (NOTE_P (head) || LABEL_P (head)
799 	      || (INSN_P (head) && JUMP_P (head)))
800 	    continue;
801 	  empty_bb = false;
802 	  break;
803        }
804
805      if (! empty_bb)
806        {
807          free (bbs);
808          return false;
809        }
810    }
811  free (bbs);
812  return true;
813}
814
815/* A simple loop from SMS point of view; it is a loop that is composed of
816   either a single basic block or two BBs - a header and a latch.  */
817#define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 ) 		    \
818				  && (EDGE_COUNT (loop->latch->preds) == 1) \
819                                  && (EDGE_COUNT (loop->latch->succs) == 1))
820
821/* Return true if the loop is in its canonical form and false if not.
822   i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit.  */
823static bool
824loop_canon_p (struct loop *loop)
825{
826
827  if (loop->inner || ! loop->outer)
828    return false;
829
830  if (!loop->single_exit)
831    {
832      if (dump_file)
833	{
834	  rtx line_note = find_line_note (BB_END (loop->header));
835
836	  fprintf (dump_file, "SMS loop many exits ");
837	  if (line_note)
838	    {
839	      expanded_location xloc;
840	      NOTE_EXPANDED_LOCATION (xloc, line_note);
841	      fprintf (dump_file, " %s %d (file, line)\n",
842		       xloc.file, xloc.line);
843	    }
844	}
845      return false;
846    }
847
848  if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
849    {
850      if (dump_file)
851	{
852	  rtx line_note = find_line_note (BB_END (loop->header));
853
854	  fprintf (dump_file, "SMS loop many BBs. ");
855	  if (line_note)
856	    {
857	      expanded_location xloc;
858  	      NOTE_EXPANDED_LOCATION (xloc, line_note);
859	      fprintf (dump_file, " %s %d (file, line)\n",
860		       xloc.file, xloc.line);
861	    }
862	}
863      return false;
864    }
865
866    return true;
867}
868
869/* If there are more than one entry for the loop,
870   make it one by splitting the first entry edge and
871   redirecting the others to the new BB.  */
872static void
873canon_loop (struct loop *loop)
874{
875  edge e;
876  edge_iterator i;
877
878  /* Avoid annoying special cases of edges going to exit
879     block.  */
880  FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR->preds)
881    if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
882      loop_split_edge_with (e, NULL_RTX);
883
884  if (loop->latch == loop->header
885      || EDGE_COUNT (loop->latch->succs) > 1)
886    {
887      FOR_EACH_EDGE (e, i, loop->header->preds)
888        if (e->src == loop->latch)
889          break;
890      loop_split_edge_with (e, NULL_RTX);
891    }
892}
893
894/* Main entry point, perform SMS scheduling on the loops of the function
895   that consist of single basic blocks.  */
896static void
897sms_schedule (void)
898{
899  static int passes = 0;
900  rtx insn;
901  ddg_ptr *g_arr, g;
902  int * node_order;
903  int maxii;
904  unsigned i,num_loops;
905  partial_schedule_ptr ps;
906  struct df *df;
907  struct loops *loops;
908  basic_block bb = NULL;
909  /* vars to the versioning only if needed*/
910  struct loop * nloop;
911  basic_block condition_bb = NULL;
912  edge latch_edge;
913  gcov_type trip_count = 0;
914
915  loops = loop_optimizer_init (LOOPS_HAVE_PREHEADERS
916			       | LOOPS_HAVE_MARKED_SINGLE_EXITS);
917  if (!loops)
918    return;  /* There is no loops to schedule.  */
919
920  /* Initialize issue_rate.  */
921  if (targetm.sched.issue_rate)
922    {
923      int temp = reload_completed;
924
925      reload_completed = 1;
926      issue_rate = targetm.sched.issue_rate ();
927      reload_completed = temp;
928    }
929  else
930    issue_rate = 1;
931
932  /* Initialize the scheduler.  */
933  current_sched_info = &sms_sched_info;
934  sched_init ();
935
936  /* Init Data Flow analysis, to be used in interloop dep calculation.  */
937  df = df_init (DF_HARD_REGS | DF_EQUIV_NOTES | DF_SUBREGS);
938  df_rd_add_problem (df, 0);
939  df_ru_add_problem (df, 0);
940  df_chain_add_problem (df, DF_DU_CHAIN | DF_UD_CHAIN);
941  df_analyze (df);
942
943  if (dump_file)
944    df_dump (df, dump_file);
945
946  /* Allocate memory to hold the DDG array one entry for each loop.
947     We use loop->num as index into this array.  */
948  g_arr = XCNEWVEC (ddg_ptr, loops->num);
949
950
951  /* Build DDGs for all the relevant loops and hold them in G_ARR
952     indexed by the loop index.  */
953  for (i = 0; i < loops->num; i++)
954    {
955      rtx head, tail;
956      rtx count_reg;
957      struct loop *loop = loops->parray[i];
958
959      /* For debugging.  */
960      if ((passes++ > MAX_SMS_LOOP_NUMBER) && (MAX_SMS_LOOP_NUMBER != -1))
961        {
962          if (dump_file)
963            fprintf (dump_file, "SMS reached MAX_PASSES... \n");
964
965          break;
966        }
967
968      if (! loop_canon_p (loop))
969        continue;
970
971      if (! loop_single_full_bb_p (loop))
972	continue;
973
974      bb = loop->header;
975
976      get_ebb_head_tail (bb, bb, &head, &tail);
977      latch_edge = loop_latch_edge (loop);
978      gcc_assert (loop->single_exit);
979      if (loop->single_exit->count)
980	trip_count = latch_edge->count / loop->single_exit->count;
981
982      /* Perfrom SMS only on loops that their average count is above threshold.  */
983
984      if ( latch_edge->count
985          && (latch_edge->count < loop->single_exit->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD))
986	{
987	  if (dump_file)
988	    {
989	      rtx line_note = find_line_note (tail);
990
991	      if (line_note)
992		{
993		  expanded_location xloc;
994		  NOTE_EXPANDED_LOCATION (xloc, line_note);
995		  fprintf (dump_file, "SMS bb %s %d (file, line)\n",
996			   xloc.file, xloc.line);
997		}
998	      fprintf (dump_file, "SMS single-bb-loop\n");
999	      if (profile_info && flag_branch_probabilities)
1000	    	{
1001	      	  fprintf (dump_file, "SMS loop-count ");
1002	      	  fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1003	             	   (HOST_WIDEST_INT) bb->count);
1004	      	  fprintf (dump_file, "\n");
1005                  fprintf (dump_file, "SMS trip-count ");
1006                  fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1007                           (HOST_WIDEST_INT) trip_count);
1008                  fprintf (dump_file, "\n");
1009	      	  fprintf (dump_file, "SMS profile-sum-max ");
1010	      	  fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1011	          	   (HOST_WIDEST_INT) profile_info->sum_max);
1012	      	  fprintf (dump_file, "\n");
1013	    	}
1014	    }
1015          continue;
1016        }
1017
1018      /* Make sure this is a doloop.  */
1019      if ( !(count_reg = doloop_register_get (tail)))
1020	continue;
1021
1022      /* Don't handle BBs with calls or barriers, or !single_set insns.  */
1023      for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
1024	if (CALL_P (insn)
1025	    || BARRIER_P (insn)
1026	    || (INSN_P (insn) && !JUMP_P (insn)
1027		&& !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
1028	  break;
1029
1030      if (insn != NEXT_INSN (tail))
1031	{
1032	  if (dump_file)
1033	    {
1034	      if (CALL_P (insn))
1035		fprintf (dump_file, "SMS loop-with-call\n");
1036	      else if (BARRIER_P (insn))
1037		fprintf (dump_file, "SMS loop-with-barrier\n");
1038	      else
1039		fprintf (dump_file, "SMS loop-with-not-single-set\n");
1040	      print_rtl_single (dump_file, insn);
1041	    }
1042
1043	  continue;
1044	}
1045
1046      if (! (g = create_ddg (bb, df, 0)))
1047        {
1048          if (dump_file)
1049	    fprintf (dump_file, "SMS doloop\n");
1050	  continue;
1051        }
1052
1053      g_arr[i] = g;
1054    }
1055
1056  /* Release Data Flow analysis data structures.  */
1057  df_finish (df);
1058  df = NULL;
1059
1060  /* We don't want to perform SMS on new loops - created by versioning.  */
1061  num_loops = loops->num;
1062  /* Go over the built DDGs and perfrom SMS for each one of them.  */
1063  for (i = 0; i < num_loops; i++)
1064    {
1065      rtx head, tail;
1066      rtx count_reg, count_init;
1067      int mii, rec_mii;
1068      unsigned stage_count = 0;
1069      HOST_WIDEST_INT loop_count = 0;
1070      struct loop *loop = loops->parray[i];
1071
1072      if (! (g = g_arr[i]))
1073        continue;
1074
1075      if (dump_file)
1076	print_ddg (dump_file, g);
1077
1078      get_ebb_head_tail (loop->header, loop->header, &head, &tail);
1079
1080      latch_edge = loop_latch_edge (loop);
1081      gcc_assert (loop->single_exit);
1082      if (loop->single_exit->count)
1083	trip_count = latch_edge->count / loop->single_exit->count;
1084
1085      if (dump_file)
1086	{
1087	  rtx line_note = find_line_note (tail);
1088
1089	  if (line_note)
1090	    {
1091	      expanded_location xloc;
1092	      NOTE_EXPANDED_LOCATION (xloc, line_note);
1093	      fprintf (dump_file, "SMS bb %s %d (file, line)\n",
1094		       xloc.file, xloc.line);
1095	    }
1096	  fprintf (dump_file, "SMS single-bb-loop\n");
1097	  if (profile_info && flag_branch_probabilities)
1098	    {
1099	      fprintf (dump_file, "SMS loop-count ");
1100	      fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1101	               (HOST_WIDEST_INT) bb->count);
1102	      fprintf (dump_file, "\n");
1103	      fprintf (dump_file, "SMS profile-sum-max ");
1104	      fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1105	               (HOST_WIDEST_INT) profile_info->sum_max);
1106	      fprintf (dump_file, "\n");
1107	    }
1108	  fprintf (dump_file, "SMS doloop\n");
1109	  fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes);
1110          fprintf (dump_file, "SMS num-loads %d\n", g->num_loads);
1111          fprintf (dump_file, "SMS num-stores %d\n", g->num_stores);
1112	}
1113
1114
1115      /* In case of th loop have doloop register it gets special
1116	 handling.  */
1117      count_init = NULL_RTX;
1118      if ((count_reg = doloop_register_get (tail)))
1119	{
1120	  basic_block pre_header;
1121
1122	  pre_header = loop_preheader_edge (loop)->src;
1123	  count_init = const_iteration_count (count_reg, pre_header,
1124					      &loop_count);
1125	}
1126      gcc_assert (count_reg);
1127
1128      if (dump_file && count_init)
1129        {
1130          fprintf (dump_file, "SMS const-doloop ");
1131          fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1132		     loop_count);
1133          fprintf (dump_file, "\n");
1134        }
1135
1136      node_order = XNEWVEC (int, g->num_nodes);
1137
1138      mii = 1; /* Need to pass some estimate of mii.  */
1139      rec_mii = sms_order_nodes (g, mii, node_order);
1140      mii = MAX (res_MII (g), rec_mii);
1141      maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
1142
1143      if (dump_file)
1144	fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1145		 rec_mii, mii, maxii);
1146
1147      /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1148	 ASAP.  */
1149      set_node_sched_params (g);
1150
1151      ps = sms_schedule_by_order (g, mii, maxii, node_order);
1152
1153      if (ps)
1154	stage_count = PS_STAGE_COUNT (ps);
1155
1156      /* Stage count of 1 means that there is no interleaving between
1157         iterations, let the scheduling passes do the job.  */
1158      if (stage_count < 1
1159	  || (count_init && (loop_count <= stage_count))
1160	  || (flag_branch_probabilities && (trip_count <= stage_count)))
1161	{
1162	  if (dump_file)
1163	    {
1164	      fprintf (dump_file, "SMS failed... \n");
1165	      fprintf (dump_file, "SMS sched-failed (stage-count=%d, loop-count=", stage_count);
1166	      fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
1167	      fprintf (dump_file, ", trip-count=");
1168	      fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count);
1169	      fprintf (dump_file, ")\n");
1170	    }
1171	  continue;
1172	}
1173      else
1174	{
1175	  int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1176	  int new_cycles;
1177	  struct undo_replace_buff_elem *reg_move_replaces;
1178
1179	  if (dump_file)
1180	    {
1181	      fprintf (dump_file,
1182		       "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
1183		       stage_count);
1184	      print_partial_schedule (ps, dump_file);
1185	      fprintf (dump_file,
1186		       "SMS Branch (%d) will later be scheduled at cycle %d.\n",
1187		       g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
1188	    }
1189
1190	  /* Set the stage boundaries.  If the DDG is built with closing_branch_deps,
1191	     the closing_branch was scheduled and should appear in the last (ii-1)
1192	     row.  Otherwise, we are free to schedule the branch, and we let nodes
1193	     that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
1194	     row; this should reduce stage_count to minimum.  */
1195	  normalize_sched_times (ps);
1196	  rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
1197	  set_columns_for_ps (ps);
1198
1199	  /* Generate the kernel just to be able to measure its cycles.  */
1200	  permute_partial_schedule (ps, g->closing_branch->first_note);
1201	  reg_move_replaces = generate_reg_moves (ps);
1202
1203	  /* Get the number of cycles the new kernel expect to execute in.  */
1204	  new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1205
1206	  /* Get back to the original loop so we can do loop versioning.  */
1207	  undo_permute_partial_schedule (ps, g->closing_branch->first_note);
1208	  if (reg_move_replaces)
1209	    undo_generate_reg_moves (ps, reg_move_replaces);
1210
1211	  if ( new_cycles >= orig_cycles)
1212	    {
1213	      /* SMS is not profitable so undo the permutation and reg move generation
1214	         and return the kernel to its original state.  */
1215	      if (dump_file)
1216		fprintf (dump_file, "Undoing SMS because it is not profitable.\n");
1217
1218	    }
1219	  else
1220	    {
1221	      canon_loop (loop);
1222
1223              /* case the BCT count is not known , Do loop-versioning */
1224	      if (count_reg && ! count_init)
1225		{
1226		  rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
1227						 GEN_INT(stage_count));
1228
1229		  nloop = loop_version (loops, loop, comp_rtx, &condition_bb,
1230					true);
1231		}
1232
1233	      /* Set new iteration count of loop kernel.  */
1234              if (count_reg && count_init)
1235		SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1236							     - stage_count + 1);
1237
1238	      /* Now apply the scheduled kernel to the RTL of the loop.  */
1239	      permute_partial_schedule (ps, g->closing_branch->first_note);
1240
1241              /* Mark this loop as software pipelined so the later
1242	      scheduling passes doesn't touch it.  */
1243	      if (! flag_resched_modulo_sched)
1244		g->bb->flags |= BB_DISABLE_SCHEDULE;
1245	      /* The life-info is not valid any more.  */
1246	      g->bb->flags |= BB_DIRTY;
1247
1248	      reg_move_replaces = generate_reg_moves (ps);
1249	      if (dump_file)
1250		print_node_sched_params (dump_file, g->num_nodes);
1251	      /* Generate prolog and epilog.  */
1252	      if (count_reg && !count_init)
1253		generate_prolog_epilog (ps, loop, count_reg);
1254	      else
1255	 	generate_prolog_epilog (ps, loop, NULL_RTX);
1256	    }
1257	  free_undo_replace_buff (reg_move_replaces);
1258	}
1259
1260      free_partial_schedule (ps);
1261      free (node_sched_params);
1262      free (node_order);
1263      free_ddg (g);
1264    }
1265
1266  free (g_arr);
1267
1268  /* Release scheduler data, needed until now because of DFA.  */
1269  sched_finish ();
1270  loop_optimizer_finalize (loops);
1271}
1272
1273/* The SMS scheduling algorithm itself
1274   -----------------------------------
1275   Input: 'O' an ordered list of insns of a loop.
1276   Output: A scheduling of the loop - kernel, prolog, and epilogue.
1277
1278   'Q' is the empty Set
1279   'PS' is the partial schedule; it holds the currently scheduled nodes with
1280	their cycle/slot.
1281   'PSP' previously scheduled predecessors.
1282   'PSS' previously scheduled successors.
1283   't(u)' the cycle where u is scheduled.
1284   'l(u)' is the latency of u.
1285   'd(v,u)' is the dependence distance from v to u.
1286   'ASAP(u)' the earliest time at which u could be scheduled as computed in
1287	     the node ordering phase.
1288   'check_hardware_resources_conflicts(u, PS, c)'
1289			     run a trace around cycle/slot through DFA model
1290			     to check resource conflicts involving instruction u
1291			     at cycle c given the partial schedule PS.
1292   'add_to_partial_schedule_at_time(u, PS, c)'
1293			     Add the node/instruction u to the partial schedule
1294			     PS at time c.
1295   'calculate_register_pressure(PS)'
1296			     Given a schedule of instructions, calculate the register
1297			     pressure it implies.  One implementation could be the
1298			     maximum number of overlapping live ranges.
1299   'maxRP' The maximum allowed register pressure, it is usually derived from the number
1300	   registers available in the hardware.
1301
1302   1. II = MII.
1303   2. PS = empty list
1304   3. for each node u in O in pre-computed order
1305   4.   if (PSP(u) != Q && PSS(u) == Q) then
1306   5.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1307   6.     start = Early_start; end = Early_start + II - 1; step = 1
1308   11.  else if (PSP(u) == Q && PSS(u) != Q) then
1309   12.      Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1310   13.     start = Late_start; end = Late_start - II + 1; step = -1
1311   14.  else if (PSP(u) != Q && PSS(u) != Q) then
1312   15.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1313   16.     Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1314   17.     start = Early_start;
1315   18.     end = min(Early_start + II - 1 , Late_start);
1316   19.     step = 1
1317   20.     else "if (PSP(u) == Q && PSS(u) == Q)"
1318   21.	  start = ASAP(u); end = start + II - 1; step = 1
1319   22.  endif
1320
1321   23.  success = false
1322   24.  for (c = start ; c != end ; c += step)
1323   25.     if check_hardware_resources_conflicts(u, PS, c) then
1324   26.       add_to_partial_schedule_at_time(u, PS, c)
1325   27.       success = true
1326   28.       break
1327   29.     endif
1328   30.  endfor
1329   31.  if (success == false) then
1330   32.    II = II + 1
1331   33.    if (II > maxII) then
1332   34.       finish - failed to schedule
1333   35.	 endif
1334   36.    goto 2.
1335   37.  endif
1336   38. endfor
1337   39. if (calculate_register_pressure(PS) > maxRP) then
1338   40.    goto 32.
1339   41. endif
1340   42. compute epilogue & prologue
1341   43. finish - succeeded to schedule
1342*/
1343
1344/* A limit on the number of cycles that resource conflicts can span.  ??? Should
1345   be provided by DFA, and be dependent on the type of insn scheduled.  Currently
1346   set to 0 to save compile time.  */
1347#define DFA_HISTORY SMS_DFA_HISTORY
1348
1349/* Given the partial schedule PS, this function calculates and returns the
1350   cycles in which we can schedule the node with the given index I.
1351   NOTE: Here we do the backtracking in SMS, in some special cases. We have
1352   noticed that there are several cases in which we fail    to SMS the loop
1353   because the sched window of a node is empty    due to tight data-deps. In
1354   such cases we want to unschedule    some of the predecessors/successors
1355   until we get non-empty    scheduling window.  It returns -1 if the
1356   scheduling window is empty and zero otherwise.  */
1357
1358static int
1359get_sched_window (partial_schedule_ptr ps, int *nodes_order, int i,
1360		  sbitmap sched_nodes, int ii, int *start_p, int *step_p, int *end_p)
1361{
1362  int start, step, end;
1363  ddg_edge_ptr e;
1364  int u = nodes_order [i];
1365  ddg_node_ptr u_node = &ps->g->nodes[u];
1366  sbitmap psp = sbitmap_alloc (ps->g->num_nodes);
1367  sbitmap pss = sbitmap_alloc (ps->g->num_nodes);
1368  sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1369  sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1370  int psp_not_empty;
1371  int pss_not_empty;
1372
1373  /* 1. compute sched window for u (start, end, step).  */
1374  sbitmap_zero (psp);
1375  sbitmap_zero (pss);
1376  psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
1377  pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
1378
1379  if (psp_not_empty && !pss_not_empty)
1380    {
1381      int early_start = INT_MIN;
1382
1383      end = INT_MAX;
1384      for (e = u_node->in; e != 0; e = e->next_in)
1385	{
1386	  ddg_node_ptr v_node = e->src;
1387	  if (TEST_BIT (sched_nodes, v_node->cuid))
1388	    {
1389	      int node_st = SCHED_TIME (v_node)
1390	      		    + e->latency - (e->distance * ii);
1391
1392	      early_start = MAX (early_start, node_st);
1393
1394	      if (e->data_type == MEM_DEP)
1395		end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1396	    }
1397	}
1398      start = early_start;
1399      end = MIN (end, early_start + ii);
1400      step = 1;
1401    }
1402
1403  else if (!psp_not_empty && pss_not_empty)
1404    {
1405      int late_start = INT_MAX;
1406
1407      end = INT_MIN;
1408      for (e = u_node->out; e != 0; e = e->next_out)
1409	{
1410	  ddg_node_ptr v_node = e->dest;
1411	  if (TEST_BIT (sched_nodes, v_node->cuid))
1412	    {
1413	      late_start = MIN (late_start,
1414				SCHED_TIME (v_node) - e->latency
1415				+ (e->distance * ii));
1416	      if (e->data_type == MEM_DEP)
1417		end = MAX (end, SCHED_TIME (v_node) - ii + 1);
1418	    }
1419	}
1420      start = late_start;
1421      end = MAX (end, late_start - ii);
1422      step = -1;
1423    }
1424
1425  else if (psp_not_empty && pss_not_empty)
1426    {
1427      int early_start = INT_MIN;
1428      int late_start = INT_MAX;
1429
1430      start = INT_MIN;
1431      end = INT_MAX;
1432      for (e = u_node->in; e != 0; e = e->next_in)
1433	{
1434	  ddg_node_ptr v_node = e->src;
1435
1436	  if (TEST_BIT (sched_nodes, v_node->cuid))
1437	    {
1438	      early_start = MAX (early_start,
1439				 SCHED_TIME (v_node) + e->latency
1440				 - (e->distance * ii));
1441	      if (e->data_type == MEM_DEP)
1442		end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1443	    }
1444	}
1445      for (e = u_node->out; e != 0; e = e->next_out)
1446	{
1447	  ddg_node_ptr v_node = e->dest;
1448
1449	  if (TEST_BIT (sched_nodes, v_node->cuid))
1450	    {
1451	      late_start = MIN (late_start,
1452				SCHED_TIME (v_node) - e->latency
1453				+ (e->distance * ii));
1454	      if (e->data_type == MEM_DEP)
1455		start = MAX (start, SCHED_TIME (v_node) - ii + 1);
1456	    }
1457	}
1458      start = MAX (start, early_start);
1459      end = MIN (end, MIN (early_start + ii, late_start + 1));
1460      step = 1;
1461    }
1462  else /* psp is empty && pss is empty.  */
1463    {
1464      start = SCHED_ASAP (u_node);
1465      end = start + ii;
1466      step = 1;
1467    }
1468
1469  *start_p = start;
1470  *step_p = step;
1471  *end_p = end;
1472  sbitmap_free (psp);
1473  sbitmap_free (pss);
1474
1475  if ((start >= end && step == 1) || (start <= end && step == -1))
1476    return -1;
1477  else
1478    return 0;
1479}
1480
1481/* This function implements the scheduling algorithm for SMS according to the
1482   above algorithm.  */
1483static partial_schedule_ptr
1484sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order)
1485{
1486  int ii = mii;
1487  int i, c, success;
1488  int try_again_with_larger_ii = true;
1489  int num_nodes = g->num_nodes;
1490  ddg_edge_ptr e;
1491  int start, end, step; /* Place together into one struct?  */
1492  sbitmap sched_nodes = sbitmap_alloc (num_nodes);
1493  sbitmap must_precede = sbitmap_alloc (num_nodes);
1494  sbitmap must_follow = sbitmap_alloc (num_nodes);
1495  sbitmap tobe_scheduled = sbitmap_alloc (num_nodes);
1496
1497  partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
1498
1499  sbitmap_ones (tobe_scheduled);
1500  sbitmap_zero (sched_nodes);
1501
1502  while ((! sbitmap_equal (tobe_scheduled, sched_nodes)
1503	 || try_again_with_larger_ii ) && ii < maxii)
1504    {
1505      int j;
1506      bool unscheduled_nodes = false;
1507
1508      if (dump_file)
1509	fprintf(dump_file, "Starting with ii=%d\n", ii);
1510      if (try_again_with_larger_ii)
1511	{
1512	  try_again_with_larger_ii = false;
1513	  sbitmap_zero (sched_nodes);
1514	}
1515
1516      for (i = 0; i < num_nodes; i++)
1517	{
1518	  int u = nodes_order[i];
1519  	  ddg_node_ptr u_node = &ps->g->nodes[u];
1520	  rtx insn = u_node->insn;
1521
1522	  if (!INSN_P (insn))
1523	    {
1524	      RESET_BIT (tobe_scheduled, u);
1525	      continue;
1526	    }
1527
1528	  if (JUMP_P (insn)) /* Closing branch handled later.  */
1529	    {
1530	      RESET_BIT (tobe_scheduled, u);
1531	      continue;
1532	    }
1533
1534	  if (TEST_BIT (sched_nodes, u))
1535	    continue;
1536
1537	  /* Try to get non-empty scheduling window.  */
1538	  j = i;
1539	  while (get_sched_window (ps, nodes_order, i, sched_nodes, ii, &start, &step, &end) < 0
1540		 && j > 0)
1541	    {
1542	      unscheduled_nodes = true;
1543	      if (TEST_BIT (NODE_PREDECESSORS (u_node), nodes_order[j - 1])
1544		  || TEST_BIT (NODE_SUCCESSORS (u_node), nodes_order[j - 1]))
1545		{
1546		  ps_unschedule_node (ps, &ps->g->nodes[nodes_order[j - 1]]);
1547		  RESET_BIT (sched_nodes, nodes_order [j - 1]);
1548		}
1549	      j--;
1550	    }
1551	  if (j < 0)
1552	    {
1553	      /* ??? Try backtracking instead of immediately ii++?  */
1554	      ii++;
1555	      try_again_with_larger_ii = true;
1556	      reset_partial_schedule (ps, ii);
1557	      break;
1558	    }
1559	  /* 2. Try scheduling u in window.  */
1560	  if (dump_file)
1561	    fprintf(dump_file, "Trying to schedule node %d in (%d .. %d) step %d\n",
1562		    u, start, end, step);
1563
1564          /* use must_follow & must_precede bitmaps to determine order
1565	     of nodes within the cycle.  */
1566          sbitmap_zero (must_precede);
1567          sbitmap_zero (must_follow);
1568      	  for (e = u_node->in; e != 0; e = e->next_in)
1569            if (TEST_BIT (sched_nodes, e->src->cuid)
1570	        && e->latency == (ii * e->distance)
1571		&& start == SCHED_TIME (e->src))
1572             SET_BIT (must_precede, e->src->cuid);
1573
1574	  for (e = u_node->out; e != 0; e = e->next_out)
1575            if (TEST_BIT (sched_nodes, e->dest->cuid)
1576	        && e->latency == (ii * e->distance)
1577		&& end == SCHED_TIME (e->dest))
1578             SET_BIT (must_follow, e->dest->cuid);
1579
1580	  success = 0;
1581	  if ((step > 0 && start < end) ||  (step < 0 && start > end))
1582	    for (c = start; c != end; c += step)
1583	      {
1584		ps_insn_ptr psi;
1585
1586		psi = ps_add_node_check_conflicts (ps, u_node, c,
1587						   must_precede,
1588						   must_follow);
1589
1590  		if (psi)
1591		  {
1592		    SCHED_TIME (u_node) = c;
1593		    SET_BIT (sched_nodes, u);
1594		    success = 1;
1595		    if (dump_file)
1596		      fprintf(dump_file, "Schedule in %d\n", c);
1597		    break;
1598		  }
1599	      }
1600	  if (!success)
1601	    {
1602	      /* ??? Try backtracking instead of immediately ii++?  */
1603	      ii++;
1604	      try_again_with_larger_ii = true;
1605	      reset_partial_schedule (ps, ii);
1606	      break;
1607	    }
1608	  if (unscheduled_nodes)
1609	    break;
1610
1611	  /* ??? If (success), check register pressure estimates.  */
1612	} /* Continue with next node.  */
1613    } /* While try_again_with_larger_ii.  */
1614
1615  sbitmap_free (sched_nodes);
1616  sbitmap_free (must_precede);
1617  sbitmap_free (must_follow);
1618  sbitmap_free (tobe_scheduled);
1619
1620  if (ii >= maxii)
1621    {
1622      free_partial_schedule (ps);
1623      ps = NULL;
1624    }
1625  return ps;
1626}
1627
1628
1629/* This page implements the algorithm for ordering the nodes of a DDG
1630   for modulo scheduling, activated through the
1631   "int sms_order_nodes (ddg_ptr, int mii, int * result)" API.  */
1632
1633#define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
1634#define ASAP(x) (ORDER_PARAMS ((x))->asap)
1635#define ALAP(x) (ORDER_PARAMS ((x))->alap)
1636#define HEIGHT(x) (ORDER_PARAMS ((x))->height)
1637#define MOB(x) (ALAP ((x)) - ASAP ((x)))
1638#define DEPTH(x) (ASAP ((x)))
1639
1640typedef struct node_order_params * nopa;
1641
1642static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
1643static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
1644static nopa  calculate_order_params (ddg_ptr, int mii);
1645static int find_max_asap (ddg_ptr, sbitmap);
1646static int find_max_hv_min_mob (ddg_ptr, sbitmap);
1647static int find_max_dv_min_mob (ddg_ptr, sbitmap);
1648
1649enum sms_direction {BOTTOMUP, TOPDOWN};
1650
1651struct node_order_params
1652{
1653  int asap;
1654  int alap;
1655  int height;
1656};
1657
1658/* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1.  */
1659static void
1660check_nodes_order (int *node_order, int num_nodes)
1661{
1662  int i;
1663  sbitmap tmp = sbitmap_alloc (num_nodes);
1664
1665  sbitmap_zero (tmp);
1666
1667  for (i = 0; i < num_nodes; i++)
1668    {
1669      int u = node_order[i];
1670
1671      gcc_assert (u < num_nodes && u >= 0 && !TEST_BIT (tmp, u));
1672
1673      SET_BIT (tmp, u);
1674    }
1675
1676  sbitmap_free (tmp);
1677}
1678
1679/* Order the nodes of G for scheduling and pass the result in
1680   NODE_ORDER.  Also set aux.count of each node to ASAP.
1681   Return the recMII for the given DDG.  */
1682static int
1683sms_order_nodes (ddg_ptr g, int mii, int * node_order)
1684{
1685  int i;
1686  int rec_mii = 0;
1687  ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
1688
1689  nopa nops = calculate_order_params (g, mii);
1690
1691  order_nodes_of_sccs (sccs, node_order);
1692
1693  if (sccs->num_sccs > 0)
1694    /* First SCC has the largest recurrence_length.  */
1695    rec_mii = sccs->sccs[0]->recurrence_length;
1696
1697  /* Save ASAP before destroying node_order_params.  */
1698  for (i = 0; i < g->num_nodes; i++)
1699    {
1700      ddg_node_ptr v = &g->nodes[i];
1701      v->aux.count = ASAP (v);
1702    }
1703
1704  free (nops);
1705  free_ddg_all_sccs (sccs);
1706  check_nodes_order (node_order, g->num_nodes);
1707
1708  return rec_mii;
1709}
1710
1711static void
1712order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
1713{
1714  int i, pos = 0;
1715  ddg_ptr g = all_sccs->ddg;
1716  int num_nodes = g->num_nodes;
1717  sbitmap prev_sccs = sbitmap_alloc (num_nodes);
1718  sbitmap on_path = sbitmap_alloc (num_nodes);
1719  sbitmap tmp = sbitmap_alloc (num_nodes);
1720  sbitmap ones = sbitmap_alloc (num_nodes);
1721
1722  sbitmap_zero (prev_sccs);
1723  sbitmap_ones (ones);
1724
1725  /* Perfrom the node ordering starting from the SCC with the highest recMII.
1726     For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc.  */
1727  for (i = 0; i < all_sccs->num_sccs; i++)
1728    {
1729      ddg_scc_ptr scc = all_sccs->sccs[i];
1730
1731      /* Add nodes on paths from previous SCCs to the current SCC.  */
1732      find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
1733      sbitmap_a_or_b (tmp, scc->nodes, on_path);
1734
1735      /* Add nodes on paths from the current SCC to previous SCCs.  */
1736      find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
1737      sbitmap_a_or_b (tmp, tmp, on_path);
1738
1739      /* Remove nodes of previous SCCs from current extended SCC.  */
1740      sbitmap_difference (tmp, tmp, prev_sccs);
1741
1742      pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1743      /* Above call to order_nodes_in_scc updated prev_sccs |= tmp.  */
1744    }
1745
1746  /* Handle the remaining nodes that do not belong to any scc.  Each call
1747     to order_nodes_in_scc handles a single connected component.  */
1748  while (pos < g->num_nodes)
1749    {
1750      sbitmap_difference (tmp, ones, prev_sccs);
1751      pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1752    }
1753  sbitmap_free (prev_sccs);
1754  sbitmap_free (on_path);
1755  sbitmap_free (tmp);
1756  sbitmap_free (ones);
1757}
1758
1759/* MII is needed if we consider backarcs (that do not close recursive cycles).  */
1760static struct node_order_params *
1761calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED)
1762{
1763  int u;
1764  int max_asap;
1765  int num_nodes = g->num_nodes;
1766  ddg_edge_ptr e;
1767  /* Allocate a place to hold ordering params for each node in the DDG.  */
1768  nopa node_order_params_arr;
1769
1770  /* Initialize of ASAP/ALAP/HEIGHT to zero.  */
1771  node_order_params_arr = (nopa) xcalloc (num_nodes,
1772					  sizeof (struct node_order_params));
1773
1774  /* Set the aux pointer of each node to point to its order_params structure.  */
1775  for (u = 0; u < num_nodes; u++)
1776    g->nodes[u].aux.info = &node_order_params_arr[u];
1777
1778  /* Disregarding a backarc from each recursive cycle to obtain a DAG,
1779     calculate ASAP, ALAP, mobility, distance, and height for each node
1780     in the dependence (direct acyclic) graph.  */
1781
1782  /* We assume that the nodes in the array are in topological order.  */
1783
1784  max_asap = 0;
1785  for (u = 0; u < num_nodes; u++)
1786    {
1787      ddg_node_ptr u_node = &g->nodes[u];
1788
1789      ASAP (u_node) = 0;
1790      for (e = u_node->in; e; e = e->next_in)
1791	if (e->distance == 0)
1792	  ASAP (u_node) = MAX (ASAP (u_node),
1793			       ASAP (e->src) + e->latency);
1794      max_asap = MAX (max_asap, ASAP (u_node));
1795    }
1796
1797  for (u = num_nodes - 1; u > -1; u--)
1798    {
1799      ddg_node_ptr u_node = &g->nodes[u];
1800
1801      ALAP (u_node) = max_asap;
1802      HEIGHT (u_node) = 0;
1803      for (e = u_node->out; e; e = e->next_out)
1804	if (e->distance == 0)
1805	  {
1806	    ALAP (u_node) = MIN (ALAP (u_node),
1807				 ALAP (e->dest) - e->latency);
1808	    HEIGHT (u_node) = MAX (HEIGHT (u_node),
1809				   HEIGHT (e->dest) + e->latency);
1810	  }
1811    }
1812
1813  return node_order_params_arr;
1814}
1815
1816static int
1817find_max_asap (ddg_ptr g, sbitmap nodes)
1818{
1819  unsigned int u = 0;
1820  int max_asap = -1;
1821  int result = -1;
1822  sbitmap_iterator sbi;
1823
1824  EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1825    {
1826      ddg_node_ptr u_node = &g->nodes[u];
1827
1828      if (max_asap < ASAP (u_node))
1829	{
1830	  max_asap = ASAP (u_node);
1831	  result = u;
1832	}
1833    }
1834  return result;
1835}
1836
1837static int
1838find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
1839{
1840  unsigned int u = 0;
1841  int max_hv = -1;
1842  int min_mob = INT_MAX;
1843  int result = -1;
1844  sbitmap_iterator sbi;
1845
1846  EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1847    {
1848      ddg_node_ptr u_node = &g->nodes[u];
1849
1850      if (max_hv < HEIGHT (u_node))
1851	{
1852	  max_hv = HEIGHT (u_node);
1853	  min_mob = MOB (u_node);
1854	  result = u;
1855	}
1856      else if ((max_hv == HEIGHT (u_node))
1857	       && (min_mob > MOB (u_node)))
1858	{
1859	  min_mob = MOB (u_node);
1860	  result = u;
1861	}
1862    }
1863  return result;
1864}
1865
1866static int
1867find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
1868{
1869  unsigned int u = 0;
1870  int max_dv = -1;
1871  int min_mob = INT_MAX;
1872  int result = -1;
1873  sbitmap_iterator sbi;
1874
1875  EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1876    {
1877      ddg_node_ptr u_node = &g->nodes[u];
1878
1879      if (max_dv < DEPTH (u_node))
1880	{
1881	  max_dv = DEPTH (u_node);
1882	  min_mob = MOB (u_node);
1883	  result = u;
1884	}
1885      else if ((max_dv == DEPTH (u_node))
1886	       && (min_mob > MOB (u_node)))
1887	{
1888	  min_mob = MOB (u_node);
1889	  result = u;
1890	}
1891    }
1892  return result;
1893}
1894
1895/* Places the nodes of SCC into the NODE_ORDER array starting
1896   at position POS, according to the SMS ordering algorithm.
1897   NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
1898   the NODE_ORDER array, starting from position zero.  */
1899static int
1900order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
1901		    int * node_order, int pos)
1902{
1903  enum sms_direction dir;
1904  int num_nodes = g->num_nodes;
1905  sbitmap workset = sbitmap_alloc (num_nodes);
1906  sbitmap tmp = sbitmap_alloc (num_nodes);
1907  sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
1908  sbitmap predecessors = sbitmap_alloc (num_nodes);
1909  sbitmap successors = sbitmap_alloc (num_nodes);
1910
1911  sbitmap_zero (predecessors);
1912  find_predecessors (predecessors, g, nodes_ordered);
1913
1914  sbitmap_zero (successors);
1915  find_successors (successors, g, nodes_ordered);
1916
1917  sbitmap_zero (tmp);
1918  if (sbitmap_a_and_b_cg (tmp, predecessors, scc))
1919    {
1920      sbitmap_copy (workset, tmp);
1921      dir = BOTTOMUP;
1922    }
1923  else if (sbitmap_a_and_b_cg (tmp, successors, scc))
1924    {
1925      sbitmap_copy (workset, tmp);
1926      dir = TOPDOWN;
1927    }
1928  else
1929    {
1930      int u;
1931
1932      sbitmap_zero (workset);
1933      if ((u = find_max_asap (g, scc)) >= 0)
1934	SET_BIT (workset, u);
1935      dir = BOTTOMUP;
1936    }
1937
1938  sbitmap_zero (zero_bitmap);
1939  while (!sbitmap_equal (workset, zero_bitmap))
1940    {
1941      int v;
1942      ddg_node_ptr v_node;
1943      sbitmap v_node_preds;
1944      sbitmap v_node_succs;
1945
1946      if (dir == TOPDOWN)
1947	{
1948	  while (!sbitmap_equal (workset, zero_bitmap))
1949	    {
1950	      v = find_max_hv_min_mob (g, workset);
1951	      v_node = &g->nodes[v];
1952	      node_order[pos++] = v;
1953	      v_node_succs = NODE_SUCCESSORS (v_node);
1954	      sbitmap_a_and_b (tmp, v_node_succs, scc);
1955
1956	      /* Don't consider the already ordered successors again.  */
1957	      sbitmap_difference (tmp, tmp, nodes_ordered);
1958	      sbitmap_a_or_b (workset, workset, tmp);
1959	      RESET_BIT (workset, v);
1960	      SET_BIT (nodes_ordered, v);
1961	    }
1962	  dir = BOTTOMUP;
1963	  sbitmap_zero (predecessors);
1964	  find_predecessors (predecessors, g, nodes_ordered);
1965	  sbitmap_a_and_b (workset, predecessors, scc);
1966	}
1967      else
1968	{
1969	  while (!sbitmap_equal (workset, zero_bitmap))
1970	    {
1971	      v = find_max_dv_min_mob (g, workset);
1972	      v_node = &g->nodes[v];
1973	      node_order[pos++] = v;
1974	      v_node_preds = NODE_PREDECESSORS (v_node);
1975	      sbitmap_a_and_b (tmp, v_node_preds, scc);
1976
1977	      /* Don't consider the already ordered predecessors again.  */
1978	      sbitmap_difference (tmp, tmp, nodes_ordered);
1979	      sbitmap_a_or_b (workset, workset, tmp);
1980	      RESET_BIT (workset, v);
1981	      SET_BIT (nodes_ordered, v);
1982	    }
1983	  dir = TOPDOWN;
1984	  sbitmap_zero (successors);
1985	  find_successors (successors, g, nodes_ordered);
1986	  sbitmap_a_and_b (workset, successors, scc);
1987	}
1988    }
1989  sbitmap_free (tmp);
1990  sbitmap_free (workset);
1991  sbitmap_free (zero_bitmap);
1992  sbitmap_free (predecessors);
1993  sbitmap_free (successors);
1994  return pos;
1995}
1996
1997
1998/* This page contains functions for manipulating partial-schedules during
1999   modulo scheduling.  */
2000
2001/* Create a partial schedule and allocate a memory to hold II rows.  */
2002
2003static partial_schedule_ptr
2004create_partial_schedule (int ii, ddg_ptr g, int history)
2005{
2006  partial_schedule_ptr ps = XNEW (struct partial_schedule);
2007  ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
2008  ps->ii = ii;
2009  ps->history = history;
2010  ps->min_cycle = INT_MAX;
2011  ps->max_cycle = INT_MIN;
2012  ps->g = g;
2013
2014  return ps;
2015}
2016
2017/* Free the PS_INSNs in rows array of the given partial schedule.
2018   ??? Consider caching the PS_INSN's.  */
2019static void
2020free_ps_insns (partial_schedule_ptr ps)
2021{
2022  int i;
2023
2024  for (i = 0; i < ps->ii; i++)
2025    {
2026      while (ps->rows[i])
2027	{
2028	  ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
2029
2030	  free (ps->rows[i]);
2031	  ps->rows[i] = ps_insn;
2032	}
2033      ps->rows[i] = NULL;
2034    }
2035}
2036
2037/* Free all the memory allocated to the partial schedule.  */
2038
2039static void
2040free_partial_schedule (partial_schedule_ptr ps)
2041{
2042  if (!ps)
2043    return;
2044  free_ps_insns (ps);
2045  free (ps->rows);
2046  free (ps);
2047}
2048
2049/* Clear the rows array with its PS_INSNs, and create a new one with
2050   NEW_II rows.  */
2051
2052static void
2053reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2054{
2055  if (!ps)
2056    return;
2057  free_ps_insns (ps);
2058  if (new_ii == ps->ii)
2059    return;
2060  ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2061						 * sizeof (ps_insn_ptr));
2062  memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2063  ps->ii = new_ii;
2064  ps->min_cycle = INT_MAX;
2065  ps->max_cycle = INT_MIN;
2066}
2067
2068/* Prints the partial schedule as an ii rows array, for each rows
2069   print the ids of the insns in it.  */
2070void
2071print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2072{
2073  int i;
2074
2075  for (i = 0; i < ps->ii; i++)
2076    {
2077      ps_insn_ptr ps_i = ps->rows[i];
2078
2079      fprintf (dump, "\n[CYCLE %d ]: ", i);
2080      while (ps_i)
2081	{
2082	  fprintf (dump, "%d, ",
2083		   INSN_UID (ps_i->node->insn));
2084	  ps_i = ps_i->next_in_row;
2085	}
2086    }
2087}
2088
2089/* Creates an object of PS_INSN and initializes it to the given parameters.  */
2090static ps_insn_ptr
2091create_ps_insn (ddg_node_ptr node, int rest_count, int cycle)
2092{
2093  ps_insn_ptr ps_i = XNEW (struct ps_insn);
2094
2095  ps_i->node = node;
2096  ps_i->next_in_row = NULL;
2097  ps_i->prev_in_row = NULL;
2098  ps_i->row_rest_count = rest_count;
2099  ps_i->cycle = cycle;
2100
2101  return ps_i;
2102}
2103
2104
2105/* Removes the given PS_INSN from the partial schedule.  Returns false if the
2106   node is not found in the partial schedule, else returns true.  */
2107static bool
2108remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2109{
2110  int row;
2111
2112  if (!ps || !ps_i)
2113    return false;
2114
2115  row = SMODULO (ps_i->cycle, ps->ii);
2116  if (! ps_i->prev_in_row)
2117    {
2118      if (ps_i != ps->rows[row])
2119	return false;
2120
2121      ps->rows[row] = ps_i->next_in_row;
2122      if (ps->rows[row])
2123	ps->rows[row]->prev_in_row = NULL;
2124    }
2125  else
2126    {
2127      ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2128      if (ps_i->next_in_row)
2129	ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2130    }
2131  free (ps_i);
2132  return true;
2133}
2134
2135/* Unlike what literature describes for modulo scheduling (which focuses
2136   on VLIW machines) the order of the instructions inside a cycle is
2137   important.  Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2138   where the current instruction should go relative to the already
2139   scheduled instructions in the given cycle.  Go over these
2140   instructions and find the first possible column to put it in.  */
2141static bool
2142ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2143		     sbitmap must_precede, sbitmap must_follow)
2144{
2145  ps_insn_ptr next_ps_i;
2146  ps_insn_ptr first_must_follow = NULL;
2147  ps_insn_ptr last_must_precede = NULL;
2148  int row;
2149
2150  if (! ps_i)
2151    return false;
2152
2153  row = SMODULO (ps_i->cycle, ps->ii);
2154
2155  /* Find the first must follow and the last must precede
2156     and insert the node immediately after the must precede
2157     but make sure that it there is no must follow after it.  */
2158  for (next_ps_i = ps->rows[row];
2159       next_ps_i;
2160       next_ps_i = next_ps_i->next_in_row)
2161    {
2162      if (TEST_BIT (must_follow, next_ps_i->node->cuid)
2163	  && ! first_must_follow)
2164        first_must_follow = next_ps_i;
2165      if (TEST_BIT (must_precede, next_ps_i->node->cuid))
2166        {
2167          /* If we have already met a node that must follow, then
2168	     there is no possible column.  */
2169  	  if (first_must_follow)
2170            return false;
2171	  else
2172            last_must_precede = next_ps_i;
2173        }
2174    }
2175
2176  /* Now insert the node after INSERT_AFTER_PSI.  */
2177
2178  if (! last_must_precede)
2179    {
2180      ps_i->next_in_row = ps->rows[row];
2181      ps_i->prev_in_row = NULL;
2182      if (ps_i->next_in_row)
2183    	ps_i->next_in_row->prev_in_row = ps_i;
2184      ps->rows[row] = ps_i;
2185    }
2186  else
2187    {
2188      ps_i->next_in_row = last_must_precede->next_in_row;
2189      last_must_precede->next_in_row = ps_i;
2190      ps_i->prev_in_row = last_must_precede;
2191      if (ps_i->next_in_row)
2192        ps_i->next_in_row->prev_in_row = ps_i;
2193    }
2194
2195  return true;
2196}
2197
2198/* Advances the PS_INSN one column in its current row; returns false
2199   in failure and true in success.  Bit N is set in MUST_FOLLOW if
2200   the node with cuid N must be come after the node pointed to by
2201   PS_I when scheduled in the same cycle.  */
2202static int
2203ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2204			sbitmap must_follow)
2205{
2206  ps_insn_ptr prev, next;
2207  int row;
2208  ddg_node_ptr next_node;
2209
2210  if (!ps || !ps_i)
2211    return false;
2212
2213  row = SMODULO (ps_i->cycle, ps->ii);
2214
2215  if (! ps_i->next_in_row)
2216    return false;
2217
2218  next_node = ps_i->next_in_row->node;
2219
2220  /* Check if next_in_row is dependent on ps_i, both having same sched
2221     times (typically ANTI_DEP).  If so, ps_i cannot skip over it.  */
2222  if (TEST_BIT (must_follow, next_node->cuid))
2223    return false;
2224
2225  /* Advance PS_I over its next_in_row in the doubly linked list.  */
2226  prev = ps_i->prev_in_row;
2227  next = ps_i->next_in_row;
2228
2229  if (ps_i == ps->rows[row])
2230    ps->rows[row] = next;
2231
2232  ps_i->next_in_row = next->next_in_row;
2233
2234  if (next->next_in_row)
2235    next->next_in_row->prev_in_row = ps_i;
2236
2237  next->next_in_row = ps_i;
2238  ps_i->prev_in_row = next;
2239
2240  next->prev_in_row = prev;
2241  if (prev)
2242    prev->next_in_row = next;
2243
2244  return true;
2245}
2246
2247/* Inserts a DDG_NODE to the given partial schedule at the given cycle.
2248   Returns 0 if this is not possible and a PS_INSN otherwise.  Bit N is
2249   set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
2250   before/after (respectively) the node pointed to by PS_I when scheduled
2251   in the same cycle.  */
2252static ps_insn_ptr
2253add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle,
2254		sbitmap must_precede, sbitmap must_follow)
2255{
2256  ps_insn_ptr ps_i;
2257  int rest_count = 1;
2258  int row = SMODULO (cycle, ps->ii);
2259
2260  if (ps->rows[row]
2261      && ps->rows[row]->row_rest_count >= issue_rate)
2262    return NULL;
2263
2264  if (ps->rows[row])
2265    rest_count += ps->rows[row]->row_rest_count;
2266
2267  ps_i = create_ps_insn (node, rest_count, cycle);
2268
2269  /* Finds and inserts PS_I according to MUST_FOLLOW and
2270     MUST_PRECEDE.  */
2271  if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
2272    {
2273      free (ps_i);
2274      return NULL;
2275    }
2276
2277  return ps_i;
2278}
2279
2280/* Advance time one cycle.  Assumes DFA is being used.  */
2281static void
2282advance_one_cycle (void)
2283{
2284  if (targetm.sched.dfa_pre_cycle_insn)
2285    state_transition (curr_state,
2286		      targetm.sched.dfa_pre_cycle_insn ());
2287
2288  state_transition (curr_state, NULL);
2289
2290  if (targetm.sched.dfa_post_cycle_insn)
2291    state_transition (curr_state,
2292		      targetm.sched.dfa_post_cycle_insn ());
2293}
2294
2295/* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds
2296   the number of cycles according to DFA that the kernel fits in,
2297   we use this to check if we done well with SMS after we add
2298   register moves.  In some cases register moves overhead makes
2299   it even worse than the original loop.  We want SMS to be performed
2300   when it gives less cycles after register moves are added.  */
2301static int
2302kernel_number_of_cycles (rtx first_insn, rtx last_insn)
2303{
2304  int cycles = 0;
2305  rtx insn;
2306  int can_issue_more = issue_rate;
2307
2308  state_reset (curr_state);
2309
2310  for (insn = first_insn;
2311       insn != NULL_RTX && insn != last_insn;
2312       insn = NEXT_INSN (insn))
2313    {
2314      if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
2315	continue;
2316
2317      /* Check if there is room for the current insn.  */
2318      if (!can_issue_more || state_dead_lock_p (curr_state))
2319	{
2320	  cycles ++;
2321	  advance_one_cycle ();
2322	  can_issue_more = issue_rate;
2323	}
2324
2325	/* Update the DFA state and return with failure if the DFA found
2326	   recource conflicts.  */
2327      if (state_transition (curr_state, insn) >= 0)
2328	{
2329	  cycles ++;
2330	  advance_one_cycle ();
2331	  can_issue_more = issue_rate;
2332	}
2333
2334      if (targetm.sched.variable_issue)
2335	can_issue_more =
2336	  targetm.sched.variable_issue (sched_dump, sched_verbose,
2337					insn, can_issue_more);
2338      /* A naked CLOBBER or USE generates no instruction, so don't
2339	 let them consume issue slots.  */
2340      else if (GET_CODE (PATTERN (insn)) != USE
2341	       && GET_CODE (PATTERN (insn)) != CLOBBER)
2342	can_issue_more--;
2343    }
2344  return cycles;
2345}
2346
2347/* Checks if PS has resource conflicts according to DFA, starting from
2348   FROM cycle to TO cycle; returns true if there are conflicts and false
2349   if there are no conflicts.  Assumes DFA is being used.  */
2350static int
2351ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
2352{
2353  int cycle;
2354
2355  state_reset (curr_state);
2356
2357  for (cycle = from; cycle <= to; cycle++)
2358    {
2359      ps_insn_ptr crr_insn;
2360      /* Holds the remaining issue slots in the current row.  */
2361      int can_issue_more = issue_rate;
2362
2363      /* Walk through the DFA for the current row.  */
2364      for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
2365	   crr_insn;
2366	   crr_insn = crr_insn->next_in_row)
2367	{
2368	  rtx insn = crr_insn->node->insn;
2369
2370	  if (!INSN_P (insn))
2371	    continue;
2372
2373	  /* Check if there is room for the current insn.  */
2374	  if (!can_issue_more || state_dead_lock_p (curr_state))
2375	    return true;
2376
2377	  /* Update the DFA state and return with failure if the DFA found
2378	     recource conflicts.  */
2379	  if (state_transition (curr_state, insn) >= 0)
2380	    return true;
2381
2382	  if (targetm.sched.variable_issue)
2383	    can_issue_more =
2384	      targetm.sched.variable_issue (sched_dump, sched_verbose,
2385					    insn, can_issue_more);
2386	  /* A naked CLOBBER or USE generates no instruction, so don't
2387	     let them consume issue slots.  */
2388	  else if (GET_CODE (PATTERN (insn)) != USE
2389		   && GET_CODE (PATTERN (insn)) != CLOBBER)
2390	    can_issue_more--;
2391	}
2392
2393      /* Advance the DFA to the next cycle.  */
2394      advance_one_cycle ();
2395    }
2396  return false;
2397}
2398
2399/* Checks if the given node causes resource conflicts when added to PS at
2400   cycle C.  If not the node is added to PS and returned; otherwise zero
2401   is returned.  Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
2402   cuid N must be come before/after (respectively) the node pointed to by
2403   PS_I when scheduled in the same cycle.  */
2404ps_insn_ptr
2405ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n,
2406   			     int c, sbitmap must_precede,
2407			     sbitmap must_follow)
2408{
2409  int has_conflicts = 0;
2410  ps_insn_ptr ps_i;
2411
2412  /* First add the node to the PS, if this succeeds check for
2413     conflicts, trying different issue slots in the same row.  */
2414  if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
2415    return NULL; /* Failed to insert the node at the given cycle.  */
2416
2417  has_conflicts = ps_has_conflicts (ps, c, c)
2418		  || (ps->history > 0
2419		      && ps_has_conflicts (ps,
2420					   c - ps->history,
2421					   c + ps->history));
2422
2423  /* Try different issue slots to find one that the given node can be
2424     scheduled in without conflicts.  */
2425  while (has_conflicts)
2426    {
2427      if (! ps_insn_advance_column (ps, ps_i, must_follow))
2428	break;
2429      has_conflicts = ps_has_conflicts (ps, c, c)
2430		      || (ps->history > 0
2431			  && ps_has_conflicts (ps,
2432					       c - ps->history,
2433					       c + ps->history));
2434    }
2435
2436  if (has_conflicts)
2437    {
2438      remove_node_from_ps (ps, ps_i);
2439      return NULL;
2440    }
2441
2442  ps->min_cycle = MIN (ps->min_cycle, c);
2443  ps->max_cycle = MAX (ps->max_cycle, c);
2444  return ps_i;
2445}
2446
2447/* Rotate the rows of PS such that insns scheduled at time
2448   START_CYCLE will appear in row 0.  Updates max/min_cycles.  */
2449void
2450rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
2451{
2452  int i, row, backward_rotates;
2453  int last_row = ps->ii - 1;
2454
2455  if (start_cycle == 0)
2456    return;
2457
2458  backward_rotates = SMODULO (start_cycle, ps->ii);
2459
2460  /* Revisit later and optimize this into a single loop.  */
2461  for (i = 0; i < backward_rotates; i++)
2462    {
2463      ps_insn_ptr first_row = ps->rows[0];
2464
2465      for (row = 0; row < last_row; row++)
2466	ps->rows[row] = ps->rows[row+1];
2467
2468      ps->rows[last_row] = first_row;
2469    }
2470
2471  ps->max_cycle -= start_cycle;
2472  ps->min_cycle -= start_cycle;
2473}
2474
2475/* Remove the node N from the partial schedule PS; because we restart the DFA
2476   each time we want to check for resource conflicts; this is equivalent to
2477   unscheduling the node N.  */
2478static bool
2479ps_unschedule_node (partial_schedule_ptr ps, ddg_node_ptr n)
2480{
2481  ps_insn_ptr ps_i;
2482  int row = SMODULO (SCHED_TIME (n), ps->ii);
2483
2484  if (row < 0 || row > ps->ii)
2485    return false;
2486
2487  for (ps_i = ps->rows[row];
2488       ps_i &&  ps_i->node != n;
2489       ps_i = ps_i->next_in_row);
2490  if (!ps_i)
2491    return false;
2492
2493  return remove_node_from_ps (ps, ps_i);
2494}
2495#endif /* INSN_SCHEDULING */
2496
2497static bool
2498gate_handle_sms (void)
2499{
2500  return (optimize > 0 && flag_modulo_sched);
2501}
2502
2503
2504/* Run instruction scheduler.  */
2505/* Perform SMS module scheduling.  */
2506static unsigned int
2507rest_of_handle_sms (void)
2508{
2509#ifdef INSN_SCHEDULING
2510  basic_block bb;
2511
2512  /* We want to be able to create new pseudos.  */
2513  no_new_pseudos = 0;
2514  /* Collect loop information to be used in SMS.  */
2515  cfg_layout_initialize (CLEANUP_UPDATE_LIFE);
2516  sms_schedule ();
2517
2518  /* Update the life information, because we add pseudos.  */
2519  max_regno = max_reg_num ();
2520  allocate_reg_info (max_regno, FALSE, FALSE);
2521  update_life_info (NULL, UPDATE_LIFE_GLOBAL_RM_NOTES,
2522                    (PROP_DEATH_NOTES
2523                     | PROP_REG_INFO
2524                     | PROP_KILL_DEAD_CODE
2525                     | PROP_SCAN_DEAD_CODE));
2526
2527  no_new_pseudos = 1;
2528
2529  /* Finalize layout changes.  */
2530  FOR_EACH_BB (bb)
2531    if (bb->next_bb != EXIT_BLOCK_PTR)
2532      bb->aux = bb->next_bb;
2533  cfg_layout_finalize ();
2534  free_dominance_info (CDI_DOMINATORS);
2535#endif /* INSN_SCHEDULING */
2536  return 0;
2537}
2538
2539struct tree_opt_pass pass_sms =
2540{
2541  "sms",                                /* name */
2542  gate_handle_sms,                      /* gate */
2543  rest_of_handle_sms,                   /* execute */
2544  NULL,                                 /* sub */
2545  NULL,                                 /* next */
2546  0,                                    /* static_pass_number */
2547  TV_SMS,                               /* tv_id */
2548  0,                                    /* properties_required */
2549  0,                                    /* properties_provided */
2550  0,                                    /* properties_destroyed */
2551  TODO_dump_func,                       /* todo_flags_start */
2552  TODO_dump_func |
2553  TODO_ggc_collect,                     /* todo_flags_finish */
2554  'm'                                   /* letter */
2555};
2556
2557