1/* Global, SSA-based optimizations using mathematical identities.
2   Copyright (C) 2005-2015 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify it
7under the terms of the GNU General Public License as published by the
8Free Software Foundation; either version 3, or (at your option) any
9later version.
10
11GCC is distributed in the hope that it will be useful, but WITHOUT
12ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14for more details.
15
16You should have received a copy of the GNU General Public License
17along with GCC; see the file COPYING3.  If not see
18<http://www.gnu.org/licenses/>.  */
19
20/* Currently, the only mini-pass in this file tries to CSE reciprocal
21   operations.  These are common in sequences such as this one:
22
23	modulus = sqrt(x*x + y*y + z*z);
24	x = x / modulus;
25	y = y / modulus;
26	z = z / modulus;
27
28   that can be optimized to
29
30	modulus = sqrt(x*x + y*y + z*z);
31        rmodulus = 1.0 / modulus;
32	x = x * rmodulus;
33	y = y * rmodulus;
34	z = z * rmodulus;
35
36   We do this for loop invariant divisors, and with this pass whenever
37   we notice that a division has the same divisor multiple times.
38
39   Of course, like in PRE, we don't insert a division if a dominator
40   already has one.  However, this cannot be done as an extension of
41   PRE for several reasons.
42
43   First of all, with some experiments it was found out that the
44   transformation is not always useful if there are only two divisions
45   hy the same divisor.  This is probably because modern processors
46   can pipeline the divisions; on older, in-order processors it should
47   still be effective to optimize two divisions by the same number.
48   We make this a param, and it shall be called N in the remainder of
49   this comment.
50
51   Second, if trapping math is active, we have less freedom on where
52   to insert divisions: we can only do so in basic blocks that already
53   contain one.  (If divisions don't trap, instead, we can insert
54   divisions elsewhere, which will be in blocks that are common dominators
55   of those that have the division).
56
57   We really don't want to compute the reciprocal unless a division will
58   be found.  To do this, we won't insert the division in a basic block
59   that has less than N divisions *post-dominating* it.
60
61   The algorithm constructs a subset of the dominator tree, holding the
62   blocks containing the divisions and the common dominators to them,
63   and walk it twice.  The first walk is in post-order, and it annotates
64   each block with the number of divisions that post-dominate it: this
65   gives information on where divisions can be inserted profitably.
66   The second walk is in pre-order, and it inserts divisions as explained
67   above, and replaces divisions by multiplications.
68
69   In the best case, the cost of the pass is O(n_statements).  In the
70   worst-case, the cost is due to creating the dominator tree subset,
71   with a cost of O(n_basic_blocks ^ 2); however this can only happen
72   for n_statements / n_basic_blocks statements.  So, the amortized cost
73   of creating the dominator tree subset is O(n_basic_blocks) and the
74   worst-case cost of the pass is O(n_statements * n_basic_blocks).
75
76   More practically, the cost will be small because there are few
77   divisions, and they tend to be in the same basic block, so insert_bb
78   is called very few times.
79
80   If we did this using domwalk.c, an efficient implementation would have
81   to work on all the variables in a single pass, because we could not
82   work on just a subset of the dominator tree, as we do now, and the
83   cost would also be something like O(n_statements * n_basic_blocks).
84   The data structures would be more complex in order to work on all the
85   variables in a single pass.  */
86
87#include "config.h"
88#include "system.h"
89#include "coretypes.h"
90#include "tm.h"
91#include "flags.h"
92#include "hash-set.h"
93#include "machmode.h"
94#include "vec.h"
95#include "double-int.h"
96#include "input.h"
97#include "alias.h"
98#include "symtab.h"
99#include "wide-int.h"
100#include "inchash.h"
101#include "tree.h"
102#include "fold-const.h"
103#include "predict.h"
104#include "hard-reg-set.h"
105#include "function.h"
106#include "dominance.h"
107#include "cfg.h"
108#include "basic-block.h"
109#include "tree-ssa-alias.h"
110#include "internal-fn.h"
111#include "gimple-fold.h"
112#include "gimple-expr.h"
113#include "is-a.h"
114#include "gimple.h"
115#include "gimple-iterator.h"
116#include "gimplify.h"
117#include "gimplify-me.h"
118#include "stor-layout.h"
119#include "gimple-ssa.h"
120#include "tree-cfg.h"
121#include "tree-phinodes.h"
122#include "ssa-iterators.h"
123#include "stringpool.h"
124#include "tree-ssanames.h"
125#include "hashtab.h"
126#include "rtl.h"
127#include "statistics.h"
128#include "real.h"
129#include "fixed-value.h"
130#include "insn-config.h"
131#include "expmed.h"
132#include "dojump.h"
133#include "explow.h"
134#include "calls.h"
135#include "emit-rtl.h"
136#include "varasm.h"
137#include "stmt.h"
138#include "expr.h"
139#include "tree-dfa.h"
140#include "tree-ssa.h"
141#include "tree-pass.h"
142#include "alloc-pool.h"
143#include "target.h"
144#include "gimple-pretty-print.h"
145#include "builtins.h"
146
147/* FIXME: RTL headers have to be included here for optabs.  */
148#include "rtl.h"		/* Because optabs.h wants enum rtx_code.  */
149#include "expr.h"		/* Because optabs.h wants sepops.  */
150#include "insn-codes.h"
151#include "optabs.h"
152
153/* This structure represents one basic block that either computes a
154   division, or is a common dominator for basic block that compute a
155   division.  */
156struct occurrence {
157  /* The basic block represented by this structure.  */
158  basic_block bb;
159
160  /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
161     inserted in BB.  */
162  tree recip_def;
163
164  /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
165     was inserted in BB.  */
166  gimple recip_def_stmt;
167
168  /* Pointer to a list of "struct occurrence"s for blocks dominated
169     by BB.  */
170  struct occurrence *children;
171
172  /* Pointer to the next "struct occurrence"s in the list of blocks
173     sharing a common dominator.  */
174  struct occurrence *next;
175
176  /* The number of divisions that are in BB before compute_merit.  The
177     number of divisions that are in BB or post-dominate it after
178     compute_merit.  */
179  int num_divisions;
180
181  /* True if the basic block has a division, false if it is a common
182     dominator for basic blocks that do.  If it is false and trapping
183     math is active, BB is not a candidate for inserting a reciprocal.  */
184  bool bb_has_division;
185};
186
187static struct
188{
189  /* Number of 1.0/X ops inserted.  */
190  int rdivs_inserted;
191
192  /* Number of 1.0/FUNC ops inserted.  */
193  int rfuncs_inserted;
194} reciprocal_stats;
195
196static struct
197{
198  /* Number of cexpi calls inserted.  */
199  int inserted;
200} sincos_stats;
201
202static struct
203{
204  /* Number of hand-written 16-bit nop / bswaps found.  */
205  int found_16bit;
206
207  /* Number of hand-written 32-bit nop / bswaps found.  */
208  int found_32bit;
209
210  /* Number of hand-written 64-bit nop / bswaps found.  */
211  int found_64bit;
212} nop_stats, bswap_stats;
213
214static struct
215{
216  /* Number of widening multiplication ops inserted.  */
217  int widen_mults_inserted;
218
219  /* Number of integer multiply-and-accumulate ops inserted.  */
220  int maccs_inserted;
221
222  /* Number of fp fused multiply-add ops inserted.  */
223  int fmas_inserted;
224} widen_mul_stats;
225
226/* The instance of "struct occurrence" representing the highest
227   interesting block in the dominator tree.  */
228static struct occurrence *occ_head;
229
230/* Allocation pool for getting instances of "struct occurrence".  */
231static alloc_pool occ_pool;
232
233
234
235/* Allocate and return a new struct occurrence for basic block BB, and
236   whose children list is headed by CHILDREN.  */
237static struct occurrence *
238occ_new (basic_block bb, struct occurrence *children)
239{
240  struct occurrence *occ;
241
242  bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
243  memset (occ, 0, sizeof (struct occurrence));
244
245  occ->bb = bb;
246  occ->children = children;
247  return occ;
248}
249
250
251/* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
252   list of "struct occurrence"s, one per basic block, having IDOM as
253   their common dominator.
254
255   We try to insert NEW_OCC as deep as possible in the tree, and we also
256   insert any other block that is a common dominator for BB and one
257   block already in the tree.  */
258
259static void
260insert_bb (struct occurrence *new_occ, basic_block idom,
261	   struct occurrence **p_head)
262{
263  struct occurrence *occ, **p_occ;
264
265  for (p_occ = p_head; (occ = *p_occ) != NULL; )
266    {
267      basic_block bb = new_occ->bb, occ_bb = occ->bb;
268      basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
269      if (dom == bb)
270	{
271	  /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
272	     from its list.  */
273	  *p_occ = occ->next;
274	  occ->next = new_occ->children;
275	  new_occ->children = occ;
276
277	  /* Try the next block (it may as well be dominated by BB).  */
278	}
279
280      else if (dom == occ_bb)
281	{
282	  /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
283	  insert_bb (new_occ, dom, &occ->children);
284	  return;
285	}
286
287      else if (dom != idom)
288	{
289	  gcc_assert (!dom->aux);
290
291	  /* There is a dominator between IDOM and BB, add it and make
292	     two children out of NEW_OCC and OCC.  First, remove OCC from
293	     its list.	*/
294	  *p_occ = occ->next;
295	  new_occ->next = occ;
296	  occ->next = NULL;
297
298	  /* None of the previous blocks has DOM as a dominator: if we tail
299	     recursed, we would reexamine them uselessly. Just switch BB with
300	     DOM, and go on looking for blocks dominated by DOM.  */
301          new_occ = occ_new (dom, new_occ);
302	}
303
304      else
305	{
306	  /* Nothing special, go on with the next element.  */
307	  p_occ = &occ->next;
308	}
309    }
310
311  /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
312  new_occ->next = *p_head;
313  *p_head = new_occ;
314}
315
316/* Register that we found a division in BB.  */
317
318static inline void
319register_division_in (basic_block bb)
320{
321  struct occurrence *occ;
322
323  occ = (struct occurrence *) bb->aux;
324  if (!occ)
325    {
326      occ = occ_new (bb, NULL);
327      insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
328    }
329
330  occ->bb_has_division = true;
331  occ->num_divisions++;
332}
333
334
335/* Compute the number of divisions that postdominate each block in OCC and
336   its children.  */
337
338static void
339compute_merit (struct occurrence *occ)
340{
341  struct occurrence *occ_child;
342  basic_block dom = occ->bb;
343
344  for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
345    {
346      basic_block bb;
347      if (occ_child->children)
348        compute_merit (occ_child);
349
350      if (flag_exceptions)
351	bb = single_noncomplex_succ (dom);
352      else
353	bb = dom;
354
355      if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
356        occ->num_divisions += occ_child->num_divisions;
357    }
358}
359
360
361/* Return whether USE_STMT is a floating-point division by DEF.  */
362static inline bool
363is_division_by (gimple use_stmt, tree def)
364{
365  return is_gimple_assign (use_stmt)
366	 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
367	 && gimple_assign_rhs2 (use_stmt) == def
368	 /* Do not recognize x / x as valid division, as we are getting
369	    confused later by replacing all immediate uses x in such
370	    a stmt.  */
371	 && gimple_assign_rhs1 (use_stmt) != def;
372}
373
374/* Walk the subset of the dominator tree rooted at OCC, setting the
375   RECIP_DEF field to a definition of 1.0 / DEF that can be used in
376   the given basic block.  The field may be left NULL, of course,
377   if it is not possible or profitable to do the optimization.
378
379   DEF_BSI is an iterator pointing at the statement defining DEF.
380   If RECIP_DEF is set, a dominator already has a computation that can
381   be used.  */
382
383static void
384insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
385		    tree def, tree recip_def, int threshold)
386{
387  tree type;
388  gassign *new_stmt;
389  gimple_stmt_iterator gsi;
390  struct occurrence *occ_child;
391
392  if (!recip_def
393      && (occ->bb_has_division || !flag_trapping_math)
394      && occ->num_divisions >= threshold)
395    {
396      /* Make a variable with the replacement and substitute it.  */
397      type = TREE_TYPE (def);
398      recip_def = create_tmp_reg (type, "reciptmp");
399      new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
400				      build_one_cst (type), def);
401
402      if (occ->bb_has_division)
403        {
404          /* Case 1: insert before an existing division.  */
405          gsi = gsi_after_labels (occ->bb);
406          while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
407	    gsi_next (&gsi);
408
409          gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
410        }
411      else if (def_gsi && occ->bb == def_gsi->bb)
412        {
413          /* Case 2: insert right after the definition.  Note that this will
414	     never happen if the definition statement can throw, because in
415	     that case the sole successor of the statement's basic block will
416	     dominate all the uses as well.  */
417          gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
418        }
419      else
420        {
421          /* Case 3: insert in a basic block not containing defs/uses.  */
422          gsi = gsi_after_labels (occ->bb);
423          gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
424        }
425
426      reciprocal_stats.rdivs_inserted++;
427
428      occ->recip_def_stmt = new_stmt;
429    }
430
431  occ->recip_def = recip_def;
432  for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
433    insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
434}
435
436
437/* Replace the division at USE_P with a multiplication by the reciprocal, if
438   possible.  */
439
440static inline void
441replace_reciprocal (use_operand_p use_p)
442{
443  gimple use_stmt = USE_STMT (use_p);
444  basic_block bb = gimple_bb (use_stmt);
445  struct occurrence *occ = (struct occurrence *) bb->aux;
446
447  if (optimize_bb_for_speed_p (bb)
448      && occ->recip_def && use_stmt != occ->recip_def_stmt)
449    {
450      gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
451      gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
452      SET_USE (use_p, occ->recip_def);
453      fold_stmt_inplace (&gsi);
454      update_stmt (use_stmt);
455    }
456}
457
458
459/* Free OCC and return one more "struct occurrence" to be freed.  */
460
461static struct occurrence *
462free_bb (struct occurrence *occ)
463{
464  struct occurrence *child, *next;
465
466  /* First get the two pointers hanging off OCC.  */
467  next = occ->next;
468  child = occ->children;
469  occ->bb->aux = NULL;
470  pool_free (occ_pool, occ);
471
472  /* Now ensure that we don't recurse unless it is necessary.  */
473  if (!child)
474    return next;
475  else
476    {
477      while (next)
478	next = free_bb (next);
479
480      return child;
481    }
482}
483
484
485/* Look for floating-point divisions among DEF's uses, and try to
486   replace them by multiplications with the reciprocal.  Add
487   as many statements computing the reciprocal as needed.
488
489   DEF must be a GIMPLE register of a floating-point type.  */
490
491static void
492execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
493{
494  use_operand_p use_p;
495  imm_use_iterator use_iter;
496  struct occurrence *occ;
497  int count = 0, threshold;
498
499  gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
500
501  FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
502    {
503      gimple use_stmt = USE_STMT (use_p);
504      if (is_division_by (use_stmt, def))
505	{
506	  register_division_in (gimple_bb (use_stmt));
507	  count++;
508	}
509    }
510
511  /* Do the expensive part only if we can hope to optimize something.  */
512  threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
513  if (count >= threshold)
514    {
515      gimple use_stmt;
516      for (occ = occ_head; occ; occ = occ->next)
517	{
518	  compute_merit (occ);
519	  insert_reciprocals (def_gsi, occ, def, NULL, threshold);
520	}
521
522      FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
523	{
524	  if (is_division_by (use_stmt, def))
525	    {
526	      FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
527		replace_reciprocal (use_p);
528	    }
529	}
530    }
531
532  for (occ = occ_head; occ; )
533    occ = free_bb (occ);
534
535  occ_head = NULL;
536}
537
538/* Go through all the floating-point SSA_NAMEs, and call
539   execute_cse_reciprocals_1 on each of them.  */
540namespace {
541
542const pass_data pass_data_cse_reciprocals =
543{
544  GIMPLE_PASS, /* type */
545  "recip", /* name */
546  OPTGROUP_NONE, /* optinfo_flags */
547  TV_NONE, /* tv_id */
548  PROP_ssa, /* properties_required */
549  0, /* properties_provided */
550  0, /* properties_destroyed */
551  0, /* todo_flags_start */
552  TODO_update_ssa, /* todo_flags_finish */
553};
554
555class pass_cse_reciprocals : public gimple_opt_pass
556{
557public:
558  pass_cse_reciprocals (gcc::context *ctxt)
559    : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
560  {}
561
562  /* opt_pass methods: */
563  virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
564  virtual unsigned int execute (function *);
565
566}; // class pass_cse_reciprocals
567
568unsigned int
569pass_cse_reciprocals::execute (function *fun)
570{
571  basic_block bb;
572  tree arg;
573
574  occ_pool = create_alloc_pool ("dominators for recip",
575				sizeof (struct occurrence),
576				n_basic_blocks_for_fn (fun) / 3 + 1);
577
578  memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
579  calculate_dominance_info (CDI_DOMINATORS);
580  calculate_dominance_info (CDI_POST_DOMINATORS);
581
582#ifdef ENABLE_CHECKING
583  FOR_EACH_BB_FN (bb, fun)
584    gcc_assert (!bb->aux);
585#endif
586
587  for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
588    if (FLOAT_TYPE_P (TREE_TYPE (arg))
589	&& is_gimple_reg (arg))
590      {
591	tree name = ssa_default_def (fun, arg);
592	if (name)
593	  execute_cse_reciprocals_1 (NULL, name);
594      }
595
596  FOR_EACH_BB_FN (bb, fun)
597    {
598      tree def;
599
600      for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
601	   gsi_next (&gsi))
602	{
603	  gphi *phi = gsi.phi ();
604	  def = PHI_RESULT (phi);
605	  if (! virtual_operand_p (def)
606	      && FLOAT_TYPE_P (TREE_TYPE (def)))
607	    execute_cse_reciprocals_1 (NULL, def);
608	}
609
610      for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
611	   gsi_next (&gsi))
612        {
613	  gimple stmt = gsi_stmt (gsi);
614
615	  if (gimple_has_lhs (stmt)
616	      && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
617	      && FLOAT_TYPE_P (TREE_TYPE (def))
618	      && TREE_CODE (def) == SSA_NAME)
619	    execute_cse_reciprocals_1 (&gsi, def);
620	}
621
622      if (optimize_bb_for_size_p (bb))
623        continue;
624
625      /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
626      for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
627	   gsi_next (&gsi))
628        {
629	  gimple stmt = gsi_stmt (gsi);
630	  tree fndecl;
631
632	  if (is_gimple_assign (stmt)
633	      && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
634	    {
635	      tree arg1 = gimple_assign_rhs2 (stmt);
636	      gimple stmt1;
637
638	      if (TREE_CODE (arg1) != SSA_NAME)
639		continue;
640
641	      stmt1 = SSA_NAME_DEF_STMT (arg1);
642
643	      if (is_gimple_call (stmt1)
644		  && gimple_call_lhs (stmt1)
645		  && (fndecl = gimple_call_fndecl (stmt1))
646		  && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
647		      || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
648		{
649		  enum built_in_function code;
650		  bool md_code, fail;
651		  imm_use_iterator ui;
652		  use_operand_p use_p;
653
654		  code = DECL_FUNCTION_CODE (fndecl);
655		  md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
656
657		  fndecl = targetm.builtin_reciprocal (code, md_code, false);
658		  if (!fndecl)
659		    continue;
660
661		  /* Check that all uses of the SSA name are divisions,
662		     otherwise replacing the defining statement will do
663		     the wrong thing.  */
664		  fail = false;
665		  FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
666		    {
667		      gimple stmt2 = USE_STMT (use_p);
668		      if (is_gimple_debug (stmt2))
669			continue;
670		      if (!is_gimple_assign (stmt2)
671			  || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
672			  || gimple_assign_rhs1 (stmt2) == arg1
673			  || gimple_assign_rhs2 (stmt2) != arg1)
674			{
675			  fail = true;
676			  break;
677			}
678		    }
679		  if (fail)
680		    continue;
681
682		  gimple_replace_ssa_lhs (stmt1, arg1);
683		  gimple_call_set_fndecl (stmt1, fndecl);
684		  update_stmt (stmt1);
685		  reciprocal_stats.rfuncs_inserted++;
686
687		  FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
688		    {
689		      gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
690		      gimple_assign_set_rhs_code (stmt, MULT_EXPR);
691		      fold_stmt_inplace (&gsi);
692		      update_stmt (stmt);
693		    }
694		}
695	    }
696	}
697    }
698
699  statistics_counter_event (fun, "reciprocal divs inserted",
700			    reciprocal_stats.rdivs_inserted);
701  statistics_counter_event (fun, "reciprocal functions inserted",
702			    reciprocal_stats.rfuncs_inserted);
703
704  free_dominance_info (CDI_DOMINATORS);
705  free_dominance_info (CDI_POST_DOMINATORS);
706  free_alloc_pool (occ_pool);
707  return 0;
708}
709
710} // anon namespace
711
712gimple_opt_pass *
713make_pass_cse_reciprocals (gcc::context *ctxt)
714{
715  return new pass_cse_reciprocals (ctxt);
716}
717
718/* Records an occurrence at statement USE_STMT in the vector of trees
719   STMTS if it is dominated by *TOP_BB or dominates it or this basic block
720   is not yet initialized.  Returns true if the occurrence was pushed on
721   the vector.  Adjusts *TOP_BB to be the basic block dominating all
722   statements in the vector.  */
723
724static bool
725maybe_record_sincos (vec<gimple> *stmts,
726		     basic_block *top_bb, gimple use_stmt)
727{
728  basic_block use_bb = gimple_bb (use_stmt);
729  if (*top_bb
730      && (*top_bb == use_bb
731	  || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
732    stmts->safe_push (use_stmt);
733  else if (!*top_bb
734	   || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
735    {
736      stmts->safe_push (use_stmt);
737      *top_bb = use_bb;
738    }
739  else
740    return false;
741
742  return true;
743}
744
745/* Look for sin, cos and cexpi calls with the same argument NAME and
746   create a single call to cexpi CSEing the result in this case.
747   We first walk over all immediate uses of the argument collecting
748   statements that we can CSE in a vector and in a second pass replace
749   the statement rhs with a REALPART or IMAGPART expression on the
750   result of the cexpi call we insert before the use statement that
751   dominates all other candidates.  */
752
753static bool
754execute_cse_sincos_1 (tree name)
755{
756  gimple_stmt_iterator gsi;
757  imm_use_iterator use_iter;
758  tree fndecl, res, type;
759  gimple def_stmt, use_stmt, stmt;
760  int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
761  auto_vec<gimple> stmts;
762  basic_block top_bb = NULL;
763  int i;
764  bool cfg_changed = false;
765
766  type = TREE_TYPE (name);
767  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
768    {
769      if (gimple_code (use_stmt) != GIMPLE_CALL
770	  || !gimple_call_lhs (use_stmt)
771	  || !(fndecl = gimple_call_fndecl (use_stmt))
772	  || !gimple_call_builtin_p (use_stmt, BUILT_IN_NORMAL))
773	continue;
774
775      switch (DECL_FUNCTION_CODE (fndecl))
776	{
777	CASE_FLT_FN (BUILT_IN_COS):
778	  seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
779	  break;
780
781	CASE_FLT_FN (BUILT_IN_SIN):
782	  seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
783	  break;
784
785	CASE_FLT_FN (BUILT_IN_CEXPI):
786	  seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
787	  break;
788
789	default:;
790	}
791    }
792
793  if (seen_cos + seen_sin + seen_cexpi <= 1)
794    return false;
795
796  /* Simply insert cexpi at the beginning of top_bb but not earlier than
797     the name def statement.  */
798  fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
799  if (!fndecl)
800    return false;
801  stmt = gimple_build_call (fndecl, 1, name);
802  res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
803  gimple_call_set_lhs (stmt, res);
804
805  def_stmt = SSA_NAME_DEF_STMT (name);
806  if (!SSA_NAME_IS_DEFAULT_DEF (name)
807      && gimple_code (def_stmt) != GIMPLE_PHI
808      && gimple_bb (def_stmt) == top_bb)
809    {
810      gsi = gsi_for_stmt (def_stmt);
811      gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
812    }
813  else
814    {
815      gsi = gsi_after_labels (top_bb);
816      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
817    }
818  sincos_stats.inserted++;
819
820  /* And adjust the recorded old call sites.  */
821  for (i = 0; stmts.iterate (i, &use_stmt); ++i)
822    {
823      tree rhs = NULL;
824      fndecl = gimple_call_fndecl (use_stmt);
825
826      switch (DECL_FUNCTION_CODE (fndecl))
827	{
828	CASE_FLT_FN (BUILT_IN_COS):
829	  rhs = fold_build1 (REALPART_EXPR, type, res);
830	  break;
831
832	CASE_FLT_FN (BUILT_IN_SIN):
833	  rhs = fold_build1 (IMAGPART_EXPR, type, res);
834	  break;
835
836	CASE_FLT_FN (BUILT_IN_CEXPI):
837	  rhs = res;
838	  break;
839
840	default:;
841	  gcc_unreachable ();
842	}
843
844	/* Replace call with a copy.  */
845	stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
846
847	gsi = gsi_for_stmt (use_stmt);
848	gsi_replace (&gsi, stmt, true);
849	if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
850	  cfg_changed = true;
851    }
852
853  return cfg_changed;
854}
855
856/* To evaluate powi(x,n), the floating point value x raised to the
857   constant integer exponent n, we use a hybrid algorithm that
858   combines the "window method" with look-up tables.  For an
859   introduction to exponentiation algorithms and "addition chains",
860   see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
861   "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
862   3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
863   Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
864
865/* Provide a default value for POWI_MAX_MULTS, the maximum number of
866   multiplications to inline before calling the system library's pow
867   function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
868   so this default never requires calling pow, powf or powl.  */
869
870#ifndef POWI_MAX_MULTS
871#define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
872#endif
873
874/* The size of the "optimal power tree" lookup table.  All
875   exponents less than this value are simply looked up in the
876   powi_table below.  This threshold is also used to size the
877   cache of pseudo registers that hold intermediate results.  */
878#define POWI_TABLE_SIZE 256
879
880/* The size, in bits of the window, used in the "window method"
881   exponentiation algorithm.  This is equivalent to a radix of
882   (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
883#define POWI_WINDOW_SIZE 3
884
885/* The following table is an efficient representation of an
886   "optimal power tree".  For each value, i, the corresponding
887   value, j, in the table states than an optimal evaluation
888   sequence for calculating pow(x,i) can be found by evaluating
889   pow(x,j)*pow(x,i-j).  An optimal power tree for the first
890   100 integers is given in Knuth's "Seminumerical algorithms".  */
891
892static const unsigned char powi_table[POWI_TABLE_SIZE] =
893  {
894      0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
895      4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
896      8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
897     12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
898     16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
899     20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
900     24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
901     28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
902     32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
903     36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
904     40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
905     44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
906     48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
907     52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
908     56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
909     60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
910     64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
911     68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
912     72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
913     76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
914     80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
915     84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
916     88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
917     92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
918     96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
919    100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
920    104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
921    108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
922    112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
923    116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
924    120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
925    124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
926  };
927
928
929/* Return the number of multiplications required to calculate
930   powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
931   subroutine of powi_cost.  CACHE is an array indicating
932   which exponents have already been calculated.  */
933
934static int
935powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
936{
937  /* If we've already calculated this exponent, then this evaluation
938     doesn't require any additional multiplications.  */
939  if (cache[n])
940    return 0;
941
942  cache[n] = true;
943  return powi_lookup_cost (n - powi_table[n], cache)
944	 + powi_lookup_cost (powi_table[n], cache) + 1;
945}
946
947/* Return the number of multiplications required to calculate
948   powi(x,n) for an arbitrary x, given the exponent N.  This
949   function needs to be kept in sync with powi_as_mults below.  */
950
951static int
952powi_cost (HOST_WIDE_INT n)
953{
954  bool cache[POWI_TABLE_SIZE];
955  unsigned HOST_WIDE_INT digit;
956  unsigned HOST_WIDE_INT val;
957  int result;
958
959  if (n == 0)
960    return 0;
961
962  /* Ignore the reciprocal when calculating the cost.  */
963  val = (n < 0) ? -n : n;
964
965  /* Initialize the exponent cache.  */
966  memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
967  cache[1] = true;
968
969  result = 0;
970
971  while (val >= POWI_TABLE_SIZE)
972    {
973      if (val & 1)
974	{
975	  digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
976	  result += powi_lookup_cost (digit, cache)
977		    + POWI_WINDOW_SIZE + 1;
978	  val >>= POWI_WINDOW_SIZE;
979	}
980      else
981	{
982	  val >>= 1;
983	  result++;
984	}
985    }
986
987  return result + powi_lookup_cost (val, cache);
988}
989
990/* Recursive subroutine of powi_as_mults.  This function takes the
991   array, CACHE, of already calculated exponents and an exponent N and
992   returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
993
994static tree
995powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
996		 HOST_WIDE_INT n, tree *cache)
997{
998  tree op0, op1, ssa_target;
999  unsigned HOST_WIDE_INT digit;
1000  gassign *mult_stmt;
1001
1002  if (n < POWI_TABLE_SIZE && cache[n])
1003    return cache[n];
1004
1005  ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1006
1007  if (n < POWI_TABLE_SIZE)
1008    {
1009      cache[n] = ssa_target;
1010      op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1011      op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1012    }
1013  else if (n & 1)
1014    {
1015      digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1016      op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1017      op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1018    }
1019  else
1020    {
1021      op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1022      op1 = op0;
1023    }
1024
1025  mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1026  gimple_set_location (mult_stmt, loc);
1027  gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1028
1029  return ssa_target;
1030}
1031
1032/* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1033   This function needs to be kept in sync with powi_cost above.  */
1034
1035static tree
1036powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1037	       tree arg0, HOST_WIDE_INT n)
1038{
1039  tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1040  gassign *div_stmt;
1041  tree target;
1042
1043  if (n == 0)
1044    return build_real (type, dconst1);
1045
1046  memset (cache, 0,  sizeof (cache));
1047  cache[1] = arg0;
1048
1049  result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1050  if (n >= 0)
1051    return result;
1052
1053  /* If the original exponent was negative, reciprocate the result.  */
1054  target = make_temp_ssa_name (type, NULL, "powmult");
1055  div_stmt = gimple_build_assign (target, RDIV_EXPR,
1056				  build_real (type, dconst1), result);
1057  gimple_set_location (div_stmt, loc);
1058  gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1059
1060  return target;
1061}
1062
1063/* ARG0 and N are the two arguments to a powi builtin in GSI with
1064   location info LOC.  If the arguments are appropriate, create an
1065   equivalent sequence of statements prior to GSI using an optimal
1066   number of multiplications, and return an expession holding the
1067   result.  */
1068
1069static tree
1070gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1071			    tree arg0, HOST_WIDE_INT n)
1072{
1073  /* Avoid largest negative number.  */
1074  if (n != -n
1075      && ((n >= -1 && n <= 2)
1076	  || (optimize_function_for_speed_p (cfun)
1077	      && powi_cost (n) <= POWI_MAX_MULTS)))
1078    return powi_as_mults (gsi, loc, arg0, n);
1079
1080  return NULL_TREE;
1081}
1082
1083/* Build a gimple call statement that calls FN with argument ARG.
1084   Set the lhs of the call statement to a fresh SSA name.  Insert the
1085   statement prior to GSI's current position, and return the fresh
1086   SSA name.  */
1087
1088static tree
1089build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1090		       tree fn, tree arg)
1091{
1092  gcall *call_stmt;
1093  tree ssa_target;
1094
1095  call_stmt = gimple_build_call (fn, 1, arg);
1096  ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1097  gimple_set_lhs (call_stmt, ssa_target);
1098  gimple_set_location (call_stmt, loc);
1099  gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1100
1101  return ssa_target;
1102}
1103
1104/* Build a gimple binary operation with the given CODE and arguments
1105   ARG0, ARG1, assigning the result to a new SSA name for variable
1106   TARGET.  Insert the statement prior to GSI's current position, and
1107   return the fresh SSA name.*/
1108
1109static tree
1110build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1111			const char *name, enum tree_code code,
1112			tree arg0, tree arg1)
1113{
1114  tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1115  gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1116  gimple_set_location (stmt, loc);
1117  gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1118  return result;
1119}
1120
1121/* Build a gimple reference operation with the given CODE and argument
1122   ARG, assigning the result to a new SSA name of TYPE with NAME.
1123   Insert the statement prior to GSI's current position, and return
1124   the fresh SSA name.  */
1125
1126static inline tree
1127build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1128		      const char *name, enum tree_code code, tree arg0)
1129{
1130  tree result = make_temp_ssa_name (type, NULL, name);
1131  gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1132  gimple_set_location (stmt, loc);
1133  gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1134  return result;
1135}
1136
1137/* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1138   prior to GSI's current position, and return the fresh SSA name.  */
1139
1140static tree
1141build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1142		       tree type, tree val)
1143{
1144  tree result = make_ssa_name (type);
1145  gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1146  gimple_set_location (stmt, loc);
1147  gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1148  return result;
1149}
1150
1151/* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1152   with location info LOC.  If possible, create an equivalent and
1153   less expensive sequence of statements prior to GSI, and return an
1154   expession holding the result.  */
1155
1156static tree
1157gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1158			   tree arg0, tree arg1)
1159{
1160  REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1161  REAL_VALUE_TYPE c2, dconst3;
1162  HOST_WIDE_INT n;
1163  tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1164  machine_mode mode;
1165  bool hw_sqrt_exists, c_is_int, c2_is_int;
1166
1167  /* If the exponent isn't a constant, there's nothing of interest
1168     to be done.  */
1169  if (TREE_CODE (arg1) != REAL_CST)
1170    return NULL_TREE;
1171
1172  /* If the exponent is equivalent to an integer, expand to an optimal
1173     multiplication sequence when profitable.  */
1174  c = TREE_REAL_CST (arg1);
1175  n = real_to_integer (&c);
1176  real_from_integer (&cint, VOIDmode, n, SIGNED);
1177  c_is_int = real_identical (&c, &cint);
1178
1179  if (c_is_int
1180      && ((n >= -1 && n <= 2)
1181	  || (flag_unsafe_math_optimizations
1182	      && optimize_bb_for_speed_p (gsi_bb (*gsi))
1183	      && powi_cost (n) <= POWI_MAX_MULTS)))
1184    return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1185
1186  /* Attempt various optimizations using sqrt and cbrt.  */
1187  type = TREE_TYPE (arg0);
1188  mode = TYPE_MODE (type);
1189  sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1190
1191  /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1192     unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1193     sqrt(-0) = -0.  */
1194  if (sqrtfn
1195      && REAL_VALUES_EQUAL (c, dconsthalf)
1196      && !HONOR_SIGNED_ZEROS (mode))
1197    return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1198
1199  /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1200     a builtin sqrt instruction is smaller than a call to pow with 0.25,
1201     so do this optimization even if -Os.  Don't do this optimization
1202     if we don't have a hardware sqrt insn.  */
1203  dconst1_4 = dconst1;
1204  SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1205  hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1206
1207  if (flag_unsafe_math_optimizations
1208      && sqrtfn
1209      && REAL_VALUES_EQUAL (c, dconst1_4)
1210      && hw_sqrt_exists)
1211    {
1212      /* sqrt(x)  */
1213      sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1214
1215      /* sqrt(sqrt(x))  */
1216      return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1217    }
1218
1219  /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1220     optimizing for space.  Don't do this optimization if we don't have
1221     a hardware sqrt insn.  */
1222  real_from_integer (&dconst3_4, VOIDmode, 3, SIGNED);
1223  SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1224
1225  if (flag_unsafe_math_optimizations
1226      && sqrtfn
1227      && optimize_function_for_speed_p (cfun)
1228      && REAL_VALUES_EQUAL (c, dconst3_4)
1229      && hw_sqrt_exists)
1230    {
1231      /* sqrt(x)  */
1232      sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1233
1234      /* sqrt(sqrt(x))  */
1235      sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1236
1237      /* sqrt(x) * sqrt(sqrt(x))  */
1238      return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1239				     sqrt_arg0, sqrt_sqrt);
1240    }
1241
1242  /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1243     optimizations since 1./3. is not exactly representable.  If x
1244     is negative and finite, the correct value of pow(x,1./3.) is
1245     a NaN with the "invalid" exception raised, because the value
1246     of 1./3. actually has an even denominator.  The correct value
1247     of cbrt(x) is a negative real value.  */
1248  cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1249  dconst1_3 = real_value_truncate (mode, dconst_third ());
1250
1251  if (flag_unsafe_math_optimizations
1252      && cbrtfn
1253      && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1254      && REAL_VALUES_EQUAL (c, dconst1_3))
1255    return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1256
1257  /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1258     if we don't have a hardware sqrt insn.  */
1259  dconst1_6 = dconst1_3;
1260  SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1261
1262  if (flag_unsafe_math_optimizations
1263      && sqrtfn
1264      && cbrtfn
1265      && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1266      && optimize_function_for_speed_p (cfun)
1267      && hw_sqrt_exists
1268      && REAL_VALUES_EQUAL (c, dconst1_6))
1269    {
1270      /* sqrt(x)  */
1271      sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1272
1273      /* cbrt(sqrt(x))  */
1274      return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1275    }
1276
1277  /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1278     and c not an integer, into
1279
1280       sqrt(x) * powi(x, n/2),                n > 0;
1281       1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1282
1283     Do not calculate the powi factor when n/2 = 0.  */
1284  real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1285  n = real_to_integer (&c2);
1286  real_from_integer (&cint, VOIDmode, n, SIGNED);
1287  c2_is_int = real_identical (&c2, &cint);
1288
1289  if (flag_unsafe_math_optimizations
1290      && sqrtfn
1291      && c2_is_int
1292      && !c_is_int
1293      && optimize_function_for_speed_p (cfun))
1294    {
1295      tree powi_x_ndiv2 = NULL_TREE;
1296
1297      /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1298         possible or profitable, give up.  Skip the degenerate case when
1299         n is 1 or -1, where the result is always 1.  */
1300      if (absu_hwi (n) != 1)
1301	{
1302	  powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1303						     abs_hwi (n / 2));
1304	  if (!powi_x_ndiv2)
1305	    return NULL_TREE;
1306	}
1307
1308      /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1309	 result of the optimal multiply sequence just calculated.  */
1310      sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1311
1312      if (absu_hwi (n) == 1)
1313	result = sqrt_arg0;
1314      else
1315	result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1316					 sqrt_arg0, powi_x_ndiv2);
1317
1318      /* If n is negative, reciprocate the result.  */
1319      if (n < 0)
1320	result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1321					 build_real (type, dconst1), result);
1322      return result;
1323    }
1324
1325  /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1326
1327     powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1328     1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1329
1330     Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1331     different from pow(x, 1./3.) due to rounding and behavior with
1332     negative x, we need to constrain this transformation to unsafe
1333     math and positive x or finite math.  */
1334  real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1335  real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1336  real_round (&c2, mode, &c2);
1337  n = real_to_integer (&c2);
1338  real_from_integer (&cint, VOIDmode, n, SIGNED);
1339  real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1340  real_convert (&c2, mode, &c2);
1341
1342  if (flag_unsafe_math_optimizations
1343      && cbrtfn
1344      && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1345      && real_identical (&c2, &c)
1346      && !c2_is_int
1347      && optimize_function_for_speed_p (cfun)
1348      && powi_cost (n / 3) <= POWI_MAX_MULTS)
1349    {
1350      tree powi_x_ndiv3 = NULL_TREE;
1351
1352      /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1353         possible or profitable, give up.  Skip the degenerate case when
1354         abs(n) < 3, where the result is always 1.  */
1355      if (absu_hwi (n) >= 3)
1356	{
1357	  powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1358						     abs_hwi (n / 3));
1359	  if (!powi_x_ndiv3)
1360	    return NULL_TREE;
1361	}
1362
1363      /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1364         as that creates an unnecessary variable.  Instead, just produce
1365         either cbrt(x) or cbrt(x) * cbrt(x).  */
1366      cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1367
1368      if (absu_hwi (n) % 3 == 1)
1369	powi_cbrt_x = cbrt_x;
1370      else
1371	powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1372					      cbrt_x, cbrt_x);
1373
1374      /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1375      if (absu_hwi (n) < 3)
1376	result = powi_cbrt_x;
1377      else
1378	result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1379					 powi_x_ndiv3, powi_cbrt_x);
1380
1381      /* If n is negative, reciprocate the result.  */
1382      if (n < 0)
1383	result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1384					 build_real (type, dconst1), result);
1385
1386      return result;
1387    }
1388
1389  /* No optimizations succeeded.  */
1390  return NULL_TREE;
1391}
1392
1393/* ARG is the argument to a cabs builtin call in GSI with location info
1394   LOC.  Create a sequence of statements prior to GSI that calculates
1395   sqrt(R*R + I*I), where R and I are the real and imaginary components
1396   of ARG, respectively.  Return an expression holding the result.  */
1397
1398static tree
1399gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1400{
1401  tree real_part, imag_part, addend1, addend2, sum, result;
1402  tree type = TREE_TYPE (TREE_TYPE (arg));
1403  tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1404  machine_mode mode = TYPE_MODE (type);
1405
1406  if (!flag_unsafe_math_optimizations
1407      || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1408      || !sqrtfn
1409      || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1410    return NULL_TREE;
1411
1412  real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1413				    REALPART_EXPR, arg);
1414  addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1415				    real_part, real_part);
1416  imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1417				    IMAGPART_EXPR, arg);
1418  addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1419				    imag_part, imag_part);
1420  sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1421  result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1422
1423  return result;
1424}
1425
1426/* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1427   on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1428   an optimal number of multiplies, when n is a constant.  */
1429
1430namespace {
1431
1432const pass_data pass_data_cse_sincos =
1433{
1434  GIMPLE_PASS, /* type */
1435  "sincos", /* name */
1436  OPTGROUP_NONE, /* optinfo_flags */
1437  TV_NONE, /* tv_id */
1438  PROP_ssa, /* properties_required */
1439  0, /* properties_provided */
1440  0, /* properties_destroyed */
1441  0, /* todo_flags_start */
1442  TODO_update_ssa, /* todo_flags_finish */
1443};
1444
1445class pass_cse_sincos : public gimple_opt_pass
1446{
1447public:
1448  pass_cse_sincos (gcc::context *ctxt)
1449    : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1450  {}
1451
1452  /* opt_pass methods: */
1453  virtual bool gate (function *)
1454    {
1455      /* We no longer require either sincos or cexp, since powi expansion
1456	 piggybacks on this pass.  */
1457      return optimize;
1458    }
1459
1460  virtual unsigned int execute (function *);
1461
1462}; // class pass_cse_sincos
1463
1464unsigned int
1465pass_cse_sincos::execute (function *fun)
1466{
1467  basic_block bb;
1468  bool cfg_changed = false;
1469
1470  calculate_dominance_info (CDI_DOMINATORS);
1471  memset (&sincos_stats, 0, sizeof (sincos_stats));
1472
1473  FOR_EACH_BB_FN (bb, fun)
1474    {
1475      gimple_stmt_iterator gsi;
1476      bool cleanup_eh = false;
1477
1478      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1479        {
1480	  gimple stmt = gsi_stmt (gsi);
1481	  tree fndecl;
1482
1483	  /* Only the last stmt in a bb could throw, no need to call
1484	     gimple_purge_dead_eh_edges if we change something in the middle
1485	     of a basic block.  */
1486	  cleanup_eh = false;
1487
1488	  if (is_gimple_call (stmt)
1489	      && gimple_call_lhs (stmt)
1490	      && (fndecl = gimple_call_fndecl (stmt))
1491	      && gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
1492	    {
1493	      tree arg, arg0, arg1, result;
1494	      HOST_WIDE_INT n;
1495	      location_t loc;
1496
1497	      switch (DECL_FUNCTION_CODE (fndecl))
1498		{
1499		CASE_FLT_FN (BUILT_IN_COS):
1500		CASE_FLT_FN (BUILT_IN_SIN):
1501		CASE_FLT_FN (BUILT_IN_CEXPI):
1502		  /* Make sure we have either sincos or cexp.  */
1503		  if (!targetm.libc_has_function (function_c99_math_complex)
1504		      && !targetm.libc_has_function (function_sincos))
1505		    break;
1506
1507		  arg = gimple_call_arg (stmt, 0);
1508		  if (TREE_CODE (arg) == SSA_NAME)
1509		    cfg_changed |= execute_cse_sincos_1 (arg);
1510		  break;
1511
1512		CASE_FLT_FN (BUILT_IN_POW):
1513		  arg0 = gimple_call_arg (stmt, 0);
1514		  arg1 = gimple_call_arg (stmt, 1);
1515
1516		  loc = gimple_location (stmt);
1517		  result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1518
1519		  if (result)
1520		    {
1521		      tree lhs = gimple_get_lhs (stmt);
1522		      gassign *new_stmt = gimple_build_assign (lhs, result);
1523		      gimple_set_location (new_stmt, loc);
1524		      unlink_stmt_vdef (stmt);
1525		      gsi_replace (&gsi, new_stmt, true);
1526		      cleanup_eh = true;
1527		      if (gimple_vdef (stmt))
1528			release_ssa_name (gimple_vdef (stmt));
1529		    }
1530		  break;
1531
1532		CASE_FLT_FN (BUILT_IN_POWI):
1533		  arg0 = gimple_call_arg (stmt, 0);
1534		  arg1 = gimple_call_arg (stmt, 1);
1535		  loc = gimple_location (stmt);
1536
1537		  if (real_minus_onep (arg0))
1538		    {
1539                      tree t0, t1, cond, one, minus_one;
1540		      gassign *stmt;
1541
1542		      t0 = TREE_TYPE (arg0);
1543		      t1 = TREE_TYPE (arg1);
1544		      one = build_real (t0, dconst1);
1545		      minus_one = build_real (t0, dconstm1);
1546
1547		      cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1548		      stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1549						  arg1, build_int_cst (t1, 1));
1550		      gimple_set_location (stmt, loc);
1551		      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1552
1553		      result = make_temp_ssa_name (t0, NULL, "powi");
1554		      stmt = gimple_build_assign (result, COND_EXPR, cond,
1555						  minus_one, one);
1556		      gimple_set_location (stmt, loc);
1557		      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1558		    }
1559		  else
1560		    {
1561		      if (!tree_fits_shwi_p (arg1))
1562			break;
1563
1564		      n = tree_to_shwi (arg1);
1565		      result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1566		    }
1567
1568		  if (result)
1569		    {
1570		      tree lhs = gimple_get_lhs (stmt);
1571		      gassign *new_stmt = gimple_build_assign (lhs, result);
1572		      gimple_set_location (new_stmt, loc);
1573		      unlink_stmt_vdef (stmt);
1574		      gsi_replace (&gsi, new_stmt, true);
1575		      cleanup_eh = true;
1576		      if (gimple_vdef (stmt))
1577			release_ssa_name (gimple_vdef (stmt));
1578		    }
1579		  break;
1580
1581		CASE_FLT_FN (BUILT_IN_CABS):
1582		  arg0 = gimple_call_arg (stmt, 0);
1583		  loc = gimple_location (stmt);
1584		  result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1585
1586		  if (result)
1587		    {
1588		      tree lhs = gimple_get_lhs (stmt);
1589		      gassign *new_stmt = gimple_build_assign (lhs, result);
1590		      gimple_set_location (new_stmt, loc);
1591		      unlink_stmt_vdef (stmt);
1592		      gsi_replace (&gsi, new_stmt, true);
1593		      cleanup_eh = true;
1594		      if (gimple_vdef (stmt))
1595			release_ssa_name (gimple_vdef (stmt));
1596		    }
1597		  break;
1598
1599		default:;
1600		}
1601	    }
1602	}
1603      if (cleanup_eh)
1604	cfg_changed |= gimple_purge_dead_eh_edges (bb);
1605    }
1606
1607  statistics_counter_event (fun, "sincos statements inserted",
1608			    sincos_stats.inserted);
1609
1610  free_dominance_info (CDI_DOMINATORS);
1611  return cfg_changed ? TODO_cleanup_cfg : 0;
1612}
1613
1614} // anon namespace
1615
1616gimple_opt_pass *
1617make_pass_cse_sincos (gcc::context *ctxt)
1618{
1619  return new pass_cse_sincos (ctxt);
1620}
1621
1622/* A symbolic number is used to detect byte permutation and selection
1623   patterns.  Therefore the field N contains an artificial number
1624   consisting of octet sized markers:
1625
1626   0    - target byte has the value 0
1627   FF   - target byte has an unknown value (eg. due to sign extension)
1628   1..size - marker value is the target byte index minus one.
1629
1630   To detect permutations on memory sources (arrays and structures), a symbolic
1631   number is also associated a base address (the array or structure the load is
1632   made from), an offset from the base address and a range which gives the
1633   difference between the highest and lowest accessed memory location to make
1634   such a symbolic number. The range is thus different from size which reflects
1635   the size of the type of current expression. Note that for non memory source,
1636   range holds the same value as size.
1637
1638   For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1639   a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1640   still have a size of 2 but this time a range of 1.  */
1641
1642struct symbolic_number {
1643  uint64_t n;
1644  tree type;
1645  tree base_addr;
1646  tree offset;
1647  HOST_WIDE_INT bytepos;
1648  tree alias_set;
1649  tree vuse;
1650  unsigned HOST_WIDE_INT range;
1651};
1652
1653#define BITS_PER_MARKER 8
1654#define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1655#define MARKER_BYTE_UNKNOWN MARKER_MASK
1656#define HEAD_MARKER(n, size) \
1657  ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1658
1659/* The number which the find_bswap_or_nop_1 result should match in
1660   order to have a nop.  The number is masked according to the size of
1661   the symbolic number before using it.  */
1662#define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1663  (uint64_t)0x08070605 << 32 | 0x04030201)
1664
1665/* The number which the find_bswap_or_nop_1 result should match in
1666   order to have a byte swap.  The number is masked according to the
1667   size of the symbolic number before using it.  */
1668#define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1669  (uint64_t)0x01020304 << 32 | 0x05060708)
1670
1671/* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1672   number N.  Return false if the requested operation is not permitted
1673   on a symbolic number.  */
1674
1675static inline bool
1676do_shift_rotate (enum tree_code code,
1677		 struct symbolic_number *n,
1678		 int count)
1679{
1680  int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1681  unsigned head_marker;
1682
1683  if (count % BITS_PER_UNIT != 0)
1684    return false;
1685  count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1686
1687  /* Zero out the extra bits of N in order to avoid them being shifted
1688     into the significant bits.  */
1689  if (size < 64 / BITS_PER_MARKER)
1690    n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1691
1692  switch (code)
1693    {
1694    case LSHIFT_EXPR:
1695      n->n <<= count;
1696      break;
1697    case RSHIFT_EXPR:
1698      head_marker = HEAD_MARKER (n->n, size);
1699      n->n >>= count;
1700      /* Arithmetic shift of signed type: result is dependent on the value.  */
1701      if (!TYPE_UNSIGNED (n->type) && head_marker)
1702	for (i = 0; i < count / BITS_PER_MARKER; i++)
1703	  n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1704		  << ((size - 1 - i) * BITS_PER_MARKER);
1705      break;
1706    case LROTATE_EXPR:
1707      n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1708      break;
1709    case RROTATE_EXPR:
1710      n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1711      break;
1712    default:
1713      return false;
1714    }
1715  /* Zero unused bits for size.  */
1716  if (size < 64 / BITS_PER_MARKER)
1717    n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1718  return true;
1719}
1720
1721/* Perform sanity checking for the symbolic number N and the gimple
1722   statement STMT.  */
1723
1724static inline bool
1725verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1726{
1727  tree lhs_type;
1728
1729  lhs_type = gimple_expr_type (stmt);
1730
1731  if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1732    return false;
1733
1734  if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
1735    return false;
1736
1737  return true;
1738}
1739
1740/* Initialize the symbolic number N for the bswap pass from the base element
1741   SRC manipulated by the bitwise OR expression.  */
1742
1743static bool
1744init_symbolic_number (struct symbolic_number *n, tree src)
1745{
1746  int size;
1747
1748  n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
1749
1750  /* Set up the symbolic number N by setting each byte to a value between 1 and
1751     the byte size of rhs1.  The highest order byte is set to n->size and the
1752     lowest order byte to 1.  */
1753  n->type = TREE_TYPE (src);
1754  size = TYPE_PRECISION (n->type);
1755  if (size % BITS_PER_UNIT != 0)
1756    return false;
1757  size /= BITS_PER_UNIT;
1758  if (size > 64 / BITS_PER_MARKER)
1759    return false;
1760  n->range = size;
1761  n->n = CMPNOP;
1762
1763  if (size < 64 / BITS_PER_MARKER)
1764    n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1765
1766  return true;
1767}
1768
1769/* Check if STMT might be a byte swap or a nop from a memory source and returns
1770   the answer. If so, REF is that memory source and the base of the memory area
1771   accessed and the offset of the access from that base are recorded in N.  */
1772
1773bool
1774find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n)
1775{
1776  /* Leaf node is an array or component ref. Memorize its base and
1777     offset from base to compare to other such leaf node.  */
1778  HOST_WIDE_INT bitsize, bitpos;
1779  machine_mode mode;
1780  int unsignedp, volatilep;
1781  tree offset, base_addr;
1782
1783  /* Not prepared to handle PDP endian.  */
1784  if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
1785    return false;
1786
1787  if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
1788    return false;
1789
1790  base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
1791				   &unsignedp, &volatilep, false);
1792
1793  if (TREE_CODE (base_addr) == MEM_REF)
1794    {
1795      offset_int bit_offset = 0;
1796      tree off = TREE_OPERAND (base_addr, 1);
1797
1798      if (!integer_zerop (off))
1799	{
1800	  offset_int boff, coff = mem_ref_offset (base_addr);
1801	  boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
1802	  bit_offset += boff;
1803	}
1804
1805      base_addr = TREE_OPERAND (base_addr, 0);
1806
1807      /* Avoid returning a negative bitpos as this may wreak havoc later.  */
1808      if (wi::neg_p (bit_offset))
1809	{
1810	  offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
1811	  offset_int tem = bit_offset.and_not (mask);
1812	  /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
1813	     Subtract it to BIT_OFFSET and add it (scaled) to OFFSET.  */
1814	  bit_offset -= tem;
1815	  tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
1816	  if (offset)
1817	    offset = size_binop (PLUS_EXPR, offset,
1818				    wide_int_to_tree (sizetype, tem));
1819	  else
1820	    offset = wide_int_to_tree (sizetype, tem);
1821	}
1822
1823      bitpos += bit_offset.to_shwi ();
1824    }
1825
1826  if (bitpos % BITS_PER_UNIT)
1827    return false;
1828  if (bitsize % BITS_PER_UNIT)
1829    return false;
1830
1831  if (!init_symbolic_number (n, ref))
1832    return false;
1833  n->base_addr = base_addr;
1834  n->offset = offset;
1835  n->bytepos = bitpos / BITS_PER_UNIT;
1836  n->alias_set = reference_alias_ptr_type (ref);
1837  n->vuse = gimple_vuse (stmt);
1838  return true;
1839}
1840
1841/* Compute the symbolic number N representing the result of a bitwise OR on 2
1842   symbolic number N1 and N2 whose source statements are respectively
1843   SOURCE_STMT1 and SOURCE_STMT2.  */
1844
1845static gimple
1846perform_symbolic_merge (gimple source_stmt1, struct symbolic_number *n1,
1847			gimple source_stmt2, struct symbolic_number *n2,
1848			struct symbolic_number *n)
1849{
1850  int i, size;
1851  uint64_t mask;
1852  gimple source_stmt;
1853  struct symbolic_number *n_start;
1854
1855  /* Sources are different, cancel bswap if they are not memory location with
1856     the same base (array, structure, ...).  */
1857  if (gimple_assign_rhs1 (source_stmt1) != gimple_assign_rhs1 (source_stmt2))
1858    {
1859      uint64_t inc;
1860      HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
1861      struct symbolic_number *toinc_n_ptr, *n_end;
1862
1863      if (!n1->base_addr || !n2->base_addr
1864	  || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
1865	return NULL;
1866
1867      if (!n1->offset != !n2->offset
1868	  || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
1869	return NULL;
1870
1871      if (n1->bytepos < n2->bytepos)
1872	{
1873	  n_start = n1;
1874	  start_sub = n2->bytepos - n1->bytepos;
1875	  source_stmt = source_stmt1;
1876	}
1877      else
1878	{
1879	  n_start = n2;
1880	  start_sub = n1->bytepos - n2->bytepos;
1881	  source_stmt = source_stmt2;
1882	}
1883
1884      /* Find the highest address at which a load is performed and
1885	 compute related info.  */
1886      end1 = n1->bytepos + (n1->range - 1);
1887      end2 = n2->bytepos + (n2->range - 1);
1888      if (end1 < end2)
1889	{
1890	  end = end2;
1891	  end_sub = end2 - end1;
1892	}
1893      else
1894	{
1895	  end = end1;
1896	  end_sub = end1 - end2;
1897	}
1898      n_end = (end2 > end1) ? n2 : n1;
1899
1900      /* Find symbolic number whose lsb is the most significant.  */
1901      if (BYTES_BIG_ENDIAN)
1902	toinc_n_ptr = (n_end == n1) ? n2 : n1;
1903      else
1904	toinc_n_ptr = (n_start == n1) ? n2 : n1;
1905
1906      n->range = end - n_start->bytepos + 1;
1907
1908      /* Check that the range of memory covered can be represented by
1909	 a symbolic number.  */
1910      if (n->range > 64 / BITS_PER_MARKER)
1911	return NULL;
1912
1913      /* Reinterpret byte marks in symbolic number holding the value of
1914	 bigger weight according to target endianness.  */
1915      inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
1916      size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
1917      for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
1918	{
1919	  unsigned marker
1920	    = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
1921	  if (marker && marker != MARKER_BYTE_UNKNOWN)
1922	    toinc_n_ptr->n += inc;
1923	}
1924    }
1925  else
1926    {
1927      n->range = n1->range;
1928      n_start = n1;
1929      source_stmt = source_stmt1;
1930    }
1931
1932  if (!n1->alias_set
1933      || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
1934    n->alias_set = n1->alias_set;
1935  else
1936    n->alias_set = ptr_type_node;
1937  n->vuse = n_start->vuse;
1938  n->base_addr = n_start->base_addr;
1939  n->offset = n_start->offset;
1940  n->bytepos = n_start->bytepos;
1941  n->type = n_start->type;
1942  size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1943
1944  for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
1945    {
1946      uint64_t masked1, masked2;
1947
1948      masked1 = n1->n & mask;
1949      masked2 = n2->n & mask;
1950      if (masked1 && masked2 && masked1 != masked2)
1951	return NULL;
1952    }
1953  n->n = n1->n | n2->n;
1954
1955  return source_stmt;
1956}
1957
1958/* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
1959   the operation given by the rhs of STMT on the result.  If the operation
1960   could successfully be executed the function returns a gimple stmt whose
1961   rhs's first tree is the expression of the source operand and NULL
1962   otherwise.  */
1963
1964static gimple
1965find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
1966{
1967  enum tree_code code;
1968  tree rhs1, rhs2 = NULL;
1969  gimple rhs1_stmt, rhs2_stmt, source_stmt1;
1970  enum gimple_rhs_class rhs_class;
1971
1972  if (!limit || !is_gimple_assign (stmt))
1973    return NULL;
1974
1975  rhs1 = gimple_assign_rhs1 (stmt);
1976
1977  if (find_bswap_or_nop_load (stmt, rhs1, n))
1978    return stmt;
1979
1980  if (TREE_CODE (rhs1) != SSA_NAME)
1981    return NULL;
1982
1983  code = gimple_assign_rhs_code (stmt);
1984  rhs_class = gimple_assign_rhs_class (stmt);
1985  rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1986
1987  if (rhs_class == GIMPLE_BINARY_RHS)
1988    rhs2 = gimple_assign_rhs2 (stmt);
1989
1990  /* Handle unary rhs and binary rhs with integer constants as second
1991     operand.  */
1992
1993  if (rhs_class == GIMPLE_UNARY_RHS
1994      || (rhs_class == GIMPLE_BINARY_RHS
1995	  && TREE_CODE (rhs2) == INTEGER_CST))
1996    {
1997      if (code != BIT_AND_EXPR
1998	  && code != LSHIFT_EXPR
1999	  && code != RSHIFT_EXPR
2000	  && code != LROTATE_EXPR
2001	  && code != RROTATE_EXPR
2002	  && !CONVERT_EXPR_CODE_P (code))
2003	return NULL;
2004
2005      source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2006
2007      /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2008	 we have to initialize the symbolic number.  */
2009      if (!source_stmt1)
2010	{
2011	  if (gimple_assign_load_p (stmt)
2012	      || !init_symbolic_number (n, rhs1))
2013	    return NULL;
2014	  source_stmt1 = stmt;
2015	}
2016
2017      switch (code)
2018	{
2019	case BIT_AND_EXPR:
2020	  {
2021	    int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2022	    uint64_t val = int_cst_value (rhs2), mask = 0;
2023	    uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2024
2025	    /* Only constants masking full bytes are allowed.  */
2026	    for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2027	      if ((val & tmp) != 0 && (val & tmp) != tmp)
2028		return NULL;
2029	      else if (val & tmp)
2030		mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2031
2032	    n->n &= mask;
2033	  }
2034	  break;
2035	case LSHIFT_EXPR:
2036	case RSHIFT_EXPR:
2037	case LROTATE_EXPR:
2038	case RROTATE_EXPR:
2039	  if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
2040	    return NULL;
2041	  break;
2042	CASE_CONVERT:
2043	  {
2044	    int i, type_size, old_type_size;
2045	    tree type;
2046
2047	    type = gimple_expr_type (stmt);
2048	    type_size = TYPE_PRECISION (type);
2049	    if (type_size % BITS_PER_UNIT != 0)
2050	      return NULL;
2051	    type_size /= BITS_PER_UNIT;
2052	    if (type_size > 64 / BITS_PER_MARKER)
2053	      return NULL;
2054
2055	    /* Sign extension: result is dependent on the value.  */
2056	    old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2057	    if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2058		&& HEAD_MARKER (n->n, old_type_size))
2059	      for (i = 0; i < type_size - old_type_size; i++)
2060		n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2061			<< ((type_size - 1 - i) * BITS_PER_MARKER);
2062
2063	    if (type_size < 64 / BITS_PER_MARKER)
2064	      {
2065		/* If STMT casts to a smaller type mask out the bits not
2066		   belonging to the target type.  */
2067		n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2068	      }
2069	    n->type = type;
2070	    if (!n->base_addr)
2071	      n->range = type_size;
2072	  }
2073	  break;
2074	default:
2075	  return NULL;
2076	};
2077      return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2078    }
2079
2080  /* Handle binary rhs.  */
2081
2082  if (rhs_class == GIMPLE_BINARY_RHS)
2083    {
2084      struct symbolic_number n1, n2;
2085      gimple source_stmt, source_stmt2;
2086
2087      if (code != BIT_IOR_EXPR)
2088	return NULL;
2089
2090      if (TREE_CODE (rhs2) != SSA_NAME)
2091	return NULL;
2092
2093      rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2094
2095      switch (code)
2096	{
2097	case BIT_IOR_EXPR:
2098	  source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2099
2100	  if (!source_stmt1)
2101	    return NULL;
2102
2103	  source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2104
2105	  if (!source_stmt2)
2106	    return NULL;
2107
2108	  if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2109	    return NULL;
2110
2111	  if (!n1.vuse != !n2.vuse
2112	      || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2113	    return NULL;
2114
2115	  source_stmt
2116	    = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2117
2118	  if (!source_stmt)
2119	    return NULL;
2120
2121	  if (!verify_symbolic_number_p (n, stmt))
2122	    return NULL;
2123
2124	  break;
2125	default:
2126	  return NULL;
2127	}
2128      return source_stmt;
2129    }
2130  return NULL;
2131}
2132
2133/* Check if STMT completes a bswap implementation or a read in a given
2134   endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2135   accordingly.  It also sets N to represent the kind of operations
2136   performed: size of the resulting expression and whether it works on
2137   a memory source, and if so alias-set and vuse.  At last, the
2138   function returns a stmt whose rhs's first tree is the source
2139   expression.  */
2140
2141static gimple
2142find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
2143{
2144  /* The number which the find_bswap_or_nop_1 result should match in order
2145     to have a full byte swap.  The number is shifted to the right
2146     according to the size of the symbolic number before using it.  */
2147  uint64_t cmpxchg = CMPXCHG;
2148  uint64_t cmpnop = CMPNOP;
2149
2150  gimple source_stmt;
2151  int limit;
2152
2153  /* The last parameter determines the depth search limit.  It usually
2154     correlates directly to the number n of bytes to be touched.  We
2155     increase that number by log2(n) + 1 here in order to also
2156     cover signed -> unsigned conversions of the src operand as can be seen
2157     in libgcc, and for initial shift/and operation of the src operand.  */
2158  limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2159  limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2160  source_stmt = find_bswap_or_nop_1 (stmt, n, limit);
2161
2162  if (!source_stmt)
2163    return NULL;
2164
2165  /* Find real size of result (highest non-zero byte).  */
2166  if (n->base_addr)
2167    {
2168      unsigned HOST_WIDE_INT rsize;
2169      uint64_t tmpn;
2170
2171      for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2172      if (BYTES_BIG_ENDIAN && n->range != rsize)
2173	/* This implies an offset, which is currently not handled by
2174	   bswap_replace.  */
2175	return NULL;
2176      n->range = rsize;
2177    }
2178
2179  /* Zero out the extra bits of N and CMP*.  */
2180  if (n->range < (int) sizeof (int64_t))
2181    {
2182      uint64_t mask;
2183
2184      mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2185      cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2186      cmpnop &= mask;
2187    }
2188
2189  /* A complete byte swap should make the symbolic number to start with
2190     the largest digit in the highest order byte. Unchanged symbolic
2191     number indicates a read with same endianness as target architecture.  */
2192  if (n->n == cmpnop)
2193    *bswap = false;
2194  else if (n->n == cmpxchg)
2195    *bswap = true;
2196  else
2197    return NULL;
2198
2199  /* Useless bit manipulation performed by code.  */
2200  if (!n->base_addr && n->n == cmpnop)
2201    return NULL;
2202
2203  n->range *= BITS_PER_UNIT;
2204  return source_stmt;
2205}
2206
2207namespace {
2208
2209const pass_data pass_data_optimize_bswap =
2210{
2211  GIMPLE_PASS, /* type */
2212  "bswap", /* name */
2213  OPTGROUP_NONE, /* optinfo_flags */
2214  TV_NONE, /* tv_id */
2215  PROP_ssa, /* properties_required */
2216  0, /* properties_provided */
2217  0, /* properties_destroyed */
2218  0, /* todo_flags_start */
2219  0, /* todo_flags_finish */
2220};
2221
2222class pass_optimize_bswap : public gimple_opt_pass
2223{
2224public:
2225  pass_optimize_bswap (gcc::context *ctxt)
2226    : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2227  {}
2228
2229  /* opt_pass methods: */
2230  virtual bool gate (function *)
2231    {
2232      return flag_expensive_optimizations && optimize;
2233    }
2234
2235  virtual unsigned int execute (function *);
2236
2237}; // class pass_optimize_bswap
2238
2239/* Perform the bswap optimization: replace the expression computed in the rhs
2240   of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2241   Which of these alternatives replace the rhs is given by N->base_addr (non
2242   null if a load is needed) and BSWAP.  The type, VUSE and set-alias of the
2243   load to perform are also given in N while the builtin bswap invoke is given
2244   in FNDEL.  Finally, if a load is involved, SRC_STMT refers to one of the
2245   load statements involved to construct the rhs in CUR_STMT and N->range gives
2246   the size of the rhs expression for maintaining some statistics.
2247
2248   Note that if the replacement involve a load, CUR_STMT is moved just after
2249   SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2250   changing of basic block.  */
2251
2252static bool
2253bswap_replace (gimple cur_stmt, gimple src_stmt, tree fndecl, tree bswap_type,
2254	       tree load_type, struct symbolic_number *n, bool bswap)
2255{
2256  gimple_stmt_iterator gsi;
2257  tree src, tmp, tgt;
2258  gimple bswap_stmt;
2259
2260  gsi = gsi_for_stmt (cur_stmt);
2261  src = gimple_assign_rhs1 (src_stmt);
2262  tgt = gimple_assign_lhs (cur_stmt);
2263
2264  /* Need to load the value from memory first.  */
2265  if (n->base_addr)
2266    {
2267      gimple_stmt_iterator gsi_ins = gsi_for_stmt (src_stmt);
2268      tree addr_expr, addr_tmp, val_expr, val_tmp;
2269      tree load_offset_ptr, aligned_load_type;
2270      gimple addr_stmt, load_stmt;
2271      unsigned align;
2272      HOST_WIDE_INT load_offset = 0;
2273
2274      align = get_object_alignment (src);
2275      /* If the new access is smaller than the original one, we need
2276	 to perform big endian adjustment.  */
2277      if (BYTES_BIG_ENDIAN)
2278	{
2279	  HOST_WIDE_INT bitsize, bitpos;
2280	  machine_mode mode;
2281	  int unsignedp, volatilep;
2282	  tree offset;
2283
2284	  get_inner_reference (src, &bitsize, &bitpos, &offset, &mode,
2285			       &unsignedp, &volatilep, false);
2286	  if (n->range < (unsigned HOST_WIDE_INT) bitsize)
2287	    {
2288	      load_offset = (bitsize - n->range) / BITS_PER_UNIT;
2289	      unsigned HOST_WIDE_INT l
2290		= (load_offset * BITS_PER_UNIT) & (align - 1);
2291	      if (l)
2292		align = l & -l;
2293	    }
2294	}
2295
2296      if (bswap
2297	  && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2298	  && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2299	return false;
2300
2301      /* Move cur_stmt just before  one of the load of the original
2302	 to ensure it has the same VUSE.  See PR61517 for what could
2303	 go wrong.  */
2304      if (gimple_bb (cur_stmt) != gimple_bb (src_stmt))
2305	reset_flow_sensitive_info (gimple_assign_lhs (cur_stmt));
2306      gsi_move_before (&gsi, &gsi_ins);
2307      gsi = gsi_for_stmt (cur_stmt);
2308
2309      /* Compute address to load from and cast according to the size
2310	 of the load.  */
2311      addr_expr = build_fold_addr_expr (unshare_expr (src));
2312      if (is_gimple_mem_ref_addr (addr_expr))
2313	addr_tmp = addr_expr;
2314      else
2315	{
2316	  addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2317					 "load_src");
2318	  addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2319	  gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2320	}
2321
2322      /* Perform the load.  */
2323      aligned_load_type = load_type;
2324      if (align < TYPE_ALIGN (load_type))
2325	aligned_load_type = build_aligned_type (load_type, align);
2326      load_offset_ptr = build_int_cst (n->alias_set, load_offset);
2327      val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2328			      load_offset_ptr);
2329
2330      if (!bswap)
2331	{
2332	  if (n->range == 16)
2333	    nop_stats.found_16bit++;
2334	  else if (n->range == 32)
2335	    nop_stats.found_32bit++;
2336	  else
2337	    {
2338	      gcc_assert (n->range == 64);
2339	      nop_stats.found_64bit++;
2340	    }
2341
2342	  /* Convert the result of load if necessary.  */
2343	  if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2344	    {
2345	      val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2346					    "load_dst");
2347	      load_stmt = gimple_build_assign (val_tmp, val_expr);
2348	      gimple_set_vuse (load_stmt, n->vuse);
2349	      gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2350	      gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2351	    }
2352	  else
2353	    {
2354	      gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2355	      gimple_set_vuse (cur_stmt, n->vuse);
2356	    }
2357	  update_stmt (cur_stmt);
2358
2359	  if (dump_file)
2360	    {
2361	      fprintf (dump_file,
2362		       "%d bit load in target endianness found at: ",
2363		       (int) n->range);
2364	      print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2365	    }
2366	  return true;
2367	}
2368      else
2369	{
2370	  val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2371	  load_stmt = gimple_build_assign (val_tmp, val_expr);
2372	  gimple_set_vuse (load_stmt, n->vuse);
2373	  gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2374	}
2375      src = val_tmp;
2376    }
2377
2378  if (n->range == 16)
2379    bswap_stats.found_16bit++;
2380  else if (n->range == 32)
2381    bswap_stats.found_32bit++;
2382  else
2383    {
2384      gcc_assert (n->range == 64);
2385      bswap_stats.found_64bit++;
2386    }
2387
2388  tmp = src;
2389
2390  /* Convert the src expression if necessary.  */
2391  if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2392    {
2393      gimple convert_stmt;
2394
2395      tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2396      convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2397      gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2398    }
2399
2400  /* Canonical form for 16 bit bswap is a rotate expression.  Only 16bit values
2401     are considered as rotation of 2N bit values by N bits is generally not
2402     equivalent to a bswap.  Consider for instance 0x01020304 r>> 16 which
2403     gives 0x03040102 while a bswap for that value is 0x04030201.  */
2404  if (bswap && n->range == 16)
2405    {
2406      tree count = build_int_cst (NULL, BITS_PER_UNIT);
2407      src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2408      bswap_stmt = gimple_build_assign (NULL, src);
2409    }
2410  else
2411    bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2412
2413  tmp = tgt;
2414
2415  /* Convert the result if necessary.  */
2416  if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2417    {
2418      gimple convert_stmt;
2419
2420      tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2421      convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2422      gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2423    }
2424
2425  gimple_set_lhs (bswap_stmt, tmp);
2426
2427  if (dump_file)
2428    {
2429      fprintf (dump_file, "%d bit bswap implementation found at: ",
2430	       (int) n->range);
2431      print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2432    }
2433
2434  gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2435  gsi_remove (&gsi, true);
2436  return true;
2437}
2438
2439/* Find manual byte swap implementations as well as load in a given
2440   endianness. Byte swaps are turned into a bswap builtin invokation
2441   while endian loads are converted to bswap builtin invokation or
2442   simple load according to the target endianness.  */
2443
2444unsigned int
2445pass_optimize_bswap::execute (function *fun)
2446{
2447  basic_block bb;
2448  bool bswap32_p, bswap64_p;
2449  bool changed = false;
2450  tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2451
2452  if (BITS_PER_UNIT != 8)
2453    return 0;
2454
2455  bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2456	       && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2457  bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2458	       && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2459		   || (bswap32_p && word_mode == SImode)));
2460
2461  /* Determine the argument type of the builtins.  The code later on
2462     assumes that the return and argument type are the same.  */
2463  if (bswap32_p)
2464    {
2465      tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2466      bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2467    }
2468
2469  if (bswap64_p)
2470    {
2471      tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2472      bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2473    }
2474
2475  memset (&nop_stats, 0, sizeof (nop_stats));
2476  memset (&bswap_stats, 0, sizeof (bswap_stats));
2477
2478  FOR_EACH_BB_FN (bb, fun)
2479    {
2480      gimple_stmt_iterator gsi;
2481
2482      /* We do a reverse scan for bswap patterns to make sure we get the
2483	 widest match. As bswap pattern matching doesn't handle previously
2484	 inserted smaller bswap replacements as sub-patterns, the wider
2485	 variant wouldn't be detected.  */
2486      for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2487        {
2488	  gimple src_stmt, cur_stmt = gsi_stmt (gsi);
2489	  tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2490	  enum tree_code code;
2491	  struct symbolic_number n;
2492	  bool bswap;
2493
2494	  /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2495	     might be moved to a different basic block by bswap_replace and gsi
2496	     must not points to it if that's the case.  Moving the gsi_prev
2497	     there make sure that gsi points to the statement previous to
2498	     cur_stmt while still making sure that all statements are
2499	     considered in this basic block.  */
2500	  gsi_prev (&gsi);
2501
2502	  if (!is_gimple_assign (cur_stmt))
2503	    continue;
2504
2505	  code = gimple_assign_rhs_code (cur_stmt);
2506	  switch (code)
2507	    {
2508	    case LROTATE_EXPR:
2509	    case RROTATE_EXPR:
2510	      if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2511		  || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2512		     % BITS_PER_UNIT)
2513		continue;
2514	      /* Fall through.  */
2515	    case BIT_IOR_EXPR:
2516	      break;
2517	    default:
2518	      continue;
2519	    }
2520
2521	  src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2522
2523	  if (!src_stmt)
2524	    continue;
2525
2526	  switch (n.range)
2527	    {
2528	    case 16:
2529	      /* Already in canonical form, nothing to do.  */
2530	      if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2531		continue;
2532	      load_type = bswap_type = uint16_type_node;
2533	      break;
2534	    case 32:
2535	      load_type = uint32_type_node;
2536	      if (bswap32_p)
2537		{
2538		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2539		  bswap_type = bswap32_type;
2540		}
2541	      break;
2542	    case 64:
2543	      load_type = uint64_type_node;
2544	      if (bswap64_p)
2545		{
2546		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2547		  bswap_type = bswap64_type;
2548		}
2549	      break;
2550	    default:
2551	      continue;
2552	    }
2553
2554	  if (bswap && !fndecl && n.range != 16)
2555	    continue;
2556
2557	  if (bswap_replace (cur_stmt, src_stmt, fndecl, bswap_type, load_type,
2558			     &n, bswap))
2559	    changed = true;
2560	}
2561    }
2562
2563  statistics_counter_event (fun, "16-bit nop implementations found",
2564			    nop_stats.found_16bit);
2565  statistics_counter_event (fun, "32-bit nop implementations found",
2566			    nop_stats.found_32bit);
2567  statistics_counter_event (fun, "64-bit nop implementations found",
2568			    nop_stats.found_64bit);
2569  statistics_counter_event (fun, "16-bit bswap implementations found",
2570			    bswap_stats.found_16bit);
2571  statistics_counter_event (fun, "32-bit bswap implementations found",
2572			    bswap_stats.found_32bit);
2573  statistics_counter_event (fun, "64-bit bswap implementations found",
2574			    bswap_stats.found_64bit);
2575
2576  return (changed ? TODO_update_ssa : 0);
2577}
2578
2579} // anon namespace
2580
2581gimple_opt_pass *
2582make_pass_optimize_bswap (gcc::context *ctxt)
2583{
2584  return new pass_optimize_bswap (ctxt);
2585}
2586
2587/* Return true if stmt is a type conversion operation that can be stripped
2588   when used in a widening multiply operation.  */
2589static bool
2590widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2591{
2592  enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2593
2594  if (TREE_CODE (result_type) == INTEGER_TYPE)
2595    {
2596      tree op_type;
2597      tree inner_op_type;
2598
2599      if (!CONVERT_EXPR_CODE_P (rhs_code))
2600	return false;
2601
2602      op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2603
2604      /* If the type of OP has the same precision as the result, then
2605	 we can strip this conversion.  The multiply operation will be
2606	 selected to create the correct extension as a by-product.  */
2607      if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2608	return true;
2609
2610      /* We can also strip a conversion if it preserves the signed-ness of
2611	 the operation and doesn't narrow the range.  */
2612      inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2613
2614      /* If the inner-most type is unsigned, then we can strip any
2615	 intermediate widening operation.  If it's signed, then the
2616	 intermediate widening operation must also be signed.  */
2617      if ((TYPE_UNSIGNED (inner_op_type)
2618	   || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2619	  && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2620	return true;
2621
2622      return false;
2623    }
2624
2625  return rhs_code == FIXED_CONVERT_EXPR;
2626}
2627
2628/* Return true if RHS is a suitable operand for a widening multiplication,
2629   assuming a target type of TYPE.
2630   There are two cases:
2631
2632     - RHS makes some value at least twice as wide.  Store that value
2633       in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2634
2635     - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2636       but leave *TYPE_OUT untouched.  */
2637
2638static bool
2639is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2640			tree *new_rhs_out)
2641{
2642  gimple stmt;
2643  tree type1, rhs1;
2644
2645  if (TREE_CODE (rhs) == SSA_NAME)
2646    {
2647      stmt = SSA_NAME_DEF_STMT (rhs);
2648      if (is_gimple_assign (stmt))
2649	{
2650	  if (! widening_mult_conversion_strippable_p (type, stmt))
2651	    rhs1 = rhs;
2652	  else
2653	    {
2654	      rhs1 = gimple_assign_rhs1 (stmt);
2655
2656	      if (TREE_CODE (rhs1) == INTEGER_CST)
2657		{
2658		  *new_rhs_out = rhs1;
2659		  *type_out = NULL;
2660		  return true;
2661		}
2662	    }
2663	}
2664      else
2665	rhs1 = rhs;
2666
2667      type1 = TREE_TYPE (rhs1);
2668
2669      if (TREE_CODE (type1) != TREE_CODE (type)
2670	  || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2671	return false;
2672
2673      *new_rhs_out = rhs1;
2674      *type_out = type1;
2675      return true;
2676    }
2677
2678  if (TREE_CODE (rhs) == INTEGER_CST)
2679    {
2680      *new_rhs_out = rhs;
2681      *type_out = NULL;
2682      return true;
2683    }
2684
2685  return false;
2686}
2687
2688/* Return true if STMT performs a widening multiplication, assuming the
2689   output type is TYPE.  If so, store the unwidened types of the operands
2690   in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2691   *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2692   and *TYPE2_OUT would give the operands of the multiplication.  */
2693
2694static bool
2695is_widening_mult_p (gimple stmt,
2696		    tree *type1_out, tree *rhs1_out,
2697		    tree *type2_out, tree *rhs2_out)
2698{
2699  tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2700
2701  if (TREE_CODE (type) != INTEGER_TYPE
2702      && TREE_CODE (type) != FIXED_POINT_TYPE)
2703    return false;
2704
2705  if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2706			       rhs1_out))
2707    return false;
2708
2709  if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2710			       rhs2_out))
2711    return false;
2712
2713  if (*type1_out == NULL)
2714    {
2715      if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2716	return false;
2717      *type1_out = *type2_out;
2718    }
2719
2720  if (*type2_out == NULL)
2721    {
2722      if (!int_fits_type_p (*rhs2_out, *type1_out))
2723	return false;
2724      *type2_out = *type1_out;
2725    }
2726
2727  /* Ensure that the larger of the two operands comes first. */
2728  if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2729    {
2730      tree tmp;
2731      tmp = *type1_out;
2732      *type1_out = *type2_out;
2733      *type2_out = tmp;
2734      tmp = *rhs1_out;
2735      *rhs1_out = *rhs2_out;
2736      *rhs2_out = tmp;
2737    }
2738
2739  return true;
2740}
2741
2742/* Process a single gimple statement STMT, which has a MULT_EXPR as
2743   its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2744   value is true iff we converted the statement.  */
2745
2746static bool
2747convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2748{
2749  tree lhs, rhs1, rhs2, type, type1, type2;
2750  enum insn_code handler;
2751  machine_mode to_mode, from_mode, actual_mode;
2752  optab op;
2753  int actual_precision;
2754  location_t loc = gimple_location (stmt);
2755  bool from_unsigned1, from_unsigned2;
2756
2757  lhs = gimple_assign_lhs (stmt);
2758  type = TREE_TYPE (lhs);
2759  if (TREE_CODE (type) != INTEGER_TYPE)
2760    return false;
2761
2762  if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2763    return false;
2764
2765  to_mode = TYPE_MODE (type);
2766  from_mode = TYPE_MODE (type1);
2767  from_unsigned1 = TYPE_UNSIGNED (type1);
2768  from_unsigned2 = TYPE_UNSIGNED (type2);
2769
2770  if (from_unsigned1 && from_unsigned2)
2771    op = umul_widen_optab;
2772  else if (!from_unsigned1 && !from_unsigned2)
2773    op = smul_widen_optab;
2774  else
2775    op = usmul_widen_optab;
2776
2777  handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2778						  0, &actual_mode);
2779
2780  if (handler == CODE_FOR_nothing)
2781    {
2782      if (op != smul_widen_optab)
2783	{
2784	  /* We can use a signed multiply with unsigned types as long as
2785	     there is a wider mode to use, or it is the smaller of the two
2786	     types that is unsigned.  Note that type1 >= type2, always.  */
2787	  if ((TYPE_UNSIGNED (type1)
2788	       && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2789	      || (TYPE_UNSIGNED (type2)
2790		  && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2791	    {
2792	      from_mode = GET_MODE_WIDER_MODE (from_mode);
2793	      if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2794		return false;
2795	    }
2796
2797	  op = smul_widen_optab;
2798	  handler = find_widening_optab_handler_and_mode (op, to_mode,
2799							  from_mode, 0,
2800							  &actual_mode);
2801
2802	  if (handler == CODE_FOR_nothing)
2803	    return false;
2804
2805	  from_unsigned1 = from_unsigned2 = false;
2806	}
2807      else
2808	return false;
2809    }
2810
2811  /* Ensure that the inputs to the handler are in the correct precison
2812     for the opcode.  This will be the full mode size.  */
2813  actual_precision = GET_MODE_PRECISION (actual_mode);
2814  if (2 * actual_precision > TYPE_PRECISION (type))
2815    return false;
2816  if (actual_precision != TYPE_PRECISION (type1)
2817      || from_unsigned1 != TYPE_UNSIGNED (type1))
2818    rhs1 = build_and_insert_cast (gsi, loc,
2819				  build_nonstandard_integer_type
2820				    (actual_precision, from_unsigned1), rhs1);
2821  if (actual_precision != TYPE_PRECISION (type2)
2822      || from_unsigned2 != TYPE_UNSIGNED (type2))
2823    rhs2 = build_and_insert_cast (gsi, loc,
2824				  build_nonstandard_integer_type
2825				    (actual_precision, from_unsigned2), rhs2);
2826
2827  /* Handle constants.  */
2828  if (TREE_CODE (rhs1) == INTEGER_CST)
2829    rhs1 = fold_convert (type1, rhs1);
2830  if (TREE_CODE (rhs2) == INTEGER_CST)
2831    rhs2 = fold_convert (type2, rhs2);
2832
2833  gimple_assign_set_rhs1 (stmt, rhs1);
2834  gimple_assign_set_rhs2 (stmt, rhs2);
2835  gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2836  update_stmt (stmt);
2837  widen_mul_stats.widen_mults_inserted++;
2838  return true;
2839}
2840
2841/* Process a single gimple statement STMT, which is found at the
2842   iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2843   rhs (given by CODE), and try to convert it into a
2844   WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2845   is true iff we converted the statement.  */
2846
2847static bool
2848convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2849			    enum tree_code code)
2850{
2851  gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2852  gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2853  tree type, type1, type2, optype;
2854  tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2855  enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2856  optab this_optab;
2857  enum tree_code wmult_code;
2858  enum insn_code handler;
2859  machine_mode to_mode, from_mode, actual_mode;
2860  location_t loc = gimple_location (stmt);
2861  int actual_precision;
2862  bool from_unsigned1, from_unsigned2;
2863
2864  lhs = gimple_assign_lhs (stmt);
2865  type = TREE_TYPE (lhs);
2866  if (TREE_CODE (type) != INTEGER_TYPE
2867      && TREE_CODE (type) != FIXED_POINT_TYPE)
2868    return false;
2869
2870  if (code == MINUS_EXPR)
2871    wmult_code = WIDEN_MULT_MINUS_EXPR;
2872  else
2873    wmult_code = WIDEN_MULT_PLUS_EXPR;
2874
2875  rhs1 = gimple_assign_rhs1 (stmt);
2876  rhs2 = gimple_assign_rhs2 (stmt);
2877
2878  if (TREE_CODE (rhs1) == SSA_NAME)
2879    {
2880      rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2881      if (is_gimple_assign (rhs1_stmt))
2882	rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2883    }
2884
2885  if (TREE_CODE (rhs2) == SSA_NAME)
2886    {
2887      rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2888      if (is_gimple_assign (rhs2_stmt))
2889	rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2890    }
2891
2892  /* Allow for one conversion statement between the multiply
2893     and addition/subtraction statement.  If there are more than
2894     one conversions then we assume they would invalidate this
2895     transformation.  If that's not the case then they should have
2896     been folded before now.  */
2897  if (CONVERT_EXPR_CODE_P (rhs1_code))
2898    {
2899      conv1_stmt = rhs1_stmt;
2900      rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2901      if (TREE_CODE (rhs1) == SSA_NAME)
2902	{
2903	  rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2904	  if (is_gimple_assign (rhs1_stmt))
2905	    rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2906	}
2907      else
2908	return false;
2909    }
2910  if (CONVERT_EXPR_CODE_P (rhs2_code))
2911    {
2912      conv2_stmt = rhs2_stmt;
2913      rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2914      if (TREE_CODE (rhs2) == SSA_NAME)
2915	{
2916	  rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2917	  if (is_gimple_assign (rhs2_stmt))
2918	    rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2919	}
2920      else
2921	return false;
2922    }
2923
2924  /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2925     is_widening_mult_p, but we still need the rhs returns.
2926
2927     It might also appear that it would be sufficient to use the existing
2928     operands of the widening multiply, but that would limit the choice of
2929     multiply-and-accumulate instructions.
2930
2931     If the widened-multiplication result has more than one uses, it is
2932     probably wiser not to do the conversion.  */
2933  if (code == PLUS_EXPR
2934      && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2935    {
2936      if (!has_single_use (rhs1)
2937	  || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2938				  &type2, &mult_rhs2))
2939	return false;
2940      add_rhs = rhs2;
2941      conv_stmt = conv1_stmt;
2942    }
2943  else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2944    {
2945      if (!has_single_use (rhs2)
2946	  || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2947				  &type2, &mult_rhs2))
2948	return false;
2949      add_rhs = rhs1;
2950      conv_stmt = conv2_stmt;
2951    }
2952  else
2953    return false;
2954
2955  to_mode = TYPE_MODE (type);
2956  from_mode = TYPE_MODE (type1);
2957  from_unsigned1 = TYPE_UNSIGNED (type1);
2958  from_unsigned2 = TYPE_UNSIGNED (type2);
2959  optype = type1;
2960
2961  /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2962  if (from_unsigned1 != from_unsigned2)
2963    {
2964      if (!INTEGRAL_TYPE_P (type))
2965	return false;
2966      /* We can use a signed multiply with unsigned types as long as
2967	 there is a wider mode to use, or it is the smaller of the two
2968	 types that is unsigned.  Note that type1 >= type2, always.  */
2969      if ((from_unsigned1
2970	   && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2971	  || (from_unsigned2
2972	      && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2973	{
2974	  from_mode = GET_MODE_WIDER_MODE (from_mode);
2975	  if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2976	    return false;
2977	}
2978
2979      from_unsigned1 = from_unsigned2 = false;
2980      optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2981					       false);
2982    }
2983
2984  /* If there was a conversion between the multiply and addition
2985     then we need to make sure it fits a multiply-and-accumulate.
2986     The should be a single mode change which does not change the
2987     value.  */
2988  if (conv_stmt)
2989    {
2990      /* We use the original, unmodified data types for this.  */
2991      tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2992      tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2993      int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2994      bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2995
2996      if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2997	{
2998	  /* Conversion is a truncate.  */
2999	  if (TYPE_PRECISION (to_type) < data_size)
3000	    return false;
3001	}
3002      else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
3003	{
3004	  /* Conversion is an extend.  Check it's the right sort.  */
3005	  if (TYPE_UNSIGNED (from_type) != is_unsigned
3006	      && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
3007	    return false;
3008	}
3009      /* else convert is a no-op for our purposes.  */
3010    }
3011
3012  /* Verify that the machine can perform a widening multiply
3013     accumulate in this mode/signedness combination, otherwise
3014     this transformation is likely to pessimize code.  */
3015  this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
3016  handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
3017						  from_mode, 0, &actual_mode);
3018
3019  if (handler == CODE_FOR_nothing)
3020    return false;
3021
3022  /* Ensure that the inputs to the handler are in the correct precison
3023     for the opcode.  This will be the full mode size.  */
3024  actual_precision = GET_MODE_PRECISION (actual_mode);
3025  if (actual_precision != TYPE_PRECISION (type1)
3026      || from_unsigned1 != TYPE_UNSIGNED (type1))
3027    mult_rhs1 = build_and_insert_cast (gsi, loc,
3028				       build_nonstandard_integer_type
3029				         (actual_precision, from_unsigned1),
3030				       mult_rhs1);
3031  if (actual_precision != TYPE_PRECISION (type2)
3032      || from_unsigned2 != TYPE_UNSIGNED (type2))
3033    mult_rhs2 = build_and_insert_cast (gsi, loc,
3034				       build_nonstandard_integer_type
3035					 (actual_precision, from_unsigned2),
3036				       mult_rhs2);
3037
3038  if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
3039    add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
3040
3041  /* Handle constants.  */
3042  if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3043    mult_rhs1 = fold_convert (type1, mult_rhs1);
3044  if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3045    mult_rhs2 = fold_convert (type2, mult_rhs2);
3046
3047  gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3048				  add_rhs);
3049  update_stmt (gsi_stmt (*gsi));
3050  widen_mul_stats.maccs_inserted++;
3051  return true;
3052}
3053
3054/* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3055   with uses in additions and subtractions to form fused multiply-add
3056   operations.  Returns true if successful and MUL_STMT should be removed.  */
3057
3058static bool
3059convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
3060{
3061  tree mul_result = gimple_get_lhs (mul_stmt);
3062  tree type = TREE_TYPE (mul_result);
3063  gimple use_stmt, neguse_stmt;
3064  gassign *fma_stmt;
3065  use_operand_p use_p;
3066  imm_use_iterator imm_iter;
3067
3068  if (FLOAT_TYPE_P (type)
3069      && flag_fp_contract_mode == FP_CONTRACT_OFF)
3070    return false;
3071
3072  /* We don't want to do bitfield reduction ops.  */
3073  if (INTEGRAL_TYPE_P (type)
3074      && (TYPE_PRECISION (type)
3075	  != GET_MODE_PRECISION (TYPE_MODE (type))))
3076    return false;
3077
3078  /* If the target doesn't support it, don't generate it.  We assume that
3079     if fma isn't available then fms, fnma or fnms are not either.  */
3080  if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3081    return false;
3082
3083  /* If the multiplication has zero uses, it is kept around probably because
3084     of -fnon-call-exceptions.  Don't optimize it away in that case,
3085     it is DCE job.  */
3086  if (has_zero_uses (mul_result))
3087    return false;
3088
3089  /* Make sure that the multiplication statement becomes dead after
3090     the transformation, thus that all uses are transformed to FMAs.
3091     This means we assume that an FMA operation has the same cost
3092     as an addition.  */
3093  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3094    {
3095      enum tree_code use_code;
3096      tree result = mul_result;
3097      bool negate_p = false;
3098
3099      use_stmt = USE_STMT (use_p);
3100
3101      if (is_gimple_debug (use_stmt))
3102	continue;
3103
3104      /* For now restrict this operations to single basic blocks.  In theory
3105	 we would want to support sinking the multiplication in
3106	 m = a*b;
3107	 if ()
3108	   ma = m + c;
3109	 else
3110	   d = m;
3111	 to form a fma in the then block and sink the multiplication to the
3112	 else block.  */
3113      if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3114	return false;
3115
3116      if (!is_gimple_assign (use_stmt))
3117	return false;
3118
3119      use_code = gimple_assign_rhs_code (use_stmt);
3120
3121      /* A negate on the multiplication leads to FNMA.  */
3122      if (use_code == NEGATE_EXPR)
3123	{
3124	  ssa_op_iter iter;
3125	  use_operand_p usep;
3126
3127	  result = gimple_assign_lhs (use_stmt);
3128
3129	  /* Make sure the negate statement becomes dead with this
3130	     single transformation.  */
3131	  if (!single_imm_use (gimple_assign_lhs (use_stmt),
3132			       &use_p, &neguse_stmt))
3133	    return false;
3134
3135	  /* Make sure the multiplication isn't also used on that stmt.  */
3136	  FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3137	    if (USE_FROM_PTR (usep) == mul_result)
3138	      return false;
3139
3140	  /* Re-validate.  */
3141	  use_stmt = neguse_stmt;
3142	  if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3143	    return false;
3144	  if (!is_gimple_assign (use_stmt))
3145	    return false;
3146
3147	  use_code = gimple_assign_rhs_code (use_stmt);
3148	  negate_p = true;
3149	}
3150
3151      switch (use_code)
3152	{
3153	case MINUS_EXPR:
3154	  if (gimple_assign_rhs2 (use_stmt) == result)
3155	    negate_p = !negate_p;
3156	  break;
3157	case PLUS_EXPR:
3158	  break;
3159	default:
3160	  /* FMA can only be formed from PLUS and MINUS.  */
3161	  return false;
3162	}
3163
3164      /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3165	 by a MULT_EXPR that we'll visit later, we might be able to
3166	 get a more profitable match with fnma.
3167	 OTOH, if we don't, a negate / fma pair has likely lower latency
3168	 that a mult / subtract pair.  */
3169      if (use_code == MINUS_EXPR && !negate_p
3170	  && gimple_assign_rhs1 (use_stmt) == result
3171	  && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3172	  && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3173	{
3174	  tree rhs2 = gimple_assign_rhs2 (use_stmt);
3175
3176	  if (TREE_CODE (rhs2) == SSA_NAME)
3177	    {
3178	      gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
3179	      if (has_single_use (rhs2)
3180		  && is_gimple_assign (stmt2)
3181		  && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3182	      return false;
3183	    }
3184	}
3185
3186      /* We can't handle a * b + a * b.  */
3187      if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3188	return false;
3189
3190      /* While it is possible to validate whether or not the exact form
3191	 that we've recognized is available in the backend, the assumption
3192	 is that the transformation is never a loss.  For instance, suppose
3193	 the target only has the plain FMA pattern available.  Consider
3194	 a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3195	 is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
3196	 still have 3 operations, but in the FMA form the two NEGs are
3197	 independent and could be run in parallel.  */
3198    }
3199
3200  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3201    {
3202      gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3203      enum tree_code use_code;
3204      tree addop, mulop1 = op1, result = mul_result;
3205      bool negate_p = false;
3206
3207      if (is_gimple_debug (use_stmt))
3208	continue;
3209
3210      use_code = gimple_assign_rhs_code (use_stmt);
3211      if (use_code == NEGATE_EXPR)
3212	{
3213	  result = gimple_assign_lhs (use_stmt);
3214	  single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3215	  gsi_remove (&gsi, true);
3216	  release_defs (use_stmt);
3217
3218	  use_stmt = neguse_stmt;
3219	  gsi = gsi_for_stmt (use_stmt);
3220	  use_code = gimple_assign_rhs_code (use_stmt);
3221	  negate_p = true;
3222	}
3223
3224      if (gimple_assign_rhs1 (use_stmt) == result)
3225	{
3226	  addop = gimple_assign_rhs2 (use_stmt);
3227	  /* a * b - c -> a * b + (-c)  */
3228	  if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3229	    addop = force_gimple_operand_gsi (&gsi,
3230					      build1 (NEGATE_EXPR,
3231						      type, addop),
3232					      true, NULL_TREE, true,
3233					      GSI_SAME_STMT);
3234	}
3235      else
3236	{
3237	  addop = gimple_assign_rhs1 (use_stmt);
3238	  /* a - b * c -> (-b) * c + a */
3239	  if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3240	    negate_p = !negate_p;
3241	}
3242
3243      if (negate_p)
3244	mulop1 = force_gimple_operand_gsi (&gsi,
3245					   build1 (NEGATE_EXPR,
3246						   type, mulop1),
3247					   true, NULL_TREE, true,
3248					   GSI_SAME_STMT);
3249
3250      fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3251				      FMA_EXPR, mulop1, op2, addop);
3252      gsi_replace (&gsi, fma_stmt, true);
3253      widen_mul_stats.fmas_inserted++;
3254    }
3255
3256  return true;
3257}
3258
3259/* Find integer multiplications where the operands are extended from
3260   smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3261   where appropriate.  */
3262
3263namespace {
3264
3265const pass_data pass_data_optimize_widening_mul =
3266{
3267  GIMPLE_PASS, /* type */
3268  "widening_mul", /* name */
3269  OPTGROUP_NONE, /* optinfo_flags */
3270  TV_NONE, /* tv_id */
3271  PROP_ssa, /* properties_required */
3272  0, /* properties_provided */
3273  0, /* properties_destroyed */
3274  0, /* todo_flags_start */
3275  TODO_update_ssa, /* todo_flags_finish */
3276};
3277
3278class pass_optimize_widening_mul : public gimple_opt_pass
3279{
3280public:
3281  pass_optimize_widening_mul (gcc::context *ctxt)
3282    : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3283  {}
3284
3285  /* opt_pass methods: */
3286  virtual bool gate (function *)
3287    {
3288      return flag_expensive_optimizations && optimize;
3289    }
3290
3291  virtual unsigned int execute (function *);
3292
3293}; // class pass_optimize_widening_mul
3294
3295unsigned int
3296pass_optimize_widening_mul::execute (function *fun)
3297{
3298  basic_block bb;
3299  bool cfg_changed = false;
3300
3301  memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3302
3303  FOR_EACH_BB_FN (bb, fun)
3304    {
3305      gimple_stmt_iterator gsi;
3306
3307      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3308        {
3309	  gimple stmt = gsi_stmt (gsi);
3310	  enum tree_code code;
3311
3312	  if (is_gimple_assign (stmt))
3313	    {
3314	      code = gimple_assign_rhs_code (stmt);
3315	      switch (code)
3316		{
3317		case MULT_EXPR:
3318		  if (!convert_mult_to_widen (stmt, &gsi)
3319		      && convert_mult_to_fma (stmt,
3320					      gimple_assign_rhs1 (stmt),
3321					      gimple_assign_rhs2 (stmt)))
3322		    {
3323		      gsi_remove (&gsi, true);
3324		      release_defs (stmt);
3325		      continue;
3326		    }
3327		  break;
3328
3329		case PLUS_EXPR:
3330		case MINUS_EXPR:
3331		  convert_plusminus_to_widen (&gsi, stmt, code);
3332		  break;
3333
3334		default:;
3335		}
3336	    }
3337	  else if (is_gimple_call (stmt)
3338		   && gimple_call_lhs (stmt))
3339	    {
3340	      tree fndecl = gimple_call_fndecl (stmt);
3341	      if (fndecl
3342		  && gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
3343		{
3344		  switch (DECL_FUNCTION_CODE (fndecl))
3345		    {
3346		      case BUILT_IN_POWF:
3347		      case BUILT_IN_POW:
3348		      case BUILT_IN_POWL:
3349			if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3350			    && REAL_VALUES_EQUAL
3351			         (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3352				  dconst2)
3353			    && convert_mult_to_fma (stmt,
3354						    gimple_call_arg (stmt, 0),
3355						    gimple_call_arg (stmt, 0)))
3356			  {
3357			    unlink_stmt_vdef (stmt);
3358			    if (gsi_remove (&gsi, true)
3359				&& gimple_purge_dead_eh_edges (bb))
3360			      cfg_changed = true;
3361			    release_defs (stmt);
3362			    continue;
3363			  }
3364			  break;
3365
3366		      default:;
3367		    }
3368		}
3369	    }
3370	  gsi_next (&gsi);
3371	}
3372    }
3373
3374  statistics_counter_event (fun, "widening multiplications inserted",
3375			    widen_mul_stats.widen_mults_inserted);
3376  statistics_counter_event (fun, "widening maccs inserted",
3377			    widen_mul_stats.maccs_inserted);
3378  statistics_counter_event (fun, "fused multiply-adds inserted",
3379			    widen_mul_stats.fmas_inserted);
3380
3381  return cfg_changed ? TODO_cleanup_cfg : 0;
3382}
3383
3384} // anon namespace
3385
3386gimple_opt_pass *
3387make_pass_optimize_widening_mul (gcc::context *ctxt)
3388{
3389  return new pass_optimize_widening_mul (ctxt);
3390}
3391