1/* Copyright (C) 2005-2015 Free Software Foundation, Inc.
2   Contributed by Richard Henderson <rth@redhat.com>.
3
4   This file is part of the GNU Offloading and Multi Processing Library
5   (libgomp).
6
7   Libgomp is free software; you can redistribute it and/or modify it
8   under the terms of the GNU General Public License as published by
9   the Free Software Foundation; either version 3, or (at your option)
10   any later version.
11
12   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15   more details.
16
17   Under Section 7 of GPL version 3, you are granted additional
18   permissions described in the GCC Runtime Library Exception, version
19   3.1, as published by the Free Software Foundation.
20
21   You should have received a copy of the GNU General Public License and
22   a copy of the GCC Runtime Library Exception along with this program;
23   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24   <http://www.gnu.org/licenses/>.  */
25
26/* This file contains routines for managing work-share iteration, both
27   for loops and sections.  */
28
29#include "libgomp.h"
30#include <stdlib.h>
31
32typedef unsigned long long gomp_ull;
33
34/* This function implements the STATIC scheduling method.  The caller should
35   iterate *pstart <= x < *pend.  Return zero if there are more iterations
36   to perform; nonzero if not.  Return less than 0 if this thread had
37   received the absolutely last iteration.  */
38
39int
40gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
41{
42  struct gomp_thread *thr = gomp_thread ();
43  struct gomp_team *team = thr->ts.team;
44  struct gomp_work_share *ws = thr->ts.work_share;
45  unsigned long nthreads = team ? team->nthreads : 1;
46
47  if (thr->ts.static_trip == -1)
48    return -1;
49
50  /* Quick test for degenerate teams and orphaned constructs.  */
51  if (nthreads == 1)
52    {
53      *pstart = ws->next_ull;
54      *pend = ws->end_ull;
55      thr->ts.static_trip = -1;
56      return ws->next_ull == ws->end_ull;
57    }
58
59  /* We interpret chunk_size zero as "unspecified", which means that we
60     should break up the iterations such that each thread makes only one
61     trip through the outer loop.  */
62  if (ws->chunk_size_ull == 0)
63    {
64      gomp_ull n, q, i, t, s0, e0, s, e;
65
66      if (thr->ts.static_trip > 0)
67	return 1;
68
69      /* Compute the total number of iterations.  */
70      if (__builtin_expect (ws->mode, 0) == 0)
71	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
72      else
73	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
74      i = thr->ts.team_id;
75
76      /* Compute the "zero-based" start and end points.  That is, as
77	 if the loop began at zero and incremented by one.  */
78      q = n / nthreads;
79      t = n % nthreads;
80      if (i < t)
81	{
82	  t = 0;
83	  q++;
84	}
85      s0 = q * i + t;
86      e0 = s0 + q;
87
88      /* Notice when no iterations allocated for this thread.  */
89      if (s0 >= e0)
90	{
91	  thr->ts.static_trip = 1;
92	  return 1;
93	}
94
95      /* Transform these to the actual start and end numbers.  */
96      s = s0 * ws->incr_ull + ws->next_ull;
97      e = e0 * ws->incr_ull + ws->next_ull;
98
99      *pstart = s;
100      *pend = e;
101      thr->ts.static_trip = (e0 == n ? -1 : 1);
102      return 0;
103    }
104  else
105    {
106      gomp_ull n, s0, e0, i, c, s, e;
107
108      /* Otherwise, each thread gets exactly chunk_size iterations
109	 (if available) each time through the loop.  */
110
111      if (__builtin_expect (ws->mode, 0) == 0)
112	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
113      else
114	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
115      i = thr->ts.team_id;
116      c = ws->chunk_size_ull;
117
118      /* Initial guess is a C sized chunk positioned nthreads iterations
119	 in, offset by our thread number.  */
120      s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
121      e0 = s0 + c;
122
123      /* Detect overflow.  */
124      if (s0 >= n)
125	return 1;
126      if (e0 > n)
127	e0 = n;
128
129      /* Transform these to the actual start and end numbers.  */
130      s = s0 * ws->incr_ull + ws->next_ull;
131      e = e0 * ws->incr_ull + ws->next_ull;
132
133      *pstart = s;
134      *pend = e;
135
136      if (e0 == n)
137	thr->ts.static_trip = -1;
138      else
139	thr->ts.static_trip++;
140      return 0;
141    }
142}
143
144
145/* This function implements the DYNAMIC scheduling method.  Arguments are
146   as for gomp_iter_ull_static_next.  This function must be called with
147   ws->lock held.  */
148
149bool
150gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
151{
152  struct gomp_thread *thr = gomp_thread ();
153  struct gomp_work_share *ws = thr->ts.work_share;
154  gomp_ull start, end, chunk, left;
155
156  start = ws->next_ull;
157  if (start == ws->end_ull)
158    return false;
159
160  chunk = ws->chunk_size_ull;
161  left = ws->end_ull - start;
162  if (__builtin_expect (ws->mode & 2, 0))
163    {
164      if (chunk < left)
165	chunk = left;
166    }
167  else
168    {
169      if (chunk > left)
170	chunk = left;
171    }
172  end = start + chunk;
173
174  ws->next_ull = end;
175  *pstart = start;
176  *pend = end;
177  return true;
178}
179
180
181#if defined HAVE_SYNC_BUILTINS && defined __LP64__
182/* Similar, but doesn't require the lock held, and uses compare-and-swap
183   instead.  Note that the only memory value that changes is ws->next_ull.  */
184
185bool
186gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
187{
188  struct gomp_thread *thr = gomp_thread ();
189  struct gomp_work_share *ws = thr->ts.work_share;
190  gomp_ull start, end, nend, chunk;
191
192  end = ws->end_ull;
193  chunk = ws->chunk_size_ull;
194
195  if (__builtin_expect (ws->mode & 1, 1))
196    {
197      gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
198      if (__builtin_expect (ws->mode & 2, 0) == 0)
199	{
200	  if (tmp >= end)
201	    return false;
202	  nend = tmp + chunk;
203	  if (nend > end)
204	    nend = end;
205	  *pstart = tmp;
206	  *pend = nend;
207	  return true;
208	}
209      else
210	{
211	  if (tmp <= end)
212	    return false;
213	  nend = tmp + chunk;
214	  if (nend < end)
215	    nend = end;
216	  *pstart = tmp;
217	  *pend = nend;
218	  return true;
219	}
220    }
221
222  start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
223  while (1)
224    {
225      gomp_ull left = end - start;
226      gomp_ull tmp;
227
228      if (start == end)
229	return false;
230
231      if (__builtin_expect (ws->mode & 2, 0))
232	{
233	  if (chunk < left)
234	    chunk = left;
235	}
236      else
237	{
238	  if (chunk > left)
239	    chunk = left;
240	}
241      nend = start + chunk;
242
243      tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
244      if (__builtin_expect (tmp == start, 1))
245	break;
246
247      start = tmp;
248    }
249
250  *pstart = start;
251  *pend = nend;
252  return true;
253}
254#endif /* HAVE_SYNC_BUILTINS */
255
256
257/* This function implements the GUIDED scheduling method.  Arguments are
258   as for gomp_iter_ull_static_next.  This function must be called with the
259   work share lock held.  */
260
261bool
262gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
263{
264  struct gomp_thread *thr = gomp_thread ();
265  struct gomp_work_share *ws = thr->ts.work_share;
266  struct gomp_team *team = thr->ts.team;
267  gomp_ull nthreads = team ? team->nthreads : 1;
268  gomp_ull n, q;
269  gomp_ull start, end;
270
271  if (ws->next_ull == ws->end_ull)
272    return false;
273
274  start = ws->next_ull;
275  if (__builtin_expect (ws->mode, 0) == 0)
276    n = (ws->end_ull - start) / ws->incr_ull;
277  else
278    n = (start - ws->end_ull) / -ws->incr_ull;
279  q = (n + nthreads - 1) / nthreads;
280
281  if (q < ws->chunk_size_ull)
282    q = ws->chunk_size_ull;
283  if (q <= n)
284    end = start + q * ws->incr_ull;
285  else
286    end = ws->end_ull;
287
288  ws->next_ull = end;
289  *pstart = start;
290  *pend = end;
291  return true;
292}
293
294#if defined HAVE_SYNC_BUILTINS && defined __LP64__
295/* Similar, but doesn't require the lock held, and uses compare-and-swap
296   instead.  Note that the only memory value that changes is ws->next_ull.  */
297
298bool
299gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
300{
301  struct gomp_thread *thr = gomp_thread ();
302  struct gomp_work_share *ws = thr->ts.work_share;
303  struct gomp_team *team = thr->ts.team;
304  gomp_ull nthreads = team ? team->nthreads : 1;
305  gomp_ull start, end, nend, incr;
306  gomp_ull chunk_size;
307
308  start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
309  end = ws->end_ull;
310  incr = ws->incr_ull;
311  chunk_size = ws->chunk_size_ull;
312
313  while (1)
314    {
315      gomp_ull n, q;
316      gomp_ull tmp;
317
318      if (start == end)
319	return false;
320
321      if (__builtin_expect (ws->mode, 0) == 0)
322	n = (end - start) / incr;
323      else
324	n = (start - end) / -incr;
325      q = (n + nthreads - 1) / nthreads;
326
327      if (q < chunk_size)
328	q = chunk_size;
329      if (__builtin_expect (q <= n, 1))
330	nend = start + q * incr;
331      else
332	nend = end;
333
334      tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
335      if (__builtin_expect (tmp == start, 1))
336	break;
337
338      start = tmp;
339    }
340
341  *pstart = start;
342  *pend = nend;
343  return true;
344}
345#endif /* HAVE_SYNC_BUILTINS */
346