1/* Copyright (C) 2005-2015 Free Software Foundation, Inc.
2   Contributed by Richard Henderson <rth@redhat.com>.
3
4   This file is part of the GNU Offloading and Multi Processing Library
5   (libgomp).
6
7   Libgomp is free software; you can redistribute it and/or modify it
8   under the terms of the GNU General Public License as published by
9   the Free Software Foundation; either version 3, or (at your option)
10   any later version.
11
12   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15   more details.
16
17   Under Section 7 of GPL version 3, you are granted additional
18   permissions described in the GCC Runtime Library Exception, version
19   3.1, as published by the Free Software Foundation.
20
21   You should have received a copy of the GNU General Public License and
22   a copy of the GCC Runtime Library Exception along with this program;
23   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24   <http://www.gnu.org/licenses/>.  */
25
26/* This file contains routines for managing work-share iteration, both
27   for loops and sections.  */
28
29#include "libgomp.h"
30#include <stdlib.h>
31
32
33/* This function implements the STATIC scheduling method.  The caller should
34   iterate *pstart <= x < *pend.  Return zero if there are more iterations
35   to perform; nonzero if not.  Return less than 0 if this thread had
36   received the absolutely last iteration.  */
37
38int
39gomp_iter_static_next (long *pstart, long *pend)
40{
41  struct gomp_thread *thr = gomp_thread ();
42  struct gomp_team *team = thr->ts.team;
43  struct gomp_work_share *ws = thr->ts.work_share;
44  unsigned long nthreads = team ? team->nthreads : 1;
45
46  if (thr->ts.static_trip == -1)
47    return -1;
48
49  /* Quick test for degenerate teams and orphaned constructs.  */
50  if (nthreads == 1)
51    {
52      *pstart = ws->next;
53      *pend = ws->end;
54      thr->ts.static_trip = -1;
55      return ws->next == ws->end;
56    }
57
58  /* We interpret chunk_size zero as "unspecified", which means that we
59     should break up the iterations such that each thread makes only one
60     trip through the outer loop.  */
61  if (ws->chunk_size == 0)
62    {
63      unsigned long n, q, i, t;
64      unsigned long s0, e0;
65      long s, e;
66
67      if (thr->ts.static_trip > 0)
68	return 1;
69
70      /* Compute the total number of iterations.  */
71      s = ws->incr + (ws->incr > 0 ? -1 : 1);
72      n = (ws->end - ws->next + s) / ws->incr;
73      i = thr->ts.team_id;
74
75      /* Compute the "zero-based" start and end points.  That is, as
76         if the loop began at zero and incremented by one.  */
77      q = n / nthreads;
78      t = n % nthreads;
79      if (i < t)
80	{
81	  t = 0;
82	  q++;
83	}
84      s0 = q * i + t;
85      e0 = s0 + q;
86
87      /* Notice when no iterations allocated for this thread.  */
88      if (s0 >= e0)
89	{
90	  thr->ts.static_trip = 1;
91	  return 1;
92	}
93
94      /* Transform these to the actual start and end numbers.  */
95      s = (long)s0 * ws->incr + ws->next;
96      e = (long)e0 * ws->incr + ws->next;
97
98      *pstart = s;
99      *pend = e;
100      thr->ts.static_trip = (e0 == n ? -1 : 1);
101      return 0;
102    }
103  else
104    {
105      unsigned long n, s0, e0, i, c;
106      long s, e;
107
108      /* Otherwise, each thread gets exactly chunk_size iterations
109	 (if available) each time through the loop.  */
110
111      s = ws->incr + (ws->incr > 0 ? -1 : 1);
112      n = (ws->end - ws->next + s) / ws->incr;
113      i = thr->ts.team_id;
114      c = ws->chunk_size;
115
116      /* Initial guess is a C sized chunk positioned nthreads iterations
117	 in, offset by our thread number.  */
118      s0 = (thr->ts.static_trip * nthreads + i) * c;
119      e0 = s0 + c;
120
121      /* Detect overflow.  */
122      if (s0 >= n)
123	return 1;
124      if (e0 > n)
125	e0 = n;
126
127      /* Transform these to the actual start and end numbers.  */
128      s = (long)s0 * ws->incr + ws->next;
129      e = (long)e0 * ws->incr + ws->next;
130
131      *pstart = s;
132      *pend = e;
133
134      if (e0 == n)
135	thr->ts.static_trip = -1;
136      else
137	thr->ts.static_trip++;
138      return 0;
139    }
140}
141
142
143/* This function implements the DYNAMIC scheduling method.  Arguments are
144   as for gomp_iter_static_next.  This function must be called with ws->lock
145   held.  */
146
147bool
148gomp_iter_dynamic_next_locked (long *pstart, long *pend)
149{
150  struct gomp_thread *thr = gomp_thread ();
151  struct gomp_work_share *ws = thr->ts.work_share;
152  long start, end, chunk, left;
153
154  start = ws->next;
155  if (start == ws->end)
156    return false;
157
158  chunk = ws->chunk_size;
159  left = ws->end - start;
160  if (ws->incr < 0)
161    {
162      if (chunk < left)
163	chunk = left;
164    }
165  else
166    {
167      if (chunk > left)
168	chunk = left;
169    }
170  end = start + chunk;
171
172  ws->next = end;
173  *pstart = start;
174  *pend = end;
175  return true;
176}
177
178
179#ifdef HAVE_SYNC_BUILTINS
180/* Similar, but doesn't require the lock held, and uses compare-and-swap
181   instead.  Note that the only memory value that changes is ws->next.  */
182
183bool
184gomp_iter_dynamic_next (long *pstart, long *pend)
185{
186  struct gomp_thread *thr = gomp_thread ();
187  struct gomp_work_share *ws = thr->ts.work_share;
188  long start, end, nend, chunk, incr;
189
190  end = ws->end;
191  incr = ws->incr;
192  chunk = ws->chunk_size;
193
194  if (__builtin_expect (ws->mode, 1))
195    {
196      long tmp = __sync_fetch_and_add (&ws->next, chunk);
197      if (incr > 0)
198	{
199	  if (tmp >= end)
200	    return false;
201	  nend = tmp + chunk;
202	  if (nend > end)
203	    nend = end;
204	  *pstart = tmp;
205	  *pend = nend;
206	  return true;
207	}
208      else
209	{
210	  if (tmp <= end)
211	    return false;
212	  nend = tmp + chunk;
213	  if (nend < end)
214	    nend = end;
215	  *pstart = tmp;
216	  *pend = nend;
217	  return true;
218	}
219    }
220
221  start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
222  while (1)
223    {
224      long left = end - start;
225      long tmp;
226
227      if (start == end)
228	return false;
229
230      if (incr < 0)
231	{
232	  if (chunk < left)
233	    chunk = left;
234	}
235      else
236	{
237	  if (chunk > left)
238	    chunk = left;
239	}
240      nend = start + chunk;
241
242      tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
243      if (__builtin_expect (tmp == start, 1))
244	break;
245
246      start = tmp;
247    }
248
249  *pstart = start;
250  *pend = nend;
251  return true;
252}
253#endif /* HAVE_SYNC_BUILTINS */
254
255
256/* This function implements the GUIDED scheduling method.  Arguments are
257   as for gomp_iter_static_next.  This function must be called with the
258   work share lock held.  */
259
260bool
261gomp_iter_guided_next_locked (long *pstart, long *pend)
262{
263  struct gomp_thread *thr = gomp_thread ();
264  struct gomp_work_share *ws = thr->ts.work_share;
265  struct gomp_team *team = thr->ts.team;
266  unsigned long nthreads = team ? team->nthreads : 1;
267  unsigned long n, q;
268  long start, end;
269
270  if (ws->next == ws->end)
271    return false;
272
273  start = ws->next;
274  n = (ws->end - start) / ws->incr;
275  q = (n + nthreads - 1) / nthreads;
276
277  if (q < ws->chunk_size)
278    q = ws->chunk_size;
279  if (q <= n)
280    end = start + q * ws->incr;
281  else
282    end = ws->end;
283
284  ws->next = end;
285  *pstart = start;
286  *pend = end;
287  return true;
288}
289
290#ifdef HAVE_SYNC_BUILTINS
291/* Similar, but doesn't require the lock held, and uses compare-and-swap
292   instead.  Note that the only memory value that changes is ws->next.  */
293
294bool
295gomp_iter_guided_next (long *pstart, long *pend)
296{
297  struct gomp_thread *thr = gomp_thread ();
298  struct gomp_work_share *ws = thr->ts.work_share;
299  struct gomp_team *team = thr->ts.team;
300  unsigned long nthreads = team ? team->nthreads : 1;
301  long start, end, nend, incr;
302  unsigned long chunk_size;
303
304  start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
305  end = ws->end;
306  incr = ws->incr;
307  chunk_size = ws->chunk_size;
308
309  while (1)
310    {
311      unsigned long n, q;
312      long tmp;
313
314      if (start == end)
315	return false;
316
317      n = (end - start) / incr;
318      q = (n + nthreads - 1) / nthreads;
319
320      if (q < chunk_size)
321	q = chunk_size;
322      if (__builtin_expect (q <= n, 1))
323	nend = start + q * incr;
324      else
325	nend = end;
326
327      tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
328      if (__builtin_expect (tmp == start, 1))
329	break;
330
331      start = tmp;
332    }
333
334  *pstart = start;
335  *pend = nend;
336  return true;
337}
338#endif /* HAVE_SYNC_BUILTINS */
339