LLVM OpenMP* Runtime Library
kmp_dispatch.cpp
1/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13/* Dynamic scheduling initialization and dispatch.
14 *
15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16 * it may change values between parallel regions. __kmp_max_nth
17 * is the largest value __kmp_nth may take, 1 is the smallest.
18 */
19
20#include "kmp.h"
21#include "kmp_error.h"
22#include "kmp_i18n.h"
23#include "kmp_itt.h"
24#include "kmp_stats.h"
25#include "kmp_str.h"
26#if KMP_USE_X87CONTROL
27#include <float.h>
28#endif
29#include "kmp_lock.h"
30#include "kmp_dispatch.h"
31#if KMP_USE_HIER_SCHED
32#include "kmp_dispatch_hier.h"
33#endif
34
35#if OMPT_SUPPORT
36#include "ompt-specific.h"
37#endif
38
39/* ------------------------------------------------------------------------ */
40/* ------------------------------------------------------------------------ */
41
42void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43 kmp_info_t *th;
44
45 KMP_DEBUG_ASSERT(gtid_ref);
46
47 if (__kmp_env_consistency_check) {
48 th = __kmp_threads[*gtid_ref];
49 if (th->th.th_root->r.r_active &&
50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51#if KMP_USE_DYNAMIC_LOCK
52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53#else
54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55#endif
56 }
57 }
58}
59
60void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61 kmp_info_t *th;
62
63 if (__kmp_env_consistency_check) {
64 th = __kmp_threads[*gtid_ref];
65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67 }
68 }
69}
70
71// Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72static inline int __kmp_get_monotonicity(enum sched_type schedule,
73 bool use_hier = false) {
74 // Pick up the nonmonotonic/monotonic bits from the scheduling type
75 int monotonicity;
76 // default to monotonic
77 monotonicity = SCHEDULE_MONOTONIC;
78 if (SCHEDULE_HAS_NONMONOTONIC(schedule))
79 monotonicity = SCHEDULE_NONMONOTONIC;
80 else if (SCHEDULE_HAS_MONOTONIC(schedule))
81 monotonicity = SCHEDULE_MONOTONIC;
82 return monotonicity;
83}
84
85// Initialize a dispatch_private_info_template<T> buffer for a particular
86// type of schedule,chunk. The loop description is found in lb (lower bound),
87// ub (upper bound), and st (stride). nproc is the number of threads relevant
88// to the scheduling (often the number of threads in a team, but not always if
89// hierarchical scheduling is used). tid is the id of the thread calling
90// the function within the group of nproc threads. It will have a value
91// between 0 and nproc - 1. This is often just the thread id within a team, but
92// is not necessarily the case when using hierarchical scheduling.
93// loc is the source file location of the corresponding loop
94// gtid is the global thread id
95template <typename T>
96void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
97 dispatch_private_info_template<T> *pr,
98 enum sched_type schedule, T lb, T ub,
99 typename traits_t<T>::signed_t st,
100#if USE_ITT_BUILD
101 kmp_uint64 *cur_chunk,
102#endif
103 typename traits_t<T>::signed_t chunk,
104 T nproc, T tid) {
105 typedef typename traits_t<T>::unsigned_t UT;
106 typedef typename traits_t<T>::floating_t DBL;
107
108 int active;
109 T tc;
110 kmp_info_t *th;
111 kmp_team_t *team;
112 int monotonicity;
113 bool use_hier;
114
115#ifdef KMP_DEBUG
116 typedef typename traits_t<T>::signed_t ST;
117 {
118 char *buff;
119 // create format specifiers before the debug output
120 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
121 "pr:%%p lb:%%%s ub:%%%s st:%%%s "
122 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
123 traits_t<T>::spec, traits_t<T>::spec,
124 traits_t<ST>::spec, traits_t<ST>::spec,
125 traits_t<T>::spec, traits_t<T>::spec);
126 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
127 __kmp_str_free(&buff);
128 }
129#endif
130 /* setup data */
131 th = __kmp_threads[gtid];
132 team = th->th.th_team;
133 active = !team->t.t_serialized;
134
135#if USE_ITT_BUILD
136 int itt_need_metadata_reporting =
137 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
138 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
139 team->t.t_active_level == 1;
140#endif
141
142#if KMP_USE_HIER_SCHED
143 use_hier = pr->flags.use_hier;
144#else
145 use_hier = false;
146#endif
147
148 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
149 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
150 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
151
152 /* Pick up the nomerge/ordered bits from the scheduling type */
153 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
154 pr->flags.nomerge = TRUE;
155 schedule =
156 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
157 } else {
158 pr->flags.nomerge = FALSE;
159 }
160 pr->type_size = traits_t<T>::type_size; // remember the size of variables
161 if (kmp_ord_lower & schedule) {
162 pr->flags.ordered = TRUE;
163 schedule =
164 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
165 } else {
166 pr->flags.ordered = FALSE;
167 }
168 // Ordered overrides nonmonotonic
169 if (pr->flags.ordered) {
170 monotonicity = SCHEDULE_MONOTONIC;
171 }
172
173 if (schedule == kmp_sch_static) {
174 schedule = __kmp_static;
175 } else {
176 if (schedule == kmp_sch_runtime) {
177 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
178 // not specified)
179 schedule = team->t.t_sched.r_sched_type;
180 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
181 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
182 // Detail the schedule if needed (global controls are differentiated
183 // appropriately)
184 if (schedule == kmp_sch_guided_chunked) {
185 schedule = __kmp_guided;
186 } else if (schedule == kmp_sch_static) {
187 schedule = __kmp_static;
188 }
189 // Use the chunk size specified by OMP_SCHEDULE (or default if not
190 // specified)
191 chunk = team->t.t_sched.chunk;
192#if USE_ITT_BUILD
193 if (cur_chunk)
194 *cur_chunk = chunk;
195#endif
196#ifdef KMP_DEBUG
197 {
198 char *buff;
199 // create format specifiers before the debug output
200 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
201 "schedule:%%d chunk:%%%s\n",
202 traits_t<ST>::spec);
203 KD_TRACE(10, (buff, gtid, schedule, chunk));
204 __kmp_str_free(&buff);
205 }
206#endif
207 } else {
208 if (schedule == kmp_sch_guided_chunked) {
209 schedule = __kmp_guided;
210 }
211 if (chunk <= 0) {
212 chunk = KMP_DEFAULT_CHUNK;
213 }
214 }
215
216 if (schedule == kmp_sch_auto) {
217 // mapping and differentiation: in the __kmp_do_serial_initialize()
218 schedule = __kmp_auto;
219#ifdef KMP_DEBUG
220 {
221 char *buff;
222 // create format specifiers before the debug output
223 buff = __kmp_str_format(
224 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
225 "schedule:%%d chunk:%%%s\n",
226 traits_t<ST>::spec);
227 KD_TRACE(10, (buff, gtid, schedule, chunk));
228 __kmp_str_free(&buff);
229 }
230#endif
231 }
232#if KMP_STATIC_STEAL_ENABLED
233 // map nonmonotonic:dynamic to static steal
234 if (schedule == kmp_sch_dynamic_chunked) {
235 if (monotonicity == SCHEDULE_NONMONOTONIC)
236 schedule = kmp_sch_static_steal;
237 }
238#endif
239 /* guided analytical not safe for too many threads */
240 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
241 schedule = kmp_sch_guided_iterative_chunked;
242 KMP_WARNING(DispatchManyThreads);
243 }
244 if (schedule == kmp_sch_runtime_simd) {
245 // compiler provides simd_width in the chunk parameter
246 schedule = team->t.t_sched.r_sched_type;
247 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
248 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
249 // Detail the schedule if needed (global controls are differentiated
250 // appropriately)
251 if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
252 schedule == __kmp_static) {
253 schedule = kmp_sch_static_balanced_chunked;
254 } else {
255 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
256 schedule = kmp_sch_guided_simd;
257 }
258 chunk = team->t.t_sched.chunk * chunk;
259 }
260#if USE_ITT_BUILD
261 if (cur_chunk)
262 *cur_chunk = chunk;
263#endif
264#ifdef KMP_DEBUG
265 {
266 char *buff;
267 // create format specifiers before the debug output
268 buff = __kmp_str_format(
269 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
270 " chunk:%%%s\n",
271 traits_t<ST>::spec);
272 KD_TRACE(10, (buff, gtid, schedule, chunk));
273 __kmp_str_free(&buff);
274 }
275#endif
276 }
277 pr->u.p.parm1 = chunk;
278 }
279 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
280 "unknown scheduling type");
281
282 pr->u.p.count = 0;
283
284 if (__kmp_env_consistency_check) {
285 if (st == 0) {
286 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
287 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
288 }
289 }
290 // compute trip count
291 if (st == 1) { // most common case
292 if (ub >= lb) {
293 tc = ub - lb + 1;
294 } else { // ub < lb
295 tc = 0; // zero-trip
296 }
297 } else if (st < 0) {
298 if (lb >= ub) {
299 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
300 // where the division needs to be unsigned regardless of the result type
301 tc = (UT)(lb - ub) / (-st) + 1;
302 } else { // lb < ub
303 tc = 0; // zero-trip
304 }
305 } else { // st > 0
306 if (ub >= lb) {
307 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
308 // where the division needs to be unsigned regardless of the result type
309 tc = (UT)(ub - lb) / st + 1;
310 } else { // ub < lb
311 tc = 0; // zero-trip
312 }
313 }
314
315#if KMP_STATS_ENABLED
316 if (KMP_MASTER_GTID(gtid)) {
317 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
318 }
319#endif
320
321 pr->u.p.lb = lb;
322 pr->u.p.ub = ub;
323 pr->u.p.st = st;
324 pr->u.p.tc = tc;
325
326#if KMP_OS_WINDOWS
327 pr->u.p.last_upper = ub + st;
328#endif /* KMP_OS_WINDOWS */
329
330 /* NOTE: only the active parallel region(s) has active ordered sections */
331
332 if (active) {
333 if (pr->flags.ordered) {
334 pr->ordered_bumped = 0;
335 pr->u.p.ordered_lower = 1;
336 pr->u.p.ordered_upper = 0;
337 }
338 }
339
340 switch (schedule) {
341#if (KMP_STATIC_STEAL_ENABLED)
342 case kmp_sch_static_steal: {
343 T ntc, init;
344
345 KD_TRACE(100,
346 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
347 gtid));
348
349 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
350 if (nproc > 1 && ntc >= nproc) {
351 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
352 T id = tid;
353 T small_chunk, extras;
354
355 small_chunk = ntc / nproc;
356 extras = ntc % nproc;
357
358 init = id * small_chunk + (id < extras ? id : extras);
359 pr->u.p.count = init;
360 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
361
362 pr->u.p.parm2 = lb;
363 // parm3 is the number of times to attempt stealing which is
364 // proportional to the number of chunks per thread up until
365 // the maximum value of nproc.
366 pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
367 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
368 pr->u.p.st = st;
369 if (traits_t<T>::type_size > 4) {
370 // AC: TODO: check if 16-byte CAS available and use it to
371 // improve performance (probably wait for explicit request
372 // before spending time on this).
373 // For now use dynamically allocated per-thread lock,
374 // free memory in __kmp_dispatch_next when status==0.
375 KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
376 pr->u.p.th_steal_lock =
377 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
378 __kmp_init_lock(pr->u.p.th_steal_lock);
379 }
380 break;
381 } else {
382 /* too few chunks: switching to kmp_sch_dynamic_chunked */
383 schedule = kmp_sch_dynamic_chunked;
384 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
385 "kmp_sch_dynamic_chunked\n",
386 gtid));
387 if (pr->u.p.parm1 <= 0)
388 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
389 break;
390 } // if
391 } // case
392#endif
393 case kmp_sch_static_balanced: {
394 T init, limit;
395
396 KD_TRACE(
397 100,
398 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
399 gtid));
400
401 if (nproc > 1) {
402 T id = tid;
403
404 if (tc < nproc) {
405 if (id < tc) {
406 init = id;
407 limit = id;
408 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
409 } else {
410 pr->u.p.count = 1; /* means no more chunks to execute */
411 pr->u.p.parm1 = FALSE;
412 break;
413 }
414 } else {
415 T small_chunk = tc / nproc;
416 T extras = tc % nproc;
417 init = id * small_chunk + (id < extras ? id : extras);
418 limit = init + small_chunk - (id < extras ? 0 : 1);
419 pr->u.p.parm1 = (id == nproc - 1);
420 }
421 } else {
422 if (tc > 0) {
423 init = 0;
424 limit = tc - 1;
425 pr->u.p.parm1 = TRUE;
426 } else {
427 // zero trip count
428 pr->u.p.count = 1; /* means no more chunks to execute */
429 pr->u.p.parm1 = FALSE;
430 break;
431 }
432 }
433#if USE_ITT_BUILD
434 // Calculate chunk for metadata report
435 if (itt_need_metadata_reporting)
436 if (cur_chunk)
437 *cur_chunk = limit - init + 1;
438#endif
439 if (st == 1) {
440 pr->u.p.lb = lb + init;
441 pr->u.p.ub = lb + limit;
442 } else {
443 // calculated upper bound, "ub" is user-defined upper bound
444 T ub_tmp = lb + limit * st;
445 pr->u.p.lb = lb + init * st;
446 // adjust upper bound to "ub" if needed, so that MS lastprivate will match
447 // it exactly
448 if (st > 0) {
449 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
450 } else {
451 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
452 }
453 }
454 if (pr->flags.ordered) {
455 pr->u.p.ordered_lower = init;
456 pr->u.p.ordered_upper = limit;
457 }
458 break;
459 } // case
460 case kmp_sch_static_balanced_chunked: {
461 // similar to balanced, but chunk adjusted to multiple of simd width
462 T nth = nproc;
463 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
464 " -> falling-through to static_greedy\n",
465 gtid));
466 schedule = kmp_sch_static_greedy;
467 if (nth > 1)
468 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
469 else
470 pr->u.p.parm1 = tc;
471 break;
472 } // case
474 case kmp_sch_guided_iterative_chunked: {
475 KD_TRACE(
476 100,
477 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
478 " case\n",
479 gtid));
480
481 if (nproc > 1) {
482 if ((2L * chunk + 1) * nproc >= tc) {
483 /* chunk size too large, switch to dynamic */
484 schedule = kmp_sch_dynamic_chunked;
485 } else {
486 // when remaining iters become less than parm2 - switch to dynamic
487 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
488 *(double *)&pr->u.p.parm3 =
489 guided_flt_param / nproc; // may occupy parm3 and parm4
490 }
491 } else {
492 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
493 "kmp_sch_static_greedy\n",
494 gtid));
495 schedule = kmp_sch_static_greedy;
496 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
497 KD_TRACE(
498 100,
499 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
500 gtid));
501 pr->u.p.parm1 = tc;
502 } // if
503 } // case
504 break;
505 case kmp_sch_guided_analytical_chunked: {
506 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
507 "kmp_sch_guided_analytical_chunked case\n",
508 gtid));
509
510 if (nproc > 1) {
511 if ((2L * chunk + 1) * nproc >= tc) {
512 /* chunk size too large, switch to dynamic */
513 schedule = kmp_sch_dynamic_chunked;
514 } else {
515 /* commonly used term: (2 nproc - 1)/(2 nproc) */
516 DBL x;
517
518#if KMP_USE_X87CONTROL
519 /* Linux* OS already has 64-bit computation by default for long double,
520 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
521 Windows* OS on IA-32 architecture, we need to set precision to 64-bit
522 instead of the default 53-bit. Even though long double doesn't work
523 on Windows* OS on Intel(R) 64, the resulting lack of precision is not
524 expected to impact the correctness of the algorithm, but this has not
525 been mathematically proven. */
526 // save original FPCW and set precision to 64-bit, as
527 // Windows* OS on IA-32 architecture defaults to 53-bit
528 unsigned int oldFpcw = _control87(0, 0);
529 _control87(_PC_64, _MCW_PC); // 0,0x30000
530#endif
531 /* value used for comparison in solver for cross-over point */
532 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
533
534 /* crossover point--chunk indexes equal to or greater than
535 this point switch to dynamic-style scheduling */
536 UT cross;
537
538 /* commonly used term: (2 nproc - 1)/(2 nproc) */
539 x = (long double)1.0 - (long double)0.5 / nproc;
540
541#ifdef KMP_DEBUG
542 { // test natural alignment
543 struct _test_a {
544 char a;
545 union {
546 char b;
547 DBL d;
548 };
549 } t;
550 ptrdiff_t natural_alignment =
551 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
552 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
553 // long)natural_alignment );
554 KMP_DEBUG_ASSERT(
555 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
556 }
557#endif // KMP_DEBUG
558
559 /* save the term in thread private dispatch structure */
560 *(DBL *)&pr->u.p.parm3 = x;
561
562 /* solve for the crossover point to the nearest integer i for which C_i
563 <= chunk */
564 {
565 UT left, right, mid;
566 long double p;
567
568 /* estimate initial upper and lower bound */
569
570 /* doesn't matter what value right is as long as it is positive, but
571 it affects performance of the solver */
572 right = 229;
573 p = __kmp_pow<UT>(x, right);
574 if (p > target) {
575 do {
576 p *= p;
577 right <<= 1;
578 } while (p > target && right < (1 << 27));
579 /* lower bound is previous (failed) estimate of upper bound */
580 left = right >> 1;
581 } else {
582 left = 0;
583 }
584
585 /* bisection root-finding method */
586 while (left + 1 < right) {
587 mid = (left + right) / 2;
588 if (__kmp_pow<UT>(x, mid) > target) {
589 left = mid;
590 } else {
591 right = mid;
592 }
593 } // while
594 cross = right;
595 }
596 /* assert sanity of computed crossover point */
597 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
598 __kmp_pow<UT>(x, cross) <= target);
599
600 /* save the crossover point in thread private dispatch structure */
601 pr->u.p.parm2 = cross;
602
603// C75803
604#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
605#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
606#else
607#define GUIDED_ANALYTICAL_WORKAROUND (x)
608#endif
609 /* dynamic-style scheduling offset */
610 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
611 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
612 cross * chunk;
613#if KMP_USE_X87CONTROL
614 // restore FPCW
615 _control87(oldFpcw, _MCW_PC);
616#endif
617 } // if
618 } else {
619 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
620 "kmp_sch_static_greedy\n",
621 gtid));
622 schedule = kmp_sch_static_greedy;
623 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
624 pr->u.p.parm1 = tc;
625 } // if
626 } // case
627 break;
628 case kmp_sch_static_greedy:
629 KD_TRACE(
630 100,
631 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
632 gtid));
633 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
634 break;
635 case kmp_sch_static_chunked:
636 case kmp_sch_dynamic_chunked:
637 if (pr->u.p.parm1 <= 0) {
638 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
639 }
640 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
641 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
642 gtid));
643 break;
644 case kmp_sch_trapezoidal: {
645 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
646
647 T parm1, parm2, parm3, parm4;
648 KD_TRACE(100,
649 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
650 gtid));
651
652 parm1 = chunk;
653
654 /* F : size of the first cycle */
655 parm2 = (tc / (2 * nproc));
656
657 if (parm2 < 1) {
658 parm2 = 1;
659 }
660
661 /* L : size of the last cycle. Make sure the last cycle is not larger
662 than the first cycle. */
663 if (parm1 < 1) {
664 parm1 = 1;
665 } else if (parm1 > parm2) {
666 parm1 = parm2;
667 }
668
669 /* N : number of cycles */
670 parm3 = (parm2 + parm1);
671 parm3 = (2 * tc + parm3 - 1) / parm3;
672
673 if (parm3 < 2) {
674 parm3 = 2;
675 }
676
677 /* sigma : decreasing incr of the trapezoid */
678 parm4 = (parm3 - 1);
679 parm4 = (parm2 - parm1) / parm4;
680
681 // pointless check, because parm4 >= 0 always
682 // if ( parm4 < 0 ) {
683 // parm4 = 0;
684 //}
685
686 pr->u.p.parm1 = parm1;
687 pr->u.p.parm2 = parm2;
688 pr->u.p.parm3 = parm3;
689 pr->u.p.parm4 = parm4;
690 } // case
691 break;
692
693 default: {
694 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
695 KMP_HNT(GetNewerLibrary), // Hint
696 __kmp_msg_null // Variadic argument list terminator
697 );
698 } break;
699 } // switch
700 pr->schedule = schedule;
701}
702
703#if KMP_USE_HIER_SCHED
704template <typename T>
705inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
706 typename traits_t<T>::signed_t st);
707template <>
708inline void
709__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
710 kmp_int32 ub, kmp_int32 st) {
711 __kmp_dispatch_init_hierarchy<kmp_int32>(
712 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
713 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
714}
715template <>
716inline void
717__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
718 kmp_uint32 ub, kmp_int32 st) {
719 __kmp_dispatch_init_hierarchy<kmp_uint32>(
720 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
721 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
722}
723template <>
724inline void
725__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
726 kmp_int64 ub, kmp_int64 st) {
727 __kmp_dispatch_init_hierarchy<kmp_int64>(
728 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
729 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
730}
731template <>
732inline void
733__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
734 kmp_uint64 ub, kmp_int64 st) {
735 __kmp_dispatch_init_hierarchy<kmp_uint64>(
736 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
737 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
738}
739
740// free all the hierarchy scheduling memory associated with the team
741void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
742 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
743 for (int i = 0; i < num_disp_buff; ++i) {
744 // type does not matter here so use kmp_int32
745 auto sh =
746 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
747 &team->t.t_disp_buffer[i]);
748 if (sh->hier) {
749 sh->hier->deallocate();
750 __kmp_free(sh->hier);
751 }
752 }
753}
754#endif
755
756// UT - unsigned flavor of T, ST - signed flavor of T,
757// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
758template <typename T>
759static void
760__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
761 T ub, typename traits_t<T>::signed_t st,
762 typename traits_t<T>::signed_t chunk, int push_ws) {
763 typedef typename traits_t<T>::unsigned_t UT;
764
765 int active;
766 kmp_info_t *th;
767 kmp_team_t *team;
768 kmp_uint32 my_buffer_index;
769 dispatch_private_info_template<T> *pr;
770 dispatch_shared_info_template<T> volatile *sh;
771
772 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
773 sizeof(dispatch_private_info));
774 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
775 sizeof(dispatch_shared_info));
776
777 if (!TCR_4(__kmp_init_parallel))
778 __kmp_parallel_initialize();
779
780 __kmp_resume_if_soft_paused();
781
782#if INCLUDE_SSC_MARKS
783 SSC_MARK_DISPATCH_INIT();
784#endif
785#ifdef KMP_DEBUG
786 typedef typename traits_t<T>::signed_t ST;
787 {
788 char *buff;
789 // create format specifiers before the debug output
790 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
791 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
792 traits_t<ST>::spec, traits_t<T>::spec,
793 traits_t<T>::spec, traits_t<ST>::spec);
794 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
795 __kmp_str_free(&buff);
796 }
797#endif
798 /* setup data */
799 th = __kmp_threads[gtid];
800 team = th->th.th_team;
801 active = !team->t.t_serialized;
802 th->th.th_ident = loc;
803
804 // Any half-decent optimizer will remove this test when the blocks are empty
805 // since the macros expand to nothing
806 // when statistics are disabled.
807 if (schedule == __kmp_static) {
808 KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
809 } else {
810 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
811 }
812
813#if KMP_USE_HIER_SCHED
814 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
815 // Hierarchical scheduling does not work with ordered, so if ordered is
816 // detected, then revert back to threaded scheduling.
817 bool ordered;
818 enum sched_type my_sched = schedule;
819 my_buffer_index = th->th.th_dispatch->th_disp_index;
820 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
821 &th->th.th_dispatch
822 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
823 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
824 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
825 my_sched =
826 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
827 ordered = (kmp_ord_lower & my_sched);
828 if (pr->flags.use_hier) {
829 if (ordered) {
830 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
831 "Disabling hierarchical scheduling.\n",
832 gtid));
833 pr->flags.use_hier = FALSE;
834 }
835 }
836 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
837 // Don't use hierarchical for ordered parallel loops and don't
838 // use the runtime hierarchy if one was specified in the program
839 if (!ordered && !pr->flags.use_hier)
840 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
841 }
842#endif // KMP_USE_HIER_SCHED
843
844#if USE_ITT_BUILD
845 kmp_uint64 cur_chunk = chunk;
846 int itt_need_metadata_reporting =
847 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
848 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
849 team->t.t_active_level == 1;
850#endif
851 if (!active) {
852 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
853 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
854 } else {
855 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
856 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
857
858 my_buffer_index = th->th.th_dispatch->th_disp_index++;
859
860 /* What happens when number of threads changes, need to resize buffer? */
861 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
862 &th->th.th_dispatch
863 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
864 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
865 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
866 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
867 my_buffer_index));
868 }
869
870 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
871#if USE_ITT_BUILD
872 &cur_chunk,
873#endif
874 chunk, (T)th->th.th_team_nproc,
875 (T)th->th.th_info.ds.ds_tid);
876 if (active) {
877 if (pr->flags.ordered == 0) {
878 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
879 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
880 } else {
881 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
882 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
883 }
884 }
885
886 if (active) {
887 /* The name of this buffer should be my_buffer_index when it's free to use
888 * it */
889
890 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
891 "sh->buffer_index:%d\n",
892 gtid, my_buffer_index, sh->buffer_index));
893 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
894 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
895 // Note: KMP_WAIT() cannot be used there: buffer index and
896 // my_buffer_index are *always* 32-bit integers.
897 KMP_MB(); /* is this necessary? */
898 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
899 "sh->buffer_index:%d\n",
900 gtid, my_buffer_index, sh->buffer_index));
901
902 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
903 th->th.th_dispatch->th_dispatch_sh_current =
904 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
905#if USE_ITT_BUILD
906 if (pr->flags.ordered) {
907 __kmp_itt_ordered_init(gtid);
908 }
909 // Report loop metadata
910 if (itt_need_metadata_reporting) {
911 // Only report metadata by master of active team at level 1
912 kmp_uint64 schedtype = 0;
913 switch (schedule) {
914 case kmp_sch_static_chunked:
915 case kmp_sch_static_balanced: // Chunk is calculated in the switch above
916 break;
917 case kmp_sch_static_greedy:
918 cur_chunk = pr->u.p.parm1;
919 break;
920 case kmp_sch_dynamic_chunked:
921 schedtype = 1;
922 break;
923 case kmp_sch_guided_iterative_chunked:
924 case kmp_sch_guided_analytical_chunked:
926 schedtype = 2;
927 break;
928 default:
929 // Should we put this case under "static"?
930 // case kmp_sch_static_steal:
931 schedtype = 3;
932 break;
933 }
934 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
935 }
936#if KMP_USE_HIER_SCHED
937 if (pr->flags.use_hier) {
938 pr->u.p.count = 0;
939 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
940 }
941#endif // KMP_USER_HIER_SCHED
942#endif /* USE_ITT_BUILD */
943 }
944
945#ifdef KMP_DEBUG
946 {
947 char *buff;
948 // create format specifiers before the debug output
949 buff = __kmp_str_format(
950 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
951 "lb:%%%s ub:%%%s"
952 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
953 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
954 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
955 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
956 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
957 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
958 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
959 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
960 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
961 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
962 __kmp_str_free(&buff);
963 }
964#endif
965#if (KMP_STATIC_STEAL_ENABLED)
966 // It cannot be guaranteed that after execution of a loop with some other
967 // schedule kind all the parm3 variables will contain the same value. Even if
968 // all parm3 will be the same, it still exists a bad case like using 0 and 1
969 // rather than program life-time increment. So the dedicated variable is
970 // required. The 'static_steal_counter' is used.
971 if (pr->schedule == kmp_sch_static_steal) {
972 // Other threads will inspect this variable when searching for a victim.
973 // This is a flag showing that other threads may steal from this thread
974 // since then.
975 volatile T *p = &pr->u.p.static_steal_counter;
976 *p = *p + 1;
977 }
978#endif // ( KMP_STATIC_STEAL_ENABLED )
979
980#if OMPT_SUPPORT && OMPT_OPTIONAL
981 if (ompt_enabled.ompt_callback_work) {
982 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
983 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
984 ompt_callbacks.ompt_callback(ompt_callback_work)(
985 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
986 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
987 }
988#endif
989 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
990}
991
992/* For ordered loops, either __kmp_dispatch_finish() should be called after
993 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
994 * every chunk of iterations. If the ordered section(s) were not executed
995 * for this iteration (or every iteration in this chunk), we need to set the
996 * ordered iteration counters so that the next thread can proceed. */
997template <typename UT>
998static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
999 typedef typename traits_t<UT>::signed_t ST;
1000 kmp_info_t *th = __kmp_threads[gtid];
1001
1002 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1003 if (!th->th.th_team->t.t_serialized) {
1004
1005 dispatch_private_info_template<UT> *pr =
1006 reinterpret_cast<dispatch_private_info_template<UT> *>(
1007 th->th.th_dispatch->th_dispatch_pr_current);
1008 dispatch_shared_info_template<UT> volatile *sh =
1009 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1010 th->th.th_dispatch->th_dispatch_sh_current);
1011 KMP_DEBUG_ASSERT(pr);
1012 KMP_DEBUG_ASSERT(sh);
1013 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1014 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1015
1016 if (pr->ordered_bumped) {
1017 KD_TRACE(
1018 1000,
1019 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1020 gtid));
1021 pr->ordered_bumped = 0;
1022 } else {
1023 UT lower = pr->u.p.ordered_lower;
1024
1025#ifdef KMP_DEBUG
1026 {
1027 char *buff;
1028 // create format specifiers before the debug output
1029 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1030 "ordered_iteration:%%%s lower:%%%s\n",
1031 traits_t<UT>::spec, traits_t<UT>::spec);
1032 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1033 __kmp_str_free(&buff);
1034 }
1035#endif
1036
1037 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1038 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1039 KMP_MB(); /* is this necessary? */
1040#ifdef KMP_DEBUG
1041 {
1042 char *buff;
1043 // create format specifiers before the debug output
1044 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1045 "ordered_iteration:%%%s lower:%%%s\n",
1046 traits_t<UT>::spec, traits_t<UT>::spec);
1047 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1048 __kmp_str_free(&buff);
1049 }
1050#endif
1051
1052 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1053 } // if
1054 } // if
1055 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1056}
1057
1058#ifdef KMP_GOMP_COMPAT
1059
1060template <typename UT>
1061static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1062 typedef typename traits_t<UT>::signed_t ST;
1063 kmp_info_t *th = __kmp_threads[gtid];
1064
1065 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1066 if (!th->th.th_team->t.t_serialized) {
1067 // int cid;
1068 dispatch_private_info_template<UT> *pr =
1069 reinterpret_cast<dispatch_private_info_template<UT> *>(
1070 th->th.th_dispatch->th_dispatch_pr_current);
1071 dispatch_shared_info_template<UT> volatile *sh =
1072 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1073 th->th.th_dispatch->th_dispatch_sh_current);
1074 KMP_DEBUG_ASSERT(pr);
1075 KMP_DEBUG_ASSERT(sh);
1076 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1077 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1078
1079 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1080 UT lower = pr->u.p.ordered_lower;
1081 UT upper = pr->u.p.ordered_upper;
1082 UT inc = upper - lower + 1;
1083
1084 if (pr->ordered_bumped == inc) {
1085 KD_TRACE(
1086 1000,
1087 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1088 gtid));
1089 pr->ordered_bumped = 0;
1090 } else {
1091 inc -= pr->ordered_bumped;
1092
1093#ifdef KMP_DEBUG
1094 {
1095 char *buff;
1096 // create format specifiers before the debug output
1097 buff = __kmp_str_format(
1098 "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1099 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1100 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1101 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1102 __kmp_str_free(&buff);
1103 }
1104#endif
1105
1106 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1107 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1108
1109 KMP_MB(); /* is this necessary? */
1110 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1111 "ordered_bumped to zero\n",
1112 gtid));
1113 pr->ordered_bumped = 0;
1115#ifdef KMP_DEBUG
1116 {
1117 char *buff;
1118 // create format specifiers before the debug output
1119 buff = __kmp_str_format(
1120 "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1121 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1122 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1123 traits_t<UT>::spec);
1124 KD_TRACE(1000,
1125 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1126 __kmp_str_free(&buff);
1127 }
1128#endif
1129
1130 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1131 }
1132 // }
1133 }
1134 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1135}
1136
1137#endif /* KMP_GOMP_COMPAT */
1138
1139template <typename T>
1140int __kmp_dispatch_next_algorithm(int gtid,
1141 dispatch_private_info_template<T> *pr,
1142 dispatch_shared_info_template<T> volatile *sh,
1143 kmp_int32 *p_last, T *p_lb, T *p_ub,
1144 typename traits_t<T>::signed_t *p_st, T nproc,
1145 T tid) {
1146 typedef typename traits_t<T>::unsigned_t UT;
1147 typedef typename traits_t<T>::signed_t ST;
1148 typedef typename traits_t<T>::floating_t DBL;
1149 int status = 0;
1150 kmp_int32 last = 0;
1151 T start;
1152 ST incr;
1153 UT limit, trip, init;
1154 kmp_info_t *th = __kmp_threads[gtid];
1155 kmp_team_t *team = th->th.th_team;
1156
1157 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1158 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1159 KMP_DEBUG_ASSERT(pr);
1160 KMP_DEBUG_ASSERT(sh);
1161 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1162#ifdef KMP_DEBUG
1163 {
1164 char *buff;
1165 // create format specifiers before the debug output
1166 buff =
1167 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1168 "sh:%%p nproc:%%%s tid:%%%s\n",
1169 traits_t<T>::spec, traits_t<T>::spec);
1170 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1171 __kmp_str_free(&buff);
1172 }
1173#endif
1174
1175 // zero trip count
1176 if (pr->u.p.tc == 0) {
1177 KD_TRACE(10,
1178 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1179 "zero status:%d\n",
1180 gtid, status));
1181 return 0;
1182 }
1183
1184 switch (pr->schedule) {
1185#if (KMP_STATIC_STEAL_ENABLED)
1186 case kmp_sch_static_steal: {
1187 T chunk = pr->u.p.parm1;
1188
1189 KD_TRACE(100,
1190 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1191 gtid));
1192
1193 trip = pr->u.p.tc - 1;
1194
1195 if (traits_t<T>::type_size > 4) {
1196 // use lock for 8-byte and CAS for 4-byte induction
1197 // variable. TODO (optional): check and use 16-byte CAS
1198 kmp_lock_t *lck = pr->u.p.th_steal_lock;
1199 KMP_DEBUG_ASSERT(lck != NULL);
1200 if (pr->u.p.count < (UT)pr->u.p.ub) {
1201 __kmp_acquire_lock(lck, gtid);
1202 // try to get own chunk of iterations
1203 init = (pr->u.p.count)++;
1204 status = (init < (UT)pr->u.p.ub);
1205 __kmp_release_lock(lck, gtid);
1206 } else {
1207 status = 0; // no own chunks
1208 }
1209 if (!status) { // try to steal
1210 kmp_info_t **other_threads = team->t.t_threads;
1211 int while_limit = pr->u.p.parm3;
1212 int while_index = 0;
1213 T id = pr->u.p.static_steal_counter; // loop id
1214 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1215 __kmp_dispatch_num_buffers; // current loop index
1216 // note: victim thread can potentially execute another loop
1217 // TODO: algorithm of searching for a victim
1218 // should be cleaned up and measured
1219 while ((!status) && (while_limit != ++while_index)) {
1220 dispatch_private_info_template<T> *victim;
1221 T remaining;
1222 T victimIdx = pr->u.p.parm4;
1223 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1224 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1225 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1226 KMP_DEBUG_ASSERT(victim);
1227 while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1228 oldVictimIdx != victimIdx) {
1229 victimIdx = (victimIdx + 1) % nproc;
1230 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1231 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1232 KMP_DEBUG_ASSERT(victim);
1233 }
1234 if (victim == pr || id != victim->u.p.static_steal_counter) {
1235 continue; // try once more (nproc attempts in total)
1236 // no victim is ready yet to participate in stealing
1237 // because no victim passed kmp_init_dispatch yet
1238 }
1239 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1240 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1241 continue; // not enough chunks to steal, goto next victim
1242 }
1243
1244 lck = victim->u.p.th_steal_lock;
1245 KMP_ASSERT(lck != NULL);
1246 __kmp_acquire_lock(lck, gtid);
1247 limit = victim->u.p.ub; // keep initial ub
1248 if (victim->u.p.count >= limit ||
1249 (remaining = limit - victim->u.p.count) < 2) {
1250 __kmp_release_lock(lck, gtid);
1251 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1252 continue; // not enough chunks to steal
1253 }
1254 // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or
1255 // by 1
1256 if (remaining > 3) {
1257 // steal 1/4 of remaining
1258 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1259 init = (victim->u.p.ub -= (remaining >> 2));
1260 } else {
1261 // steal 1 chunk of 2 or 3 remaining
1262 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1263 init = (victim->u.p.ub -= 1);
1264 }
1265 __kmp_release_lock(lck, gtid);
1266
1267 KMP_DEBUG_ASSERT(init + 1 <= limit);
1268 pr->u.p.parm4 = victimIdx; // remember victim to steal from
1269 status = 1;
1270 while_index = 0;
1271 // now update own count and ub with stolen range but init chunk
1272 __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
1273 pr->u.p.count = init + 1;
1274 pr->u.p.ub = limit;
1275 __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
1276 } // while (search for victim)
1277 } // if (try to find victim and steal)
1278 } else {
1279 // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1280 typedef union {
1281 struct {
1282 UT count;
1283 T ub;
1284 } p;
1285 kmp_int64 b;
1286 } union_i4;
1287 // All operations on 'count' or 'ub' must be combined atomically
1288 // together.
1289 {
1290 union_i4 vold, vnew;
1291 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1292 vnew = vold;
1293 vnew.p.count++;
1294 while (!KMP_COMPARE_AND_STORE_ACQ64(
1295 (volatile kmp_int64 *)&pr->u.p.count,
1296 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1297 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1298 KMP_CPU_PAUSE();
1299 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1300 vnew = vold;
1301 vnew.p.count++;
1302 }
1303 vnew = vold;
1304 init = vnew.p.count;
1305 status = (init < (UT)vnew.p.ub);
1306 }
1307
1308 if (!status) {
1309 kmp_info_t **other_threads = team->t.t_threads;
1310 int while_limit = pr->u.p.parm3;
1311 int while_index = 0;
1312 T id = pr->u.p.static_steal_counter; // loop id
1313 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1314 __kmp_dispatch_num_buffers; // current loop index
1315 // note: victim thread can potentially execute another loop
1316 // TODO: algorithm of searching for a victim
1317 // should be cleaned up and measured
1318 while ((!status) && (while_limit != ++while_index)) {
1319 dispatch_private_info_template<T> *victim;
1320 union_i4 vold, vnew;
1321 kmp_int32 remaining;
1322 T victimIdx = pr->u.p.parm4;
1323 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1324 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1325 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1326 KMP_DEBUG_ASSERT(victim);
1327 while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1328 oldVictimIdx != victimIdx) {
1329 victimIdx = (victimIdx + 1) % nproc;
1330 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1331 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1332 KMP_DEBUG_ASSERT(victim);
1333 }
1334 if (victim == pr || id != victim->u.p.static_steal_counter) {
1335 continue; // try once more (nproc attempts in total)
1336 // no victim is ready yet to participate in stealing
1337 // because no victim passed kmp_init_dispatch yet
1338 }
1339 pr->u.p.parm4 = victimIdx; // new victim found
1340 while (1) { // CAS loop if victim has enough chunks to steal
1341 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1342 vnew = vold;
1343
1344 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1345 if (vnew.p.count >= (UT)vnew.p.ub ||
1346 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1347 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1348 break; // not enough chunks to steal, goto next victim
1349 }
1350 if (remaining > 3) {
1351 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1352 } else {
1353 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1354 }
1355 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1356 // TODO: Should this be acquire or release?
1357 if (KMP_COMPARE_AND_STORE_ACQ64(
1358 (volatile kmp_int64 *)&victim->u.p.count,
1359 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1360 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1361 // stealing succeeded
1362 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1363 vold.p.ub - vnew.p.ub);
1364 status = 1;
1365 while_index = 0;
1366 // now update own count and ub
1367 init = vnew.p.ub;
1368 vold.p.count = init + 1;
1369#if KMP_ARCH_X86
1370 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1371#else
1372 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1373#endif
1374 break;
1375 } // if (check CAS result)
1376 KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1377 } // while (try to steal from particular victim)
1378 } // while (search for victim)
1379 } // if (try to find victim and steal)
1380 } // if (4-byte induction variable)
1381 if (!status) {
1382 *p_lb = 0;
1383 *p_ub = 0;
1384 if (p_st != NULL)
1385 *p_st = 0;
1386 } else {
1387 start = pr->u.p.parm2;
1388 init *= chunk;
1389 limit = chunk + init - 1;
1390 incr = pr->u.p.st;
1391 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1392
1393 KMP_DEBUG_ASSERT(init <= trip);
1394 if ((last = (limit >= trip)) != 0)
1395 limit = trip;
1396 if (p_st != NULL)
1397 *p_st = incr;
1398
1399 if (incr == 1) {
1400 *p_lb = start + init;
1401 *p_ub = start + limit;
1402 } else {
1403 *p_lb = start + init * incr;
1404 *p_ub = start + limit * incr;
1405 }
1406
1407 if (pr->flags.ordered) {
1408 pr->u.p.ordered_lower = init;
1409 pr->u.p.ordered_upper = limit;
1410 } // if
1411 } // if
1412 break;
1413 } // case
1414#endif // ( KMP_STATIC_STEAL_ENABLED )
1415 case kmp_sch_static_balanced: {
1416 KD_TRACE(
1417 10,
1418 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1419 gtid));
1420 /* check if thread has any iteration to do */
1421 if ((status = !pr->u.p.count) != 0) {
1422 pr->u.p.count = 1;
1423 *p_lb = pr->u.p.lb;
1424 *p_ub = pr->u.p.ub;
1425 last = pr->u.p.parm1;
1426 if (p_st != NULL)
1427 *p_st = pr->u.p.st;
1428 } else { /* no iterations to do */
1429 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1430 }
1431 } // case
1432 break;
1433 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1434 merged here */
1435 case kmp_sch_static_chunked: {
1436 T parm1;
1437
1438 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1439 "kmp_sch_static_[affinity|chunked] case\n",
1440 gtid));
1441 parm1 = pr->u.p.parm1;
1442
1443 trip = pr->u.p.tc - 1;
1444 init = parm1 * (pr->u.p.count + tid);
1445
1446 if ((status = (init <= trip)) != 0) {
1447 start = pr->u.p.lb;
1448 incr = pr->u.p.st;
1449 limit = parm1 + init - 1;
1450
1451 if ((last = (limit >= trip)) != 0)
1452 limit = trip;
1453
1454 if (p_st != NULL)
1455 *p_st = incr;
1456
1457 pr->u.p.count += nproc;
1458
1459 if (incr == 1) {
1460 *p_lb = start + init;
1461 *p_ub = start + limit;
1462 } else {
1463 *p_lb = start + init * incr;
1464 *p_ub = start + limit * incr;
1465 }
1466
1467 if (pr->flags.ordered) {
1468 pr->u.p.ordered_lower = init;
1469 pr->u.p.ordered_upper = limit;
1470 } // if
1471 } // if
1472 } // case
1473 break;
1474
1475 case kmp_sch_dynamic_chunked: {
1476 T chunk = pr->u.p.parm1;
1477
1478 KD_TRACE(
1479 100,
1480 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1481 gtid));
1482
1483 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1484 trip = pr->u.p.tc - 1;
1485
1486 if ((status = (init <= trip)) == 0) {
1487 *p_lb = 0;
1488 *p_ub = 0;
1489 if (p_st != NULL)
1490 *p_st = 0;
1491 } else {
1492 start = pr->u.p.lb;
1493 limit = chunk + init - 1;
1494 incr = pr->u.p.st;
1495
1496 if ((last = (limit >= trip)) != 0)
1497 limit = trip;
1498
1499 if (p_st != NULL)
1500 *p_st = incr;
1501
1502 if (incr == 1) {
1503 *p_lb = start + init;
1504 *p_ub = start + limit;
1505 } else {
1506 *p_lb = start + init * incr;
1507 *p_ub = start + limit * incr;
1508 }
1509
1510 if (pr->flags.ordered) {
1511 pr->u.p.ordered_lower = init;
1512 pr->u.p.ordered_upper = limit;
1513 } // if
1514 } // if
1515 } // case
1516 break;
1517
1518 case kmp_sch_guided_iterative_chunked: {
1519 T chunkspec = pr->u.p.parm1;
1520 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1521 "iterative case\n",
1522 gtid));
1523 trip = pr->u.p.tc;
1524 // Start atomic part of calculations
1525 while (1) {
1526 ST remaining; // signed, because can be < 0
1527 init = sh->u.s.iteration; // shared value
1528 remaining = trip - init;
1529 if (remaining <= 0) { // AC: need to compare with 0 first
1530 // nothing to do, don't try atomic op
1531 status = 0;
1532 break;
1533 }
1534 if ((T)remaining <
1535 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1536 // use dynamic-style schedule
1537 // atomically increment iterations, get old value
1538 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1539 (ST)chunkspec);
1540 remaining = trip - init;
1541 if (remaining <= 0) {
1542 status = 0; // all iterations got by other threads
1543 } else {
1544 // got some iterations to work on
1545 status = 1;
1546 if ((T)remaining > chunkspec) {
1547 limit = init + chunkspec - 1;
1548 } else {
1549 last = 1; // the last chunk
1550 limit = init + remaining - 1;
1551 } // if
1552 } // if
1553 break;
1554 } // if
1555 limit = init +
1556 (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1557 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1558 (ST)init, (ST)limit)) {
1559 // CAS was successful, chunk obtained
1560 status = 1;
1561 --limit;
1562 break;
1563 } // if
1564 } // while
1565 if (status != 0) {
1566 start = pr->u.p.lb;
1567 incr = pr->u.p.st;
1568 if (p_st != NULL)
1569 *p_st = incr;
1570 *p_lb = start + init * incr;
1571 *p_ub = start + limit * incr;
1572 if (pr->flags.ordered) {
1573 pr->u.p.ordered_lower = init;
1574 pr->u.p.ordered_upper = limit;
1575 } // if
1576 } else {
1577 *p_lb = 0;
1578 *p_ub = 0;
1579 if (p_st != NULL)
1580 *p_st = 0;
1581 } // if
1582 } // case
1583 break;
1584
1585 case kmp_sch_guided_simd: {
1586 // same as iterative but curr-chunk adjusted to be multiple of given
1587 // chunk
1588 T chunk = pr->u.p.parm1;
1589 KD_TRACE(100,
1590 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1591 gtid));
1592 trip = pr->u.p.tc;
1593 // Start atomic part of calculations
1594 while (1) {
1595 ST remaining; // signed, because can be < 0
1596 init = sh->u.s.iteration; // shared value
1597 remaining = trip - init;
1598 if (remaining <= 0) { // AC: need to compare with 0 first
1599 status = 0; // nothing to do, don't try atomic op
1600 break;
1601 }
1602 KMP_DEBUG_ASSERT(init % chunk == 0);
1603 // compare with K*nproc*(chunk+1), K=2 by default
1604 if ((T)remaining < pr->u.p.parm2) {
1605 // use dynamic-style schedule
1606 // atomically increment iterations, get old value
1607 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1608 (ST)chunk);
1609 remaining = trip - init;
1610 if (remaining <= 0) {
1611 status = 0; // all iterations got by other threads
1612 } else {
1613 // got some iterations to work on
1614 status = 1;
1615 if ((T)remaining > chunk) {
1616 limit = init + chunk - 1;
1617 } else {
1618 last = 1; // the last chunk
1619 limit = init + remaining - 1;
1620 } // if
1621 } // if
1622 break;
1623 } // if
1624 // divide by K*nproc
1625 UT span = remaining * (*(double *)&pr->u.p.parm3);
1626 UT rem = span % chunk;
1627 if (rem) // adjust so that span%chunk == 0
1628 span += chunk - rem;
1629 limit = init + span;
1630 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1631 (ST)init, (ST)limit)) {
1632 // CAS was successful, chunk obtained
1633 status = 1;
1634 --limit;
1635 break;
1636 } // if
1637 } // while
1638 if (status != 0) {
1639 start = pr->u.p.lb;
1640 incr = pr->u.p.st;
1641 if (p_st != NULL)
1642 *p_st = incr;
1643 *p_lb = start + init * incr;
1644 *p_ub = start + limit * incr;
1645 if (pr->flags.ordered) {
1646 pr->u.p.ordered_lower = init;
1647 pr->u.p.ordered_upper = limit;
1648 } // if
1649 } else {
1650 *p_lb = 0;
1651 *p_ub = 0;
1652 if (p_st != NULL)
1653 *p_st = 0;
1654 } // if
1655 } // case
1656 break;
1657
1658 case kmp_sch_guided_analytical_chunked: {
1659 T chunkspec = pr->u.p.parm1;
1660 UT chunkIdx;
1661#if KMP_USE_X87CONTROL
1662 /* for storing original FPCW value for Windows* OS on
1663 IA-32 architecture 8-byte version */
1664 unsigned int oldFpcw;
1665 unsigned int fpcwSet = 0;
1666#endif
1667 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1668 "kmp_sch_guided_analytical_chunked case\n",
1669 gtid));
1670
1671 trip = pr->u.p.tc;
1672
1673 KMP_DEBUG_ASSERT(nproc > 1);
1674 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1675
1676 while (1) { /* this while loop is a safeguard against unexpected zero
1677 chunk sizes */
1678 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1679 if (chunkIdx >= (UT)pr->u.p.parm2) {
1680 --trip;
1681 /* use dynamic-style scheduling */
1682 init = chunkIdx * chunkspec + pr->u.p.count;
1683 /* need to verify init > 0 in case of overflow in the above
1684 * calculation */
1685 if ((status = (init > 0 && init <= trip)) != 0) {
1686 limit = init + chunkspec - 1;
1687
1688 if ((last = (limit >= trip)) != 0)
1689 limit = trip;
1690 }
1691 break;
1692 } else {
1693/* use exponential-style scheduling */
1694/* The following check is to workaround the lack of long double precision on
1695 Windows* OS.
1696 This check works around the possible effect that init != 0 for chunkIdx == 0.
1697 */
1698#if KMP_USE_X87CONTROL
1699 /* If we haven't already done so, save original
1700 FPCW and set precision to 64-bit, as Windows* OS
1701 on IA-32 architecture defaults to 53-bit */
1702 if (!fpcwSet) {
1703 oldFpcw = _control87(0, 0);
1704 _control87(_PC_64, _MCW_PC);
1705 fpcwSet = 0x30000;
1706 }
1707#endif
1708 if (chunkIdx) {
1709 init = __kmp_dispatch_guided_remaining<T>(
1710 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1711 KMP_DEBUG_ASSERT(init);
1712 init = trip - init;
1713 } else
1714 init = 0;
1715 limit = trip - __kmp_dispatch_guided_remaining<T>(
1716 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1717 KMP_ASSERT(init <= limit);
1718 if (init < limit) {
1719 KMP_DEBUG_ASSERT(limit <= trip);
1720 --limit;
1721 status = 1;
1722 break;
1723 } // if
1724 } // if
1725 } // while (1)
1726#if KMP_USE_X87CONTROL
1727 /* restore FPCW if necessary
1728 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1729 */
1730 if (fpcwSet && (oldFpcw & fpcwSet))
1731 _control87(oldFpcw, _MCW_PC);
1732#endif
1733 if (status != 0) {
1734 start = pr->u.p.lb;
1735 incr = pr->u.p.st;
1736 if (p_st != NULL)
1737 *p_st = incr;
1738 *p_lb = start + init * incr;
1739 *p_ub = start + limit * incr;
1740 if (pr->flags.ordered) {
1741 pr->u.p.ordered_lower = init;
1742 pr->u.p.ordered_upper = limit;
1743 }
1744 } else {
1745 *p_lb = 0;
1746 *p_ub = 0;
1747 if (p_st != NULL)
1748 *p_st = 0;
1749 }
1750 } // case
1751 break;
1752
1753 case kmp_sch_trapezoidal: {
1754 UT index;
1755 T parm2 = pr->u.p.parm2;
1756 T parm3 = pr->u.p.parm3;
1757 T parm4 = pr->u.p.parm4;
1758 KD_TRACE(100,
1759 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1760 gtid));
1761
1762 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1763
1764 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1765 trip = pr->u.p.tc - 1;
1766
1767 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1768 *p_lb = 0;
1769 *p_ub = 0;
1770 if (p_st != NULL)
1771 *p_st = 0;
1772 } else {
1773 start = pr->u.p.lb;
1774 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1775 incr = pr->u.p.st;
1776
1777 if ((last = (limit >= trip)) != 0)
1778 limit = trip;
1779
1780 if (p_st != NULL)
1781 *p_st = incr;
1782
1783 if (incr == 1) {
1784 *p_lb = start + init;
1785 *p_ub = start + limit;
1786 } else {
1787 *p_lb = start + init * incr;
1788 *p_ub = start + limit * incr;
1789 }
1790
1791 if (pr->flags.ordered) {
1792 pr->u.p.ordered_lower = init;
1793 pr->u.p.ordered_upper = limit;
1794 } // if
1795 } // if
1796 } // case
1797 break;
1798 default: {
1799 status = 0; // to avoid complaints on uninitialized variable use
1800 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1801 KMP_HNT(GetNewerLibrary), // Hint
1802 __kmp_msg_null // Variadic argument list terminator
1803 );
1804 } break;
1805 } // switch
1806 if (p_last)
1807 *p_last = last;
1808#ifdef KMP_DEBUG
1809 if (pr->flags.ordered) {
1810 char *buff;
1811 // create format specifiers before the debug output
1812 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1813 "ordered_lower:%%%s ordered_upper:%%%s\n",
1814 traits_t<UT>::spec, traits_t<UT>::spec);
1815 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1816 __kmp_str_free(&buff);
1817 }
1818 {
1819 char *buff;
1820 // create format specifiers before the debug output
1821 buff = __kmp_str_format(
1822 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1823 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1824 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1825 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1826 __kmp_str_free(&buff);
1827 }
1828#endif
1829 return status;
1830}
1831
1832/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1833 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1834 is not called. */
1835#if OMPT_SUPPORT && OMPT_OPTIONAL
1836#define OMPT_LOOP_END \
1837 if (status == 0) { \
1838 if (ompt_enabled.ompt_callback_work) { \
1839 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1840 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1841 ompt_callbacks.ompt_callback(ompt_callback_work)( \
1842 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1843 &(task_info->task_data), 0, codeptr); \
1844 } \
1845 }
1846// TODO: implement count
1847#else
1848#define OMPT_LOOP_END // no-op
1849#endif
1850
1851#if KMP_STATS_ENABLED
1852#define KMP_STATS_LOOP_END \
1853 { \
1854 kmp_int64 u, l, t, i; \
1855 l = (kmp_int64)(*p_lb); \
1856 u = (kmp_int64)(*p_ub); \
1857 i = (kmp_int64)(pr->u.p.st); \
1858 if (status == 0) { \
1859 t = 0; \
1860 KMP_POP_PARTITIONED_TIMER(); \
1861 } else if (i == 1) { \
1862 if (u >= l) \
1863 t = u - l + 1; \
1864 else \
1865 t = 0; \
1866 } else if (i < 0) { \
1867 if (l >= u) \
1868 t = (l - u) / (-i) + 1; \
1869 else \
1870 t = 0; \
1871 } else { \
1872 if (u >= l) \
1873 t = (u - l) / i + 1; \
1874 else \
1875 t = 0; \
1876 } \
1877 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1878 }
1879#else
1880#define KMP_STATS_LOOP_END /* Nothing */
1881#endif
1882
1883template <typename T>
1884static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1885 T *p_lb, T *p_ub,
1886 typename traits_t<T>::signed_t *p_st
1887#if OMPT_SUPPORT && OMPT_OPTIONAL
1888 ,
1889 void *codeptr
1890#endif
1891 ) {
1892
1893 typedef typename traits_t<T>::unsigned_t UT;
1894 typedef typename traits_t<T>::signed_t ST;
1895 // This is potentially slightly misleading, schedule(runtime) will appear here
1896 // even if the actual runtime schedule is static. (Which points out a
1897 // disadvantage of schedule(runtime): even when static scheduling is used it
1898 // costs more than a compile time choice to use static scheduling would.)
1899 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1900
1901 int status;
1902 dispatch_private_info_template<T> *pr;
1903 kmp_info_t *th = __kmp_threads[gtid];
1904 kmp_team_t *team = th->th.th_team;
1905
1906 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1907 KD_TRACE(
1908 1000,
1909 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1910 gtid, p_lb, p_ub, p_st, p_last));
1911
1912 if (team->t.t_serialized) {
1913 /* NOTE: serialize this dispatch because we are not at the active level */
1914 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1915 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1916 KMP_DEBUG_ASSERT(pr);
1917
1918 if ((status = (pr->u.p.tc != 0)) == 0) {
1919 *p_lb = 0;
1920 *p_ub = 0;
1921 // if ( p_last != NULL )
1922 // *p_last = 0;
1923 if (p_st != NULL)
1924 *p_st = 0;
1925 if (__kmp_env_consistency_check) {
1926 if (pr->pushed_ws != ct_none) {
1927 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1928 }
1929 }
1930 } else if (pr->flags.nomerge) {
1931 kmp_int32 last;
1932 T start;
1933 UT limit, trip, init;
1934 ST incr;
1935 T chunk = pr->u.p.parm1;
1936
1937 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1938 gtid));
1939
1940 init = chunk * pr->u.p.count++;
1941 trip = pr->u.p.tc - 1;
1942
1943 if ((status = (init <= trip)) == 0) {
1944 *p_lb = 0;
1945 *p_ub = 0;
1946 // if ( p_last != NULL )
1947 // *p_last = 0;
1948 if (p_st != NULL)
1949 *p_st = 0;
1950 if (__kmp_env_consistency_check) {
1951 if (pr->pushed_ws != ct_none) {
1952 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1953 }
1954 }
1955 } else {
1956 start = pr->u.p.lb;
1957 limit = chunk + init - 1;
1958 incr = pr->u.p.st;
1959
1960 if ((last = (limit >= trip)) != 0) {
1961 limit = trip;
1962#if KMP_OS_WINDOWS
1963 pr->u.p.last_upper = pr->u.p.ub;
1964#endif /* KMP_OS_WINDOWS */
1965 }
1966 if (p_last != NULL)
1967 *p_last = last;
1968 if (p_st != NULL)
1969 *p_st = incr;
1970 if (incr == 1) {
1971 *p_lb = start + init;
1972 *p_ub = start + limit;
1973 } else {
1974 *p_lb = start + init * incr;
1975 *p_ub = start + limit * incr;
1976 }
1977
1978 if (pr->flags.ordered) {
1979 pr->u.p.ordered_lower = init;
1980 pr->u.p.ordered_upper = limit;
1981#ifdef KMP_DEBUG
1982 {
1983 char *buff;
1984 // create format specifiers before the debug output
1985 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1986 "ordered_lower:%%%s ordered_upper:%%%s\n",
1987 traits_t<UT>::spec, traits_t<UT>::spec);
1988 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1989 pr->u.p.ordered_upper));
1990 __kmp_str_free(&buff);
1991 }
1992#endif
1993 } // if
1994 } // if
1995 } else {
1996 pr->u.p.tc = 0;
1997 *p_lb = pr->u.p.lb;
1998 *p_ub = pr->u.p.ub;
1999#if KMP_OS_WINDOWS
2000 pr->u.p.last_upper = *p_ub;
2001#endif /* KMP_OS_WINDOWS */
2002 if (p_last != NULL)
2003 *p_last = TRUE;
2004 if (p_st != NULL)
2005 *p_st = pr->u.p.st;
2006 } // if
2007#ifdef KMP_DEBUG
2008 {
2009 char *buff;
2010 // create format specifiers before the debug output
2011 buff = __kmp_str_format(
2012 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2013 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2014 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2015 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
2016 __kmp_str_free(&buff);
2017 }
2018#endif
2019#if INCLUDE_SSC_MARKS
2020 SSC_MARK_DISPATCH_NEXT();
2021#endif
2022 OMPT_LOOP_END;
2023 KMP_STATS_LOOP_END;
2024 return status;
2025 } else {
2026 kmp_int32 last = 0;
2027 dispatch_shared_info_template<T> volatile *sh;
2028
2029 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2030 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2031
2032 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2033 th->th.th_dispatch->th_dispatch_pr_current);
2034 KMP_DEBUG_ASSERT(pr);
2035 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2036 th->th.th_dispatch->th_dispatch_sh_current);
2037 KMP_DEBUG_ASSERT(sh);
2038
2039#if KMP_USE_HIER_SCHED
2040 if (pr->flags.use_hier)
2041 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2042 else
2043#endif // KMP_USE_HIER_SCHED
2044 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2045 p_st, th->th.th_team_nproc,
2046 th->th.th_info.ds.ds_tid);
2047 // status == 0: no more iterations to execute
2048 if (status == 0) {
2049 UT num_done;
2050
2051 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2052#ifdef KMP_DEBUG
2053 {
2054 char *buff;
2055 // create format specifiers before the debug output
2056 buff = __kmp_str_format(
2057 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2058 traits_t<UT>::spec);
2059 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2060 __kmp_str_free(&buff);
2061 }
2062#endif
2063
2064#if KMP_USE_HIER_SCHED
2065 pr->flags.use_hier = FALSE;
2066#endif
2067 if ((ST)num_done == th->th.th_team_nproc - 1) {
2068#if (KMP_STATIC_STEAL_ENABLED)
2069 if (pr->schedule == kmp_sch_static_steal &&
2070 traits_t<T>::type_size > 4) {
2071 int i;
2072 int idx = (th->th.th_dispatch->th_disp_index - 1) %
2073 __kmp_dispatch_num_buffers; // current loop index
2074 kmp_info_t **other_threads = team->t.t_threads;
2075 // loop complete, safe to destroy locks used for stealing
2076 for (i = 0; i < th->th.th_team_nproc; ++i) {
2077 dispatch_private_info_template<T> *buf =
2078 reinterpret_cast<dispatch_private_info_template<T> *>(
2079 &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
2080 kmp_lock_t *lck = buf->u.p.th_steal_lock;
2081 KMP_ASSERT(lck != NULL);
2082 __kmp_destroy_lock(lck);
2083 __kmp_free(lck);
2084 buf->u.p.th_steal_lock = NULL;
2085 }
2086 }
2087#endif
2088 /* NOTE: release this buffer to be reused */
2089
2090 KMP_MB(); /* Flush all pending memory write invalidates. */
2091
2092 sh->u.s.num_done = 0;
2093 sh->u.s.iteration = 0;
2094
2095 /* TODO replace with general release procedure? */
2096 if (pr->flags.ordered) {
2097 sh->u.s.ordered_iteration = 0;
2098 }
2099
2100 KMP_MB(); /* Flush all pending memory write invalidates. */
2101
2102 sh->buffer_index += __kmp_dispatch_num_buffers;
2103 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2104 gtid, sh->buffer_index));
2105
2106 KMP_MB(); /* Flush all pending memory write invalidates. */
2107
2108 } // if
2109 if (__kmp_env_consistency_check) {
2110 if (pr->pushed_ws != ct_none) {
2111 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2112 }
2113 }
2114
2115 th->th.th_dispatch->th_deo_fcn = NULL;
2116 th->th.th_dispatch->th_dxo_fcn = NULL;
2117 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2118 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2119 } // if (status == 0)
2120#if KMP_OS_WINDOWS
2121 else if (last) {
2122 pr->u.p.last_upper = pr->u.p.ub;
2123 }
2124#endif /* KMP_OS_WINDOWS */
2125 if (p_last != NULL && status != 0)
2126 *p_last = last;
2127 } // if
2128
2129#ifdef KMP_DEBUG
2130 {
2131 char *buff;
2132 // create format specifiers before the debug output
2133 buff = __kmp_str_format(
2134 "__kmp_dispatch_next: T#%%d normal case: "
2135 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2136 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2137 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2138 (p_last ? *p_last : 0), status));
2139 __kmp_str_free(&buff);
2140 }
2141#endif
2142#if INCLUDE_SSC_MARKS
2143 SSC_MARK_DISPATCH_NEXT();
2144#endif
2145 OMPT_LOOP_END;
2146 KMP_STATS_LOOP_END;
2147 return status;
2148}
2149
2150template <typename T>
2151static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2152 kmp_int32 *plastiter, T *plower, T *pupper,
2153 typename traits_t<T>::signed_t incr) {
2154 typedef typename traits_t<T>::unsigned_t UT;
2155 kmp_uint32 team_id;
2156 kmp_uint32 nteams;
2157 UT trip_count;
2158 kmp_team_t *team;
2159 kmp_info_t *th;
2160
2161 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2162 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2163#ifdef KMP_DEBUG
2164 typedef typename traits_t<T>::signed_t ST;
2165 {
2166 char *buff;
2167 // create format specifiers before the debug output
2168 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2169 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2170 traits_t<T>::spec, traits_t<T>::spec,
2171 traits_t<ST>::spec, traits_t<T>::spec);
2172 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2173 __kmp_str_free(&buff);
2174 }
2175#endif
2176
2177 if (__kmp_env_consistency_check) {
2178 if (incr == 0) {
2179 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2180 loc);
2181 }
2182 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2183 // The loop is illegal.
2184 // Some zero-trip loops maintained by compiler, e.g.:
2185 // for(i=10;i<0;++i) // lower >= upper - run-time check
2186 // for(i=0;i>10;--i) // lower <= upper - run-time check
2187 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2188 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2189 // Compiler does not check the following illegal loops:
2190 // for(i=0;i<10;i+=incr) // where incr<0
2191 // for(i=10;i>0;i-=incr) // where incr<0
2192 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2193 }
2194 }
2195 th = __kmp_threads[gtid];
2196 team = th->th.th_team;
2197 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2198 nteams = th->th.th_teams_size.nteams;
2199 team_id = team->t.t_master_tid;
2200 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2201
2202 // compute global trip count
2203 if (incr == 1) {
2204 trip_count = *pupper - *plower + 1;
2205 } else if (incr == -1) {
2206 trip_count = *plower - *pupper + 1;
2207 } else if (incr > 0) {
2208 // upper-lower can exceed the limit of signed type
2209 trip_count = (UT)(*pupper - *plower) / incr + 1;
2210 } else {
2211 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2212 }
2213
2214 if (trip_count <= nteams) {
2215 KMP_DEBUG_ASSERT(
2216 __kmp_static == kmp_sch_static_greedy ||
2217 __kmp_static ==
2218 kmp_sch_static_balanced); // Unknown static scheduling type.
2219 // only some teams get single iteration, others get nothing
2220 if (team_id < trip_count) {
2221 *pupper = *plower = *plower + team_id * incr;
2222 } else {
2223 *plower = *pupper + incr; // zero-trip loop
2224 }
2225 if (plastiter != NULL)
2226 *plastiter = (team_id == trip_count - 1);
2227 } else {
2228 if (__kmp_static == kmp_sch_static_balanced) {
2229 UT chunk = trip_count / nteams;
2230 UT extras = trip_count % nteams;
2231 *plower +=
2232 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2233 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2234 if (plastiter != NULL)
2235 *plastiter = (team_id == nteams - 1);
2236 } else {
2237 T chunk_inc_count =
2238 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2239 T upper = *pupper;
2240 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2241 // Unknown static scheduling type.
2242 *plower += team_id * chunk_inc_count;
2243 *pupper = *plower + chunk_inc_count - incr;
2244 // Check/correct bounds if needed
2245 if (incr > 0) {
2246 if (*pupper < *plower)
2247 *pupper = traits_t<T>::max_value;
2248 if (plastiter != NULL)
2249 *plastiter = *plower <= upper && *pupper > upper - incr;
2250 if (*pupper > upper)
2251 *pupper = upper; // tracker C73258
2252 } else {
2253 if (*pupper > *plower)
2254 *pupper = traits_t<T>::min_value;
2255 if (plastiter != NULL)
2256 *plastiter = *plower >= upper && *pupper < upper - incr;
2257 if (*pupper < upper)
2258 *pupper = upper; // tracker C73258
2259 }
2260 }
2261 }
2262}
2263
2264//-----------------------------------------------------------------------------
2265// Dispatch routines
2266// Transfer call to template< type T >
2267// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2268// T lb, T ub, ST st, ST chunk )
2269extern "C" {
2270
2287void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2288 enum sched_type schedule, kmp_int32 lb,
2289 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2290 KMP_DEBUG_ASSERT(__kmp_init_serial);
2291#if OMPT_SUPPORT && OMPT_OPTIONAL
2292 OMPT_STORE_RETURN_ADDRESS(gtid);
2293#endif
2294 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2295}
2299void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2300 enum sched_type schedule, kmp_uint32 lb,
2301 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2302 KMP_DEBUG_ASSERT(__kmp_init_serial);
2303#if OMPT_SUPPORT && OMPT_OPTIONAL
2304 OMPT_STORE_RETURN_ADDRESS(gtid);
2305#endif
2306 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2307}
2308
2312void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2313 enum sched_type schedule, kmp_int64 lb,
2314 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2315 KMP_DEBUG_ASSERT(__kmp_init_serial);
2316#if OMPT_SUPPORT && OMPT_OPTIONAL
2317 OMPT_STORE_RETURN_ADDRESS(gtid);
2318#endif
2319 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2320}
2321
2325void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2326 enum sched_type schedule, kmp_uint64 lb,
2327 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2328 KMP_DEBUG_ASSERT(__kmp_init_serial);
2329#if OMPT_SUPPORT && OMPT_OPTIONAL
2330 OMPT_STORE_RETURN_ADDRESS(gtid);
2331#endif
2332 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2333}
2334
2344void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2345 enum sched_type schedule, kmp_int32 *p_last,
2346 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2347 kmp_int32 chunk) {
2348 KMP_DEBUG_ASSERT(__kmp_init_serial);
2349#if OMPT_SUPPORT && OMPT_OPTIONAL
2350 OMPT_STORE_RETURN_ADDRESS(gtid);
2351#endif
2352 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2353 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2354}
2355
2356void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2357 enum sched_type schedule, kmp_int32 *p_last,
2358 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2359 kmp_int32 chunk) {
2360 KMP_DEBUG_ASSERT(__kmp_init_serial);
2361#if OMPT_SUPPORT && OMPT_OPTIONAL
2362 OMPT_STORE_RETURN_ADDRESS(gtid);
2363#endif
2364 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2365 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2366}
2367
2368void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2369 enum sched_type schedule, kmp_int32 *p_last,
2370 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2371 kmp_int64 chunk) {
2372 KMP_DEBUG_ASSERT(__kmp_init_serial);
2373#if OMPT_SUPPORT && OMPT_OPTIONAL
2374 OMPT_STORE_RETURN_ADDRESS(gtid);
2375#endif
2376 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2377 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2378}
2379
2380void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2381 enum sched_type schedule, kmp_int32 *p_last,
2382 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2383 kmp_int64 chunk) {
2384 KMP_DEBUG_ASSERT(__kmp_init_serial);
2385#if OMPT_SUPPORT && OMPT_OPTIONAL
2386 OMPT_STORE_RETURN_ADDRESS(gtid);
2387#endif
2388 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2389 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2390}
2391
2405int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2406 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2407#if OMPT_SUPPORT && OMPT_OPTIONAL
2408 OMPT_STORE_RETURN_ADDRESS(gtid);
2409#endif
2410 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2411#if OMPT_SUPPORT && OMPT_OPTIONAL
2412 ,
2413 OMPT_LOAD_RETURN_ADDRESS(gtid)
2414#endif
2415 );
2416}
2417
2421int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2422 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2423 kmp_int32 *p_st) {
2424#if OMPT_SUPPORT && OMPT_OPTIONAL
2425 OMPT_STORE_RETURN_ADDRESS(gtid);
2426#endif
2427 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2428#if OMPT_SUPPORT && OMPT_OPTIONAL
2429 ,
2430 OMPT_LOAD_RETURN_ADDRESS(gtid)
2431#endif
2432 );
2433}
2434
2438int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2439 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2440#if OMPT_SUPPORT && OMPT_OPTIONAL
2441 OMPT_STORE_RETURN_ADDRESS(gtid);
2442#endif
2443 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2444#if OMPT_SUPPORT && OMPT_OPTIONAL
2445 ,
2446 OMPT_LOAD_RETURN_ADDRESS(gtid)
2447#endif
2448 );
2449}
2450
2454int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2455 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2456 kmp_int64 *p_st) {
2457#if OMPT_SUPPORT && OMPT_OPTIONAL
2458 OMPT_STORE_RETURN_ADDRESS(gtid);
2459#endif
2460 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2461#if OMPT_SUPPORT && OMPT_OPTIONAL
2462 ,
2463 OMPT_LOAD_RETURN_ADDRESS(gtid)
2464#endif
2465 );
2466}
2467
2474void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2475 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2476}
2477
2481void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2482 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2483}
2484
2488void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2489 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2490}
2491
2495void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2496 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2497}
2500//-----------------------------------------------------------------------------
2501// Non-template routines from kmp_dispatch.cpp used in other sources
2502
2503kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2504 return value == checker;
2505}
2506
2507kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2508 return value != checker;
2509}
2510
2511kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2512 return value < checker;
2513}
2514
2515kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2516 return value >= checker;
2517}
2518
2519kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2520 return value <= checker;
2521}
2522
2523kmp_uint32
2524__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2525 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2526 void *obj // Higher-level synchronization object, or NULL.
2527 ) {
2528 // note: we may not belong to a team at this point
2529 volatile kmp_uint32 *spin = spinner;
2530 kmp_uint32 check = checker;
2531 kmp_uint32 spins;
2532 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2533 kmp_uint32 r;
2534
2535 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2536 KMP_INIT_YIELD(spins);
2537 // main wait spin loop
2538 while (!f(r = TCR_4(*spin), check)) {
2539 KMP_FSYNC_SPIN_PREPARE(obj);
2540 /* GEH - remove this since it was accidentally introduced when kmp_wait was
2541 split. It causes problems with infinite recursion because of exit lock */
2542 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2543 __kmp_abort_thread(); */
2544 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2545 }
2546 KMP_FSYNC_SPIN_ACQUIRED(obj);
2547 return r;
2548}
2549
2550void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2551 kmp_uint32 (*pred)(void *, kmp_uint32),
2552 void *obj // Higher-level synchronization object, or NULL.
2553 ) {
2554 // note: we may not belong to a team at this point
2555 void *spin = spinner;
2556 kmp_uint32 check = checker;
2557 kmp_uint32 spins;
2558 kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2559
2560 KMP_FSYNC_SPIN_INIT(obj, spin);
2561 KMP_INIT_YIELD(spins);
2562 // main wait spin loop
2563 while (!f(spin, check)) {
2564 KMP_FSYNC_SPIN_PREPARE(obj);
2565 /* if we have waited a bit, or are noversubscribed, yield */
2566 /* pause is in the following code */
2567 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2568 }
2569 KMP_FSYNC_SPIN_ACQUIRED(obj);
2570}
2571
2572} // extern "C"
2573
2574#ifdef KMP_GOMP_COMPAT
2575
2576void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2577 enum sched_type schedule, kmp_int32 lb,
2578 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2579 int push_ws) {
2580 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2581 push_ws);
2582}
2583
2584void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2585 enum sched_type schedule, kmp_uint32 lb,
2586 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2587 int push_ws) {
2588 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2589 push_ws);
2590}
2591
2592void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2593 enum sched_type schedule, kmp_int64 lb,
2594 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2595 int push_ws) {
2596 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2597 push_ws);
2598}
2599
2600void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2601 enum sched_type schedule, kmp_uint64 lb,
2602 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2603 int push_ws) {
2604 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2605 push_ws);
2606}
2607
2608void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2609 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2610}
2611
2612void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2613 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2614}
2615
2616void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2617 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2618}
2619
2620void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2621 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2622}
2623
2624#endif /* KMP_GOMP_COMPAT */
2625
2626/* ------------------------------------------------------------------------ */
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:887
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:900
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
sched_type
Definition: kmp.h:336
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)
@ kmp_sch_runtime_simd
Definition: kmp.h:358
@ kmp_sch_auto
Definition: kmp.h:343
@ kmp_sch_static
Definition: kmp.h:339
@ kmp_sch_guided_simd
Definition: kmp.h:357
@ kmp_sch_guided_chunked
Definition: kmp.h:341
@ kmp_sch_lower
Definition: kmp.h:337
@ kmp_nm_upper
Definition: kmp.h:408
@ kmp_ord_lower
Definition: kmp.h:363
@ kmp_sch_upper
Definition: kmp.h:361
@ kmp_nm_lower
Definition: kmp.h:381
Definition: kmp.h:222