Halide
synchronization_common.h
Go to the documentation of this file.
1 #include "HalideRuntime.h"
2 #include "printer.h"
3 #include "scoped_spin_lock.h"
4 
5 /* This provides an implementation of pthreads-like mutex and
6  * condition variables with fast default case performance. The code
7  * is based on the "parking lot" design and specifically Amanieu
8  * d'Antras' Rust implementation:
9  * https://github.com/Amanieu/parking_lot
10  * and the one in WTF:
11  * https://webkit.org/blog/6161/locking-in-webkit/
12  *
13  * Neither of the above implementations were used directly largely for
14  * dependency management. This implementation lacks a few features
15  * relative to those two. Specifically timeouts are not
16  * supported. Nor is optional fairness or deadlock detection.
17  * This implementation should provide a faily standalone "one file"
18  * fast synchronization layer on top of readily available system primitives.
19  *
20  * TODO: Implement pthread_once equivalent.
21  * TODO: Add read/write lock and move SharedExclusiveSpinLock from tracing.cpp
22  * to this mechanism.
23  * TODO: Add timeouts and optional fairness if needed.
24  * TODO: Relying on condition variables has issues for old versions of Windows
25  * and likely has portability issues to some very bare bones embedded OSes.
26  * Doing an implementation using only semaphores or event counters should
27  * be doable.
28  */
29 
30 #include "runtime_atomics.h"
31 
32 // Copied from tsan_interface.h
33 #ifndef TSAN_ANNOTATIONS
34 #define TSAN_ANNOTATIONS 0
35 #endif
36 
37 #if TSAN_ANNOTATIONS
38 extern "C" {
39 const unsigned __tsan_mutex_linker_init = 1 << 0;
40 void __tsan_mutex_pre_lock(void *addr, unsigned flags);
41 void __tsan_mutex_post_lock(void *addr, unsigned flags, int recursion);
42 int __tsan_mutex_pre_unlock(void *addr, unsigned flags);
43 void __tsan_mutex_post_unlock(void *addr, unsigned flags);
44 void __tsan_mutex_pre_signal(void *addr, unsigned flags);
45 void __tsan_mutex_post_signal(void *addr, unsigned flags);
46 }
47 #endif
48 
49 namespace Halide {
50 namespace Runtime {
51 namespace Internal {
52 
53 namespace Synchronization {
54 
55 namespace {
56 
57 #if TSAN_ANNOTATIONS
58 ALWAYS_INLINE void if_tsan_pre_lock(void *mutex) {
59  __tsan_mutex_pre_lock(mutex, __tsan_mutex_linker_init);
60 };
61 // TODO(zalman|dvyukov): Is 1 the right value for a non-recursive lock? pretty sure value is ignored.
62 ALWAYS_INLINE void if_tsan_post_lock(void *mutex) {
63  __tsan_mutex_post_lock(mutex, __tsan_mutex_linker_init, 1);
64 }
65 // TODO(zalman|dvyukov): Is it safe to ignore return value here if locks are not recursive?
66 ALWAYS_INLINE void if_tsan_pre_unlock(void *mutex) {
67  (void)__tsan_mutex_pre_unlock(mutex, __tsan_mutex_linker_init);
68 }
69 ALWAYS_INLINE void if_tsan_post_unlock(void *mutex) {
70  __tsan_mutex_post_unlock(mutex, __tsan_mutex_linker_init);
71 }
72 ALWAYS_INLINE void if_tsan_pre_signal(void *cond) {
73  __tsan_mutex_pre_signal(cond, 0);
74 }
75 ALWAYS_INLINE void if_tsan_post_signal(void *cond) {
76  __tsan_mutex_post_signal(cond, 0);
77 }
78 #else
79 ALWAYS_INLINE void if_tsan_pre_lock(void *) {
80 }
81 ALWAYS_INLINE void if_tsan_post_lock(void *) {
82 }
83 ALWAYS_INLINE void if_tsan_pre_unlock(void *) {
84 }
85 ALWAYS_INLINE void if_tsan_post_unlock(void *) {
86 }
87 ALWAYS_INLINE void if_tsan_pre_signal(void *) {
88 }
89 ALWAYS_INLINE void if_tsan_post_signal(void *) {
90 }
91 #endif
92 
93 } // namespace
94 
95 class spin_control {
96  // Everyone says this should be 40. Have not measured it.
97  int spin_count = 40;
98 
99 public:
101  if (spin_count > 0) {
102  spin_count--;
103  }
104  return spin_count > 0;
105  }
106 
108  spin_count = 40;
109  }
110 };
111 
112 // Low order two bits are used for locking state,
113 static constexpr uint8_t lock_bit = 0x01;
114 static constexpr uint8_t queue_lock_bit = 0x02;
115 static constexpr uint8_t parked_bit = 0x02;
116 
118  thread_parker parker; // TODO: member or pointer?
119 
120  // This design is from the Rust parking lot implementation by Amanieu d'Antras.
121  // Comment from original:
122  //
123  // Linked list of threads in the queue. The queue is split into two parts:
124  // the processed part and the unprocessed part. When new nodes are added to
125  // the list, they only have the next pointer set, and queue_tail is null.
126  //
127  // Nodes are processed with the queue lock held, which consists of setting
128  // the prev pointer for each node and setting the queue_tail pointer on the
129  // first processed node of the list.
130  //
131  // This setup allows nodes to be added to the queue without a lock, while
132  // still allowing O(1) removal of nodes from the processed part of the list.
133  // The only cost is the O(n) processing, but this only needs to be done
134  // once for each node, and therefore isn't too expensive.
135 
139 };
140 
141 class word_lock {
142  uintptr_t state = 0;
143 
144  void lock_full();
145  void unlock_full();
146 
147 public:
149  if_tsan_pre_lock(this);
150 
151  uintptr_t expected = 0;
152  uintptr_t desired = lock_bit;
153  // Try for a fast grab of the lock bit. If this does not work, call the full adaptive looping code.
154  if (!atomic_cas_weak_acquire_relaxed(&state, &expected, &desired)) {
155  lock_full();
156  }
157 
158  if_tsan_post_lock(this);
159  }
160 
162  if_tsan_pre_unlock(this);
163 
164  uintptr_t val = atomic_fetch_and_release(&state, ~(uintptr_t)lock_bit);
165  // If another thread is currently queueing, that thread will ensure
166  // it acquires the lock or wakes a waiting thread.
167  bool no_thread_queuing = (val & queue_lock_bit) == 0;
168  // Only need to do a wakeup if there are threads waiting.
169  bool some_queued = (val & ~(uintptr_t)(queue_lock_bit | lock_bit)) != 0;
170  if (no_thread_queuing && some_queued) {
171  unlock_full();
172  }
173 
174  if_tsan_post_unlock(this);
175  }
176 };
177 
178 WEAK void word_lock::lock_full() {
179  spin_control spinner;
180  uintptr_t expected;
181  atomic_load_relaxed(&state, &expected);
182 
183  while (true) {
184  if (!(expected & lock_bit)) {
185  uintptr_t desired = expected | lock_bit;
186 
187  if (atomic_cas_weak_acquire_relaxed(&state, &expected, &desired)) {
188  return;
189  }
190  continue;
191  }
192 
193  if (((expected & ~(uintptr_t)(queue_lock_bit | lock_bit)) != 0) && spinner.should_spin()) {
195  atomic_load_relaxed(&state, &expected);
196  continue;
197  }
198 
199  word_lock_queue_data node;
200 
201  node.parker.prepare_park();
202  // TODO set up prelinkage parking state
203 
204  word_lock_queue_data *head = (word_lock_queue_data *)(expected & ~(uintptr_t)(queue_lock_bit | lock_bit));
205  if (head == nullptr) {
206  node.tail = &node;
207  // constructor set node.prev = nullptr;
208  } else {
209  // Mark the tail as nullptr. The unlock routine will walk the list and wakeup
210  // the thread at the end.
211  // constructor set node.tail = nullptr;
212  // constructor set node.prev = nullptr;
213  node.next = head;
214  }
215 
216  uintptr_t desired = ((uintptr_t)&node) | (expected & (queue_lock_bit | lock_bit));
217  if (atomic_cas_weak_release_relaxed(&state, &expected, &desired)) {
218  node.parker.park();
219  spinner.reset();
220  atomic_load_relaxed(&state, &expected);
221  }
222  }
223 }
224 
225 WEAK void word_lock::unlock_full() {
226  uintptr_t expected;
227  atomic_load_relaxed(&state, &expected);
228 
229  while (true) {
230  // If another thread is currently queueing, that thread will ensure
231  // it acquires the lock or wakes a waiting thread.
232  bool thread_queuing = (expected & queue_lock_bit);
233  // Only need to do a wakeup if there are threads waiting.
234  bool none_queued = (expected & ~(uintptr_t)(queue_lock_bit | lock_bit)) == 0;
235  if (thread_queuing || none_queued) {
236  return;
237  }
238 
239  uintptr_t desired = expected | queue_lock_bit;
240  if (atomic_cas_weak_acquire_relaxed(&state, &expected, &desired)) {
241  break;
242  }
243  }
244 
245  while (true) {
246  word_lock_queue_data *head = (word_lock_queue_data *)(expected & ~(uintptr_t)(queue_lock_bit | lock_bit));
247  word_lock_queue_data *current = head;
248  word_lock_queue_data *tail = current->tail;
249  while (tail == nullptr) {
250  word_lock_queue_data *next = current->next;
251  halide_abort_if_false(nullptr, next != nullptr);
252  next->prev = current;
253  current = next;
254  tail = current->tail;
255  }
256  head->tail = tail;
257 
258  // If the lock is now locked, unlock the queue and have the thread
259  // that currently holds the lock do the wakeup
260  if (expected & lock_bit) {
261  uintptr_t desired = expected & ~(uintptr_t)queue_lock_bit;
262  if (atomic_cas_weak_relacq_relaxed(&state, &expected, &desired)) {
263  return;
264  }
265  atomic_thread_fence_acquire();
266  continue;
267  }
268 
269  word_lock_queue_data *new_tail = tail->prev;
270  if (new_tail == nullptr) {
271  bool continue_outer = false;
272  while (!continue_outer) {
273  uintptr_t desired = expected & lock_bit;
274  if (atomic_cas_weak_relacq_relaxed(&state, &expected, &desired)) {
275  break;
276  }
277  if ((expected & ~(uintptr_t)(queue_lock_bit | lock_bit)) == 0) {
278  continue;
279  } else {
280  atomic_thread_fence_acquire();
281  continue_outer = true;
282  }
283  }
284 
285  if (continue_outer) {
286  continue;
287  }
288  } else {
289  head->tail = new_tail;
290  atomic_and_fetch_release(&state, ~(uintptr_t)queue_lock_bit);
291  }
292 
293  // TODO: The reason there are three calls here is other things can happen between them.
294  // Also it is not clear if unpark_start has to return the mutex/flag used by unpark
295  // and unpark_finish due to memory lifetime reasons.
296  tail->parker.unpark_start();
297  tail->parker.unpark();
298  tail->parker.unpark_finish();
299  break;
300  }
301 }
302 
303 struct queue_data {
304  thread_parker parker; // TODO: member or pointer?
305 
307  queue_data *next = nullptr;
309 };
310 
311 // Must be a power of two.
312 constexpr int LOAD_FACTOR = 4;
313 
314 struct hash_bucket {
316 
317  queue_data *head = nullptr; // Is this queue_data or thread_data?
318  queue_data *tail = nullptr; // Is this queue_data or thread_data?
319 };
320 
321 constexpr int HASH_TABLE_SIZE = MAX_THREADS * LOAD_FACTOR;
322 struct hash_table {
324 };
326 
327 constexpr int HASH_TABLE_BITS = 10;
328 static_assert((1 << HASH_TABLE_BITS) >= MAX_THREADS * LOAD_FACTOR);
329 
330 #if 0
331 WEAK void dump_hash() {
332  int i = 0;
333  for (auto &bucket : table.buckets) {
334  queue_data *head = bucket.head;
335  while (head != nullptr) {
336  print(nullptr) << "Bucket index " << i << " addr " << (void *)head->sleep_address << "\n";
337  head = head->next;
338  }
339  i++;
340  }
341 }
342 #endif
343 
344 static ALWAYS_INLINE uintptr_t addr_hash(uintptr_t addr) {
345  // Fibonacci hashing. The golden ratio is 1.9E3779B97F4A7C15F39...
346  // in hexadecimal.
347  if (sizeof(uintptr_t) >= 8) {
348  return (addr * (uintptr_t)0x9E3779B97F4A7C15) >> (64 - HASH_TABLE_BITS);
349  } else {
350  return (addr * (uintptr_t)0x9E3779B9) >> (32 - HASH_TABLE_BITS);
351  }
352 }
353 
355  uintptr_t hash = addr_hash(addr);
356 
357  halide_debug_assert(nullptr, hash < HASH_TABLE_SIZE);
358 
359  // TODO: if resizing is implemented, loop, etc.
360  hash_bucket &bucket = table.buckets[hash];
361 
362  bucket.mutex.lock();
363 
364  return bucket;
365 }
366 
367 struct bucket_pair {
370 
372  : from(from), to(to) {
373  }
374 };
375 
377  // TODO: if resizing is implemented, loop, etc.
378  uintptr_t hash_from = addr_hash(addr_from);
379  uintptr_t hash_to = addr_hash(addr_to);
380 
381  halide_debug_assert(nullptr, hash_from < HASH_TABLE_SIZE);
382  halide_debug_assert(nullptr, hash_to < HASH_TABLE_SIZE);
383 
384  // Lock the bucket with the smaller hash first in order
385  // to prevent deadlock.
386  if (hash_from == hash_to) {
387  hash_bucket &first = table.buckets[hash_from];
388  first.mutex.lock();
389  return bucket_pair(first, first);
390  } else if (hash_from < hash_to) {
391  hash_bucket &first = table.buckets[hash_from];
392  hash_bucket &second = table.buckets[hash_to];
393  first.mutex.lock();
394  second.mutex.lock();
395  return bucket_pair(first, second);
396  } else {
397  hash_bucket &first = table.buckets[hash_to];
398  hash_bucket &second = table.buckets[hash_from];
399  first.mutex.lock();
400  second.mutex.lock();
401  return bucket_pair(second, first);
402  }
403 }
404 
406  // In the lock routine, the buckets are locked smaller hash index first.
407  // Here we reverse this ordering by comparing the pointers. This works
408  // since the pointers are obtained by indexing an array with the hash
409  // values.
410  if (&buckets.from == &buckets.to) {
411  buckets.from.mutex.unlock();
412  } else if (&buckets.from > &buckets.to) {
413  buckets.from.mutex.unlock();
414  buckets.to.mutex.unlock();
415  } else {
416  buckets.to.mutex.unlock();
417  buckets.from.mutex.unlock();
418  }
419 }
420 
422  bool unpark_one = false;
424 };
425 
427  uintptr_t park(uintptr_t addr);
429  int unpark_requeue(uintptr_t addr_from, uintptr_t addr_to, uintptr_t unpark_info);
430 
431 protected:
432  virtual bool validate(validate_action &action) {
433  return true;
434  }
435  virtual void before_sleep() {
436  // nothing
437  }
438  virtual uintptr_t unpark(int unparked, bool more_waiters) {
439  return 0;
440  }
441  virtual void requeue_callback(const validate_action &action, bool one_to_wake, bool some_requeued) {
442  // nothing
443  }
444 };
445 
446 // TODO: Do we need a park_result thing here?
449 
450  hash_bucket &bucket = lock_bucket(addr);
451 
452  validate_action action;
453  if (!validate(action)) {
454  bucket.mutex.unlock();
455  return action.invalid_unpark_info;
456  }
457 
458  queue_data.next = nullptr;
459  queue_data.sleep_address = addr;
460  queue_data.parker.prepare_park();
461  if (bucket.head != nullptr) {
462  bucket.tail->next = &queue_data;
463  } else {
464  bucket.head = &queue_data;
465  }
466  bucket.tail = &queue_data;
467  bucket.mutex.unlock();
468 
469  before_sleep();
470 
471  queue_data.parker.park();
472 
473  return queue_data.unpark_info;
474 
475  // TODO: handling timeout.
476 }
477 
479  hash_bucket &bucket = lock_bucket(addr);
480 
481  queue_data **data_location = &bucket.head;
482  queue_data *prev = nullptr;
483  queue_data *data = *data_location;
484  while (data != nullptr) {
485  uintptr_t cur_addr;
486  atomic_load_relaxed(&data->sleep_address, &cur_addr);
487  if (cur_addr == addr) {
488  queue_data *next = data->next;
489  *data_location = next;
490 
491  bool more_waiters = false;
492 
493  if (bucket.tail == data) {
494  bucket.tail = prev;
495  } else {
496  queue_data *data2 = next;
497  while (data2 != nullptr && !more_waiters) {
498  uintptr_t cur_addr2;
499  atomic_load_relaxed(&data2->sleep_address, &cur_addr2);
500  more_waiters = (cur_addr2 == addr);
501  data2 = data2->next;
502  }
503  }
504 
505  data->unpark_info = unpark(1, more_waiters);
506 
507  data->parker.unpark_start();
508  bucket.mutex.unlock();
509  data->parker.unpark();
510  data->parker.unpark_finish();
511 
512  // TODO: Figure out ret type.
513  return more_waiters ? 1 : 0;
514  } else {
515  data_location = &data->next;
516  prev = data;
517  data = data->next;
518  }
519  }
520 
521  unpark(0, false);
522 
523  bucket.mutex.unlock();
524 
525  // TODO: decide if this is the right return value.
526  return 0;
527 }
528 
529 WEAK int parking_control::unpark_requeue(uintptr_t addr_from, uintptr_t addr_to, uintptr_t unpark_info) {
530  bucket_pair buckets = lock_bucket_pair(addr_from, addr_to);
531 
532  validate_action action;
533  if (!validate(action)) {
534  unlock_bucket_pair(buckets);
535  return 0;
536  }
537 
538  queue_data **data_location = &buckets.from.head;
539  queue_data *prev = nullptr;
540  queue_data *data = *data_location;
541  queue_data *requeue = nullptr;
542  queue_data *requeue_tail = nullptr;
543  queue_data *wakeup = nullptr;
544 
545  while (data != nullptr) {
546  uintptr_t cur_addr;
547  atomic_load_relaxed(&data->sleep_address, &cur_addr);
548 
549  queue_data *next = data->next;
550  if (cur_addr == addr_from) {
551  *data_location = next;
552 
553  if (buckets.from.tail == data) {
554  buckets.from.tail = prev;
555  }
556 
557  if (action.unpark_one && wakeup == nullptr) {
558  wakeup = data;
559  } else {
560  if (requeue == nullptr) {
561  requeue = data;
562  } else {
563  requeue_tail->next = data;
564  }
565 
566  requeue_tail = data;
567  atomic_store_relaxed(&data->sleep_address, &addr_to);
568  }
569  data = next;
570  // TODO: prev ptr?
571  } else {
572  data_location = &data->next;
573  prev = data;
574  data = next;
575  }
576  }
577 
578  if (requeue != nullptr) {
579  requeue_tail->next = nullptr;
580  if (buckets.to.head == nullptr) {
581  buckets.to.head = requeue;
582  } else {
583  buckets.to.tail->next = requeue;
584  }
585  buckets.to.tail = requeue_tail;
586  }
587 
588  requeue_callback(action, wakeup != nullptr, requeue != nullptr);
589 
590  if (wakeup != nullptr) {
591  wakeup->unpark_info = unpark_info;
592  wakeup->parker.unpark_start();
593  unlock_bucket_pair(buckets);
594  wakeup->parker.unpark();
595  wakeup->parker.unpark_finish();
596  } else {
597  unlock_bucket_pair(buckets);
598  }
599 
600  return wakeup != nullptr && action.unpark_one;
601 }
602 
603 struct mutex_parking_control final : public parking_control {
605 
608  }
609 
610 protected:
611  bool validate(validate_action &action) final {
612  uintptr_t result;
613  atomic_load_relaxed(lock_state, &result);
614  return result == (lock_bit | parked_bit);
615  }
616 
617  uintptr_t unpark(int unparked, bool more_waiters) final {
618  // TODO: consider handling fairness.
619  uintptr_t return_state = more_waiters ? parked_bit : 0;
620  atomic_store_release(lock_state, &return_state);
621  return 0;
622  }
623 };
624 
625 class fast_mutex {
626  uintptr_t state = 0;
627 
628  ALWAYS_INLINE void lock_full() {
629  // Everyone says this should be 40. Have not measured it.
630  spin_control spinner;
631  uintptr_t expected;
632  atomic_load_relaxed(&state, &expected);
633 
634  while (true) {
635  if (!(expected & lock_bit)) {
636  uintptr_t desired = expected | lock_bit;
637  if (atomic_cas_weak_acquire_relaxed(&state, &expected, &desired)) {
638  return;
639  }
640  continue;
641  }
642 
643  // Spin with spin count. Note that this occurs even if
644  // threads are parked. We're prioritizing throughput over
645  // fairness by letting sleeping threads lie.
646  if (spinner.should_spin()) {
648  atomic_load_relaxed(&state, &expected);
649  continue;
650  }
651 
652  // Mark mutex as having parked threads if not already done.
653  if ((expected & parked_bit) == 0) {
654  uintptr_t desired = expected | parked_bit;
655  if (!atomic_cas_weak_relaxed_relaxed(&state, &expected, &desired)) {
656  continue;
657  }
658  }
659 
660  // TODO: consider handling fairness, timeout
661  mutex_parking_control control(&state);
662  uintptr_t result = control.park((uintptr_t)this);
663  if (result == (uintptr_t)this) {
664  return;
665  }
666 
667  spinner.reset();
668  atomic_load_relaxed(&state, &expected);
669  }
670  }
671 
672  ALWAYS_INLINE void unlock_full() {
673  uintptr_t expected = lock_bit;
674  uintptr_t desired = 0;
675  // Try for a fast release of the lock. Redundant with code in lock, but done
676  // to make unlock_full a standalone unlock that can be called directly.
677  if (atomic_cas_strong_release_relaxed(&state, &expected, &desired)) {
678  return;
679  }
680 
681  mutex_parking_control control(&state);
682  control.unpark_one((uintptr_t)this);
683  }
684 
685 public:
687  uintptr_t expected = 0;
688  uintptr_t desired = lock_bit;
689  // Try for a fast grab of the lock bit. If this does not work, call the full adaptive looping code.
690  if (!atomic_cas_weak_acquire_relaxed(&state, &expected, &desired)) {
691  lock_full();
692  }
693  }
694 
696  uintptr_t expected = lock_bit;
697  uintptr_t desired = 0;
698  // Try for a fast grab of the lock bit. If this does not work, call the full adaptive looping code.
699  if (!atomic_cas_weak_release_relaxed(&state, &expected, &desired)) {
700  unlock_full();
701  }
702  }
703 
705  uintptr_t val;
706  atomic_load_relaxed(&state, &val);
707  while (true) {
708  if (!(val & lock_bit)) {
709  return false;
710  }
711 
712  uintptr_t desired = val | parked_bit;
713  if (atomic_cas_weak_relaxed_relaxed(&state, &val, &desired)) {
714  return true;
715  }
716  }
717  }
718 
720  atomic_or_fetch_relaxed(&state, parked_bit);
721  }
722 };
723 
727 
730  }
731 
732 protected:
733  uintptr_t unpark(int unparked, bool more_waiters) final {
734  if (!more_waiters) {
735  uintptr_t val = 0;
736  atomic_store_relaxed(cond_state, &val);
737  }
738 
739 #if 0 // TODO: figure out why this was here.
740  return (uintptr_t)mutex;
741 #else
742  return 0;
743 #endif
744  }
745 };
746 
750 
753  }
754 
755 protected:
756  bool validate(validate_action &action) final {
757  uintptr_t val;
758  atomic_load_relaxed(cond_state, &val);
759  // By the time this broadcast locked everything and was processed, the cond
760  // has progressed to a new mutex, do nothing since any waiting threads have
761  // to be waiting on what is effectively a different condition.
762  if (val != (uintptr_t)mutex) {
763  return false;
764  }
765  // Clear the cond's connection to the mutex as all waiting threads are going to reque onto the mutex.
766  val = 0;
767  atomic_store_relaxed(cond_state, &val);
768  action.unpark_one = !mutex->make_parked_if_locked();
769  return true;
770  }
771 
772  void requeue_callback(const validate_action &action, bool one_to_wake, bool some_requeued) final {
773  if (action.unpark_one && some_requeued) {
774  mutex->make_parked();
775  }
776  }
777 };
778 
779 struct wait_parking_control final : public parking_control {
782 
785  }
786 
787 protected:
788  bool validate(validate_action &action) final {
789  uintptr_t val;
790  atomic_load_relaxed(cond_state, &val);
791 
792  if (val == 0) {
793  val = (uintptr_t)mutex;
794  atomic_store_relaxed(cond_state, &val);
795  } else if (val != (uintptr_t)mutex) {
796  // TODO: signal error.
797  action.invalid_unpark_info = (uintptr_t)mutex;
798  return false;
799  }
800 
801  return true;
802  }
803 
804  void before_sleep() final {
805  mutex->unlock();
806  }
807 
808  uintptr_t unpark(int unparked, bool more_waiters) final {
809  if (!more_waiters) {
810  uintptr_t val = 0;
811  atomic_store_relaxed(cond_state, &val);
812  }
813  return 0;
814  }
815 };
816 
817 class fast_cond {
818  uintptr_t state = 0;
819 
820 public:
822  if_tsan_pre_signal(this);
823 
824  uintptr_t val;
825  atomic_load_relaxed(&state, &val);
826  if (val == 0) {
827  if_tsan_post_signal(this);
828  return;
829  }
830  signal_parking_control control(&state, (fast_mutex *)val);
831  control.unpark_one((uintptr_t)this);
832  if_tsan_post_signal(this);
833  }
834 
836  if_tsan_pre_signal(this);
837  uintptr_t val;
838  atomic_load_relaxed(&state, &val);
839  if (val == 0) {
840  if_tsan_post_signal(this);
841  return;
842  }
843  broadcast_parking_control control(&state, (fast_mutex *)val);
844  control.unpark_requeue((uintptr_t)this, val, 0);
845  if_tsan_post_signal(this);
846  }
847 
849  wait_parking_control control(&state, mutex);
850  uintptr_t result = control.park((uintptr_t)this);
851  if (result != (uintptr_t)mutex) {
852  mutex->lock();
853  } else {
854  if_tsan_pre_lock(mutex);
855 
856  // TODO: this is debug only.
857  uintptr_t val;
858  atomic_load_relaxed((uintptr_t *)mutex, &val);
859  halide_abort_if_false(nullptr, val & 0x1);
860 
861  if_tsan_post_lock(mutex);
862  }
863  }
864 };
865 
866 } // namespace Synchronization
867 
868 } // namespace Internal
869 } // namespace Runtime
870 } // namespace Halide
871 
872 extern "C" {
873 
877  fast_mutex->lock();
878 }
879 
883  fast_mutex->unlock();
884 }
885 
889  fast_cond->broadcast();
890 }
891 
892 WEAK void halide_cond_signal(struct halide_cond *cond) {
895  fast_cond->signal();
896 }
897 
898 WEAK void halide_cond_wait(struct halide_cond *cond, struct halide_mutex *mutex) {
903  fast_cond->wait(fast_mutex);
904 }
905 
906 // Actual definition of the mutex array.
909 };
910 
912  // TODO: If sz is huge, we should probably hash it down to something smaller
913  // in the accessors below. Check for deadlocks before doing so.
915  nullptr, sizeof(halide_mutex_array));
916  if (array == nullptr) {
917  // Will result in a failed assertion and a call to halide_error.
918  return nullptr;
919  }
920  array->array = (halide_mutex *)halide_malloc(
921  nullptr, sz * sizeof(halide_mutex));
922  if (array->array == nullptr) {
923  halide_free(nullptr, array);
924  // Will result in a failed assertion and a call to halide_error.
925  return nullptr;
926  }
927  memset(array->array, 0, sz * sizeof(halide_mutex));
928  return array;
929 }
930 
931 WEAK void halide_mutex_array_destroy(void *user_context, void *array) {
932  struct halide_mutex_array *arr_ptr = (struct halide_mutex_array *)array;
933  halide_free(user_context, arr_ptr->array);
934  halide_free(user_context, arr_ptr);
935 }
936 
938  halide_mutex_lock(&array->array[entry]);
940 }
941 
943  halide_mutex_unlock(&array->array[entry]);
945 }
946 }
Halide::Runtime::Internal::Synchronization::spin_control::reset
ALWAYS_INLINE void reset()
Definition: synchronization_common.h:107
halide_mutex_array_lock
WEAK int halide_mutex_array_lock(struct halide_mutex_array *array, int entry)
Definition: synchronization_common.h:937
Halide::Runtime::Internal::Synchronization::word_lock_queue_data
Definition: synchronization_common.h:117
halide_free
void halide_free(void *user_context, void *ptr)
Halide::Runtime::Internal::Synchronization::parking_control::unpark_requeue
int unpark_requeue(uintptr_t addr_from, uintptr_t addr_to, uintptr_t unpark_info)
Definition: synchronization_common.h:529
Halide::Runtime::Internal::Synchronization::word_lock::unlock
ALWAYS_INLINE void unlock()
Definition: synchronization_common.h:161
Halide::Runtime::Internal::Synchronization::fast_mutex::make_parked_if_locked
ALWAYS_INLINE bool make_parked_if_locked()
Definition: synchronization_common.h:704
halide_mutex_array::array
struct halide_mutex * array
Definition: synchronization_common.h:908
Halide::Runtime::Internal::Synchronization::parking_control::before_sleep
virtual void before_sleep()
Definition: synchronization_common.h:435
Halide::Runtime::Internal::Synchronization::mutex_parking_control::lock_state
uintptr_t *const lock_state
Definition: synchronization_common.h:604
Halide::Runtime::Internal::Synchronization::hash_bucket::mutex
word_lock mutex
Definition: synchronization_common.h:315
uint8_t
unsigned __INT8_TYPE__ uint8_t
Definition: runtime_internal.h:29
Halide::Runtime::Internal::Synchronization::fast_mutex::unlock
ALWAYS_INLINE void unlock()
Definition: synchronization_common.h:695
halide_error_code_success
@ halide_error_code_success
There was no error.
Definition: HalideRuntime.h:1039
Halide::Runtime::Internal::Synchronization::fast_cond::wait
ALWAYS_INLINE void wait(fast_mutex *mutex)
Definition: synchronization_common.h:848
Halide::Runtime::Internal::Synchronization::broadcast_parking_control::mutex
fast_mutex *const mutex
Definition: synchronization_common.h:749
Halide::Runtime::Internal::Synchronization::word_lock::lock
ALWAYS_INLINE void lock()
Definition: synchronization_common.h:148
Halide::Runtime::Internal::Synchronization::parking_control::validate
virtual bool validate(validate_action &action)
Definition: synchronization_common.h:432
Halide::Runtime::Internal::Synchronization::spin_control
Definition: synchronization_common.h:95
Halide::Runtime::Internal::Synchronization::fast_cond::broadcast
ALWAYS_INLINE void broadcast()
Definition: synchronization_common.h:835
Halide::Runtime::Internal::Synchronization::queue_data::sleep_address
uintptr_t sleep_address
Definition: synchronization_common.h:306
Halide::Runtime::Internal::Synchronization::wait_parking_control
Definition: synchronization_common.h:779
halide_cond
Cross platform condition variable.
Definition: HalideRuntime.h:171
Halide::Runtime::Internal::Synchronization::wait_parking_control::mutex
fast_mutex *const mutex
Definition: synchronization_common.h:781
Halide::Runtime::Internal::Synchronization::queue_data::unpark_info
uintptr_t unpark_info
Definition: synchronization_common.h:308
Halide::Runtime::Internal::Synchronization::parking_control::unpark_one
uintptr_t unpark_one(uintptr_t addr)
Definition: synchronization_common.h:478
halide_debug_assert
#define halide_debug_assert(user_context, cond)
halide_debug_assert() is like halide_assert(), but only expands into a check when DEBUG_RUNTIME is de...
Definition: runtime_internal.h:281
Halide::Runtime::Internal::Synchronization::spin_control::should_spin
ALWAYS_INLINE bool should_spin()
Definition: synchronization_common.h:100
Halide::Runtime::Internal::Synchronization::HASH_TABLE_SIZE
constexpr int HASH_TABLE_SIZE
Definition: synchronization_common.h:321
Halide::Runtime::Internal::Synchronization::parking_control
Definition: synchronization_common.h:426
Halide::Runtime::Internal::Synchronization::mutex_parking_control::unpark
uintptr_t unpark(int unparked, bool more_waiters) final
Definition: synchronization_common.h:617
Halide::Runtime::Internal::Synchronization::mutex_parking_control::mutex_parking_control
ALWAYS_INLINE mutex_parking_control(uintptr_t *lock_state)
Definition: synchronization_common.h:606
halide_cond_wait
WEAK void halide_cond_wait(struct halide_cond *cond, struct halide_mutex *mutex)
Definition: synchronization_common.h:898
uintptr_t
__UINTPTR_TYPE__ uintptr_t
Definition: runtime_internal.h:73
Halide::Runtime::Internal::Synchronization::parking_control::unpark
virtual uintptr_t unpark(int unparked, bool more_waiters)
Definition: synchronization_common.h:438
Halide::Runtime::Internal::Synchronization::broadcast_parking_control::cond_state
uintptr_t *const cond_state
Definition: synchronization_common.h:748
halide_mutex_array
Definition: synchronization_common.h:907
Halide
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Definition: AbstractGenerator.h:19
Halide::Runtime::Internal::Synchronization::bucket_pair::to
hash_bucket & to
Definition: synchronization_common.h:369
Halide::Runtime::Internal::Synchronization::hash_bucket::head
queue_data * head
Definition: synchronization_common.h:317
Halide::Runtime::Internal::Synchronization::hash_bucket::tail
queue_data * tail
Definition: synchronization_common.h:318
Halide::Runtime::Internal::Synchronization::broadcast_parking_control::broadcast_parking_control
ALWAYS_INLINE broadcast_parking_control(uintptr_t *cond_state, fast_mutex *mutex)
Definition: synchronization_common.h:751
Halide::Runtime::Internal::Synchronization::fast_mutex::lock
ALWAYS_INLINE void lock()
Definition: synchronization_common.h:686
Halide::LinkageType::Internal
@ Internal
Not visible externally, similar to 'static' linkage in C.
Halide::Runtime::Internal::Synchronization::fast_mutex
Definition: synchronization_common.h:625
halide_cond_signal
WEAK void halide_cond_signal(struct halide_cond *cond)
Definition: synchronization_common.h:892
Halide::Runtime::Internal::Synchronization::lock_bucket_pair
WEAK bucket_pair lock_bucket_pair(uintptr_t addr_from, uintptr_t addr_to)
Definition: synchronization_common.h:376
memset
void * memset(void *s, int val, size_t n)
Halide::Runtime::Internal::Synchronization::wait_parking_control::unpark
uintptr_t unpark(int unparked, bool more_waiters) final
Definition: synchronization_common.h:808
Halide::Runtime::Internal::Synchronization::word_lock_queue_data::parker
thread_parker parker
Definition: synchronization_common.h:118
Halide::print
Expr print(const std::vector< Expr > &values)
Create an Expr that prints out its value whenever it is evaluated.
Halide::Runtime::Internal::Synchronization::bucket_pair
Definition: synchronization_common.h:367
Halide::Runtime::Internal::Synchronization::broadcast_parking_control::validate
bool validate(validate_action &action) final
Definition: synchronization_common.h:756
scoped_spin_lock.h
Halide::Runtime::Internal::Synchronization::wait_parking_control::before_sleep
void before_sleep() final
Definition: synchronization_common.h:804
Halide::Runtime::Internal::Synchronization::word_lock
Definition: synchronization_common.h:141
Halide::Runtime::Internal::Synchronization::wait_parking_control::cond_state
uintptr_t *const cond_state
Definition: synchronization_common.h:780
halide_mutex_array_create
WEAK halide_mutex_array * halide_mutex_array_create(int sz)
Definition: synchronization_common.h:911
Halide::Runtime::Internal::Synchronization::signal_parking_control::mutex
fast_mutex *const mutex
Definition: synchronization_common.h:726
printer.h
Halide::Runtime::Internal::Synchronization::broadcast_parking_control
Definition: synchronization_common.h:747
Halide::Runtime::Internal::Synchronization::queue_data
Definition: synchronization_common.h:303
Halide::Runtime::Internal::Synchronization::wait_parking_control::wait_parking_control
ALWAYS_INLINE wait_parking_control(uintptr_t *cond_state, fast_mutex *mutex)
Definition: synchronization_common.h:783
Halide::Runtime::Internal::Synchronization::LOAD_FACTOR
constexpr int LOAD_FACTOR
Definition: synchronization_common.h:312
Halide::Runtime::Internal::Synchronization::parking_control::requeue_callback
virtual void requeue_callback(const validate_action &action, bool one_to_wake, bool some_requeued)
Definition: synchronization_common.h:441
Halide::Runtime::Internal::Synchronization::mutex_parking_control::validate
bool validate(validate_action &action) final
Definition: synchronization_common.h:611
Halide::Runtime::Internal::Synchronization::parking_control::park
uintptr_t park(uintptr_t addr)
Definition: synchronization_common.h:447
halide_mutex_array_destroy
WEAK void halide_mutex_array_destroy(void *user_context, void *array)
Definition: synchronization_common.h:931
halide_mutex
Cross-platform mutex.
Definition: HalideRuntime.h:166
halide_malloc
void * halide_malloc(void *user_context, size_t x)
Halide calls these functions to allocate and free memory.
Halide::Runtime::Internal::Synchronization::hash_table::buckets
hash_bucket buckets[HASH_TABLE_SIZE]
Definition: synchronization_common.h:323
halide_thread_yield
void halide_thread_yield()
ALWAYS_INLINE
#define ALWAYS_INLINE
Definition: runtime_internal.h:55
Halide::Runtime::Internal::Synchronization::signal_parking_control
Definition: synchronization_common.h:724
Halide::Runtime::Internal::Synchronization::table
WEAK hash_table table
Definition: synchronization_common.h:325
HalideRuntime.h
Halide::Runtime::Internal::Synchronization::word_lock_queue_data::tail
word_lock_queue_data * tail
Definition: synchronization_common.h:138
Halide::Runtime::Internal::Synchronization::unlock_bucket_pair
WEAK void unlock_bucket_pair(bucket_pair &buckets)
Definition: synchronization_common.h:405
halide_mutex_lock
WEAK void halide_mutex_lock(halide_mutex *mutex)
A basic set of mutex and condition variable functions, which call platform specific code for mutual e...
Definition: synchronization_common.h:874
Halide::Runtime::Internal::Synchronization::fast_mutex::make_parked
ALWAYS_INLINE void make_parked()
Definition: synchronization_common.h:719
Halide::Runtime::Internal::Synchronization::fast_cond::signal
ALWAYS_INLINE void signal()
Definition: synchronization_common.h:821
Halide::Runtime::Internal::Synchronization::signal_parking_control::unpark
uintptr_t unpark(int unparked, bool more_waiters) final
Definition: synchronization_common.h:733
halide_mutex_unlock
WEAK void halide_mutex_unlock(halide_mutex *mutex)
Definition: synchronization_common.h:880
Halide::Runtime::Internal::Synchronization::queue_data::next
queue_data * next
Definition: synchronization_common.h:307
Halide::Runtime::Internal::Synchronization::hash_bucket
Definition: synchronization_common.h:314
Halide::Runtime::Internal::Synchronization::validate_action::invalid_unpark_info
uintptr_t invalid_unpark_info
Definition: synchronization_common.h:423
halide_abort_if_false
#define halide_abort_if_false(user_context, cond)
Definition: runtime_internal.h:262
Halide::Runtime::Internal::Synchronization::wait_parking_control::validate
bool validate(validate_action &action) final
Definition: synchronization_common.h:788
Halide::Runtime::Internal::Synchronization::broadcast_parking_control::requeue_callback
void requeue_callback(const validate_action &action, bool one_to_wake, bool some_requeued) final
Definition: synchronization_common.h:772
Halide::Runtime::Internal::Synchronization::fast_cond
Definition: synchronization_common.h:817
Halide::Runtime::Internal::Synchronization::bucket_pair::bucket_pair
ALWAYS_INLINE bucket_pair(hash_bucket &from, hash_bucket &to)
Definition: synchronization_common.h:371
halide_mutex_array_unlock
WEAK int halide_mutex_array_unlock(struct halide_mutex_array *array, int entry)
Definition: synchronization_common.h:942
WEAK
#define WEAK
Definition: runtime_internal.h:52
halide_cond_broadcast
WEAK void halide_cond_broadcast(struct halide_cond *cond)
Definition: synchronization_common.h:886
Halide::Runtime::Internal::Synchronization::validate_action
Definition: synchronization_common.h:421
Halide::Runtime::Internal::Synchronization::lock_bucket
WEAK hash_bucket & lock_bucket(uintptr_t addr)
Definition: synchronization_common.h:354
Halide::Runtime::Internal::Synchronization::validate_action::unpark_one
bool unpark_one
Definition: synchronization_common.h:422
Halide::Runtime::Internal::Synchronization::word_lock_queue_data::next
word_lock_queue_data * next
Definition: synchronization_common.h:136
runtime_atomics.h
Halide::Runtime::Internal::Synchronization::signal_parking_control::signal_parking_control
ALWAYS_INLINE signal_parking_control(uintptr_t *cond_state, fast_mutex *mutex)
Definition: synchronization_common.h:728
Halide::Runtime::Internal::Synchronization::hash_table
Definition: synchronization_common.h:322
Halide::Runtime::Internal::Synchronization::queue_data::parker
thread_parker parker
Definition: synchronization_common.h:304
Halide::Runtime::Internal::Synchronization::HASH_TABLE_BITS
constexpr int HASH_TABLE_BITS
Definition: synchronization_common.h:327
Halide::Runtime::Internal::Synchronization::bucket_pair::from
hash_bucket & from
Definition: synchronization_common.h:368
Halide::Runtime::Internal::Synchronization::signal_parking_control::cond_state
uintptr_t *const cond_state
Definition: synchronization_common.h:725
Halide::Runtime::Internal::Synchronization::word_lock_queue_data::prev
word_lock_queue_data * prev
Definition: synchronization_common.h:137
Halide::Runtime::Internal::Synchronization::mutex_parking_control
Definition: synchronization_common.h:603