Halide
ThreadInfo.h
Go to the documentation of this file.
1 #ifndef THREAD_INFO_H
2 #define THREAD_INFO_H
3 
4 /** \file
5  *
6  * Data structure containing information about GPU threads for a particular
7  * location in the loop nest and its surrounding block. Useful when computing
8  * GPU features
9  */
10 
11 #include <vector>
12 
13 #include "Errors.h"
14 #include "FunctionDAG.h"
15 
16 namespace Halide {
17 namespace Internal {
18 namespace Autoscheduler {
19 
20 static constexpr int MAX_THREADS_PER_BLOCK = 1024;
21 
22 struct LoopNest;
23 
24 // Sort / filter thread tile options
28  bool operator<(const ThreadTileOption &other) const {
30  }
31 
32  // Ensure we don't accidentally copy this type
33  ThreadTileOption() = default;
34  ThreadTileOption(ThreadTileOption &&) = default;
36  ThreadTileOption(const ThreadTileOption &) = delete;
37  ThreadTileOption &operator=(const ThreadTileOption &) = delete;
38 };
39 
40 struct ThreadInfo {
41  ThreadInfo(int vectorized_loop_index, const std::vector<int64_t> &size, const std::vector<FunctionDAG::Node::Loop> &loop, const std::vector<int64_t> &max_thread_counts) {
42  init_threads_in_this_block(max_thread_counts);
43 
44  std::size_t num_thread_loops = 0;
45 
46  if (vectorized_loop_index != -1 && size[vectorized_loop_index] != 1) {
47  threads[num_thread_loops] = size[vectorized_loop_index];
48  num_threads *= size[vectorized_loop_index];
49  num_thread_loops = 1;
50  loop_indices.push_back(vectorized_loop_index);
51  loop_vars.push_back(loop[vectorized_loop_index].var);
52  }
53 
54  for (std::size_t i = 0; i < size.size() && num_thread_loops < 3; i++) {
55  if (size[i] == 1 || (int)i == vectorized_loop_index) {
56  continue;
57  }
58 
59  if (num_threads * size[i] > MAX_THREADS_PER_BLOCK) {
60  break;
61  }
62 
63  threads[num_thread_loops] = size[i];
64  num_threads *= size[i];
65  ++num_thread_loops;
66  loop_indices.push_back(i);
67  loop_vars.push_back(loop[i].var);
68  }
69 
70  if (loop_indices.empty()) {
71  internal_assert(!size.empty());
72  ++num_thread_loops;
73  loop_indices.push_back(0);
74  loop_vars.push_back(loop[0].var);
75  }
76 
78  internal_assert(loop_indices.size() == num_thread_loops);
79  internal_assert(loop_vars.size() == num_thread_loops);
80  internal_assert(!loop_indices.empty() && loop_indices.size() <= 3);
81  internal_assert(!loop_vars.empty() && loop_vars.size() <= 3);
82 
83  count_num_active_warps_per_block();
84  }
85 
86  template<typename Fn>
87  void for_each_thread_id(const Fn &fn) const {
88  int thread_id = 0;
89  for (int z = 0; z < threads_in_this_block[2]; z++) {
90  for (int y = 0; y < threads_in_this_block[1]; y++) {
91  for (int x = 0; x < threads_in_this_block[0]; x++) {
92  // Skip any threads in this loop nest with extent less than the
93  // extents of the largest thread loops in this block
94  // for thread.x in [0, 10]:
95  // ...
96  // for thread.x in [0, 5]:
97  // ...
98  // For the 2nd loop, skip threads with x id >= 5
99  bool active = x < threads[0] && y < threads[1] && z < threads[2];
100 
101  fn(thread_id, active, thread_id == num_threads_in_this_block - 1);
102  ++thread_id;
103  }
104  }
105  }
106  }
107 
108  template<typename Fn>
109  void for_each_thread_id_in_first_warp(Fn &fn) const {
110  int thread_id = 0;
111  for (int z = 0; z < threads_in_this_block[2]; z++) {
112  for (int y = 0; y < threads_in_this_block[1]; y++) {
113  for (int x = 0; x < threads_in_this_block[0]; x++) {
114  // Skip any threads in this loop nest with extent less than the
115  // extents of the largest thread loops in this block
116  // for thread.x in [0, 10]:
117  // ...
118  // for thread.x in [0, 5]:
119  // ...
120  // For the 2nd loop, skip threads with x id >= 5
121  bool active = x < threads[0] && y < threads[1] && z < threads[2];
122 
123  bool last_thread = thread_id == 31;
124  fn(thread_id, x, y, z, active, last_thread);
125  ++thread_id;
126 
127  if (last_thread) {
128  return;
129  }
130  }
131  }
132  }
133  }
134 
135  template<typename Fn>
136  void for_each_thread_id_in_tail_warp(Fn &fn) const {
137  int thread_id = final_warp_initial_thread_id;
138  int last_thread_id = thread_id + num_threads_in_final_warp - 1;
139 
140  for (; thread_id <= last_thread_id; ++thread_id) {
141  int z = thread_id / (threads_in_this_block[1] * threads_in_this_block[0]);
142  int y = (thread_id - z * threads_in_this_block[1] * threads_in_this_block[0]) / threads_in_this_block[0];
143  int x = thread_id % threads_in_this_block[0];
144 
148 
149  bool active = x < threads[0] && y < threads[1] && z < threads[2];
150 
151  fn(thread_id, x, y, z, active, thread_id == last_thread_id);
152  }
153  }
154 
155  template<typename Fn>
156  void for_each_active_thread_id(const Fn &fn) const {
157  for_each_thread_id([&](int thread_id, bool is_active, bool is_last_thread) {
158  if (!is_active) {
159  return;
160  }
161 
162  fn(thread_id, is_last_thread);
163  });
164  }
165 
166  double warp_lane_utilization() const {
167  return (double)num_active_threads / (double)(num_active_warps_per_block * 32);
168  }
169 
170  double idle_lane_wastage() const {
171  return ((double)(num_active_warps_per_block * 32) - (double)num_active_threads) / MAX_THREADS_PER_BLOCK;
172  }
173 
174  double block_occupancy() const {
175  return (double)num_threads / MAX_THREADS_PER_BLOCK;
176  }
177 
181  bool has_tail_warp = false;
184 
185  int threads_in_this_block[3] = {1, 1, 1};
187 
188  int threads[3] = {1, 1, 1};
191 
192  std::vector<int> loop_indices;
193  std::vector<std::string> loop_vars;
194 
195 private:
196  void init_threads_in_this_block(const std::vector<int64_t> &max_thread_counts) {
197  int num_thread_loops = 0;
198  for (auto c : max_thread_counts) {
199  if (c == 1) {
200  continue;
201  }
202 
203  if (num_thread_loops >= 3 || num_threads_in_this_block * c > MAX_THREADS_PER_BLOCK) {
204  break;
205  }
206 
207  threads_in_this_block[num_thread_loops] = c;
209  ++num_thread_loops;
210  }
211 
213  if (num_threads_in_this_block % 32 != 0) {
215  }
216  }
217 
218  void count_num_active_warps_per_block() {
219  bool current_warp_is_active = false;
220  int num_active_threads_in_cur_warp = 0;
221  int num_active_threads_in_first_warp = 0;
222  int num_threads_in_cur_warp = 0;
223  bool first_warp = true;
224 
225  for_each_thread_id([&](int thread_id, bool is_active, bool is_last_thread) {
226  current_warp_is_active |= is_active;
227 
228  if (is_active) {
229  ++num_active_threads_in_cur_warp;
231  }
232  ++num_threads_in_cur_warp;
233 
234  if ((thread_id + 1) % 32 == 0 || is_last_thread) {
235  if (current_warp_is_active) {
237 
238  if (first_warp) {
239  first_warp = false;
240  num_active_threads_in_first_warp = num_active_threads_in_cur_warp;
241  }
242 
243  if (is_last_thread) {
244  num_threads_in_final_warp = num_threads_in_cur_warp;
245  has_tail_warp = num_active_threads_in_first_warp != num_active_threads_in_cur_warp;
246  final_warp_initial_thread_id = thread_id - num_threads_in_cur_warp + 1;
247 
249  }
250  }
251 
252  current_warp_is_active = false;
253  num_threads_in_cur_warp = 0;
254  num_active_threads_in_cur_warp = 0;
255  }
256  });
257 
259  if (has_tail_warp) {
261  }
262  }
263 };
264 
265 } // namespace Autoscheduler
266 } // namespace Internal
267 } // namespace Halide
268 
269 #endif // THREAD_INFO_H
Halide::Internal::Autoscheduler::ThreadInfo::for_each_thread_id_in_first_warp
void for_each_thread_id_in_first_warp(Fn &fn) const
Definition: ThreadInfo.h:109
Halide::Internal::Autoscheduler::ThreadInfo::num_active_threads
int64_t num_active_threads
Definition: ThreadInfo.h:190
Halide::Internal::Autoscheduler::ThreadInfo::final_warp_initial_thread_id
int final_warp_initial_thread_id
Definition: ThreadInfo.h:182
internal_assert
#define internal_assert(c)
Definition: Errors.h:19
Errors.h
Halide::Internal::Autoscheduler::ThreadInfo::loop_vars
std::vector< std::string > loop_vars
Definition: ThreadInfo.h:193
Halide::Internal::Autoscheduler::ThreadInfo::for_each_thread_id
void for_each_thread_id(const Fn &fn) const
Definition: ThreadInfo.h:87
Halide::Internal::Autoscheduler::ThreadInfo::for_each_active_thread_id
void for_each_active_thread_id(const Fn &fn) const
Definition: ThreadInfo.h:156
Halide::Internal::Autoscheduler::ThreadInfo::has_tail_warp
bool has_tail_warp
Definition: ThreadInfo.h:181
Halide::Internal::Autoscheduler::ThreadTileOption::ThreadTileOption
ThreadTileOption()=default
Halide::Internal::IntrusivePtr
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
Definition: IntrusivePtr.h:68
Halide::Internal::Autoscheduler::ThreadInfo::num_threads_in_this_block
int64_t num_threads_in_this_block
Definition: ThreadInfo.h:186
Halide::Internal::Autoscheduler::ThreadInfo::for_each_thread_id_in_tail_warp
void for_each_thread_id_in_tail_warp(Fn &fn) const
Definition: ThreadInfo.h:136
Halide
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Definition: AbstractGenerator.h:19
Halide::Internal::Autoscheduler::ThreadTileOption::operator<
bool operator<(const ThreadTileOption &other) const
Definition: ThreadInfo.h:28
Halide::LinkageType::Internal
@ Internal
Not visible externally, similar to 'static' linkage in C.
Halide::Internal::Autoscheduler::ThreadInfo::num_warps_per_block
int num_warps_per_block
Definition: ThreadInfo.h:178
FunctionDAG.h
size_t
__SIZE_TYPE__ size_t
Definition: runtime_internal.h:31
Halide::Internal::Autoscheduler::ThreadInfo
Definition: ThreadInfo.h:40
Halide::Internal::Autoscheduler::ThreadInfo::num_regular_active_warps_per_block
int num_regular_active_warps_per_block
Definition: ThreadInfo.h:180
Halide::Internal::Autoscheduler::ThreadInfo::idle_lane_wastage
double idle_lane_wastage() const
Definition: ThreadInfo.h:170
Halide::Internal::Autoscheduler::ThreadInfo::block_occupancy
double block_occupancy() const
Definition: ThreadInfo.h:174
int64_t
signed __INT64_TYPE__ int64_t
Definition: runtime_internal.h:22
Halide::Internal::Autoscheduler::ThreadTileOption
Definition: ThreadInfo.h:25
Halide::Internal::Autoscheduler::ThreadTileOption::loop_nest
IntrusivePtr< const LoopNest > loop_nest
Definition: ThreadInfo.h:26
Halide::Internal::Autoscheduler::ThreadInfo::num_active_warps_per_block
int num_active_warps_per_block
Definition: ThreadInfo.h:179
Halide::Internal::Autoscheduler::ThreadInfo::num_threads_in_final_warp
int num_threads_in_final_warp
Definition: ThreadInfo.h:183
Halide::Internal::Autoscheduler::ThreadInfo::num_threads
int64_t num_threads
Definition: ThreadInfo.h:189
Halide::Internal::Autoscheduler::ThreadInfo::loop_indices
std::vector< int > loop_indices
Definition: ThreadInfo.h:192
Halide::Internal::Autoscheduler::ThreadTileOption::max_idle_lane_wastage
double max_idle_lane_wastage
Definition: ThreadInfo.h:27
Halide::Internal::Autoscheduler::ThreadTileOption::operator=
ThreadTileOption & operator=(ThreadTileOption &&)=default
Halide::Internal::Autoscheduler::ThreadInfo::threads
int threads[3]
Definition: ThreadInfo.h:188
Halide::Internal::Autoscheduler::ThreadInfo::ThreadInfo
ThreadInfo(int vectorized_loop_index, const std::vector< int64_t > &size, const std::vector< FunctionDAG::Node::Loop > &loop, const std::vector< int64_t > &max_thread_counts)
Definition: ThreadInfo.h:41
Halide::Internal::Autoscheduler::ThreadInfo::warp_lane_utilization
double warp_lane_utilization() const
Definition: ThreadInfo.h:166
Halide::Internal::Autoscheduler::ThreadInfo::threads_in_this_block
int threads_in_this_block[3]
Definition: ThreadInfo.h:185