Halide
GPUMemInfo.h
Go to the documentation of this file.
1 #ifndef GPU_MEM_INFO_H
2 #define GPU_MEM_INFO_H
3 
4 #include <unordered_map>
5 #include <unordered_set>
6 #include <vector>
7 
8 #include "ASLog.h"
9 #include "Errors.h"
10 
11 /** \file
12  *
13  * Data structures that help track memory access information. Useful when
14  * computing GPU features
15  */
16 
17 namespace Halide {
18 namespace Internal {
19 namespace Autoscheduler {
20 
21 struct GlobalMem;
22 struct GlobalAccessAccumulator;
23 struct SharedMem;
24 struct SharedAccessAccumulator;
25 struct LocalMem;
26 struct LocalAccessAccumulator;
27 
28 template<typename T>
29 struct MemTraits;
30 
31 template<>
32 struct MemTraits<GlobalMem> {
33  static constexpr double bytes_per_transaction = 32;
34  using MemInfoType = GlobalMem;
36 };
37 
38 template<>
39 struct MemTraits<SharedMem> {
40  static constexpr double bytes_per_transaction = 128;
41  using MemInfoType = SharedMem;
43 };
44 
45 template<>
46 struct MemTraits<LocalMem> {
47  static constexpr double bytes_per_transaction = 32;
48  using MemInfoType = GlobalMem; // Local mem behaves similarly to global mem
50 };
51 
52 template<typename T>
54 
55 template<typename T>
56 struct MemInfo {
58 
59  double num_transactions() const {
60  return total_num_transactions;
61  }
62 
63  void add_access_info(double num_requests, double num_transactions_per_request, double num_bytes_used_per_request) {
64  internal_assert(num_bytes_used_per_request > 0);
65 
66  double total_transactions = num_requests * num_transactions_per_request;
67  double total_bytes = total_transactions * bytes_per_transaction;
68  double total_bytes_used = num_requests * num_bytes_used_per_request;
69 
70  internal_assert(total_bytes_used <= total_bytes)
71  << "\ntotal_bytes_used = " << total_bytes_used
72  << "\ntotal_bytes = " << total_bytes
73  << "\ntotal_transactions = " << total_transactions
74  << "\nnum_transactions_per_request = " << num_transactions_per_request
75  << "\nnum_requests = " << num_requests;
76 
77  update_totals(total_transactions, total_bytes_used, total_bytes);
78  }
79 
80  void add(const MemInfo<T> &other) {
81  total_num_transactions += other.total_num_transactions;
82  total_num_bytes_used += other.total_num_bytes_used;
83  total_num_bytes += other.total_num_bytes;
84  }
85 
86  double efficiency() const {
87  if (total_num_bytes == 0) {
88  return 1;
89  }
90 
91  double result = total_num_bytes_used / total_num_bytes;
92  internal_assert(result <= 1);
93  return result;
94  }
95 
96 private:
97  void update_totals(double num_transactions, double num_bytes_used, double num_bytes) {
98  total_num_transactions += num_transactions;
99  total_num_bytes_used += num_bytes_used;
100  total_num_bytes += num_bytes;
101  }
102 
103  double total_num_transactions = 0;
104  double total_num_bytes_used = 0;
105  double total_num_bytes = 0;
106 };
107 
108 template<typename T>
110 
114 
115 struct Strides {
116 public:
117  Strides(const std::vector<int64_t> &storage_strides)
118  : storage_strides{storage_strides} {
119  }
120 
121  void add_valid(const std::vector<double> &strides) {
122  add(strides, true);
123  }
124 
125  void add_invalid() {
126  add({}, false);
127  }
128 
129  bool valid(size_t loop_index) const {
130  return is_valid[loop_index];
131  }
132 
133  int64_t offset(size_t loop_index, int64_t point) const {
134  internal_assert(loop_index < is_valid.size() && valid(loop_index));
135  internal_assert(index_strides[loop_index].size() == storage_strides.size());
136 
137  int64_t result = 0;
138  for (size_t i = 0; i < storage_strides.size(); ++i) {
139  result += (int64_t)(point * index_strides[loop_index][i]) * storage_strides[i];
140  }
141  return std::abs(result);
142  }
143 
144  void dump(bool verbose = false) {
145  if (!verbose) {
146  return;
147  }
148 
149  for (size_t i = 0; i < storage_strides.size(); ++i) {
150  if (!valid(i)) {
151  aslog(2) << "stride " << i << ": invalid\n";
152  continue;
153  }
154  aslog(2) << "storage_stride " << i << ": " << storage_strides[i] << "\n";
155  }
156 
157  for (size_t i = 0; i < index_strides.size(); ++i) {
158  for (size_t j = 0; j < index_strides[i].size(); ++j) {
159  aslog(2) << "index_stride " << i << ", storage_stride " << j << ": " << index_strides[i][j] << " ";
160  }
161  aslog(2) << "\n";
162  }
163  }
164 
165 private:
166  void add(const std::vector<double> &strides, bool e) {
167  index_strides.push_back(strides);
168  is_valid.push_back(e);
169  }
170 
171  std::vector<int64_t> storage_strides;
172  std::vector<std::vector<double>> index_strides;
173  std::vector<bool> is_valid;
174 };
175 
177  GlobalAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
178  : bytes_per_access{bytes_per_access}, dimensions{dimensions}, strides{strides}, verbose{verbose} {
179  }
180 
181  void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) {
182  if (!active) {
183  return;
184  }
185 
186  if (verbose) {
187  aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n";
188  }
189 
190  int thread_ids[3] = {x, y, z};
191  int64_t byte = 0;
192  for (size_t i = 0; i < dimensions; ++i) {
193  if (!strides.valid(i)) {
194  ++unknown_sectors;
195  return;
196  }
197  byte += bytes_per_access * strides.offset(i, thread_ids[i]);
198  }
199 
200  if (verbose) {
201  aslog(2) << "byte accessed: " << byte << "\n";
202  }
203 
204  int64_t sector = byte / 32;
205  if (verbose) {
206  aslog(2) << "sectors accessed: ";
207  }
208  for (int i = 0; i < bytes_per_access; ++i) {
209  if (verbose) {
210  aslog(2) << sector << " ";
211  }
212  sectors_accessed[sector].insert(byte + i);
213  }
214  if (verbose) {
215  aslog(2) << "\n\n";
216  }
217  }
218 
219  void add_access_info(int num_requests, GlobalMemInfo &global_mem_info, bool is_tail_warp) const {
220  int num_transactions_per_request = sectors_accessed.size() + unknown_sectors;
221 
222  if (verbose) {
223  if (is_tail_warp) {
224  aslog(2) << "tail_";
225  }
226  aslog(2) << "num_transactions_per_request = " << num_transactions_per_request << "\n";
227  }
228 
229  int num_bytes_used_per_request = 0;
230  for (const auto &sector : sectors_accessed) {
231  num_bytes_used_per_request += sector.second.size();
232  }
233 
234  num_bytes_used_per_request += unknown_sectors * bytes_per_access;
235 
236  if (verbose) {
237  if (is_tail_warp) {
238  aslog(2) << "tail_";
239  }
240  aslog(2) << "num_requests_per_block = " << num_requests << "\n";
241  }
242 
243  global_mem_info.add_access_info(
244  num_requests,
245  num_transactions_per_request,
246  num_bytes_used_per_request);
247  }
248 
249 private:
250  int bytes_per_access;
251  size_t dimensions;
252  Strides strides;
253  bool verbose;
254  int unknown_sectors = 0;
255  std::unordered_map<int64_t, std::unordered_set<int64_t>> sectors_accessed;
256 };
257 
259  SharedAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
260  : bytes_per_access{bytes_per_access}, dimensions{dimensions}, strides{strides}, verbose{verbose} {
261  }
262 
263  void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) {
264  if (!active) {
265  return;
266  }
267 
268  if (verbose) {
269  aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n";
270  }
271 
272  int thread_ids[3] = {x, y, z};
273  int64_t byte = 0;
274  for (size_t i = 0; i < dimensions; ++i) {
275  if (!strides.valid(i)) {
276  ++unknown_banks;
277  return;
278  }
279  byte += bytes_per_access * strides.offset(i, thread_ids[i]);
280  }
281 
282  if (verbose) {
283  aslog(2) << "bytes accessed: ";
284  for (int i = 0; i < bytes_per_access; ++i) {
285  aslog(2) << byte + i << " ";
286  }
287  aslog(2) << "\n";
288  }
289 
290  if (verbose) {
291  aslog(2) << "banks accessed: ";
292  }
293  for (int i = 0; i < bytes_per_access; ++i) {
294  int64_t word = (byte + i) / 4;
295  int64_t bank = word % 32;
296  if (verbose) {
297  aslog(2) << bank << " ";
298  }
299  bytes_accessed.insert(byte + i);
300  bank_to_words_accessed[bank].insert(word);
301  }
302  if (verbose) {
303  aslog(2) << "\n\n";
304  }
305  }
306 
307  void add_access_info(int num_requests, SharedMemInfo &shared_mem_info, bool is_tail_warp) const {
308  int num_transactions_per_request = 0;
309  for (const auto &bank : bank_to_words_accessed) {
310  num_transactions_per_request = std::max(num_transactions_per_request, (int)bank.size());
311  }
312 
313  num_transactions_per_request += unknown_banks;
314 
315  if (verbose) {
316  if (is_tail_warp) {
317  aslog(2) << "tail_";
318  }
319  aslog(2) << "num_transactions_per_request = " << num_transactions_per_request << "\n";
320  }
321 
322  int num_bytes_used_per_request = bytes_accessed.size();
323 
324  num_bytes_used_per_request += unknown_banks * bytes_per_access;
325 
326  if (verbose) {
327  if (is_tail_warp) {
328  aslog(2) << "tail_";
329  }
330  aslog(2) << "num_requests_per_block = " << num_requests << "\n";
331  }
332 
333  shared_mem_info.add_access_info(
334  num_requests,
335  num_transactions_per_request,
336  num_bytes_used_per_request);
337  }
338 
339 private:
340  int bytes_per_access;
341  size_t dimensions;
342  Strides strides;
343  bool verbose;
344  int unknown_banks = 0;
345  std::unordered_set<int64_t> bytes_accessed;
346  std::array<std::unordered_set<int64_t>, 32> bank_to_words_accessed;
347 };
348 
350  LocalAccessAccumulator(int bytes_per_access, bool verbose)
351  : bytes_per_access{bytes_per_access}, verbose{verbose} {
352  }
353 
354  void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) {
355  if (!active) {
356  return;
357  }
358 
359  ++thread_count;
360 
361  if (verbose) {
362  aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n";
363  }
364  }
365 
366  void add_access_info(int num_requests, LocalMemInfo &local_mem_info, bool is_tail_warp) const {
367  int num_bytes_used_per_request = thread_count * bytes_per_access;
368  int sectors_accessed = std::ceil((float)num_bytes_used_per_request / (float)LocalMemInfo::bytes_per_transaction);
369  int num_transactions_per_request = sectors_accessed;
370 
371  if (verbose) {
372  if (is_tail_warp) {
373  aslog(2) << "tail_";
374  }
375  aslog(2) << "num_transactions_per_request = " << num_transactions_per_request << "\n";
376  }
377 
378  if (verbose) {
379  if (is_tail_warp) {
380  aslog(2) << "tail_";
381  }
382  aslog(2) << "num_requests_per_block = " << num_requests << "\n";
383  }
384 
385  local_mem_info.add_access_info(
386  num_requests,
387  num_transactions_per_request,
388  num_bytes_used_per_request);
389  }
390 
391 private:
392  int bytes_per_access;
393  bool verbose;
394  int thread_count = 0;
395  std::unordered_map<int64_t, std::unordered_set<int64_t>> sectors_accessed;
396 };
397 
398 } // namespace Autoscheduler
399 } // namespace Internal
400 } // namespace Halide
401 
402 #endif // GPU_MEM_INFO_H
Halide::Internal::Autoscheduler::MemInfo::efficiency
double efficiency() const
Definition: GPUMemInfo.h:86
internal_assert
#define internal_assert(c)
Definition: Errors.h:19
Errors.h
Halide::Internal::Autoscheduler::SharedAccessAccumulator::operator()
void operator()(int thread_id, int x, int y, int z, int active, bool last_thread)
Definition: GPUMemInfo.h:263
Halide::Internal::Autoscheduler::SharedAccessAccumulator::SharedAccessAccumulator
SharedAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
Definition: GPUMemInfo.h:259
Halide::Internal::Autoscheduler::Strides::valid
bool valid(size_t loop_index) const
Definition: GPUMemInfo.h:129
Halide::Internal::Autoscheduler::SharedAccessAccumulator
Definition: GPUMemInfo.h:258
Halide::Internal::Autoscheduler::Strides::Strides
Strides(const std::vector< int64_t > &storage_strides)
Definition: GPUMemInfo.h:117
Halide::Internal::Autoscheduler::MemInfo::bytes_per_transaction
static constexpr double bytes_per_transaction
Definition: GPUMemInfo.h:57
Halide::Internal::Autoscheduler::GlobalAccessAccumulator
Definition: GPUMemInfo.h:176
Halide::Internal::aslog
Definition: ASLog.h:16
Halide::Internal::Autoscheduler::Strides::add_invalid
void add_invalid()
Definition: GPUMemInfo.h:125
Halide::Internal::Autoscheduler::Strides::offset
int64_t offset(size_t loop_index, int64_t point) const
Definition: GPUMemInfo.h:133
Halide::Internal::Autoscheduler::Strides::add_valid
void add_valid(const std::vector< double > &strides)
Definition: GPUMemInfo.h:121
Halide
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Definition: AbstractGenerator.h:19
Halide::Internal::Autoscheduler::GlobalAccessAccumulator::add_access_info
void add_access_info(int num_requests, GlobalMemInfo &global_mem_info, bool is_tail_warp) const
Definition: GPUMemInfo.h:219
Halide::LinkageType::Internal
@ Internal
Not visible externally, similar to 'static' linkage in C.
Halide::Internal::Autoscheduler::SharedAccessAccumulator::add_access_info
void add_access_info(int num_requests, SharedMemInfo &shared_mem_info, bool is_tail_warp) const
Definition: GPUMemInfo.h:307
Halide::Internal::Autoscheduler::MemTraits< LocalMem >::MemInfoType
GlobalMem MemInfoType
Definition: GPUMemInfo.h:48
Halide::ceil
Expr ceil(Expr x)
Return the least whole number greater than or equal to a floating-point expression.
Halide::Internal::Autoscheduler::MemInfo::add_access_info
void add_access_info(double num_requests, double num_transactions_per_request, double num_bytes_used_per_request)
Definition: GPUMemInfo.h:63
Halide::Internal::Autoscheduler::MemTraits
Definition: GPUMemInfo.h:29
Halide::Internal::Autoscheduler::LocalAccessAccumulator::operator()
void operator()(int thread_id, int x, int y, int z, int active, bool last_thread)
Definition: GPUMemInfo.h:354
int64_t
signed __INT64_TYPE__ int64_t
Definition: runtime_internal.h:22
Halide::abs
Expr abs(Expr a)
Returns the absolute value of a signed integer or floating-point expression.
Halide::Internal::Autoscheduler::MemTraits< SharedMem >::MemInfoType
SharedMem MemInfoType
Definition: GPUMemInfo.h:41
Halide::Internal::Autoscheduler::MemInfo::add
void add(const MemInfo< T > &other)
Definition: GPUMemInfo.h:80
Halide::Internal::Autoscheduler::MemInfo
Definition: GPUMemInfo.h:56
ASLog.h
Halide::Internal::Autoscheduler::Strides::dump
void dump(bool verbose=false)
Definition: GPUMemInfo.h:144
Halide::Internal::Autoscheduler::GlobalAccessAccumulator::GlobalAccessAccumulator
GlobalAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
Definition: GPUMemInfo.h:177
Halide::Internal::Autoscheduler::LocalAccessAccumulator::LocalAccessAccumulator
LocalAccessAccumulator(int bytes_per_access, bool verbose)
Definition: GPUMemInfo.h:350
Halide::max
Expr max(const FuncRef &a, const FuncRef &b)
Definition: Func.h:587
Halide::Internal::Autoscheduler::Accumulator
typename MemTraits< T >::Accumulator Accumulator
Definition: GPUMemInfo.h:53
Halide::Internal::Autoscheduler::LocalAccessAccumulator::add_access_info
void add_access_info(int num_requests, LocalMemInfo &local_mem_info, bool is_tail_warp) const
Definition: GPUMemInfo.h:366
Halide::Internal::Autoscheduler::GlobalAccessAccumulator::operator()
void operator()(int thread_id, int x, int y, int z, int active, bool last_thread)
Definition: GPUMemInfo.h:181
Halide::Internal::Autoscheduler::MemInfo::num_transactions
double num_transactions() const
Definition: GPUMemInfo.h:59
Halide::Internal::Autoscheduler::LocalAccessAccumulator
Definition: GPUMemInfo.h:349
Halide::Internal::Autoscheduler::Strides
Definition: GPUMemInfo.h:115
Halide::Internal::Autoscheduler::MemTraits< GlobalMem >::MemInfoType
GlobalMem MemInfoType
Definition: GPUMemInfo.h:34