Halide 19.0.0
Halide compiler and libraries
Loading...
Searching...
No Matches
GPUMemInfo.h
Go to the documentation of this file.
1#ifndef GPU_MEM_INFO_H
2#define GPU_MEM_INFO_H
3
4#include <unordered_map>
5#include <unordered_set>
6#include <vector>
7
8#include "ASLog.h"
9#include "Errors.h"
10
11/** \file
12 *
13 * Data structures that help track memory access information. Useful when
14 * computing GPU features
15 */
16
17namespace Halide {
18namespace Internal {
19namespace Autoscheduler {
20
21struct GlobalMem;
22struct GlobalAccessAccumulator;
23struct SharedMem;
24struct SharedAccessAccumulator;
25struct LocalMem;
26struct LocalAccessAccumulator;
27
28template<typename T>
29struct MemTraits;
30
31template<>
32struct MemTraits<GlobalMem> {
33 static constexpr double bytes_per_transaction = 32;
34 using MemInfoType = GlobalMem;
36};
37
38template<>
39struct MemTraits<SharedMem> {
40 static constexpr double bytes_per_transaction = 128;
41 using MemInfoType = SharedMem;
43};
44
45template<>
46struct MemTraits<LocalMem> {
47 static constexpr double bytes_per_transaction = 32;
48 using MemInfoType = GlobalMem; // Local mem behaves similarly to global mem
50};
51
52template<typename T>
54
55template<typename T>
56struct MemInfo {
58
59 double num_transactions() const {
60 return total_num_transactions;
61 }
62
63 void add_access_info(double num_requests, double num_transactions_per_request, double num_bytes_used_per_request) {
64 internal_assert(num_bytes_used_per_request > 0);
65
66 double total_transactions = num_requests * num_transactions_per_request;
67 double total_bytes = total_transactions * bytes_per_transaction;
68 double total_bytes_used = num_requests * num_bytes_used_per_request;
69
70 internal_assert(total_bytes_used <= total_bytes)
71 << "\ntotal_bytes_used = " << total_bytes_used
72 << "\ntotal_bytes = " << total_bytes
73 << "\ntotal_transactions = " << total_transactions
74 << "\nnum_transactions_per_request = " << num_transactions_per_request
75 << "\nnum_requests = " << num_requests;
76
77 update_totals(total_transactions, total_bytes_used, total_bytes);
78 }
79
80 void add(const MemInfo<T> &other) {
81 total_num_transactions += other.total_num_transactions;
82 total_num_bytes_used += other.total_num_bytes_used;
83 total_num_bytes += other.total_num_bytes;
84 }
85
86 double efficiency() const {
87 if (total_num_bytes == 0) {
88 return 1;
89 }
90
91 double result = total_num_bytes_used / total_num_bytes;
92 internal_assert(result <= 1);
93 return result;
94 }
95
96private:
97 void update_totals(double num_transactions, double num_bytes_used, double num_bytes) {
98 total_num_transactions += num_transactions;
99 total_num_bytes_used += num_bytes_used;
100 total_num_bytes += num_bytes;
101 }
102
103 double total_num_transactions = 0;
104 double total_num_bytes_used = 0;
105 double total_num_bytes = 0;
106};
107
108template<typename T>
110
114
115struct Strides {
116public:
117 explicit Strides(const std::vector<int64_t> &storage_strides)
118 : storage_strides{storage_strides} {
119 }
120
121 void add_valid(const std::vector<double> &strides) {
122 add(strides, true);
123 }
124
125 void add_invalid() {
126 add({}, false);
127 }
128
129 bool valid(size_t loop_index) const {
130 return is_valid[loop_index];
131 }
132
133 int64_t offset(size_t loop_index, int64_t point) const {
134 internal_assert(loop_index < is_valid.size() && valid(loop_index));
135 internal_assert(index_strides[loop_index].size() == storage_strides.size());
136
137 int64_t result = 0;
138 for (size_t i = 0; i < storage_strides.size(); ++i) {
139 result += (int64_t)(point * index_strides[loop_index][i]) * storage_strides[i];
140 }
141 return std::abs(result);
142 }
143
144 void dump(bool verbose = false) {
145 if (!verbose) {
146 return;
147 }
148
149 for (size_t i = 0; i < storage_strides.size(); ++i) {
150 if (!valid(i)) {
151 aslog(2) << "stride " << i << ": invalid\n";
152 continue;
153 }
154 aslog(2) << "storage_stride " << i << ": " << storage_strides[i] << "\n";
155 }
156
157 for (size_t i = 0; i < index_strides.size(); ++i) {
158 for (size_t j = 0; j < index_strides[i].size(); ++j) {
159 aslog(2) << "index_stride " << i << ", storage_stride " << j << ": " << index_strides[i][j] << " ";
160 }
161 aslog(2) << "\n";
162 }
163 }
164
165private:
166 void add(const std::vector<double> &strides, bool e) {
167 index_strides.push_back(strides);
168 is_valid.push_back(e);
169 }
170
171 std::vector<int64_t> storage_strides;
172 std::vector<std::vector<double>> index_strides;
173 std::vector<bool> is_valid;
174};
175
177 GlobalAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
178 : bytes_per_access{bytes_per_access},
179 dimensions{dimensions},
180 strides{strides},
181 verbose{verbose} {
182 }
183
184 void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) {
185 if (!active) {
186 return;
187 }
188
189 if (verbose) {
190 aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n";
191 }
192
193 int thread_ids[3] = {x, y, z};
194 int64_t byte = 0;
195 for (size_t i = 0; i < dimensions; ++i) {
196 if (!strides.valid(i)) {
197 ++unknown_sectors;
198 return;
199 }
200 byte += bytes_per_access * strides.offset(i, thread_ids[i]);
201 }
202
203 if (verbose) {
204 aslog(2) << "byte accessed: " << byte << "\n";
205 }
206
207 int64_t sector = byte / 32;
208 if (verbose) {
209 aslog(2) << "sectors accessed: ";
210 }
211 for (int i = 0; i < bytes_per_access; ++i) {
212 if (verbose) {
213 aslog(2) << sector << " ";
214 }
215 sectors_accessed[sector].insert(byte + i);
216 }
217 if (verbose) {
218 aslog(2) << "\n\n";
219 }
220 }
221
222 void add_access_info(int num_requests, GlobalMemInfo &global_mem_info, bool is_tail_warp) const {
223 int num_transactions_per_request = sectors_accessed.size() + unknown_sectors;
224
225 if (verbose) {
226 if (is_tail_warp) {
227 aslog(2) << "tail_";
228 }
229 aslog(2) << "num_transactions_per_request = " << num_transactions_per_request << "\n";
230 }
231
232 int num_bytes_used_per_request = 0;
233 for (const auto &sector : sectors_accessed) {
234 num_bytes_used_per_request += sector.second.size();
235 }
236
237 num_bytes_used_per_request += unknown_sectors * bytes_per_access;
238
239 if (verbose) {
240 if (is_tail_warp) {
241 aslog(2) << "tail_";
242 }
243 aslog(2) << "num_requests_per_block = " << num_requests << "\n";
244 }
245
246 global_mem_info.add_access_info(
247 num_requests,
248 num_transactions_per_request,
249 num_bytes_used_per_request);
250 }
251
252private:
253 int bytes_per_access;
254 size_t dimensions;
255 Strides strides;
256 bool verbose;
257 int unknown_sectors = 0;
258 std::unordered_map<int64_t, std::unordered_set<int64_t>> sectors_accessed;
259};
260
262 SharedAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
263 : bytes_per_access{bytes_per_access},
264 dimensions{dimensions},
265 strides{strides},
266 verbose{verbose} {
267 }
268
269 void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) {
270 if (!active) {
271 return;
272 }
273
274 if (verbose) {
275 aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n";
276 }
277
278 int thread_ids[3] = {x, y, z};
279 int64_t byte = 0;
280 for (size_t i = 0; i < dimensions; ++i) {
281 if (!strides.valid(i)) {
282 ++unknown_banks;
283 return;
284 }
285 byte += bytes_per_access * strides.offset(i, thread_ids[i]);
286 }
287
288 if (verbose) {
289 aslog(2) << "bytes accessed: ";
290 for (int i = 0; i < bytes_per_access; ++i) {
291 aslog(2) << byte + i << " ";
292 }
293 aslog(2) << "\n";
294 }
295
296 if (verbose) {
297 aslog(2) << "banks accessed: ";
298 }
299 for (int i = 0; i < bytes_per_access; ++i) {
300 int64_t word = (byte + i) / 4;
301 int64_t bank = word % 32;
302 if (verbose) {
303 aslog(2) << bank << " ";
304 }
305 bytes_accessed.insert(byte + i);
306 bank_to_words_accessed[bank].insert(word);
307 }
308 if (verbose) {
309 aslog(2) << "\n\n";
310 }
311 }
312
313 void add_access_info(int num_requests, SharedMemInfo &shared_mem_info, bool is_tail_warp) const {
314 int num_transactions_per_request = 0;
315 for (const auto &bank : bank_to_words_accessed) {
316 num_transactions_per_request = std::max(num_transactions_per_request, (int)bank.size());
317 }
318
319 num_transactions_per_request += unknown_banks;
320
321 if (verbose) {
322 if (is_tail_warp) {
323 aslog(2) << "tail_";
324 }
325 aslog(2) << "num_transactions_per_request = " << num_transactions_per_request << "\n";
326 }
327
328 int num_bytes_used_per_request = bytes_accessed.size();
329
330 num_bytes_used_per_request += unknown_banks * bytes_per_access;
331
332 if (verbose) {
333 if (is_tail_warp) {
334 aslog(2) << "tail_";
335 }
336 aslog(2) << "num_requests_per_block = " << num_requests << "\n";
337 }
338
339 shared_mem_info.add_access_info(
340 num_requests,
341 num_transactions_per_request,
342 num_bytes_used_per_request);
343 }
344
345private:
346 int bytes_per_access;
347 size_t dimensions;
348 Strides strides;
349 bool verbose;
350 int unknown_banks = 0;
351 std::unordered_set<int64_t> bytes_accessed;
352 std::array<std::unordered_set<int64_t>, 32> bank_to_words_accessed;
353};
354
356 LocalAccessAccumulator(int bytes_per_access, bool verbose)
357 : bytes_per_access{bytes_per_access},
358 verbose{verbose} {
359 }
360
361 void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) {
362 if (!active) {
363 return;
364 }
365
366 ++thread_count;
367
368 if (verbose) {
369 aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n";
370 }
371 }
372
373 void add_access_info(int num_requests, LocalMemInfo &local_mem_info, bool is_tail_warp) const {
374 int num_bytes_used_per_request = thread_count * bytes_per_access;
375 int sectors_accessed = std::ceil((float)num_bytes_used_per_request / (float)LocalMemInfo::bytes_per_transaction);
376 int num_transactions_per_request = sectors_accessed;
377
378 if (verbose) {
379 if (is_tail_warp) {
380 aslog(2) << "tail_";
381 }
382 aslog(2) << "num_transactions_per_request = " << num_transactions_per_request << "\n";
383 }
384
385 if (verbose) {
386 if (is_tail_warp) {
387 aslog(2) << "tail_";
388 }
389 aslog(2) << "num_requests_per_block = " << num_requests << "\n";
390 }
391
392 local_mem_info.add_access_info(
393 num_requests,
394 num_transactions_per_request,
395 num_bytes_used_per_request);
396 }
397
398private:
399 int bytes_per_access;
400 bool verbose;
401 int thread_count = 0;
402 std::unordered_map<int64_t, std::unordered_set<int64_t>> sectors_accessed;
403};
404
405} // namespace Autoscheduler
406} // namespace Internal
407} // namespace Halide
408
409#endif // GPU_MEM_INFO_H
#define internal_assert(c)
Definition Errors.h:19
typename MemTraits< T >::Accumulator Accumulator
Definition GPUMemInfo.h:53
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
signed __INT64_TYPE__ int64_t
GlobalAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
Definition GPUMemInfo.h:177
void operator()(int thread_id, int x, int y, int z, int active, bool last_thread)
Definition GPUMemInfo.h:184
void add_access_info(int num_requests, GlobalMemInfo &global_mem_info, bool is_tail_warp) const
Definition GPUMemInfo.h:222
void add_access_info(int num_requests, LocalMemInfo &local_mem_info, bool is_tail_warp) const
Definition GPUMemInfo.h:373
LocalAccessAccumulator(int bytes_per_access, bool verbose)
Definition GPUMemInfo.h:356
void operator()(int thread_id, int x, int y, int z, int active, bool last_thread)
Definition GPUMemInfo.h:361
void add_access_info(double num_requests, double num_transactions_per_request, double num_bytes_used_per_request)
Definition GPUMemInfo.h:63
static constexpr double bytes_per_transaction
Definition GPUMemInfo.h:57
void add(const MemInfo< T > &other)
Definition GPUMemInfo.h:80
void operator()(int thread_id, int x, int y, int z, int active, bool last_thread)
Definition GPUMemInfo.h:269
SharedAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
Definition GPUMemInfo.h:262
void add_access_info(int num_requests, SharedMemInfo &shared_mem_info, bool is_tail_warp) const
Definition GPUMemInfo.h:313
bool valid(size_t loop_index) const
Definition GPUMemInfo.h:129
void add_valid(const std::vector< double > &strides)
Definition GPUMemInfo.h:121
Strides(const std::vector< int64_t > &storage_strides)
Definition GPUMemInfo.h:117
int64_t offset(size_t loop_index, int64_t point) const
Definition GPUMemInfo.h:133