Halide 21.0.0
Halide compiler and libraries
Loading...
Searching...
No Matches
GPUMemInfo.h
Go to the documentation of this file.
1#ifndef GPU_MEM_INFO_H
2#define GPU_MEM_INFO_H
3
4#include <unordered_map>
5#include <unordered_set>
6#include <vector>
7
8#include "ASLog.h"
9
10/** \file
11 *
12 * Data structures that help track memory access information. Useful when
13 * computing GPU features
14 */
15
16namespace Halide {
17namespace Internal {
18namespace Autoscheduler {
19
20struct GlobalMem;
22struct SharedMem;
24struct LocalMem;
26
27template<typename T>
28struct MemTraits;
29
30template<>
31struct MemTraits<GlobalMem> {
32 static constexpr double bytes_per_transaction = 32;
33 using MemInfoType = GlobalMem;
35};
36
37template<>
38struct MemTraits<SharedMem> {
39 static constexpr double bytes_per_transaction = 128;
40 using MemInfoType = SharedMem;
42};
43
44template<>
45struct MemTraits<LocalMem> {
46 static constexpr double bytes_per_transaction = 32;
47 using MemInfoType = GlobalMem; // Local mem behaves similarly to global mem
49};
50
51template<typename T>
53
54template<typename T>
55struct MemInfo {
57
58 double num_transactions() const {
59 return total_num_transactions;
60 }
61
62 void add_access_info(double num_requests, double num_transactions_per_request, double num_bytes_used_per_request) {
63 internal_assert(num_bytes_used_per_request > 0);
64
65 double total_transactions = num_requests * num_transactions_per_request;
66 double total_bytes = total_transactions * bytes_per_transaction;
67 double total_bytes_used = num_requests * num_bytes_used_per_request;
68
69 internal_assert(total_bytes_used <= total_bytes)
70 << "\ntotal_bytes_used = " << total_bytes_used
71 << "\ntotal_bytes = " << total_bytes
72 << "\ntotal_transactions = " << total_transactions
73 << "\nnum_transactions_per_request = " << num_transactions_per_request
74 << "\nnum_requests = " << num_requests;
75
76 update_totals(total_transactions, total_bytes_used, total_bytes);
77 }
78
79 void add(const MemInfo<T> &other) {
80 total_num_transactions += other.total_num_transactions;
81 total_num_bytes_used += other.total_num_bytes_used;
82 total_num_bytes += other.total_num_bytes;
83 }
84
85 double efficiency() const {
86 if (total_num_bytes == 0) {
87 return 1;
88 }
89
90 double result = total_num_bytes_used / total_num_bytes;
91 internal_assert(result <= 1);
92 return result;
93 }
94
95private:
96 void update_totals(double num_transactions, double num_bytes_used, double num_bytes) {
97 total_num_transactions += num_transactions;
98 total_num_bytes_used += num_bytes_used;
99 total_num_bytes += num_bytes;
100 }
101
102 double total_num_transactions = 0;
103 double total_num_bytes_used = 0;
104 double total_num_bytes = 0;
105};
106
107template<typename T>
109
113
114struct Strides {
115public:
116 explicit Strides(const std::vector<int64_t> &storage_strides)
117 : storage_strides{storage_strides} {
118 }
119
120 void add_valid(const std::vector<double> &strides) {
121 add(strides, true);
122 }
123
124 void add_invalid() {
125 add({}, false);
126 }
127
128 bool valid(size_t loop_index) const {
129 return is_valid[loop_index];
130 }
131
132 int64_t offset(size_t loop_index, int64_t point) const {
133 internal_assert(loop_index < is_valid.size() && valid(loop_index));
134 internal_assert(index_strides[loop_index].size() == storage_strides.size());
135
136 int64_t result = 0;
137 for (size_t i = 0; i < storage_strides.size(); ++i) {
138 result += (int64_t)(point * index_strides[loop_index][i]) * storage_strides[i];
139 }
140 return std::abs(result);
141 }
142
143 void dump(bool verbose = false) {
144 if (!verbose) {
145 return;
146 }
147
148 for (size_t i = 0; i < storage_strides.size(); ++i) {
149 if (!valid(i)) {
150 aslog(2) << "stride " << i << ": invalid\n";
151 continue;
152 }
153 aslog(2) << "storage_stride " << i << ": " << storage_strides[i] << "\n";
154 }
155
156 for (size_t i = 0; i < index_strides.size(); ++i) {
157 for (size_t j = 0; j < index_strides[i].size(); ++j) {
158 aslog(2) << "index_stride " << i << ", storage_stride " << j << ": " << index_strides[i][j] << " ";
159 }
160 aslog(2) << "\n";
161 }
162 }
163
164private:
165 void add(const std::vector<double> &strides, bool e) {
166 index_strides.push_back(strides);
167 is_valid.push_back(e);
168 }
169
170 std::vector<int64_t> storage_strides;
171 std::vector<std::vector<double>> index_strides;
172 std::vector<bool> is_valid;
173};
174
176 GlobalAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
177 : bytes_per_access{bytes_per_access},
178 dimensions{dimensions},
179 strides{strides},
180 verbose{verbose} {
181 }
182
183 void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) {
184 if (!active) {
185 return;
186 }
187
188 if (verbose) {
189 aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n";
190 }
191
192 int thread_ids[3] = {x, y, z};
193 int64_t byte = 0;
194 for (size_t i = 0; i < dimensions; ++i) {
195 if (!strides.valid(i)) {
196 ++unknown_sectors;
197 return;
198 }
199 byte += bytes_per_access * strides.offset(i, thread_ids[i]);
200 }
201
202 if (verbose) {
203 aslog(2) << "byte accessed: " << byte << "\n";
204 }
205
206 int64_t sector = byte / 32;
207 if (verbose) {
208 aslog(2) << "sectors accessed: ";
209 }
210 for (int i = 0; i < bytes_per_access; ++i) {
211 if (verbose) {
212 aslog(2) << sector << " ";
213 }
214 sectors_accessed[sector].insert(byte + i);
215 }
216 if (verbose) {
217 aslog(2) << "\n\n";
218 }
219 }
220
221 void add_access_info(int num_requests, GlobalMemInfo &global_mem_info, bool is_tail_warp) const {
222 int num_transactions_per_request = sectors_accessed.size() + unknown_sectors;
223
224 if (verbose) {
225 if (is_tail_warp) {
226 aslog(2) << "tail_";
227 }
228 aslog(2) << "num_transactions_per_request = " << num_transactions_per_request << "\n";
229 }
230
231 int num_bytes_used_per_request = 0;
232 for (const auto &sector : sectors_accessed) {
233 num_bytes_used_per_request += sector.second.size();
234 }
235
236 num_bytes_used_per_request += unknown_sectors * bytes_per_access;
237
238 if (verbose) {
239 if (is_tail_warp) {
240 aslog(2) << "tail_";
241 }
242 aslog(2) << "num_requests_per_block = " << num_requests << "\n";
243 }
244
245 global_mem_info.add_access_info(
246 num_requests,
247 num_transactions_per_request,
248 num_bytes_used_per_request);
249 }
250
251private:
252 int bytes_per_access;
253 size_t dimensions;
254 Strides strides;
255 bool verbose;
256 int unknown_sectors = 0;
257 std::unordered_map<int64_t, std::unordered_set<int64_t>> sectors_accessed;
258};
259
261 SharedAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
262 : bytes_per_access{bytes_per_access},
263 dimensions{dimensions},
264 strides{strides},
265 verbose{verbose} {
266 }
267
268 void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) {
269 if (!active) {
270 return;
271 }
272
273 if (verbose) {
274 aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n";
275 }
276
277 int thread_ids[3] = {x, y, z};
278 int64_t byte = 0;
279 for (size_t i = 0; i < dimensions; ++i) {
280 if (!strides.valid(i)) {
281 ++unknown_banks;
282 return;
283 }
284 byte += bytes_per_access * strides.offset(i, thread_ids[i]);
285 }
286
287 if (verbose) {
288 aslog(2) << "bytes accessed: ";
289 for (int i = 0; i < bytes_per_access; ++i) {
290 aslog(2) << byte + i << " ";
291 }
292 aslog(2) << "\n";
293 }
294
295 if (verbose) {
296 aslog(2) << "banks accessed: ";
297 }
298 for (int i = 0; i < bytes_per_access; ++i) {
299 int64_t word = (byte + i) / 4;
300 int64_t bank = word % 32;
301 if (verbose) {
302 aslog(2) << bank << " ";
303 }
304 bytes_accessed.insert(byte + i);
305 bank_to_words_accessed[bank].insert(word);
306 }
307 if (verbose) {
308 aslog(2) << "\n\n";
309 }
310 }
311
312 void add_access_info(int num_requests, SharedMemInfo &shared_mem_info, bool is_tail_warp) const {
313 int num_transactions_per_request = 0;
314 for (const auto &bank : bank_to_words_accessed) {
315 num_transactions_per_request = std::max(num_transactions_per_request, (int)bank.size());
316 }
317
318 num_transactions_per_request += unknown_banks;
319
320 if (verbose) {
321 if (is_tail_warp) {
322 aslog(2) << "tail_";
323 }
324 aslog(2) << "num_transactions_per_request = " << num_transactions_per_request << "\n";
325 }
326
327 int num_bytes_used_per_request = bytes_accessed.size();
328
329 num_bytes_used_per_request += unknown_banks * bytes_per_access;
330
331 if (verbose) {
332 if (is_tail_warp) {
333 aslog(2) << "tail_";
334 }
335 aslog(2) << "num_requests_per_block = " << num_requests << "\n";
336 }
337
338 shared_mem_info.add_access_info(
339 num_requests,
340 num_transactions_per_request,
341 num_bytes_used_per_request);
342 }
343
344private:
345 int bytes_per_access;
346 size_t dimensions;
347 Strides strides;
348 bool verbose;
349 int unknown_banks = 0;
350 std::unordered_set<int64_t> bytes_accessed;
351 std::array<std::unordered_set<int64_t>, 32> bank_to_words_accessed;
352};
353
355 LocalAccessAccumulator(int bytes_per_access, bool verbose)
356 : bytes_per_access{bytes_per_access},
357 verbose{verbose} {
358 }
359
360 void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) {
361 if (!active) {
362 return;
363 }
364
365 ++thread_count;
366
367 if (verbose) {
368 aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n";
369 }
370 }
371
372 void add_access_info(int num_requests, LocalMemInfo &local_mem_info, bool is_tail_warp) const {
373 int num_bytes_used_per_request = thread_count * bytes_per_access;
374 int sectors_accessed = std::ceil((float)num_bytes_used_per_request / (float)LocalMemInfo::bytes_per_transaction);
375 int num_transactions_per_request = sectors_accessed;
376
377 if (verbose) {
378 if (is_tail_warp) {
379 aslog(2) << "tail_";
380 }
381 aslog(2) << "num_transactions_per_request = " << num_transactions_per_request << "\n";
382 }
383
384 if (verbose) {
385 if (is_tail_warp) {
386 aslog(2) << "tail_";
387 }
388 aslog(2) << "num_requests_per_block = " << num_requests << "\n";
389 }
390
391 local_mem_info.add_access_info(
392 num_requests,
393 num_transactions_per_request,
394 num_bytes_used_per_request);
395 }
396
397private:
398 int bytes_per_access;
399 bool verbose;
400 int thread_count = 0;
401 std::unordered_map<int64_t, std::unordered_set<int64_t>> sectors_accessed;
402};
403
404} // namespace Autoscheduler
405} // namespace Internal
406} // namespace Halide
407
408#endif // GPU_MEM_INFO_H
#define internal_assert(c)
Definition Error.h:218
MemInfoType< SharedMem > SharedMemInfo
Definition GPUMemInfo.h:111
MemInfo< typename MemTraits< T >::MemInfoType > MemInfoType
Definition GPUMemInfo.h:108
MemInfoType< LocalMem > LocalMemInfo
Definition GPUMemInfo.h:112
typename MemTraits< T >::Accumulator Accumulator
Definition GPUMemInfo.h:52
MemInfoType< GlobalMem > GlobalMemInfo
Definition GPUMemInfo.h:110
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
signed __INT64_TYPE__ int64_t
GlobalAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
Definition GPUMemInfo.h:176
void operator()(int thread_id, int x, int y, int z, int active, bool last_thread)
Definition GPUMemInfo.h:183
void add_access_info(int num_requests, GlobalMemInfo &global_mem_info, bool is_tail_warp) const
Definition GPUMemInfo.h:221
void add_access_info(int num_requests, LocalMemInfo &local_mem_info, bool is_tail_warp) const
Definition GPUMemInfo.h:372
LocalAccessAccumulator(int bytes_per_access, bool verbose)
Definition GPUMemInfo.h:355
void operator()(int thread_id, int x, int y, int z, int active, bool last_thread)
Definition GPUMemInfo.h:360
void add_access_info(double num_requests, double num_transactions_per_request, double num_bytes_used_per_request)
Definition GPUMemInfo.h:62
void add(const MemInfo< T > &other)
Definition GPUMemInfo.h:79
void operator()(int thread_id, int x, int y, int z, int active, bool last_thread)
Definition GPUMemInfo.h:268
SharedAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
Definition GPUMemInfo.h:261
void add_access_info(int num_requests, SharedMemInfo &shared_mem_info, bool is_tail_warp) const
Definition GPUMemInfo.h:312
bool valid(size_t loop_index) const
Definition GPUMemInfo.h:128
void add_valid(const std::vector< double > &strides)
Definition GPUMemInfo.h:120
Strides(const std::vector< int64_t > &storage_strides)
Definition GPUMemInfo.h:116
int64_t offset(size_t loop_index, int64_t point) const
Definition GPUMemInfo.h:132