4 #include <unordered_map>
5 #include <unordered_set>
19 namespace Autoscheduler {
22 struct GlobalAccessAccumulator;
24 struct SharedAccessAccumulator;
26 struct LocalAccessAccumulator;
33 static constexpr
double bytes_per_transaction = 32;
40 static constexpr
double bytes_per_transaction = 128;
47 static constexpr
double bytes_per_transaction = 32;
60 return total_num_transactions;
63 void add_access_info(
double num_requests,
double num_transactions_per_request,
double num_bytes_used_per_request) {
66 double total_transactions = num_requests * num_transactions_per_request;
68 double total_bytes_used = num_requests * num_bytes_used_per_request;
71 <<
"\ntotal_bytes_used = " << total_bytes_used
72 <<
"\ntotal_bytes = " << total_bytes
73 <<
"\ntotal_transactions = " << total_transactions
74 <<
"\nnum_transactions_per_request = " << num_transactions_per_request
75 <<
"\nnum_requests = " << num_requests;
77 update_totals(total_transactions, total_bytes_used, total_bytes);
81 total_num_transactions += other.total_num_transactions;
82 total_num_bytes_used += other.total_num_bytes_used;
83 total_num_bytes += other.total_num_bytes;
87 if (total_num_bytes == 0) {
91 double result = total_num_bytes_used / total_num_bytes;
97 void update_totals(
double num_transactions,
double num_bytes_used,
double num_bytes) {
99 total_num_bytes_used += num_bytes_used;
100 total_num_bytes += num_bytes;
103 double total_num_transactions = 0;
104 double total_num_bytes_used = 0;
105 double total_num_bytes = 0;
117 Strides(
const std::vector<int64_t> &storage_strides)
118 : storage_strides{storage_strides} {
129 bool valid(
size_t loop_index)
const {
130 return is_valid[loop_index];
135 internal_assert(index_strides[loop_index].size() == storage_strides.size());
138 for (
size_t i = 0; i < storage_strides.size(); ++i) {
139 result += (
int64_t)(point * index_strides[loop_index][i]) * storage_strides[i];
144 void dump(
bool verbose =
false) {
149 for (
size_t i = 0; i < storage_strides.size(); ++i) {
151 aslog(2) <<
"stride " << i <<
": invalid\n";
154 aslog(2) <<
"storage_stride " << i <<
": " << storage_strides[i] <<
"\n";
157 for (
size_t i = 0; i < index_strides.size(); ++i) {
158 for (
size_t j = 0; j < index_strides[i].size(); ++j) {
159 aslog(2) <<
"index_stride " << i <<
", storage_stride " << j <<
": " << index_strides[i][j] <<
" ";
166 void add(
const std::vector<double> &strides,
bool e) {
167 index_strides.push_back(strides);
168 is_valid.push_back(e);
171 std::vector<int64_t> storage_strides;
172 std::vector<std::vector<double>> index_strides;
173 std::vector<bool> is_valid;
178 : bytes_per_access{bytes_per_access}, dimensions{dimensions}, strides{strides}, verbose{verbose} {
181 void operator()(
int thread_id,
int x,
int y,
int z,
int active,
bool last_thread) {
187 aslog(2) <<
"thread_id: " << thread_id <<
" (" << x <<
", " << y <<
", " << z <<
")\n";
190 int thread_ids[3] = {x, y, z};
192 for (
size_t i = 0; i < dimensions; ++i) {
193 if (!strides.valid(i)) {
197 byte += bytes_per_access * strides.offset(i, thread_ids[i]);
201 aslog(2) <<
"byte accessed: " <<
byte <<
"\n";
206 aslog(2) <<
"sectors accessed: ";
208 for (
int i = 0; i < bytes_per_access; ++i) {
210 aslog(2) << sector <<
" ";
212 sectors_accessed[sector].insert(
byte + i);
220 int num_transactions_per_request = sectors_accessed.size() + unknown_sectors;
226 aslog(2) <<
"num_transactions_per_request = " << num_transactions_per_request <<
"\n";
229 int num_bytes_used_per_request = 0;
230 for (
const auto §or : sectors_accessed) {
231 num_bytes_used_per_request += sector.second.size();
234 num_bytes_used_per_request += unknown_sectors * bytes_per_access;
240 aslog(2) <<
"num_requests_per_block = " << num_requests <<
"\n";
245 num_transactions_per_request,
246 num_bytes_used_per_request);
250 int bytes_per_access;
254 int unknown_sectors = 0;
255 std::unordered_map<int64_t, std::unordered_set<int64_t>> sectors_accessed;
260 : bytes_per_access{bytes_per_access}, dimensions{dimensions}, strides{strides}, verbose{verbose} {
263 void operator()(
int thread_id,
int x,
int y,
int z,
int active,
bool last_thread) {
269 aslog(2) <<
"thread_id: " << thread_id <<
" (" << x <<
", " << y <<
", " << z <<
")\n";
272 int thread_ids[3] = {x, y, z};
274 for (
size_t i = 0; i < dimensions; ++i) {
275 if (!strides.valid(i)) {
279 byte += bytes_per_access * strides.offset(i, thread_ids[i]);
283 aslog(2) <<
"bytes accessed: ";
284 for (
int i = 0; i < bytes_per_access; ++i) {
285 aslog(2) <<
byte + i <<
" ";
291 aslog(2) <<
"banks accessed: ";
293 for (
int i = 0; i < bytes_per_access; ++i) {
297 aslog(2) << bank <<
" ";
299 bytes_accessed.insert(
byte + i);
300 bank_to_words_accessed[bank].insert(word);
308 int num_transactions_per_request = 0;
309 for (
const auto &bank : bank_to_words_accessed) {
310 num_transactions_per_request =
std::max(num_transactions_per_request, (
int)bank.size());
313 num_transactions_per_request += unknown_banks;
319 aslog(2) <<
"num_transactions_per_request = " << num_transactions_per_request <<
"\n";
322 int num_bytes_used_per_request = bytes_accessed.size();
324 num_bytes_used_per_request += unknown_banks * bytes_per_access;
330 aslog(2) <<
"num_requests_per_block = " << num_requests <<
"\n";
335 num_transactions_per_request,
336 num_bytes_used_per_request);
340 int bytes_per_access;
344 int unknown_banks = 0;
345 std::unordered_set<int64_t> bytes_accessed;
346 std::array<std::unordered_set<int64_t>, 32> bank_to_words_accessed;
351 : bytes_per_access{bytes_per_access}, verbose{verbose} {
354 void operator()(
int thread_id,
int x,
int y,
int z,
int active,
bool last_thread) {
362 aslog(2) <<
"thread_id: " << thread_id <<
" (" << x <<
", " << y <<
", " << z <<
")\n";
367 int num_bytes_used_per_request = thread_count * bytes_per_access;
369 int num_transactions_per_request = sectors_accessed;
375 aslog(2) <<
"num_transactions_per_request = " << num_transactions_per_request <<
"\n";
382 aslog(2) <<
"num_requests_per_block = " << num_requests <<
"\n";
387 num_transactions_per_request,
388 num_bytes_used_per_request);
392 int bytes_per_access;
394 int thread_count = 0;
395 std::unordered_map<int64_t, std::unordered_set<int64_t>> sectors_accessed;
402 #endif // GPU_MEM_INFO_H