4#include <unordered_map>
5#include <unordered_set>
59 return total_num_transactions;
62 void add_access_info(
double num_requests,
double num_transactions_per_request,
double num_bytes_used_per_request) {
65 double total_transactions = num_requests * num_transactions_per_request;
67 double total_bytes_used = num_requests * num_bytes_used_per_request;
70 <<
"\ntotal_bytes_used = " << total_bytes_used
71 <<
"\ntotal_bytes = " << total_bytes
72 <<
"\ntotal_transactions = " << total_transactions
73 <<
"\nnum_transactions_per_request = " << num_transactions_per_request
74 <<
"\nnum_requests = " << num_requests;
76 update_totals(total_transactions, total_bytes_used, total_bytes);
80 total_num_transactions += other.total_num_transactions;
81 total_num_bytes_used += other.total_num_bytes_used;
82 total_num_bytes += other.total_num_bytes;
86 if (total_num_bytes == 0) {
90 double result = total_num_bytes_used / total_num_bytes;
96 void update_totals(
double num_transactions,
double num_bytes_used,
double num_bytes) {
98 total_num_bytes_used += num_bytes_used;
99 total_num_bytes += num_bytes;
102 double total_num_transactions = 0;
103 double total_num_bytes_used = 0;
104 double total_num_bytes = 0;
116 explicit Strides(
const std::vector<int64_t> &storage_strides)
117 : storage_strides{storage_strides} {
128 bool valid(
size_t loop_index)
const {
129 return is_valid[loop_index];
134 internal_assert(index_strides[loop_index].size() == storage_strides.size());
137 for (
size_t i = 0; i < storage_strides.size(); ++i) {
138 result += (
int64_t)(point * index_strides[loop_index][i]) * storage_strides[i];
140 return std::abs(result);
143 void dump(
bool verbose =
false) {
148 for (
size_t i = 0; i < storage_strides.size(); ++i) {
150 aslog(2) <<
"stride " << i <<
": invalid\n";
153 aslog(2) <<
"storage_stride " << i <<
": " << storage_strides[i] <<
"\n";
156 for (
size_t i = 0; i < index_strides.size(); ++i) {
157 for (
size_t j = 0; j < index_strides[i].size(); ++j) {
158 aslog(2) <<
"index_stride " << i <<
", storage_stride " << j <<
": " << index_strides[i][j] <<
" ";
165 void add(
const std::vector<double> &strides,
bool e) {
166 index_strides.push_back(strides);
167 is_valid.push_back(e);
170 std::vector<int64_t> storage_strides;
171 std::vector<std::vector<double>> index_strides;
172 std::vector<bool> is_valid;
177 : bytes_per_access{bytes_per_access},
178 dimensions{dimensions},
183 void operator()(
int thread_id,
int x,
int y,
int z,
int active,
bool last_thread) {
189 aslog(2) <<
"thread_id: " << thread_id <<
" (" << x <<
", " << y <<
", " << z <<
")\n";
192 int thread_ids[3] = {x, y, z};
194 for (
size_t i = 0; i < dimensions; ++i) {
195 if (!strides.valid(i)) {
199 byte += bytes_per_access * strides.offset(i, thread_ids[i]);
203 aslog(2) <<
"byte accessed: " <<
byte <<
"\n";
208 aslog(2) <<
"sectors accessed: ";
210 for (
int i = 0; i < bytes_per_access; ++i) {
212 aslog(2) << sector <<
" ";
214 sectors_accessed[sector].insert(
byte + i);
222 int num_transactions_per_request = sectors_accessed.size() + unknown_sectors;
228 aslog(2) <<
"num_transactions_per_request = " << num_transactions_per_request <<
"\n";
231 int num_bytes_used_per_request = 0;
232 for (
const auto §or : sectors_accessed) {
233 num_bytes_used_per_request += sector.second.size();
236 num_bytes_used_per_request += unknown_sectors * bytes_per_access;
242 aslog(2) <<
"num_requests_per_block = " << num_requests <<
"\n";
247 num_transactions_per_request,
248 num_bytes_used_per_request);
252 int bytes_per_access;
256 int unknown_sectors = 0;
257 std::unordered_map<int64_t, std::unordered_set<int64_t>> sectors_accessed;
262 : bytes_per_access{bytes_per_access},
263 dimensions{dimensions},
268 void operator()(
int thread_id,
int x,
int y,
int z,
int active,
bool last_thread) {
274 aslog(2) <<
"thread_id: " << thread_id <<
" (" << x <<
", " << y <<
", " << z <<
")\n";
277 int thread_ids[3] = {x, y, z};
279 for (
size_t i = 0; i < dimensions; ++i) {
280 if (!strides.valid(i)) {
284 byte += bytes_per_access * strides.offset(i, thread_ids[i]);
288 aslog(2) <<
"bytes accessed: ";
289 for (
int i = 0; i < bytes_per_access; ++i) {
290 aslog(2) <<
byte + i <<
" ";
296 aslog(2) <<
"banks accessed: ";
298 for (
int i = 0; i < bytes_per_access; ++i) {
302 aslog(2) << bank <<
" ";
304 bytes_accessed.insert(
byte + i);
305 bank_to_words_accessed[bank].insert(word);
313 int num_transactions_per_request = 0;
314 for (
const auto &bank : bank_to_words_accessed) {
315 num_transactions_per_request = std::max(num_transactions_per_request, (
int)bank.size());
318 num_transactions_per_request += unknown_banks;
324 aslog(2) <<
"num_transactions_per_request = " << num_transactions_per_request <<
"\n";
327 int num_bytes_used_per_request = bytes_accessed.size();
329 num_bytes_used_per_request += unknown_banks * bytes_per_access;
335 aslog(2) <<
"num_requests_per_block = " << num_requests <<
"\n";
340 num_transactions_per_request,
341 num_bytes_used_per_request);
345 int bytes_per_access;
349 int unknown_banks = 0;
350 std::unordered_set<int64_t> bytes_accessed;
351 std::array<std::unordered_set<int64_t>, 32> bank_to_words_accessed;
356 : bytes_per_access{bytes_per_access},
360 void operator()(
int thread_id,
int x,
int y,
int z,
int active,
bool last_thread) {
368 aslog(2) <<
"thread_id: " << thread_id <<
" (" << x <<
", " << y <<
", " << z <<
")\n";
373 int num_bytes_used_per_request = thread_count * bytes_per_access;
375 int num_transactions_per_request = sectors_accessed;
381 aslog(2) <<
"num_transactions_per_request = " << num_transactions_per_request <<
"\n";
388 aslog(2) <<
"num_requests_per_block = " << num_requests <<
"\n";
393 num_transactions_per_request,
394 num_bytes_used_per_request);
398 int bytes_per_access;
400 int thread_count = 0;
401 std::unordered_map<int64_t, std::unordered_set<int64_t>> sectors_accessed;
#define internal_assert(c)
MemInfoType< SharedMem > SharedMemInfo
MemInfo< typename MemTraits< T >::MemInfoType > MemInfoType
MemInfoType< LocalMem > LocalMemInfo
typename MemTraits< T >::Accumulator Accumulator
MemInfoType< GlobalMem > GlobalMemInfo
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
signed __INT64_TYPE__ int64_t
GlobalAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
void operator()(int thread_id, int x, int y, int z, int active, bool last_thread)
void add_access_info(int num_requests, GlobalMemInfo &global_mem_info, bool is_tail_warp) const
void add_access_info(int num_requests, LocalMemInfo &local_mem_info, bool is_tail_warp) const
LocalAccessAccumulator(int bytes_per_access, bool verbose)
void operator()(int thread_id, int x, int y, int z, int active, bool last_thread)
void add_access_info(double num_requests, double num_transactions_per_request, double num_bytes_used_per_request)
static constexpr double bytes_per_transaction
double efficiency() const
double num_transactions() const
void add(const MemInfo< T > &other)
static constexpr double bytes_per_transaction
GlobalAccessAccumulator Accumulator
static constexpr double bytes_per_transaction
LocalAccessAccumulator Accumulator
SharedAccessAccumulator Accumulator
static constexpr double bytes_per_transaction
void operator()(int thread_id, int x, int y, int z, int active, bool last_thread)
SharedAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
void add_access_info(int num_requests, SharedMemInfo &shared_mem_info, bool is_tail_warp) const
bool valid(size_t loop_index) const
void dump(bool verbose=false)
void add_valid(const std::vector< double > &strides)
Strides(const std::vector< int64_t > &storage_strides)
int64_t offset(size_t loop_index, int64_t point) const