18namespace Autoscheduler {
20static constexpr int MAX_THREADS_PER_BLOCK = 1024;
41 ThreadInfo(
int vectorized_loop_index,
const std::vector<int64_t> &size,
const std::vector<FunctionDAG::Node::Loop> &loop,
const std::vector<int64_t> &max_thread_counts) {
42 init_threads_in_this_block(max_thread_counts);
44 std::size_t num_thread_loops = 0;
46 if (vectorized_loop_index != -1 && size[vectorized_loop_index] != 1) {
47 threads[num_thread_loops] = size[vectorized_loop_index];
51 loop_vars.push_back(loop[vectorized_loop_index].var);
54 for (std::size_t i = 0; i < size.size() && num_thread_loops < 3; i++) {
55 if (size[i] == 1 || (
int)i == vectorized_loop_index) {
59 if (
num_threads * size[i] > MAX_THREADS_PER_BLOCK) {
63 threads[num_thread_loops] = size[i];
83 count_num_active_warps_per_block();
108 template<
typename Fn>
123 bool last_thread = thread_id == 31;
124 fn(thread_id, x, y, z, active, last_thread);
135 template<
typename Fn>
140 for (; thread_id <= last_thread_id; ++thread_id) {
151 fn(thread_id, x, y, z, active, thread_id == last_thread_id);
155 template<
typename Fn>
162 fn(thread_id, is_last_thread);
175 return (
double)
num_threads / MAX_THREADS_PER_BLOCK;
196 void init_threads_in_this_block(
const std::vector<int64_t> &max_thread_counts) {
197 int num_thread_loops = 0;
198 for (
auto c : max_thread_counts) {
218 void count_num_active_warps_per_block() {
219 bool current_warp_is_active =
false;
220 int num_active_threads_in_cur_warp = 0;
221 int num_active_threads_in_first_warp = 0;
222 int num_threads_in_cur_warp = 0;
223 bool first_warp =
true;
226 current_warp_is_active |= is_active;
229 ++num_active_threads_in_cur_warp;
232 ++num_threads_in_cur_warp;
234 if ((thread_id + 1) % 32 == 0 || is_last_thread) {
235 if (current_warp_is_active) {
240 num_active_threads_in_first_warp = num_active_threads_in_cur_warp;
243 if (is_last_thread) {
245 has_tail_warp = num_active_threads_in_first_warp != num_active_threads_in_cur_warp;
252 current_warp_is_active =
false;
253 num_threads_in_cur_warp = 0;
254 num_active_threads_in_cur_warp = 0;
#define internal_assert(c)
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
signed __INT64_TYPE__ int64_t
std::vector< std::string > loop_vars
void for_each_thread_id_in_tail_warp(Fn &fn) const
int64_t num_active_threads
double block_occupancy() const
int num_threads_in_final_warp
double warp_lane_utilization() const
double idle_lane_wastage() const
void for_each_active_thread_id(const Fn &fn) const
ThreadInfo(int vectorized_loop_index, const std::vector< int64_t > &size, const std::vector< FunctionDAG::Node::Loop > &loop, const std::vector< int64_t > &max_thread_counts)
int num_active_warps_per_block
void for_each_thread_id(const Fn &fn) const
int threads_in_this_block[3]
int64_t num_threads_in_this_block
int final_warp_initial_thread_id
void for_each_thread_id_in_first_warp(Fn &fn) const
int num_regular_active_warps_per_block
std::vector< int > loop_indices
ThreadTileOption & operator=(const ThreadTileOption &)=delete
double max_idle_lane_wastage
bool operator<(const ThreadTileOption &other) const
IntrusivePtr< const LoopNest > loop_nest
ThreadTileOption()=default
ThreadTileOption & operator=(ThreadTileOption &&)=default
ThreadTileOption(ThreadTileOption &&)=default
ThreadTileOption(const ThreadTileOption &)=delete
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.