24namespace Autoscheduler {
75bool all(
const vector<int> &v);
102 std::set<const FunctionDAG::Node *>
store_at;
146 mutable std::map<uint64_t, StageMap<ScheduleFeatures>>
features;
171 vector<vector<int64_t>> &stage_sizes,
172 vector<vector<int>> &pure_dims,
173 vector<int> &vectorized_indices)
const;
188 const vector<int64_t> &max_size);
195 h ^= (next + 0x9e3779b9 + (h << 6) + (h >> 2));
208 count += c->funcs_realized_or_inlined();
245 bool is_inlined =
false)
const;
266 const LoopNest *current_thread_loop =
nullptr)
const;
274 c->set_working_set_at_task_feature(working_set,
features);
275 features->get(c->stage).working_set_at_task = working_set;
281 bool in_threads_loop)
const;
296 int innermost_storage_dim,
298 const Bound &store_bounds,
302 int innermost_storage_dim,
304 const Bound &store_bounds,
306 bool verbose =
false)
const;
315 int consumer_innermost_dim,
317 const Bound &consumer_store_bounds,
319 const std::vector<int64_t> &inner_serial_loop_extents,
320 const Sites &consumer_site,
327 bool verbose =
false)
const;
332 int loop_index)
const;
336 bool accessed_has_been_scheduled,
343 bool accessed_has_been_scheduled,
346 bool verbose =
false)
const;
349 bool verbose =
false)
const;
354 const Bound &store_bounds,
357 double num_requests_per_warp,
359 bool verbose =
false)
const;
362 int consumer_innermost_dim,
364 const Bound &consumer_store_bounds,
366 double serial_loop_extents)
const;
370 int consumer_innermost_dim,
372 const Bound &consumer_store_bounds,
374 double serial_loop_extents,
379 int producer_innermost_dim,
381 const Bound &producer_store_bounds,
382 bool producer_has_been_scheduled,
385 double serial_loop_extents,
386 bool verbose =
false)
const;
411 int64_t total_shared_mem_alloc_size,
419 const std::vector<const FunctionDAG::Edge *> &edge_chain,
426 bool producer_has_been_scheduled,
427 int producer_innermost_dim,
435 int producer_dims)
const;
468 bool use_memoized_features,
471 int64_t *working_set_local_constant,
472 int64_t *working_set_local_dynamic,
475 bool verbose =
false)
const;
480 return node ==
nullptr;
485 return bounds.emplace(f, b);
498 const vector<const FunctionDAG::Edge *> &edge_chain)
const;
538 bool in_threads_loop,
549 bool move_all_rvars_inward =
true,
550 const vector<int> &rvars_to_move_inward = {})
const;
553 bool in_threads_loop =
false)
const;
561 bool in_threads_loop)
const;
574 bool in_threads_loop,
576 vector<int64_t> union_counts = vector<int64_t>())
const;
665 StageMap<std::unique_ptr<StageScheduleState>> &state_map,
671 std::vector<StageScheduleState *> &ancestors,
688 const LoopNest *compute_root_loop_nest =
nullptr)
const;
699 std::cerr <<
"\nState filtered: \n";
701 std::cerr <<
"Reason: ";
708 std::cerr << std::forward<T>(x);
Data structure containing information about the current GPU loop nest hierarchy of blocks,...
Data structures that help track memory access information.
Data structure containing information about GPU threads for a particular location in the loop nest an...
A class representing a reference count to be used with IntrusivePtr.
A reference to a site in a Halide statement at the top of the body of a particular for loop.
A Halide variable, to be used when defining functions.
int64_t get_active_block_hardware_limit(const Anderson2021Params ¶ms)
PerfectHashMap< FunctionDAG::Node::Stage, T > StageMap
bool all(const vector< int > &v)
bool are_valid_thread_extents(const vector< int64_t > &counts)
bool in_range_zero_one(double x)
bool accessed_at_constant_indices(const std::vector< int > &unrolled, const FunctionDAG::Edge *e)
constexpr int64_t get_register_mem_alloc_limit()
double get_idle_lane_wastage_limit()
int64_t get_shared_memory_sm_limit(const Anderson2021Params ¶ms)
PerfectHashMap< FunctionDAG::Node, T > NodeMap
int64_t get_active_warp_hardware_limit(const Anderson2021Params ¶ms)
bool may_subtile(const Anderson2021Params ¶ms)
double get_idle_lane_wastage_limit_env_var()
int get_unroll_limit(const Target &target)
int64_t get_shared_memory_limit(const Anderson2021Params ¶ms)
std::string stringify(GPU_parallelism label)
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
unsigned __INT64_TYPE__ uint64_t
signed __INT64_TYPE__ int64_t
const LoopNest * loop_nest
Filter(const LoopNest *loop_nest)
Filter & operator<<(T &&x)
static bool enable_filter_printing()
uint64_t hash_of_producers_stored_at_root
bool is_stored_in_registers() const
const LoopNest * innermost
bool is_stored_in_shared_mem() const
bool is_stored_in_local_mem() const
GPUMemoryType gpu_store_memory_type
std::vector< const LoopNest * > inlined_innermosts
bool is_stored_in_global_mem() const
bool is_constant_allocation
vector< StageScheduleState * > ancestors
const FunctionDAG::Node * node
int vectorized_loop_index
vector< int64_t > gpu_thread_extents
bool all_innermost_unrolled
vector< FuncVar > ordered_vars
const FunctionDAG::Node::Stage * stage
std::ostringstream schedule_source
NodeMap< std::vector< std::pair< const LoopNest *, std::vector< const FunctionDAG::Edge * > > > > producers_to_be_staged
vector< pair< int, int > > collect_producers(const StageMap< Sites > &sites) const
bool is_gpu_thread(const Target &target) const
vector< IntrusivePtr< const LoopNest > > compute_in_tiles(const FunctionDAG::Node *f, const LoopNest *parent, const Anderson2021Params ¶ms, const Target &target, const SearchSpaceOptions &search_space_options, int v, bool in_realization, bool in_threads_loop, bool is_pre_pass, vector< int64_t > union_counts=vector< int64_t >()) const
int vectorized_loop_index
const LoopNest * get_enclosing_block(const LoopNest *parent, const LoopNest *grandparent) const
int num_serial_loops(const FunctionDAG::Node::Stage *stage) const
int64_t points_accessed_per_thread(const Anderson2021Params ¶ms, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector< const FunctionDAG::Edge * > &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose) const
bool has_constant_region_required(const FunctionDAG::Node *node) const
int num_serial_loops() const
std::map< uint64_t, StageMap< ScheduleFeatures > > features
int get_pure_stage_vectorized_loop_index(const FunctionDAG::Node *node) const
const FunctionDAG::Node * node
void dump(T &stream, string prefix, const LoopNest *parent) const
int64_t product_of_self_and_descendants(int loop_index) const
GPU_parallelism gpu_label
bool all_strides_exist(const LoadJacobian &jac, const FunctionDAG::Node *storage_node, const LoopNest &root) const
void recompute_inlined_features(const StageMap< Sites > &sites, StageMap< ScheduleFeatures > *features) const
void inline_func(const FunctionDAG::Node *f)
void generate_vec_dim_serial_tilings(vector< int > &serial_sizes) const
bool region_computed_shrinks(const FunctionDAG::Node *f, const LoopNest *parent) const
void compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector< int64_t > &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo &global_mem_loads, SharedMemInfo &shared_mem_loads, LocalMemInfo &local_mem_loads, bool verbose=false) const
int64_t compute_licm_amortization(const LoopNest *innermost, const LoopNest *parent, const ScheduleFeatures &feat, const LoadJacobian &jac, int producer_dims) const
int64_t product_of_descendants(int loop_index) const
bool requires_dynamic_allocation(const FunctionDAG::Node *f, const Target &target, bool in_threads_loop) const
bool is_gpu_block(const Target &target) const
bool node_has_dynamic_region_computed(const FunctionDAG::Node *f) const
bool exceeds_serial_extents_limit(const Target &target, const LoopNest *parent, bool in_threads_loop) const
void collect_stages(std::set< const FunctionDAG::Node::Stage * > &stages) const
bool accesses_input_buffer() const
std::vector< int64_t > size
Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector< const FunctionDAG::Edge * > &edge_chain) const
int64_t get_total_constant_local_mem_alloc_size() const
const Bound & get_bounds(const FunctionDAG::Node *f) const
std::pair< double, double > compute_local_mem_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const LoopNest &root, double serial_loop_extents) const
int vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType &mem_type, bool verbose=false) const
bool all_paths_to_leaves_have_thread_loop() const
GPUMemoryType get_gpu_memory_type(bool in_block, bool in_thread, bool is_inlined=false) const
void compute_warp_and_block_occupancy(const Anderson2021Params ¶ms, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const
vector< int64_t > get_union_thread_counts(const FunctionDAG::Node *f) const
void collect_nodes_that_should_be_inlined(const NodeMap< bool > &nodes_to_freeze, NodeMap< bool > &inlined_nodes) const
bool has_valid_thread_extents() const
void apply(LoopLevel here, StageMap< std::unique_ptr< StageScheduleState > > &state_map, double num_cores, int depth, const LoopNest *parent, const LoopNest *compute_site, const Target &target, std::vector< StageScheduleState * > &ancestors, const NodeMap< bool > &all_inlined) const
std::map< uint64_t, StageMap< StageMap< FeatureIntermediates > > > feature_intermediates
int get_actual_vector_dim(const Bound &store_bounds) const
void compute_shared_mem_occupancy(const Anderson2021Params ¶ms, const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const
Strides compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo *thread_info, bool verbose=false) const
int get_vectorized_loop_index_from_pure_stage(const LoopNest &root) const
bool computes(const FunctionDAG::Node *f) const
double storage_stride(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const LoopNest &root) const
std::vector< IntrusivePtr< const LoopNest > > children
void set_working_set_at_task_feature(int64_t working_set, StageMap< ScheduleFeatures > *features) const
void copy_from(const LoopNest &n)
bool promote_allocs_to_registers(const Target &target, StageMap< Sites > &sites) const
std::pair< int64_t, int64_t > get_block_and_serial_extents(const LoopNest *block) const
const Bound & set_bounds(const FunctionDAG::Node *f, BoundContents *b) const
bool has_dynamic_allocation_inside_thread(bool in_thread_loop) const
void memoize_points_computed_minimum(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features) const
bool has_constant_region_computed(const FunctionDAG::Node *node) const
double compute_local_mem_stride(double stride, double bytes) const
MemInfoType< T > compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const
bool add_gpu_thread_tilings(const FunctionDAG::Node *f, const Anderson2021Params ¶ms, const Target &target, int v, vector< IntrusivePtr< const LoopNest > > &result, const vector< int64_t > &max_size)
IntrusivePtr< const LoopNest > parallelize_in_tiles(const vector< int64_t > &tiling, const LoopNest *parent, const Anderson2021Params ¶ms, const Target &target, bool inner_tiling, bool adjust_tiling, bool move_all_rvars_inward=true, const vector< int > &rvars_to_move_inward={}) const
void memoize_features(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features) const
void collect_all_inlined(NodeMap< bool > &all_inlined) const
std::vector< int > unrolled_loops(const Target &target, const LoopNest *parent, const LoopNest *grandparent) const
std::string to_string() const
bool is_gpu_serial(const Target &target) const
double max_idle_lane_wastage(const Target &target, GPULoopInfo gpu_loop_info) const
static void hash_combine(uint64_t &h, uint64_t next)
const LoopNest * find_pure_stage_loop_nest(const FunctionDAG::Node *node) const
void dump(std::ostream &os, string prefix, const LoopNest *parent) const
void get_stage_sizes(const FunctionDAG::Node *f, vector< vector< int64_t > > &stage_sizes, vector< vector< int > > &pure_dims, vector< int > &vectorized_indices) const
uint64_t compute_hash_of_producers_stored_at_root(const StageMap< Sites > &sites) const
int64_t max_inlined_calls() const
const FunctionDAG::Node::Stage * stage
bool other_stage_has_same_producer(const FunctionDAG::Node *producer) const
void get_stages_computed_in_each_compute_root_loop(StageMap< StageMap< bool > > &descendants, const LoopNest *compute_root_loop_nest=nullptr) const
void compute_features(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target, const StageMap< Sites > &sites, int64_t instances, int64_t parallelism, const LoopNest *parent, const LoopNest *grandparent, const LoopNest &root, GPULoopInfo gpu_loop_info, bool use_memoized_features, const StageMap< int64_t > &total_shared_mem_alloc_sizes, int64_t *working_set, int64_t *working_set_local_constant, int64_t *working_set_local_dynamic, StageMap< ScheduleFeatures > *features, Statistics &stats, bool verbose=false) const
void get_sites(const Target &target, StageMap< Sites > &sites, StageMap< int64_t > &shared_mem_alloc_sizes, const LoopNest *task=nullptr, const LoopNest *parent=nullptr, const LoopNest *current_thread_loop=nullptr) const
bool calls(const FunctionDAG::Node *f) const
bool producer_computed_here_or_further_in(const FunctionDAG::Node *producer) const
void copy_from_including_features(const LoopNest &n)
bool can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, const FunctionDAG::Node *accessed, int innermost_dim, int loop_index) const
void update_producers_to_be_staged(StageScheduleState &state, const NodeMap< bool > &all_inlined) const
bool has_thread_loop_descendant() const
bool can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType &mem_type) const
vector< IntrusivePtr< const LoopNest > > children
void compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo *thread_info, MemInfoType< T > &mem_info, double serial_loop_extents, bool verbose=false) const
void get_allocs_that_can_be_promoted_to_registers(const Target &target, StageMap< Sites > &sites, NodeMap< bool > &can_be_promoted_to_registers, const LoopNest *grandparent, const LoopNest *parent) const
void compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const
void structural_hash(uint64_t &h, int depth) const
size_t funcs_realized_or_inlined() const
void compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType< T > &mem_info, bool verbose=false) const
std::pair< const LoopNest *, const LoopNest * > find_innermost_and_parent() const
std::set< const FunctionDAG::Node * > store_at
std::pair< int64_t, bool > compute_alloc_size_of_node_here(const FunctionDAG::Node *f) const
bool compute_here(const FunctionDAG::Node *f, bool tileable, int v, bool in_threads_loop, const Anderson2021Params ¶ms, const Target &target)
int64_t get_total_local_mem_alloc_size(bool constant_allocs_only=false, bool in_threads_loop=false) const
NodeMap< int64_t > inlined
void compute_working_set_from_features(int64_t *working_set, const StageMap< ScheduleFeatures > *features) const
int vectorized_access_size(size_t loop_index, bool verbose=false) const
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
A struct representing a target machine and os to generate code for.
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled? I.e.
A class that can represent Vars or RVars.