27using NodeMap = PerfectHashMap<FunctionDAG::Node, T>;
30using StageMap = PerfectHashMap<FunctionDAG::Node::Stage, T>;
75bool all(
const vector<int> &v);
102 std::set<const FunctionDAG::Node *>
store_at;
146 mutable std::map<uint64_t, StageMap<ScheduleFeatures>>
features;
171 vector<vector<int64_t>> &stage_sizes,
172 vector<vector<int>> &pure_dims,
173 vector<int> &vectorized_indices)
const;
188 const vector<int64_t> &max_size);
195 h ^= (next + 0x9e3779b9 + (h << 6) + (h >> 2));
208 count += c->funcs_realized_or_inlined();
245 bool is_inlined =
false)
const;
266 const LoopNest *current_thread_loop =
nullptr)
const;
274 c->set_working_set_at_task_feature(working_set,
features);
275 features->get(c->stage).working_set_at_task = working_set;
281 bool in_threads_loop)
const;
296 int innermost_storage_dim,
298 const Bound &store_bounds,
302 int innermost_storage_dim,
304 const Bound &store_bounds,
306 bool verbose =
false)
const;
315 int consumer_innermost_dim,
317 const Bound &consumer_store_bounds,
319 const std::vector<int64_t> &inner_serial_loop_extents,
320 const Sites &consumer_site,
327 bool verbose =
false)
const;
332 int loop_index)
const;
336 bool accessed_has_been_scheduled,
343 bool accessed_has_been_scheduled,
346 bool verbose =
false)
const;
349 bool verbose =
false)
const;
354 const Bound &store_bounds,
357 double num_requests_per_warp,
359 bool verbose =
false)
const;
362 int consumer_innermost_dim,
364 const Bound &consumer_store_bounds,
366 double serial_loop_extents)
const;
370 int consumer_innermost_dim,
372 const Bound &consumer_store_bounds,
374 double serial_loop_extents,
379 int producer_innermost_dim,
381 const Bound &producer_store_bounds,
382 bool producer_has_been_scheduled,
385 double serial_loop_extents,
386 bool verbose =
false)
const;
411 int64_t total_shared_mem_alloc_size,
419 const std::vector<const FunctionDAG::Edge *> &edge_chain,
426 bool producer_has_been_scheduled,
427 int producer_innermost_dim,
435 int producer_dims)
const;
468 bool use_memoized_features,
471 int64_t *working_set_local_constant,
472 int64_t *working_set_local_dynamic,
475 bool verbose =
false)
const;
480 return node ==
nullptr;
485 return bounds.emplace(f, b);
498 const vector<const FunctionDAG::Edge *> &edge_chain)
const;
538 bool in_threads_loop,
549 bool move_all_rvars_inward =
true,
550 const vector<int> &rvars_to_move_inward = {})
const;
553 bool in_threads_loop =
false)
const;
561 bool in_threads_loop)
const;
574 bool in_threads_loop,
576 vector<int64_t> union_counts = vector<int64_t>())
const;
665 StageMap<std::unique_ptr<StageScheduleState>> &state_map,
671 std::vector<StageScheduleState *> &ancestors,
688 const LoopNest *compute_root_loop_nest =
nullptr)
const;
699 std::cerr <<
"\nState filtered: \n";
701 std::cerr <<
"Reason: ";
708 std::cerr << std::forward<T>(x);
Data structure containing information about the current GPU loop nest hierarchy of blocks,...
Data structures that help track memory access information.
Data structure containing information about GPU threads for a particular location in the loop nest an...
A class representing a reference count to be used with IntrusivePtr.
A reference to a site in a Halide statement at the top of the body of a particular for loop.
A Halide variable, to be used when defining functions.
MemInfoType< SharedMem > SharedMemInfo
int64_t get_active_block_hardware_limit(const Anderson2021Params ¶ms)
PerfectHashMap< FunctionDAG::Node::Stage, T > StageMap
bool all(const vector< int > &v)
IntrusivePtr< const BoundContents > Bound
bool are_valid_thread_extents(const vector< int64_t > &counts)
bool in_range_zero_one(double x)
bool accessed_at_constant_indices(const std::vector< int > &unrolled, const FunctionDAG::Edge *e)
constexpr int64_t get_register_mem_alloc_limit()
double get_idle_lane_wastage_limit()
int64_t get_shared_memory_sm_limit(const Anderson2021Params ¶ms)
PerfectHashMap< FunctionDAG::Node, T > NodeMap
int64_t get_active_warp_hardware_limit(const Anderson2021Params ¶ms)
bool may_subtile(const Anderson2021Params ¶ms)
double get_idle_lane_wastage_limit_env_var()
MemInfo< typename MemTraits< T >::MemInfoType > MemInfoType
MemInfoType< LocalMem > LocalMemInfo
int get_unroll_limit(const Target &target)
int64_t get_shared_memory_limit(const Anderson2021Params ¶ms)
std::string stringify(GPU_parallelism label)
MemInfoType< GlobalMem > GlobalMemInfo
RefCount & ref_count(const T *t) noexcept
Because in this header we don't yet know how client classes store their RefCount (and we don't want t...
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
unsigned __INT64_TYPE__ uint64_t
signed __INT64_TYPE__ int64_t
const LoopNest * loop_nest
Filter(const LoopNest *loop_nest)
Filter & operator<<(T &&x)
static bool enable_filter_printing()
uint64_t hash_of_producers_stored_at_root
bool is_stored_in_registers() const
const LoopNest * innermost
bool is_stored_in_shared_mem() const
bool is_stored_in_local_mem() const
GPUMemoryType gpu_store_memory_type
std::vector< const LoopNest * > inlined_innermosts
bool is_stored_in_global_mem() const
bool is_constant_allocation
vector< StageScheduleState * > ancestors
std::vector< FuncVar > vars
const FunctionDAG::Node * node
vector< int64_t > gpu_thread_extents
bool all_innermost_unrolled
vector< FuncVar > ordered_vars
const FunctionDAG::Node::Stage * stage
std::ostringstream schedule_source
NodeMap< std::vector< std::pair< const LoopNest *, std::vector< const FunctionDAG::Edge * > > > > producers_to_be_staged
vector< pair< int, int > > collect_producers(const StageMap< Sites > &sites) const
bool is_gpu_thread(const Target &target) const
vector< IntrusivePtr< const LoopNest > > compute_in_tiles(const FunctionDAG::Node *f, const LoopNest *parent, const Anderson2021Params ¶ms, const Target &target, const SearchSpaceOptions &search_space_options, int v, bool in_realization, bool in_threads_loop, bool is_pre_pass, vector< int64_t > union_counts=vector< int64_t >()) const
int vectorized_loop_index
const LoopNest * get_enclosing_block(const LoopNest *parent, const LoopNest *grandparent) const
int num_serial_loops(const FunctionDAG::Node::Stage *stage) const
int64_t points_accessed_per_thread(const Anderson2021Params ¶ms, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector< const FunctionDAG::Edge * > &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose) const
bool has_constant_region_required(const FunctionDAG::Node *node) const
int num_serial_loops() const
std::map< uint64_t, StageMap< ScheduleFeatures > > features
int get_pure_stage_vectorized_loop_index(const FunctionDAG::Node *node) const
const FunctionDAG::Node * node
void dump(T &stream, string prefix, const LoopNest *parent) const
int64_t product_of_self_and_descendants(int loop_index) const
GPU_parallelism gpu_label
bool all_strides_exist(const LoadJacobian &jac, const FunctionDAG::Node *storage_node, const LoopNest &root) const
void recompute_inlined_features(const StageMap< Sites > &sites, StageMap< ScheduleFeatures > *features) const
void inline_func(const FunctionDAG::Node *f)
void generate_vec_dim_serial_tilings(vector< int > &serial_sizes) const
bool region_computed_shrinks(const FunctionDAG::Node *f, const LoopNest *parent) const
void compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector< int64_t > &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo &global_mem_loads, SharedMemInfo &shared_mem_loads, LocalMemInfo &local_mem_loads, bool verbose=false) const
int64_t compute_licm_amortization(const LoopNest *innermost, const LoopNest *parent, const ScheduleFeatures &feat, const LoadJacobian &jac, int producer_dims) const
int64_t product_of_descendants(int loop_index) const
bool requires_dynamic_allocation(const FunctionDAG::Node *f, const Target &target, bool in_threads_loop) const
bool is_gpu_block(const Target &target) const
bool node_has_dynamic_region_computed(const FunctionDAG::Node *f) const
bool exceeds_serial_extents_limit(const Target &target, const LoopNest *parent, bool in_threads_loop) const
void collect_stages(std::set< const FunctionDAG::Node::Stage * > &stages) const
bool accesses_input_buffer() const
std::vector< int64_t > size
Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector< const FunctionDAG::Edge * > &edge_chain) const
int64_t get_total_constant_local_mem_alloc_size() const
const Bound & get_bounds(const FunctionDAG::Node *f) const
std::pair< double, double > compute_local_mem_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const LoopNest &root, double serial_loop_extents) const
int vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType &mem_type, bool verbose=false) const
bool all_paths_to_leaves_have_thread_loop() const
GPUMemoryType get_gpu_memory_type(bool in_block, bool in_thread, bool is_inlined=false) const
void compute_warp_and_block_occupancy(const Anderson2021Params ¶ms, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const
vector< int64_t > get_union_thread_counts(const FunctionDAG::Node *f) const
void collect_nodes_that_should_be_inlined(const NodeMap< bool > &nodes_to_freeze, NodeMap< bool > &inlined_nodes) const
bool has_valid_thread_extents() const
void apply(LoopLevel here, StageMap< std::unique_ptr< StageScheduleState > > &state_map, double num_cores, int depth, const LoopNest *parent, const LoopNest *compute_site, const Target &target, std::vector< StageScheduleState * > &ancestors, const NodeMap< bool > &all_inlined) const
std::map< uint64_t, StageMap< StageMap< FeatureIntermediates > > > feature_intermediates
int get_actual_vector_dim(const Bound &store_bounds) const
void compute_shared_mem_occupancy(const Anderson2021Params ¶ms, const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const
Strides compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo *thread_info, bool verbose=false) const
int get_vectorized_loop_index_from_pure_stage(const LoopNest &root) const
bool computes(const FunctionDAG::Node *f) const
double storage_stride(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const LoopNest &root) const
std::vector< IntrusivePtr< const LoopNest > > children
void set_working_set_at_task_feature(int64_t working_set, StageMap< ScheduleFeatures > *features) const
void copy_from(const LoopNest &n)
bool promote_allocs_to_registers(const Target &target, StageMap< Sites > &sites) const
std::pair< int64_t, int64_t > get_block_and_serial_extents(const LoopNest *block) const
const Bound & set_bounds(const FunctionDAG::Node *f, BoundContents *b) const
bool has_dynamic_allocation_inside_thread(bool in_thread_loop) const
void memoize_points_computed_minimum(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features) const
bool has_constant_region_computed(const FunctionDAG::Node *node) const
double compute_local_mem_stride(double stride, double bytes) const
MemInfoType< T > compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const
bool add_gpu_thread_tilings(const FunctionDAG::Node *f, const Anderson2021Params ¶ms, const Target &target, int v, vector< IntrusivePtr< const LoopNest > > &result, const vector< int64_t > &max_size)
IntrusivePtr< const LoopNest > parallelize_in_tiles(const vector< int64_t > &tiling, const LoopNest *parent, const Anderson2021Params ¶ms, const Target &target, bool inner_tiling, bool adjust_tiling, bool move_all_rvars_inward=true, const vector< int > &rvars_to_move_inward={}) const
void memoize_features(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features) const
void collect_all_inlined(NodeMap< bool > &all_inlined) const
std::vector< int > unrolled_loops(const Target &target, const LoopNest *parent, const LoopNest *grandparent) const
std::string to_string() const
bool is_gpu_serial(const Target &target) const
double max_idle_lane_wastage(const Target &target, GPULoopInfo gpu_loop_info) const
static void hash_combine(uint64_t &h, uint64_t next)
const LoopNest * find_pure_stage_loop_nest(const FunctionDAG::Node *node) const
void get_stage_sizes(const FunctionDAG::Node *f, vector< vector< int64_t > > &stage_sizes, vector< vector< int > > &pure_dims, vector< int > &vectorized_indices) const
uint64_t compute_hash_of_producers_stored_at_root(const StageMap< Sites > &sites) const
int64_t max_inlined_calls() const
const FunctionDAG::Node::Stage * stage
bool other_stage_has_same_producer(const FunctionDAG::Node *producer) const
void get_stages_computed_in_each_compute_root_loop(StageMap< StageMap< bool > > &descendants, const LoopNest *compute_root_loop_nest=nullptr) const
void compute_features(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target, const StageMap< Sites > &sites, int64_t instances, int64_t parallelism, const LoopNest *parent, const LoopNest *grandparent, const LoopNest &root, GPULoopInfo gpu_loop_info, bool use_memoized_features, const StageMap< int64_t > &total_shared_mem_alloc_sizes, int64_t *working_set, int64_t *working_set_local_constant, int64_t *working_set_local_dynamic, StageMap< ScheduleFeatures > *features, Statistics &stats, bool verbose=false) const
void get_sites(const Target &target, StageMap< Sites > &sites, StageMap< int64_t > &shared_mem_alloc_sizes, const LoopNest *task=nullptr, const LoopNest *parent=nullptr, const LoopNest *current_thread_loop=nullptr) const
bool calls(const FunctionDAG::Node *f) const
bool producer_computed_here_or_further_in(const FunctionDAG::Node *producer) const
void copy_from_including_features(const LoopNest &n)
bool can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, const FunctionDAG::Node *accessed, int innermost_dim, int loop_index) const
void update_producers_to_be_staged(StageScheduleState &state, const NodeMap< bool > &all_inlined) const
bool has_thread_loop_descendant() const
bool can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType &mem_type) const
void compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo *thread_info, MemInfoType< T > &mem_info, double serial_loop_extents, bool verbose=false) const
void get_allocs_that_can_be_promoted_to_registers(const Target &target, StageMap< Sites > &sites, NodeMap< bool > &can_be_promoted_to_registers, const LoopNest *grandparent, const LoopNest *parent) const
void compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const
void structural_hash(uint64_t &h, int depth) const
size_t funcs_realized_or_inlined() const
void compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType< T > &mem_info, bool verbose=false) const
std::pair< const LoopNest *, const LoopNest * > find_innermost_and_parent() const
std::set< const FunctionDAG::Node * > store_at
std::pair< int64_t, bool > compute_alloc_size_of_node_here(const FunctionDAG::Node *f) const
bool compute_here(const FunctionDAG::Node *f, bool tileable, int v, bool in_threads_loop, const Anderson2021Params ¶ms, const Target &target)
int64_t get_total_local_mem_alloc_size(bool constant_allocs_only=false, bool in_threads_loop=false) const
NodeMap< int64_t > inlined
void compute_working_set_from_features(int64_t *working_set, const StageMap< ScheduleFeatures > *features) const
int vectorized_access_size(size_t loop_index, bool verbose=false) const
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
A struct representing a target machine and os to generate code for.
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled?
A class that can represent Vars or RVars.