12#include <unordered_set>
23using std::unordered_set;
49template<
typename PostCreateMutator>
51 const LoopNest *new_loop_nest_parent,
53 const PostCreateMutator &post_create_mutator) {
54 new_loop_nest->
copy_from(*existing_loop_nest);
56 for (std::size_t i = 0, N = new_loop_nest->
children.size(); i < N; ++i) {
58 new_loop_nest->
children[i] = new_child;
59 deep_copy_loop_nest(new_child, new_loop_nest, existing_loop_nest->children[i], post_create_mutator);
62 post_create_mutator(new_loop_nest);
65using LoopNestMap = map<const LoopNest *, pair<const LoopNest *, int>>;
67template<
typename PostCreateMutator>
69 const PostCreateMutator &post_create_mutator) {
105 template<
typename PostCreateMutator>
134 const Target &target)
const;
145 bool verbose =
false)
const;
150 std::ostream &out)
const;
168 const Target &target)
const;
171 const Target &target)
const;
178 bool verbose =
false);
192 const vector<VarOrRVar> ¶llel_vars,
193 const vector<int64_t> ¶llel_extents,
194 const vector<int> &constant_extents)
const;
198 const vector<VarOrRVar> ¶llel_vars,
199 const vector<int64_t> ¶llel_extents)
const;
203 std::unordered_set<std::string> &new_serial_vars,
204 std::ostringstream &staged_funcs_schedule_source)
const;
234 struct CompareStates {
236 return a->cost > b->cost;
240 std::vector<IntrusivePtr<State>> storage;
245 if (sz >= storage.size()) {
246 storage.resize(std::max(sz * 2, (
size_t)64));
248 internal_assert(sz < storage.size()) << sz <<
" " << storage.size() <<
"\n";
249 storage[sz] = std::move(s);
251 std::push_heap(storage.begin(), storage.begin() + sz, CompareStates{});
255 internal_assert(sz <= storage.size()) << sz <<
" " << storage.size() <<
"\n";
256 std::pop_heap(storage.begin(), storage.begin() + sz, CompareStates{});
258 return std::move(storage[sz]);
274 storage.swap(other.storage);
275 std::swap(sz, other.sz);
283 std::make_heap(storage.begin(), storage.begin() + sz, CompareStates{});
287 for (
size_t i = 0; i < sz; i++) {
#define internal_assert(c)
const IntrusivePtr< State > & top()
void emplace(IntrusivePtr< State > &&s)
IntrusivePtr< State > pop()
void swap(StateQueue &other) noexcept
IntrusivePtr< State > operator[](int idx) const
A class representing a reference count to be used with IntrusivePtr.
A single definition of a Func.
PerfectHashMap< FunctionDAG::Node::Stage, T > StageMap
bool compute_root_and_inline_only()
int64_t get_stack_memory_limit()
bool use_adjusted_tilings()
constexpr int kLocalMemoryLimit
double get_stack_memory_adjustment_factor()
bool verify_memoized_features()
PerfectHashMap< FunctionDAG::Node, T > NodeMap
map< const LoopNest *, pair< const LoopNest *, int > > LoopNestMap
bool is_memoize_blocks_enabled()
void deep_copy_loop_nest(LoopNest *new_loop_nest, const LoopNest *new_loop_nest_parent, const IntrusivePtr< const LoopNest > &existing_loop_nest, const PostCreateMutator &post_create_mutator)
RefCount & ref_count(const T *t) noexcept
Because in this header we don't yet know how client classes store their RefCount (and we don't want t...
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
unsigned __INT64_TYPE__ uint64_t
signed __INT64_TYPE__ int64_t
std::vector< IntrusivePtr< const LoopNest > > children
void copy_from(const LoopNest &n)
void operator()(LoopNest *new_loop_nest) const
void operator()(LoopNest *new_loop_nest) const
void split_compute_root_loops(LoopNest *loop_nest) const
void add_outer_thread_loops(LoopNest *loop_nest) const
const Anderson2021Params & params
void save_featurization(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target, std::ostream &out) const
bool exceeds_serial_extents_limit(const Target &target) const
int64_t get_shared_mem_alloc_size(const LoopNest *block, const LoopNest *loop) const
bool has_compute_root_loops_without_blocks() const
const LoopNest * deepest_valid_compute_location(const Anderson2021Params ¶ms, const LoopNestMap &parent, const FunctionDAG::Node &node, const LoopNest *loop, const LoopNest *root, StageMap< int64_t > &total_shared_mem_alloc_sizes) const
void update_always_consider_inline_options(const FunctionDAG::Node *node)
int64_t total_loop_extents_of_ancestors(const LoopNestMap &parent, const LoopNest *loop) const
bool can_fuse_gpu(const vector< int64_t > ¶llel_extents) const
bool should_always_consider_inline(const FunctionDAG::Node *node) const
bool contains_store_at_further_in_than_outermost() const
void operator=(const State &)=delete
void compute_loop_nest_parents(LoopNestMap &p, const LoopNest *here, int depth) const
State(const State &)=delete
bool has_dynamic_allocation_inside_thread() const
void fuse_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector< VarOrRVar > ¶llel_vars, const vector< int64_t > ¶llel_extents, const vector< int > &constant_extents) const
void set_gpu_store_site(const LoopNestMap &parent, const LoopNest *loop, LoopNest::Sites &site) const
void print_compute_locations() const
bool mark_gpu_threads(LoopNest::StageScheduleState *state, Stage &stage, std::unordered_set< std::string > &new_serial_vars, std::ostringstream &staged_funcs_schedule_source) const
NodeMap< bool > always_consider_inline
IntrusivePtr< const State > parent
void operator=(State &&)=delete
uint64_t structural_hash(int depth) const
IntrusivePtr< const LoopNest > root
bool exceeds_shared_memory_limit(const Anderson2021Params ¶ms, const Target &target) const
bool compute_featurization(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target, StageMap< ScheduleFeatures > *features, Statistics &stats, bool verbose=false) const
bool contains_store_at(const set< const FunctionDAG::Node * > &outermost_store_at, const IntrusivePtr< const LoopNest > &parent) const
void mark_gpu_blocks(LoopNest::StageScheduleState *state, Stage &stage, const vector< VarOrRVar > ¶llel_vars, const vector< int64_t > ¶llel_extents) const
IntrusivePtr< const LoopNest > get_root_for_features(const Anderson2021Params ¶ms, const Target &target) const
bool calculate_cost(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target, CostModel *cost_model, Statistics &stats, bool verbose=false)
LoopNest * create_feature_root(const PostCreateMutator &post_create_mutator) const
const LoopNest * deepest_common_ancestor(const LoopNestMap &parent, const LoopNest *a, const LoopNest *b) const
IntrusivePtr< State > make_child() const
bool exceeds_local_memory_limit(const Anderson2021Params ¶ms, const Target &target) const
void add_to_always_consider_inline_options(const FunctionDAG::Node *node)
void apply_schedule(const FunctionDAG &dag, const Anderson2021Params ¶ms, const Target &target)
bool has_loop_nest_without_thread_loops() const
std::vector< double > cost_per_stage
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
A struct representing a target machine and os to generate code for.