docs/anderson2021_2_loop_nest_8h_source.html

/** This file defines the LoopNest, which is our

 * representation of a Halide schedule, and contains methods to

 * generate candidates for scheduling as well as extract a

 * featurization that can be used to cost each candidate. */


#ifndef LOOP_NEST_H

#define LOOP_NEST_H


#include "ASLog.h"

#include "CostModel.h"

#include "FunctionDAG.h"

#include "GPULoopInfo.h"

#include "GPUMemInfo.h"

#include "PerfectHashMap.h"

#include "SearchSpaceOptions.h"

#include "Statistics.h"

#include "ThreadInfo.h"

#include "Tiling.h"

#include <set>

#include <vector>


namespace Halide {

namespace Internal {

namespace Autoscheduler {


template<typename T>

using NodeMap = PerfectHashMap<FunctionDAG::Node, T>;


template<typename T>

using StageMap = PerfectHashMap<FunctionDAG::Node::Stage, T>;


enum class GPU_parallelism {

    Block,

    Thread,

    Serial,

    Simd,

    Parallelized,

    None

};


std::string stringify(GPU_parallelism label);


// inlined => func is inlined so has no memory store location


enum class GPUMemoryType {

    Global,

    Shared,

    Local,

    Registers,

    Inlined

};


bool may_subtile(const Anderson2021Params &params);


int64_t get_shared_memory_limit(const Anderson2021Params &params);


int64_t get_shared_memory_sm_limit(const Anderson2021Params &params);


int64_t get_active_block_hardware_limit(const Anderson2021Params &params);


int64_t get_active_warp_hardware_limit(const Anderson2021Params &params);


constexpr int64_t get_register_mem_alloc_limit() {

    return 128;

}


int get_unroll_limit(const Target &target);


bool in_range_zero_one(double x);


bool are_valid_thread_extents(const vector<int64_t> &counts);


double get_idle_lane_wastage_limit_env_var();

double get_idle_lane_wastage_limit();


bool all(const vector<int> &v);

bool accessed_at_constant_indices(const std::vector<int> &unrolled, const FunctionDAG::Edge *e);


// We're going to do a tree search over possible schedules to find an

// optimal one. A tree search requires a state, and a function that

// gives you children of the state (with costs). The following struct

// represents the state, which is a partial schedule.

//

// A partial schedule is a tree. Each node is some portion of the for

// loop nest of some Func. If there are no children, it's the

// innermost set of loops. If there are children, it's a loop over

// tiles of that Func.


struct LoopNest {

    mutable RefCount ref_count;


    // The extents of this loop. Put another way, the number of tiles,

    // not the size of each tile.

    vector<int64_t> size;


    // The nodes inside the loop body

    vector<IntrusivePtr<const LoopNest>> children;


    // Funcs inlined into this inner loop, and the number of times

    // each is called. Only valid if children is empty.

    NodeMap<int64_t> inlined;


    // Funcs stored inside this loop

    std::set<const FunctionDAG::Node *> store_at;


    // The total bounds required of any given Func over all iterations

    // of this loop. In the paper, this is represented using the

    // little boxes to the left of the loop nest tree figures.

    mutable NodeMap<Bound> bounds;


    // The Func this loop nest belongs to

    const FunctionDAG::Node *node = nullptr;


    // The stage of the Func

    const FunctionDAG::Node::Stage *stage = nullptr;


    // Is this the innermost loop of this func (the SIMD loop)?

    bool innermost = false;


    // Are we permitted to tile this loop?

    bool tileable = false;


    // Is this the parallel outer loop?

    bool parallel = false;


    // What dimension is this Func vectorized over, in terms of the pure args of the Func?

    int vector_dim = -1;


    // Which loop corresponds to the innermost storage dimension and will be vectorized. -1 means none of them.

    int vectorized_loop_index = -1;


    // Apply gpu threads to this loop nest

    mutable GPU_parallelism gpu_label = GPU_parallelism::None;


    struct FeatureIntermediates {

        double inlined_calls;

        double num_vectors;

        double num_scalars;

        double vector_size;

        double innermost_pure_loop_extent;

        double outer_parallelism;

        double num_warps_per_block;

        double num_threads_per_block;

        double points_computed_per_thread;

    };


    mutable std::map<uint64_t, StageMap<StageMap<FeatureIntermediates>>> feature_intermediates;

    mutable std::map<uint64_t, StageMap<ScheduleFeatures>> features;


    bool is_gpu_serial(const Target &target) const {

        return target.has_gpu_feature() && gpu_label == GPU_parallelism::Serial;

    }


    bool is_gpu_thread(const Target &target) const {

        return target.has_gpu_feature() && gpu_label == GPU_parallelism::Thread;

    }


    bool is_gpu_block(const Target &target) const {

        return target.has_gpu_feature() && gpu_label == GPU_parallelism::Block;

    }


    bool is_scalar() const {

        return size.empty();

    }


    // given a newly inserted node f into this LoopNest, get union of thread counts in each dimension

    // across all siblings of f.

    vector<int64_t> get_union_thread_counts(const FunctionDAG::Node *f) const;


    // given a newly inserted node f into this LoopNest, gets the size of

    // all of f's stages and their pure_dim indices

    void get_stage_sizes(const FunctionDAG::Node *f,

                         vector<vector<int64_t>> &stage_sizes,

                         vector<vector<int>> &pure_dims,

                         vector<int> &vectorized_indices) const;


    // given the loop nest of a stage to parallelize at root, figure out if using odd tile sizes

    // for the vectorized dimension will allow the resulting thread tiles to be multiples of 32

    // if so, we will include these in the serial loop sizes

    void generate_vec_dim_serial_tilings(vector<int> &serial_sizes) const;


    // get the loop nests of a newly inserted node, f, that is marked GPU threads. Tiles

    // the newly inserted loop nests of f into a threads loop outside a serial loop.

    // V is the vectorized dimension of f. Adds loopnests created from each tiling option in result.

    bool add_gpu_thread_tilings(const FunctionDAG::Node *f,

                                const Anderson2021Params &params,

                                const Target &target,

                                int v,

                                vector<IntrusivePtr<const LoopNest>> &result,

                                const vector<int64_t> &max_size);


    void copy_from(const LoopNest &n);

    void copy_from_including_features(const LoopNest &n);


    static void hash_combine(uint64_t &h, uint64_t next) {

        // From boost

        h ^= (next + 0x9e3779b9 + (h << 6) + (h >> 2));

    }


    // Hash the loop structure and sizes up to a fixed depth. This is

    // used as the hash function for the coarse-to-fine beam search in

    // the paper.

    void structural_hash(uint64_t &h, int depth) const;


    // How many funcs are scheduled inside this loop level. Used in

    // the structural hash.


    size_t funcs_realized_or_inlined() const {

        size_t count = inlined.size() + store_at.size();

        for (const auto &c : children) {

            count += c->funcs_realized_or_inlined();

        }

        return count;

    }


    // All of a stage's interesting locations in the loop nest. Used to help compute the featurization of a stage.


    struct Sites {

        const LoopNest *compute = nullptr;                 // Its containing compute_at site

        const LoopNest *store = nullptr;                   // Its containing store_at site

        const LoopNest *produce = nullptr;                 // Its own outermost node

        const LoopNest *innermost = nullptr;               // Its innermost node - usually a SIMD loop

        const LoopNest *task = nullptr;                    // The parallel for loop it belongs to

        const LoopNest *thread = nullptr;                  // Its containing gpu_thread loop

        GPUMemoryType gpu_store_memory_type;               // global, local, shared?

        int64_t allocation_size = 0;                       // Allocation size in bytes

        bool is_constant_allocation = false;               // Does the allocation have constant size?

        int64_t num_realizations = 0;                      // Number of times this stage is realized. Only valid for unscheduled producers

        bool inlined = false;                              // Is the Func inlined?

        std::vector<const LoopNest *> inlined_innermosts;  // Is the Func inlined?

        uint64_t hash_of_producers_stored_at_root;


        bool is_stored_in_global_mem() const {

            return gpu_store_memory_type == GPUMemoryType::Global;

        }


        bool is_stored_in_shared_mem() const {

            return gpu_store_memory_type == GPUMemoryType::Shared;

        }


        bool is_stored_in_local_mem() const {

            return gpu_store_memory_type == GPUMemoryType::Local;

        }


        bool is_stored_in_registers() const {

            return gpu_store_memory_type == GPUMemoryType::Registers;

        }


    };


    GPUMemoryType get_gpu_memory_type(bool in_block,

                                      bool in_thread,

                                      bool is_inlined = false) const;


    std::vector<int> unrolled_loops(const Target &target,

                                    const LoopNest *parent,

                                    const LoopNest *grandparent) const;


    void get_allocs_that_can_be_promoted_to_registers(const Target &target,

                                                      StageMap<Sites> &sites,

                                                      NodeMap<bool> &can_be_promoted_to_registers,

                                                      const LoopNest *grandparent,

                                                      const LoopNest *parent) const;


    bool promote_allocs_to_registers(const Target &target,

                                     StageMap<Sites> &sites) const;


    // Compute all the sites of interest for each pipeline stage

    void get_sites(const Target &target,

                   StageMap<Sites> &sites,

                   StageMap<int64_t> &shared_mem_alloc_sizes,

                   const LoopNest *task = nullptr,

                   const LoopNest *parent = nullptr,

                   const LoopNest *current_thread_loop = nullptr) const;


    // A helper for the working_set_at_task feature. Most features are

    // computed in the recursive pass 'compute_features' below, but

    // this one must be done in a second separate recursive pass.


    void set_working_set_at_task_feature(int64_t working_set,

                                         StageMap<ScheduleFeatures> *features) const {

        for (const auto &c : children) {

            c->set_working_set_at_task_feature(working_set, features);

            features->get(c->stage).working_set_at_task = working_set;

        }

    }


    bool exceeds_serial_extents_limit(const Target &target,

                                      const LoopNest *parent,

                                      bool in_threads_loop) const;


    bool node_has_dynamic_region_computed(const FunctionDAG::Node *f) const;


    bool has_dynamic_allocation_inside_thread(bool in_thread_loop) const;


    const LoopNest *find_pure_stage_loop_nest(const FunctionDAG::Node *node) const;


    int get_pure_stage_vectorized_loop_index(const FunctionDAG::Node *node) const;


    int get_vectorized_loop_index_from_pure_stage(const LoopNest &root) const;


    // Get the stride over "node's" storage for a unit increment in the vectorized loop's

    // index

    double storage_stride(const LoadJacobian &jac,

                          int innermost_storage_dim,

                          const FunctionDAG::Node *storage_node,

                          const Bound &store_bounds,

                          const LoopNest &root) const;


    Strides compute_strides(const LoadJacobian &jac,

                            int innermost_storage_dim,

                            const FunctionDAG::Node *storage_node,

                            const Bound &store_bounds,

                            const ThreadInfo *thread_info,

                            bool verbose = false) const;


    bool all_strides_exist(const LoadJacobian &jac,

                           const FunctionDAG::Node *storage_node,

                           const LoopNest &root) const;


    int get_actual_vector_dim(const Bound &store_bounds) const;


    void compute_gpu_store_features(const LoadJacobian &jac,

                                    int consumer_innermost_dim,

                                    const FunctionDAG::Node *node,

                                    const Bound &consumer_store_bounds,

                                    const GPULoopInfo &gpu_loop_info,

                                    const std::vector<int64_t> &inner_serial_loop_extents,

                                    const Sites &consumer_site,

                                    ScheduleFeatures &feat,

                                    const LoopNest *parent,

                                    const LoopNest &root,

                                    GlobalMemInfo &global_mem_loads,

                                    SharedMemInfo &shared_mem_loads,

                                    LocalMemInfo &local_mem_loads,

                                    bool verbose = false) const;


    bool can_vectorize_access_for_innermost_dim(const LoadJacobian &jac,

                                                const FunctionDAG::Node *accessed,

                                                int innermost_dim,

                                                int loop_index) const;


    bool can_vectorize_store_access(const LoadJacobian &jac,

                                    const FunctionDAG::Node *accessed,

                                    bool accessed_has_been_scheduled,

                                    int innermost_dim,

                                    int loop_index,

                                    const GPUMemoryType &mem_type) const;


    int vectorized_load_access_size(const LoadJacobian &jac,

                                    const FunctionDAG::Node *accessed,

                                    bool accessed_has_been_scheduled,

                                    int innermost_dim,

                                    const GPUMemoryType &mem_type,

                                    bool verbose = false) const;


    int vectorized_access_size(size_t loop_index,

                               bool verbose = false) const;


    template<typename T>

    void compute_num_mem_accesses_per_block(const LoadJacobian &jac,

                                            const FunctionDAG::Node *node,

                                            const Bound &store_bounds,

                                            const ThreadInfo *thread_info,

                                            int innermost_dim,

                                            double num_requests_per_warp,

                                            MemInfoType<T> &mem_info,

                                            bool verbose = false) const;


    std::pair<double, double> compute_local_mem_store_features(const LoadJacobian &jac,

                                                               int consumer_innermost_dim,

                                                               const FunctionDAG::Node *node,

                                                               const Bound &consumer_store_bounds,

                                                               const LoopNest &root,

                                                               double serial_loop_extents) const;


    template<typename T>

    MemInfoType<T> compute_mem_store_info(const LoadJacobian &jac,

                                          int consumer_innermost_dim,

                                          const FunctionDAG::Node *node,

                                          const Bound &consumer_store_bounds,

                                          const ThreadInfo *thread_info,

                                          double serial_loop_extents,

                                          bool verbose) const;


    template<typename T>

    void compute_mem_load_features(const LoadJacobian &jac,

                                   int producer_innermost_dim,

                                   const FunctionDAG::Node *node,

                                   const Bound &producer_store_bounds,

                                   bool producer_has_been_scheduled,

                                   const ThreadInfo *thread_info,

                                   MemInfoType<T> &mem_info,

                                   double serial_loop_extents,

                                   bool verbose = false) const;


    double compute_local_mem_stride(double stride,

                                    double bytes) const;


    // Assumes block, serial, thread or block, thread nesting

    const LoopNest *get_enclosing_block(const LoopNest *parent,

                                        const LoopNest *grandparent) const;


    std::pair<int64_t, int64_t> get_block_and_serial_extents(const LoopNest *block) const;


    bool all_paths_to_leaves_have_thread_loop() const;


    bool has_thread_loop_descendant() const;


    void compute_warp_features(ScheduleFeatures &features,

                               const GPULoopInfo &gpu_loop_info) const;


    // Assume that when a block is active, all its warps are active

    void compute_warp_and_block_occupancy(const Anderson2021Params &params,

                                          ScheduleFeatures &feat,

                                          const GPULoopInfo &gpu_loop_info) const;


    void compute_shared_mem_occupancy(const Anderson2021Params &params,

                                      const Target &target,

                                      int64_t total_shared_mem_alloc_size,

                                      ScheduleFeatures &feat) const;


    std::pair<const LoopNest *, const LoopNest *> find_innermost_and_parent() const;


    int64_t points_accessed_per_thread(const Anderson2021Params &params,

                                       const Target &target,

                                       const GPULoopInfo &gpu_loop_info,

                                       const std::vector<const FunctionDAG::Edge *> &edge_chain,

                                       const LoadJacobian &jac,

                                       const LoopNest *parent,

                                       const LoopNest *grandparent,

                                       int64_t n,

                                       const ScheduleFeatures &feat,

                                       const LoadJacobian &serial_jac,

                                       bool producer_has_been_scheduled,

                                       int producer_innermost_dim,

                                       const GPUMemoryType &mem_type,

                                       bool verbose) const;


    int64_t compute_licm_amortization(const LoopNest *innermost,

                                      const LoopNest *parent,

                                      const ScheduleFeatures &feat,

                                      const LoadJacobian &jac,

                                      int producer_dims) const;


    void memoize_points_computed_minimum(StageMap<ScheduleFeatures> &memoized_features,

                                         const StageMap<ScheduleFeatures> *features) const;


    vector<pair<int, int>> collect_producers(const StageMap<Sites> &sites) const;


    uint64_t compute_hash_of_producers_stored_at_root(const StageMap<Sites> &sites) const;


    void collect_stages(std::set<const FunctionDAG::Node::Stage *> &stages) const;


    void memoize_features(StageMap<ScheduleFeatures> &memoized_features,

                          const StageMap<ScheduleFeatures> *features) const;


    void compute_working_set_from_features(int64_t *working_set,

                                           const StageMap<ScheduleFeatures> *features) const;


    void recompute_inlined_features(const StageMap<Sites> &sites,

                                    StageMap<ScheduleFeatures> *features) const;


    std::pair<int64_t, bool> compute_alloc_size_of_node_here(const FunctionDAG::Node *f) const;


    // Do a recursive walk over the loop nest computing features to feed the cost model.

    void compute_features(const FunctionDAG &dag,

                          const Anderson2021Params &params,

                          const Target &target,

                          const StageMap<Sites> &sites,

                          int64_t instances,

                          int64_t parallelism,

                          const LoopNest *parent,

                          const LoopNest *grandparent,

                          const LoopNest &root,

                          GPULoopInfo gpu_loop_info,

                          bool use_memoized_features,

                          const StageMap<int64_t> &total_shared_mem_alloc_sizes,

                          int64_t *working_set,

                          int64_t *working_set_local_constant,

                          int64_t *working_set_local_dynamic,

                          StageMap<ScheduleFeatures> *features,

                          Statistics &stats,

                          bool verbose = false) const;


    bool is_root() const {

        // The root is the sole node without a Func associated with

        // it.

        return node == nullptr;

    }


    // Set the region required of a Func at this site.


    const Bound &set_bounds(const FunctionDAG::Node *f, BoundContents *b) const {

        return bounds.emplace(f, b);

    }


    // Get the region required of a Func at this site, from which we

    // know what region would be computed if it were scheduled here,

    // and what its loop nest would be.

    const Bound &get_bounds(const FunctionDAG::Node *f) const;


    // Get the region required of a Func at this site (but only to satisfy the

    // consumers along the given edge chain), from which we know what region

    // would be computed if it were scheduled here and what its loop nest

    // would be.

    Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f,

                                      const vector<const FunctionDAG::Edge *> &edge_chain) const;


    void dump() const;


    std::string to_string() const;


    // Recursively print a loop nest representation to stderr

    template<typename T>

    void dump(T &stream, string prefix, const LoopNest *parent) const;


    // Does this loop nest access the given Func

    bool calls(const FunctionDAG::Node *f) const;


    // What is the maximum number of inlined calls to a Func that

    // occur within this loop. Used to prune states that would

    // generate too much code.

    int64_t max_inlined_calls() const;


    // Does this loop nest access an input buffer? Used to select

    // trail strategies when splitting loops. We don't want to read

    // out of bounds on inputs, even if we don't intend to use the

    // values read. It could create annoying assertion failures for

    // the user. It's OK to read out of range of the values computed

    // on internal Funcs though. Allocation bounds inference just pads

    // out the bounds so that it won't fault.

    bool accesses_input_buffer() const;


    // Does this loop nest contain a computation of the given Func.

    bool computes(const FunctionDAG::Node *f) const;


    // Above here most methods query the loop nest. Below we have

    // methods that mutate the loop nest.


    // Inline a Func into all consumers within this loop.

    void inline_func(const FunctionDAG::Node *f);


    // Compute a Func at this site.

    bool compute_here(const FunctionDAG::Node *f,

                      bool tileable,

                      int v,

                      bool in_threads_loop,

                      const Anderson2021Params &params,

                      const Target &target);


    // Parallelize this loop according to the given tiling.

    IntrusivePtr<const LoopNest> parallelize_in_tiles(const vector<int64_t> &tiling,

                                                      const LoopNest *parent,

                                                      const Anderson2021Params &params,

                                                      const Target &target,

                                                      bool inner_tiling,

                                                      bool adjust_tiling,

                                                      bool move_all_rvars_inward = true,

                                                      const vector<int> &rvars_to_move_inward = {}) const;


    int64_t get_total_local_mem_alloc_size(bool constant_allocs_only = false,

                                           bool in_threads_loop = false) const;

    int64_t get_total_constant_local_mem_alloc_size() const;


    // All store ats further in than the block level must be fixed

    // sized allocations. This method checks if f will require a dynamic

    // allocation

    bool requires_dynamic_allocation(const FunctionDAG::Node *f,

                                     const Target &target,

                                     bool in_threads_loop) const;


    // Return all possible ways to compute f in tiles somewhere within

    // this loop nest.

    // in_threads_loop tracks whether or not function is going to be placed inside a

    // loop marked gpu_threads, in which case f's loops cannot be gpu_threads

    vector<IntrusivePtr<const LoopNest>> compute_in_tiles(const FunctionDAG::Node *f,

                                                          const LoopNest *parent,

                                                          const Anderson2021Params &params,

                                                          const Target &target,

                                                          const SearchSpaceOptions &search_space_options,

                                                          int v,

                                                          bool in_realization,

                                                          bool in_threads_loop,

                                                          bool is_pre_pass,

                                                          vector<int64_t> union_counts = vector<int64_t>()) const;


    // Below here we have methods that apply a schedule to a Halide pipeline.


    // A model of the state of the loop nest of a Func while applying

    // Halide's scheduling directives.


    // Note that StageScheduleState is movable-but-not-copyable thanks to its ostringstream member.


    struct StageScheduleState {

        // How much parallelism do we need to exploit with this Func?

        double num_cores = 0;


        // Which storage dimension is vectorized? We need to reorder it innermost

        int vector_dim = -1;

        int vectorized_loop_index = -1;


        // The various Vars and RVars used for scheduling a Func.


        struct FuncVar {

            // The top-level var or rvar this was split off from

            VarOrRVar orig;


            // This var.

            VarOrRVar var;


            // Source code to access this Var/RVar. Used for printing

            // valid Halide source for this schedule.

            string accessor;


            // Our estimate of the extent of this var. This is exact

            // when constant_extent flag is true.

            int64_t extent = 0;


            // Which index in the symbolic loop nest does this var

            // belong to.

            size_t index = 0;


            // Some flags.

            bool innermost_pure_dim = false;

            bool outermost = false;

            bool parallel = false;

            bool exists = false;

            bool pure = false;

            bool constant_extent = false;


            bool vectorized = false;

            bool gpu_threads = false;


            FuncVar()

                : orig(Var()),

                  var(Var()) {

            }


        };


        const FunctionDAG::Node *node;

        const FunctionDAG::Node::Stage *stage;

        bool parallel = false;

        bool vectorized = false;

        bool all_innermost_unrolled = false;

        FuncVar vectorized_var;


        // In order from innermost to outermost. Each group of d is one tiling level.

        vector<FuncVar> vars;


        // In order from innermost to outermost. Each group of d is one tiling level.

        vector<FuncVar> ordered_vars;

        vector<int64_t> gpu_thread_extents;


        NodeMap<std::vector<std::pair<const LoopNest *, std::vector<const FunctionDAG::Edge *>>>>

            producers_to_be_staged;


        // From outermost in

        vector<StageScheduleState *> ancestors;


        std::ostringstream schedule_source;

    };


    bool has_constant_region_computed(const FunctionDAG::Node *node) const;

    bool has_constant_region_required(const FunctionDAG::Node *node) const;

    bool other_stage_has_same_producer(const FunctionDAG::Node *producer) const;

    int num_serial_loops(const FunctionDAG::Node::Stage *stage) const;

    int num_serial_loops() const;

    bool producer_computed_here_or_further_in(const FunctionDAG::Node *producer) const;


    void update_producers_to_be_staged(StageScheduleState &state,

                                       const NodeMap<bool> &all_inlined) const;

    bool region_computed_shrinks(const FunctionDAG::Node *f,

                                 const LoopNest *parent) const;


    // Apply the schedule represented by this loop nest to a Halide pipeline.

    void apply(LoopLevel here,

               StageMap<std::unique_ptr<StageScheduleState>> &state_map,

               double num_cores,

               int depth,

               const LoopNest *parent,

               const LoopNest *compute_site,

               const Target &target,

               std::vector<StageScheduleState *> &ancestors,

               const NodeMap<bool> &all_inlined) const;


    double max_idle_lane_wastage(const Target &target,

                                 GPULoopInfo gpu_loop_info) const;


    bool has_valid_thread_extents() const;


    void collect_nodes_that_should_be_inlined(const NodeMap<bool> &nodes_to_freeze,

                                              NodeMap<bool> &inlined_nodes) const;


    void collect_all_inlined(NodeMap<bool> &all_inlined) const;


    int64_t product_of_self_and_descendants(int loop_index) const;

    int64_t product_of_descendants(int loop_index) const;


    void get_stages_computed_in_each_compute_root_loop(StageMap<StageMap<bool>> &descendants,

                                                       const LoopNest *compute_root_loop_nest = nullptr) const;

};


struct Filter {

    const LoopNest *loop_nest;

    bool logging = false;


    explicit Filter(const LoopNest *loop_nest)

        : loop_nest{loop_nest},

          logging{enable_filter_printing()} {

        if (logging) {

            std::cerr << "\nState filtered: \n";

            loop_nest->dump();

            std::cerr << "Reason: ";

        }

    }


    template<typename T>


    Filter &operator<<(T &&x) {

        if (logging) {

            std::cerr << std::forward<T>(x);

        }

        return *this;

    }


    static bool enable_filter_printing();

};


}  // namespace Autoscheduler

}  // namespace Internal

}  // namespace Halide


#endif  // LOOP_NEST_H

ASLog.h

GPULoopInfo.h
Data structure containing information about the current GPU loop nest hierarchy of blocks,...

GPUMemInfo.h
Data structures that help track memory access information.

PerfectHashMap.h

SearchSpaceOptions.h

Statistics.h

ThreadInfo.h
Data structure containing information about GPU threads for a particular location in the loop nest an...

Tiling.h

CostModel.h

FunctionDAG.h

Halide::Internal::Autoscheduler::LoadJacobian
Definition FunctionDAG.h:134

Halide::Internal::RefCount
A class representing a reference count to be used with IntrusivePtr.
Definition IntrusivePtr.h:19

Halide::LoopLevel
A reference to a site in a Halide statement at the top of the body of a particular for loop.
Definition Schedule.h:203

Halide::Var
A Halide variable, to be used when defining functions.
Definition Var.h:19

PerfectHashMap
Definition PerfectHashMap.h:38

Halide::Internal::Autoscheduler::get_active_block_hardware_limit
int64_t get_active_block_hardware_limit(const Anderson2021Params &params)

Halide::Internal::Autoscheduler::StageMap
PerfectHashMap< FunctionDAG::Node::Stage, T > StageMap
Definition LoopNest.h:24

Halide::Internal::Autoscheduler::all
bool all(const vector< int > &v)

Halide::Internal::Autoscheduler::are_valid_thread_extents
bool are_valid_thread_extents(const vector< int64_t > &counts)

Halide::Internal::Autoscheduler::in_range_zero_one
bool in_range_zero_one(double x)

Halide::Internal::Autoscheduler::accessed_at_constant_indices
bool accessed_at_constant_indices(const std::vector< int > &unrolled, const FunctionDAG::Edge *e)

Halide::Internal::Autoscheduler::get_register_mem_alloc_limit
constexpr int64_t get_register_mem_alloc_limit()
Definition LoopNest.h:62

Halide::Internal::Autoscheduler::get_idle_lane_wastage_limit
double get_idle_lane_wastage_limit()

Halide::Internal::Autoscheduler::get_shared_memory_sm_limit
int64_t get_shared_memory_sm_limit(const Anderson2021Params &params)

Halide::Internal::Autoscheduler::NodeMap
PerfectHashMap< FunctionDAG::Node, T > NodeMap
Definition LoopNest.h:21

Halide::Internal::Autoscheduler::get_active_warp_hardware_limit
int64_t get_active_warp_hardware_limit(const Anderson2021Params &params)

Halide::Internal::Autoscheduler::may_subtile
bool may_subtile(const Anderson2021Params &params)

Halide::Internal::Autoscheduler::get_idle_lane_wastage_limit_env_var
double get_idle_lane_wastage_limit_env_var()

Halide::Internal::Autoscheduler::get_unroll_limit
int get_unroll_limit(const Target &target)

Halide::Internal::Autoscheduler::GPUMemoryType
GPUMemoryType
Definition LoopNest.h:44

Halide::Internal::Autoscheduler::GPUMemoryType::Global
@ Global

Halide::Internal::Autoscheduler::GPUMemoryType::Local
@ Local

Halide::Internal::Autoscheduler::GPUMemoryType::Shared
@ Shared

Halide::Internal::Autoscheduler::GPUMemoryType::Registers
@ Registers

Halide::Internal::Autoscheduler::GPUMemoryType::Inlined
@ Inlined

Halide::Internal::Autoscheduler::get_shared_memory_limit
int64_t get_shared_memory_limit(const Anderson2021Params &params)

Halide::Internal::Autoscheduler::stringify
std::string stringify(GPU_parallelism label)

Halide::Internal::Autoscheduler::GPU_parallelism
GPU_parallelism
Definition LoopNest.h:32

Halide::Internal::Autoscheduler::GPU_parallelism::Parallelized
@ Parallelized

Halide::Internal::Autoscheduler::GPU_parallelism::Serial
@ Serial

Halide::Internal::Autoscheduler::GPU_parallelism::None
@ None

Halide::Internal::Autoscheduler::GPU_parallelism::Thread
@ Thread

Halide::Internal::Autoscheduler::GPU_parallelism::Simd
@ Simd

Halide::Internal::Autoscheduler::GPU_parallelism::Block
@ Block

Halide
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Definition AbstractGenerator.h:19

Halide::LinkageType::Internal
@ Internal
Not visible externally, similar to 'static' linkage in C.

uint64_t
unsigned __INT64_TYPE__ uint64_t
Definition runtime_internal.h:23

int64_t
signed __INT64_TYPE__ int64_t
Definition runtime_internal.h:22

Halide::Internal::Autoscheduler::Anderson2021Params
Definition CostModel.h:18

Halide::Internal::Autoscheduler::BoundContents
Definition FunctionDAG.h:305

Halide::Internal::Autoscheduler::Filter
Definition LoopNest.h:691

Halide::Internal::Autoscheduler::Filter::loop_nest
const LoopNest * loop_nest
Definition LoopNest.h:692

Halide::Internal::Autoscheduler::Filter::Filter
Filter(const LoopNest *loop_nest)
Definition LoopNest.h:695

Halide::Internal::Autoscheduler::Filter::operator<<
Filter & operator<<(T &&x)
Definition LoopNest.h:706

Halide::Internal::Autoscheduler::Filter::enable_filter_printing
static bool enable_filter_printing()

Halide::Internal::Autoscheduler::Filter::logging
bool logging
Definition LoopNest.h:693

Halide::Internal::Autoscheduler::FunctionDAG::Edge
Definition FunctionDAG.h:571

Halide::Internal::Autoscheduler::FunctionDAG::Node::Stage
Definition FunctionDAG.h:476

Halide::Internal::Autoscheduler::FunctionDAG::Node
Definition FunctionDAG.h:407

Halide::Internal::Autoscheduler::FunctionDAG
Definition FunctionDAG.h:396

Halide::Internal::Autoscheduler::GPULoopInfo
Definition GPULoopInfo.h:22

Halide::Internal::Autoscheduler::LoopNest::FeatureIntermediates
Definition LoopNest.h:133

Halide::Internal::Autoscheduler::LoopNest::FeatureIntermediates::points_computed_per_thread
double points_computed_per_thread
Definition LoopNest.h:142

Halide::Internal::Autoscheduler::LoopNest::FeatureIntermediates::outer_parallelism
double outer_parallelism
Definition LoopNest.h:139

Halide::Internal::Autoscheduler::LoopNest::FeatureIntermediates::num_warps_per_block
double num_warps_per_block
Definition LoopNest.h:140

Halide::Internal::Autoscheduler::LoopNest::FeatureIntermediates::num_vectors
double num_vectors
Definition LoopNest.h:135

Halide::Internal::Autoscheduler::LoopNest::FeatureIntermediates::innermost_pure_loop_extent
double innermost_pure_loop_extent
Definition LoopNest.h:138

Halide::Internal::Autoscheduler::LoopNest::FeatureIntermediates::num_scalars
double num_scalars
Definition LoopNest.h:136

Halide::Internal::Autoscheduler::LoopNest::FeatureIntermediates::num_threads_per_block
double num_threads_per_block
Definition LoopNest.h:141

Halide::Internal::Autoscheduler::LoopNest::FeatureIntermediates::vector_size
double vector_size
Definition LoopNest.h:137

Halide::Internal::Autoscheduler::LoopNest::FeatureIntermediates::inlined_calls
double inlined_calls
Definition LoopNest.h:134

Halide::Internal::Autoscheduler::LoopNest::Sites
Definition LoopNest.h:214

Halide::Internal::Autoscheduler::LoopNest::Sites::hash_of_producers_stored_at_root
uint64_t hash_of_producers_stored_at_root
Definition LoopNest.h:109

Halide::Internal::Autoscheduler::LoopNest::Sites::is_stored_in_registers
bool is_stored_in_registers() const
Definition LoopNest.h:238

Halide::Internal::Autoscheduler::LoopNest::Sites::task
const LoopNest * task
Definition LoopNest.h:105

Halide::Internal::Autoscheduler::LoopNest::Sites::innermost
const LoopNest * innermost
Definition LoopNest.h:104

Halide::Internal::Autoscheduler::LoopNest::Sites::is_stored_in_shared_mem
bool is_stored_in_shared_mem() const
Definition LoopNest.h:232

Halide::Internal::Autoscheduler::LoopNest::Sites::thread
const LoopNest * thread
Definition LoopNest.h:220

Halide::Internal::Autoscheduler::LoopNest::Sites::is_stored_in_local_mem
bool is_stored_in_local_mem() const
Definition LoopNest.h:235

Halide::Internal::Autoscheduler::LoopNest::Sites::allocation_size
int64_t allocation_size
Definition LoopNest.h:222

Halide::Internal::Autoscheduler::LoopNest::Sites::store
const LoopNest * store
Definition LoopNest.h:102

Halide::Internal::Autoscheduler::LoopNest::Sites::gpu_store_memory_type
GPUMemoryType gpu_store_memory_type
Definition LoopNest.h:221

Halide::Internal::Autoscheduler::LoopNest::Sites::inlined_innermosts
std::vector< const LoopNest * > inlined_innermosts
Definition LoopNest.h:226

Halide::Internal::Autoscheduler::LoopNest::Sites::produce
const LoopNest * produce
Definition LoopNest.h:103

Halide::Internal::Autoscheduler::LoopNest::Sites::num_realizations
int64_t num_realizations
Definition LoopNest.h:224

Halide::Internal::Autoscheduler::LoopNest::Sites::inlined
bool inlined
Definition LoopNest.h:106

Halide::Internal::Autoscheduler::LoopNest::Sites::compute
const LoopNest * compute
Definition LoopNest.h:101

Halide::Internal::Autoscheduler::LoopNest::Sites::is_stored_in_global_mem
bool is_stored_in_global_mem() const
Definition LoopNest.h:229

Halide::Internal::Autoscheduler::LoopNest::Sites::is_constant_allocation
bool is_constant_allocation
Definition LoopNest.h:223

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar
Definition LoopNest.h:593

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::FuncVar
FuncVar()
Definition LoopNest.h:623

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::innermost_pure_dim
bool innermost_pure_dim
Definition LoopNest.h:238

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::index
size_t index
Definition LoopNest.h:235

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::pure
bool pure
Definition LoopNest.h:242

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::orig
VarOrRVar orig
Definition LoopNest.h:220

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::constant_extent
bool constant_extent
Definition LoopNest.h:243

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::parallel
bool parallel
Definition LoopNest.h:240

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::exists
bool exists
Definition LoopNest.h:241

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::outermost
bool outermost
Definition LoopNest.h:239

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::gpu_threads
bool gpu_threads
Definition LoopNest.h:621

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::var
VarOrRVar var
Definition LoopNest.h:223

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::extent
int64_t extent
Definition LoopNest.h:231

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::vectorized
bool vectorized
Definition LoopNest.h:620

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::FuncVar::accessor
string accessor
Definition LoopNest.h:227

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState
Definition LoopNest.h:584

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::num_cores
double num_cores
Definition LoopNest.h:211

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::ancestors
vector< StageScheduleState * > ancestors
Definition LoopNest.h:646

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::node
const FunctionDAG::Node * node
Definition LoopNest.h:628

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::parallel
bool parallel
Definition LoopNest.h:630

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::vectorized
bool vectorized
Definition LoopNest.h:631

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::vars
vector< FuncVar > vars
Definition LoopNest.h:636

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::vector_dim
int vector_dim
Definition LoopNest.h:214

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::vectorized_loop_index
int vectorized_loop_index
Definition LoopNest.h:215

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::gpu_thread_extents
vector< int64_t > gpu_thread_extents
Definition LoopNest.h:640

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::all_innermost_unrolled
bool all_innermost_unrolled
Definition LoopNest.h:632

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::ordered_vars
vector< FuncVar > ordered_vars
Definition LoopNest.h:639

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::stage
const FunctionDAG::Node::Stage * stage
Definition LoopNest.h:629

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::schedule_source
std::ostringstream schedule_source
Definition LoopNest.h:252

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::producers_to_be_staged
NodeMap< std::vector< std::pair< const LoopNest *, std::vector< const FunctionDAG::Edge * > > > > producers_to_be_staged
Definition LoopNest.h:643

Halide::Internal::Autoscheduler::LoopNest::StageScheduleState::vectorized_var
FuncVar vectorized_var
Definition LoopNest.h:633

Halide::Internal::Autoscheduler::LoopNest
Definition LoopNest.h:87

Halide::Internal::Autoscheduler::LoopNest::collect_producers
vector< pair< int, int > > collect_producers(const StageMap< Sites > &sites) const

Halide::Internal::Autoscheduler::LoopNest::is_gpu_thread
bool is_gpu_thread(const Target &target) const
Definition LoopNest.h:152

Halide::Internal::Autoscheduler::LoopNest::compute_in_tiles
vector< IntrusivePtr< const LoopNest > > compute_in_tiles(const FunctionDAG::Node *f, const LoopNest *parent, const Anderson2021Params &params, const Target &target, const SearchSpaceOptions &search_space_options, int v, bool in_realization, bool in_threads_loop, bool is_pre_pass, vector< int64_t > union_counts=vector< int64_t >()) const

Halide::Internal::Autoscheduler::LoopNest::vectorized_loop_index
int vectorized_loop_index
Definition LoopNest.h:75

Halide::Internal::Autoscheduler::LoopNest::is_root
bool is_root() const
Definition LoopNest.h:477

Halide::Internal::Autoscheduler::LoopNest::get_enclosing_block
const LoopNest * get_enclosing_block(const LoopNest *parent, const LoopNest *grandparent) const

Halide::Internal::Autoscheduler::LoopNest::is_scalar
bool is_scalar() const
Definition LoopNest.h:160

Halide::Internal::Autoscheduler::LoopNest::num_serial_loops
int num_serial_loops(const FunctionDAG::Node::Stage *stage) const

Halide::Internal::Autoscheduler::LoopNest::points_accessed_per_thread
int64_t points_accessed_per_thread(const Anderson2021Params &params, const Target &target, const GPULoopInfo &gpu_loop_info, const std::vector< const FunctionDAG::Edge * > &edge_chain, const LoadJacobian &jac, const LoopNest *parent, const LoopNest *grandparent, int64_t n, const ScheduleFeatures &feat, const LoadJacobian &serial_jac, bool producer_has_been_scheduled, int producer_innermost_dim, const GPUMemoryType &mem_type, bool verbose) const

Halide::Internal::Autoscheduler::LoopNest::has_constant_region_required
bool has_constant_region_required(const FunctionDAG::Node *node) const

Halide::Internal::Autoscheduler::LoopNest::num_serial_loops
int num_serial_loops() const

Halide::Internal::Autoscheduler::LoopNest::features
std::map< uint64_t, StageMap< ScheduleFeatures > > features
Definition LoopNest.h:146

Halide::Internal::Autoscheduler::LoopNest::get_pure_stage_vectorized_loop_index
int get_pure_stage_vectorized_loop_index(const FunctionDAG::Node *node) const

Halide::Internal::Autoscheduler::LoopNest::node
const FunctionDAG::Node * node
Definition LoopNest.h:57

Halide::Internal::Autoscheduler::LoopNest::dump
void dump(T &stream, string prefix, const LoopNest *parent) const

Halide::Internal::Autoscheduler::LoopNest::product_of_self_and_descendants
int64_t product_of_self_and_descendants(int loop_index) const

Halide::Internal::Autoscheduler::LoopNest::gpu_label
GPU_parallelism gpu_label
Definition LoopNest.h:131

Halide::Internal::Autoscheduler::LoopNest::all_strides_exist
bool all_strides_exist(const LoadJacobian &jac, const FunctionDAG::Node *storage_node, const LoopNest &root) const

Halide::Internal::Autoscheduler::LoopNest::recompute_inlined_features
void recompute_inlined_features(const StageMap< Sites > &sites, StageMap< ScheduleFeatures > *features) const

Halide::Internal::Autoscheduler::LoopNest::inline_func
void inline_func(const FunctionDAG::Node *f)

Halide::Internal::Autoscheduler::LoopNest::generate_vec_dim_serial_tilings
void generate_vec_dim_serial_tilings(vector< int > &serial_sizes) const

Halide::Internal::Autoscheduler::LoopNest::region_computed_shrinks
bool region_computed_shrinks(const FunctionDAG::Node *f, const LoopNest *parent) const

Halide::Internal::Autoscheduler::LoopNest::compute_gpu_store_features
void compute_gpu_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const GPULoopInfo &gpu_loop_info, const std::vector< int64_t > &inner_serial_loop_extents, const Sites &consumer_site, ScheduleFeatures &feat, const LoopNest *parent, const LoopNest &root, GlobalMemInfo &global_mem_loads, SharedMemInfo &shared_mem_loads, LocalMemInfo &local_mem_loads, bool verbose=false) const

Halide::Internal::Autoscheduler::LoopNest::compute_licm_amortization
int64_t compute_licm_amortization(const LoopNest *innermost, const LoopNest *parent, const ScheduleFeatures &feat, const LoadJacobian &jac, int producer_dims) const

Halide::Internal::Autoscheduler::LoopNest::product_of_descendants
int64_t product_of_descendants(int loop_index) const

Halide::Internal::Autoscheduler::LoopNest::requires_dynamic_allocation
bool requires_dynamic_allocation(const FunctionDAG::Node *f, const Target &target, bool in_threads_loop) const

Halide::Internal::Autoscheduler::LoopNest::is_gpu_block
bool is_gpu_block(const Target &target) const
Definition LoopNest.h:156

Halide::Internal::Autoscheduler::LoopNest::node_has_dynamic_region_computed
bool node_has_dynamic_region_computed(const FunctionDAG::Node *f) const

Halide::Internal::Autoscheduler::LoopNest::exceeds_serial_extents_limit
bool exceeds_serial_extents_limit(const Target &target, const LoopNest *parent, bool in_threads_loop) const

Halide::Internal::Autoscheduler::LoopNest::collect_stages
void collect_stages(std::set< const FunctionDAG::Node::Stage * > &stages) const

Halide::Internal::Autoscheduler::LoopNest::accesses_input_buffer
bool accesses_input_buffer() const

Halide::Internal::Autoscheduler::LoopNest::size
std::vector< int64_t > size
Definition LoopNest.h:39

Halide::Internal::Autoscheduler::LoopNest::get_bounds_along_edge_chain
Bound get_bounds_along_edge_chain(const FunctionDAG::Node *f, const vector< const FunctionDAG::Edge * > &edge_chain) const

Halide::Internal::Autoscheduler::LoopNest::get_total_constant_local_mem_alloc_size
int64_t get_total_constant_local_mem_alloc_size() const

Halide::Internal::Autoscheduler::LoopNest::get_bounds
const Bound & get_bounds(const FunctionDAG::Node *f) const

Halide::Internal::Autoscheduler::LoopNest::compute_local_mem_store_features
std::pair< double, double > compute_local_mem_store_features(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const LoopNest &root, double serial_loop_extents) const

Halide::Internal::Autoscheduler::LoopNest::vectorized_load_access_size
int vectorized_load_access_size(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, const GPUMemoryType &mem_type, bool verbose=false) const

Halide::Internal::Autoscheduler::LoopNest::all_paths_to_leaves_have_thread_loop
bool all_paths_to_leaves_have_thread_loop() const

Halide::Internal::Autoscheduler::LoopNest::get_gpu_memory_type
GPUMemoryType get_gpu_memory_type(bool in_block, bool in_thread, bool is_inlined=false) const

Halide::Internal::Autoscheduler::LoopNest::compute_warp_and_block_occupancy
void compute_warp_and_block_occupancy(const Anderson2021Params &params, ScheduleFeatures &feat, const GPULoopInfo &gpu_loop_info) const

Halide::Internal::Autoscheduler::LoopNest::get_union_thread_counts
vector< int64_t > get_union_thread_counts(const FunctionDAG::Node *f) const

Halide::Internal::Autoscheduler::LoopNest::vector_dim
int vector_dim
Definition LoopNest.h:72

Halide::Internal::Autoscheduler::LoopNest::collect_nodes_that_should_be_inlined
void collect_nodes_that_should_be_inlined(const NodeMap< bool > &nodes_to_freeze, NodeMap< bool > &inlined_nodes) const

Halide::Internal::Autoscheduler::LoopNest::has_valid_thread_extents
bool has_valid_thread_extents() const

Halide::Internal::Autoscheduler::LoopNest::apply
void apply(LoopLevel here, StageMap< std::unique_ptr< StageScheduleState > > &state_map, double num_cores, int depth, const LoopNest *parent, const LoopNest *compute_site, const Target &target, std::vector< StageScheduleState * > &ancestors, const NodeMap< bool > &all_inlined) const

Halide::Internal::Autoscheduler::LoopNest::feature_intermediates
std::map< uint64_t, StageMap< StageMap< FeatureIntermediates > > > feature_intermediates
Definition LoopNest.h:145

Halide::Internal::Autoscheduler::LoopNest::get_actual_vector_dim
int get_actual_vector_dim(const Bound &store_bounds) const

Halide::Internal::Autoscheduler::LoopNest::compute_shared_mem_occupancy
void compute_shared_mem_occupancy(const Anderson2021Params &params, const Target &target, int64_t total_shared_mem_alloc_size, ScheduleFeatures &feat) const

Halide::Internal::Autoscheduler::LoopNest::compute_strides
Strides compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo *thread_info, bool verbose=false) const

Halide::Internal::Autoscheduler::LoopNest::get_vectorized_loop_index_from_pure_stage
int get_vectorized_loop_index_from_pure_stage(const LoopNest &root) const

Halide::Internal::Autoscheduler::LoopNest::computes
bool computes(const FunctionDAG::Node *f) const

Halide::Internal::Autoscheduler::LoopNest::storage_stride
double storage_stride(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const LoopNest &root) const

Halide::Internal::Autoscheduler::LoopNest::tileable
bool tileable
Definition LoopNest.h:66

Halide::Internal::Autoscheduler::LoopNest::children
std::vector< IntrusivePtr< const LoopNest > > children
Definition LoopNest.h:42

Halide::Internal::Autoscheduler::LoopNest::set_working_set_at_task_feature
void set_working_set_at_task_feature(int64_t working_set, StageMap< ScheduleFeatures > *features) const
Definition LoopNest.h:271

Halide::Internal::Autoscheduler::LoopNest::copy_from
void copy_from(const LoopNest &n)

Halide::Internal::Autoscheduler::LoopNest::promote_allocs_to_registers
bool promote_allocs_to_registers(const Target &target, StageMap< Sites > &sites) const

Halide::Internal::Autoscheduler::LoopNest::get_block_and_serial_extents
std::pair< int64_t, int64_t > get_block_and_serial_extents(const LoopNest *block) const

Halide::Internal::Autoscheduler::LoopNest::set_bounds
const Bound & set_bounds(const FunctionDAG::Node *f, BoundContents *b) const
Definition LoopNest.h:484

Halide::Internal::Autoscheduler::LoopNest::has_dynamic_allocation_inside_thread
bool has_dynamic_allocation_inside_thread(bool in_thread_loop) const

Halide::Internal::Autoscheduler::LoopNest::memoize_points_computed_minimum
void memoize_points_computed_minimum(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features) const

Halide::Internal::Autoscheduler::LoopNest::has_constant_region_computed
bool has_constant_region_computed(const FunctionDAG::Node *node) const

Halide::Internal::Autoscheduler::LoopNest::compute_local_mem_stride
double compute_local_mem_stride(double stride, double bytes) const

Halide::Internal::Autoscheduler::LoopNest::compute_mem_store_info
MemInfoType< T > compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const

Halide::Internal::Autoscheduler::LoopNest::add_gpu_thread_tilings
bool add_gpu_thread_tilings(const FunctionDAG::Node *f, const Anderson2021Params &params, const Target &target, int v, vector< IntrusivePtr< const LoopNest > > &result, const vector< int64_t > &max_size)

Halide::Internal::Autoscheduler::LoopNest::parallelize_in_tiles
IntrusivePtr< const LoopNest > parallelize_in_tiles(const vector< int64_t > &tiling, const LoopNest *parent, const Anderson2021Params &params, const Target &target, bool inner_tiling, bool adjust_tiling, bool move_all_rvars_inward=true, const vector< int > &rvars_to_move_inward={}) const

Halide::Internal::Autoscheduler::LoopNest::memoize_features
void memoize_features(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features) const

Halide::Internal::Autoscheduler::LoopNest::collect_all_inlined
void collect_all_inlined(NodeMap< bool > &all_inlined) const

Halide::Internal::Autoscheduler::LoopNest::unrolled_loops
std::vector< int > unrolled_loops(const Target &target, const LoopNest *parent, const LoopNest *grandparent) const

Halide::Internal::Autoscheduler::LoopNest::to_string
std::string to_string() const

Halide::Internal::Autoscheduler::LoopNest::is_gpu_serial
bool is_gpu_serial(const Target &target) const
Definition LoopNest.h:148

Halide::Internal::Autoscheduler::LoopNest::max_idle_lane_wastage
double max_idle_lane_wastage(const Target &target, GPULoopInfo gpu_loop_info) const

Halide::Internal::Autoscheduler::LoopNest::hash_combine
static void hash_combine(uint64_t &h, uint64_t next)
Definition LoopNest.h:193

Halide::Internal::Autoscheduler::LoopNest::find_pure_stage_loop_nest
const LoopNest * find_pure_stage_loop_nest(const FunctionDAG::Node *node) const

Halide::Internal::Autoscheduler::LoopNest::dump
void dump(std::ostream &os, string prefix, const LoopNest *parent) const

Halide::Internal::Autoscheduler::LoopNest::ref_count
RefCount ref_count
Definition LoopNest.h:35

Halide::Internal::Autoscheduler::LoopNest::bounds
NodeMap< Bound > bounds
Definition LoopNest.h:54

Halide::Internal::Autoscheduler::LoopNest::get_stage_sizes
void get_stage_sizes(const FunctionDAG::Node *f, vector< vector< int64_t > > &stage_sizes, vector< vector< int > > &pure_dims, vector< int > &vectorized_indices) const

Halide::Internal::Autoscheduler::LoopNest::compute_hash_of_producers_stored_at_root
uint64_t compute_hash_of_producers_stored_at_root(const StageMap< Sites > &sites) const

Halide::Internal::Autoscheduler::LoopNest::max_inlined_calls
int64_t max_inlined_calls() const

Halide::Internal::Autoscheduler::LoopNest::stage
const FunctionDAG::Node::Stage * stage
Definition LoopNest.h:60

Halide::Internal::Autoscheduler::LoopNest::innermost
bool innermost
Definition LoopNest.h:63

Halide::Internal::Autoscheduler::LoopNest::other_stage_has_same_producer
bool other_stage_has_same_producer(const FunctionDAG::Node *producer) const

Halide::Internal::Autoscheduler::LoopNest::get_stages_computed_in_each_compute_root_loop
void get_stages_computed_in_each_compute_root_loop(StageMap< StageMap< bool > > &descendants, const LoopNest *compute_root_loop_nest=nullptr) const

Halide::Internal::Autoscheduler::LoopNest::compute_features
void compute_features(const FunctionDAG &dag, const Anderson2021Params &params, const Target &target, const StageMap< Sites > &sites, int64_t instances, int64_t parallelism, const LoopNest *parent, const LoopNest *grandparent, const LoopNest &root, GPULoopInfo gpu_loop_info, bool use_memoized_features, const StageMap< int64_t > &total_shared_mem_alloc_sizes, int64_t *working_set, int64_t *working_set_local_constant, int64_t *working_set_local_dynamic, StageMap< ScheduleFeatures > *features, Statistics &stats, bool verbose=false) const

Halide::Internal::Autoscheduler::LoopNest::get_sites
void get_sites(const Target &target, StageMap< Sites > &sites, StageMap< int64_t > &shared_mem_alloc_sizes, const LoopNest *task=nullptr, const LoopNest *parent=nullptr, const LoopNest *current_thread_loop=nullptr) const

Halide::Internal::Autoscheduler::LoopNest::parallel
bool parallel
Definition LoopNest.h:69

Halide::Internal::Autoscheduler::LoopNest::calls
bool calls(const FunctionDAG::Node *f) const

Halide::Internal::Autoscheduler::LoopNest::producer_computed_here_or_further_in
bool producer_computed_here_or_further_in(const FunctionDAG::Node *producer) const

Halide::Internal::Autoscheduler::LoopNest::copy_from_including_features
void copy_from_including_features(const LoopNest &n)

Halide::Internal::Autoscheduler::LoopNest::can_vectorize_access_for_innermost_dim
bool can_vectorize_access_for_innermost_dim(const LoadJacobian &jac, const FunctionDAG::Node *accessed, int innermost_dim, int loop_index) const

Halide::Internal::Autoscheduler::LoopNest::update_producers_to_be_staged
void update_producers_to_be_staged(StageScheduleState &state, const NodeMap< bool > &all_inlined) const

Halide::Internal::Autoscheduler::LoopNest::has_thread_loop_descendant
bool has_thread_loop_descendant() const

Halide::Internal::Autoscheduler::LoopNest::can_vectorize_store_access
bool can_vectorize_store_access(const LoadJacobian &jac, const FunctionDAG::Node *accessed, bool accessed_has_been_scheduled, int innermost_dim, int loop_index, const GPUMemoryType &mem_type) const

Halide::Internal::Autoscheduler::LoopNest::dump
void dump() const

Halide::Internal::Autoscheduler::LoopNest::children
vector< IntrusivePtr< const LoopNest > > children
Definition LoopNest.h:95

Halide::Internal::Autoscheduler::LoopNest::compute_mem_load_features
void compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo *thread_info, MemInfoType< T > &mem_info, double serial_loop_extents, bool verbose=false) const

Halide::Internal::Autoscheduler::LoopNest::get_allocs_that_can_be_promoted_to_registers
void get_allocs_that_can_be_promoted_to_registers(const Target &target, StageMap< Sites > &sites, NodeMap< bool > &can_be_promoted_to_registers, const LoopNest *grandparent, const LoopNest *parent) const

Halide::Internal::Autoscheduler::LoopNest::compute_warp_features
void compute_warp_features(ScheduleFeatures &features, const GPULoopInfo &gpu_loop_info) const

Halide::Internal::Autoscheduler::LoopNest::size
vector< int64_t > size
Definition LoopNest.h:92

Halide::Internal::Autoscheduler::LoopNest::structural_hash
void structural_hash(uint64_t &h, int depth) const

Halide::Internal::Autoscheduler::LoopNest::funcs_realized_or_inlined
size_t funcs_realized_or_inlined() const
Definition LoopNest.h:205

Halide::Internal::Autoscheduler::LoopNest::compute_num_mem_accesses_per_block
void compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType< T > &mem_info, bool verbose=false) const

Halide::Internal::Autoscheduler::LoopNest::find_innermost_and_parent
std::pair< const LoopNest *, const LoopNest * > find_innermost_and_parent() const

Halide::Internal::Autoscheduler::LoopNest::store_at
std::set< const FunctionDAG::Node * > store_at
Definition LoopNest.h:49

Halide::Internal::Autoscheduler::LoopNest::compute_alloc_size_of_node_here
std::pair< int64_t, bool > compute_alloc_size_of_node_here(const FunctionDAG::Node *f) const

Halide::Internal::Autoscheduler::LoopNest::compute_here
bool compute_here(const FunctionDAG::Node *f, bool tileable, int v, bool in_threads_loop, const Anderson2021Params &params, const Target &target)

Halide::Internal::Autoscheduler::LoopNest::get_total_local_mem_alloc_size
int64_t get_total_local_mem_alloc_size(bool constant_allocs_only=false, bool in_threads_loop=false) const

Halide::Internal::Autoscheduler::LoopNest::inlined
NodeMap< int64_t > inlined
Definition LoopNest.h:46

Halide::Internal::Autoscheduler::LoopNest::compute_working_set_from_features
void compute_working_set_from_features(int64_t *working_set, const StageMap< ScheduleFeatures > *features) const

Halide::Internal::Autoscheduler::LoopNest::vectorized_access_size
int vectorized_access_size(size_t loop_index, bool verbose=false) const

Halide::Internal::Autoscheduler::MemInfo
Definition GPUMemInfo.h:56

Halide::Internal::Autoscheduler::SearchSpaceOptions
Definition SearchSpaceOptions.h:12

Halide::Internal::Autoscheduler::Statistics
Definition Statistics.h:65

Halide::Internal::Autoscheduler::Strides
Definition GPUMemInfo.h:115

Halide::Internal::Autoscheduler::ThreadInfo
Definition ThreadInfo.h:40

Halide::Internal::IntrusivePtr
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
Definition IntrusivePtr.h:71

Halide::Internal::ScheduleFeatures
Definition Featurization.h:167

Halide::Target
A struct representing a target machine and os to generate code for.
Definition Target.h:19

Halide::Target::has_gpu_feature
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled? I.e.

Halide::VarOrRVar
A class that can represent Vars or RVars.
Definition Func.h:29