Classes
struct	Adams2019Params
struct	Anderson2021Params
struct	BoundContents
struct	Cache
struct	CachingOptions
class	ExprBranching
struct	Filter
struct	FunctionDAG
struct	GlobalAccessAccumulator
struct	GPULoopInfo
class	LoadJacobian
struct	LocalAccessAccumulator
struct	LoopNest
class	LoopNestParser
class	map
	STL class. More...
struct	MemInfo
struct	MemTraits
struct	MemTraits< GlobalMem >
struct	MemTraits< LocalMem >
struct	MemTraits< SharedMem >
struct	NoOpMutator
struct	OptionalRational
class	pair
	STL class. More...
class	ParamParser
struct	ScopedStatistic
struct	ScopedTimer
struct	SearchSpace
struct	SearchSpaceOptions
class	set
	STL class. More...
struct	SharedAccessAccumulator
class	Span
struct	State
class	StateQueue
struct	Statistics
struct	Strides
class	string
	STL class. More...
struct	ThreadInfo
struct	ThreadTileOption
struct	Timer
class	unique_ptr
	STL class. More...
class	unordered_set
	STL class. More...
class	vector
	STL class. More...

Typedefs
typedef PerfectHashMap< FunctionDAG::Node::Stage, ScheduleFeatures >	StageMapOfScheduleFeatures
using	BlockCache = NodeMap<std::map<int, std::vector<IntrusivePtr<const LoopNest>>>>
using	Bound = IntrusivePtr<const BoundContents>
template<typename T>
using	NodeMap = PerfectHashMap<FunctionDAG::Node, T>
template<typename T>
using	StageMap = PerfectHashMap<FunctionDAG::Node::Stage, T>
using	Clock = std::chrono::high_resolution_clock
template<typename T>
using	Accumulator = typename MemTraits<T>::Accumulator
template<typename T>
using	MemInfoType = MemInfo<typename MemTraits<T>::MemInfoType>
using	GlobalMemInfo = MemInfoType<GlobalMem>
using	SharedMemInfo = MemInfoType<SharedMem>
using	LocalMemInfo = MemInfoType<LocalMem>
using	LoopNestMap = map<const LoopNest , pair<const LoopNest , int>>

Enumerations
enum class	GPU_parallelism { Block , Thread , Serial , Simd , Parallelized , None }
enum class	GPUMemoryType { Global , Shared , Local , Registers , Inlined }

Functions
void	find_and_apply_schedule (FunctionDAG &dag, const std::vector< Function > &outputs, const Adams2019Params &params, CostModel cost_model, int beam_size, StageMapOfScheduleFeatures schedule_features)
std::vector< std::vector< int64_t > >	generate_tilings (const vector< int64_t > &s, int d, int factor, bool allow_splits)
const LoopNest *	deepest_common_ancestor (const std::map< const LoopNest , std::pair< const LoopNest , int > > &parents, const LoopNest a, const LoopNest b)
void	compute_loop_nest_parents (std::map< const LoopNest , std::pair< const LoopNest , int > > &parents, const LoopNest *here, int depth)
void	find_and_apply_schedule (FunctionDAG &dag, const std::vector< Function > &outputs, const Anderson2021Params &params, const Target &target, CostModel cost_model, int beam_size, StageMapOfScheduleFeatures schedule_features)
void	sanitize_names (std::string &str)
std::string	stringify (GPU_parallelism label)
bool	may_subtile (const Anderson2021Params &params)
int64_t	get_shared_memory_limit (const Anderson2021Params &params)
int64_t	get_shared_memory_sm_limit (const Anderson2021Params &params)
int64_t	get_active_block_hardware_limit (const Anderson2021Params &params)
int64_t	get_active_warp_hardware_limit (const Anderson2021Params &params)
constexpr int64_t	get_register_mem_alloc_limit ()
int	get_unroll_limit (const Target &target)
bool	in_range_zero_one (double x)
bool	are_valid_thread_extents (const vector< int64_t > &counts)
double	get_idle_lane_wastage_limit_env_var ()
double	get_idle_lane_wastage_limit ()
bool	all (const vector< int > &v)
bool	accessed_at_constant_indices (const std::vector< int > &unrolled, const FunctionDAG::Edge *e)
bool	verify_memoized_features ()
bool	is_memoize_blocks_enabled ()
double	get_stack_memory_adjustment_factor ()
int64_t	get_stack_memory_limit ()
bool	use_adjusted_tilings ()
bool	compute_root_and_inline_only ()
template<typename PostCreateMutator>
void	deep_copy_loop_nest (LoopNest new_loop_nest, const LoopNest new_loop_nest_parent, const IntrusivePtr< const LoopNest > &existing_loop_nest, const PostCreateMutator &post_create_mutator)
template<typename PostCreateMutator>
LoopNest *	deep_copy_loop_nest (const IntrusivePtr< const LoopNest > &loop_nest, const PostCreateMutator &post_create_mutator)
template<typename A, typename B>
void	expect_eq (int line, const A &expected, const B &actual)
template<typename A, typename B>
void	approx_eq (int line, const A &expected, const B &actual, float epsilon)
template<typename A>
void	expect (int line, const A &expected)
bool	all_ones (const std::vector< int64_t > &nums)
bool	equal_to_existing_size (const std::vector< int64_t > &s, const std::vector< int64_t > &nums)
std::vector< std::vector< int64_t > >	generate_serial_tilings (const std::vector< int64_t > &s, int d, int last_d, int vectorized_index, const std::vector< int > &vec_dim_serial_sizes, bool filter_small_outer_extents=false, bool allow_inner_ones=false)
std::vector< std::vector< int64_t > >	generate_tilings (const std::vector< int64_t > &s, int d, int factor, bool allow_splits, const std::vector< int > &inner_sizes=std::vector< int >())
void	lowered_dims (const std::vector< int64_t > &size, int vector_loop_i, std::vector< int64_t > &lowered_size)
	moves vectorized dimension first and also removes dimensions with size 1 to reflect actual thread dimensions when loop nests are lowered
std::vector< std::vector< int64_t > >	generate_gpu_tilings (const std::vector< std::vector< int64_t > > &stage_sizes, const std::vector< std::vector< int > > &pure_dims, const std::vector< int64_t > &max_s, int d, const std::vector< int > &vectorized_indices, bool serial_inner, bool is_compute_root_stage)

Variables
constexpr int	kLocalMemoryLimit = 524288

Typedef Documentation

◆ StageMapOfScheduleFeatures

typedef PerfectHashMap< FunctionDAG::Node::Stage, ScheduleFeatures > Halide::Internal::Autoscheduler::StageMapOfScheduleFeatures

Definition at line 12 of file AutoSchedule.h.

◆ BlockCache

using Halide::Internal::Autoscheduler::BlockCache = NodeMap<std::map<int, std::vector<IntrusivePtr<const LoopNest>>>>

Definition at line 89 of file Cache.h.

◆ Bound

typedef IntrusivePtr< const BoundContents > Halide::Internal::Autoscheduler::Bound = IntrusivePtr<const BoundContents>

Definition at line 362 of file FunctionDAG.h.

◆ NodeMap

template<typename T>

using Halide::Internal::Autoscheduler::NodeMap = PerfectHashMap<FunctionDAG::Node, T>

Definition at line 21 of file LoopNest.h.

◆ StageMap

template<typename T>

using Halide::Internal::Autoscheduler::StageMap = PerfectHashMap<FunctionDAG::Node::Stage, T>

Definition at line 24 of file LoopNest.h.

◆ Clock

using Halide::Internal::Autoscheduler::Clock = std::chrono::high_resolution_clock

Definition at line 15 of file Timer.h.

◆ Accumulator

template<typename T>

using Halide::Internal::Autoscheduler::Accumulator = typename MemTraits<T>::Accumulator

Definition at line 52 of file GPUMemInfo.h.

◆ MemInfoType

template<typename T>

using Halide::Internal::Autoscheduler::MemInfoType = MemInfo<typename MemTraits<T>::MemInfoType>

Definition at line 108 of file GPUMemInfo.h.

◆ GlobalMemInfo

using Halide::Internal::Autoscheduler::GlobalMemInfo = MemInfoType<GlobalMem>

Definition at line 110 of file GPUMemInfo.h.

◆ SharedMemInfo

using Halide::Internal::Autoscheduler::SharedMemInfo = MemInfoType<SharedMem>

Definition at line 111 of file GPUMemInfo.h.

◆ LocalMemInfo

using Halide::Internal::Autoscheduler::LocalMemInfo = MemInfoType<LocalMem>

Definition at line 112 of file GPUMemInfo.h.

◆ LoopNestMap

using Halide::Internal::Autoscheduler::LoopNestMap = map<const LoopNest *, pair<const LoopNest *, int>>

Definition at line 65 of file State.h.

Enumeration Type Documentation

◆ GPU_parallelism

enum class Halide::Internal::Autoscheduler::GPU_parallelism

strong

Enumerator
Block
Thread
Serial
Simd
Parallelized
None

Definition at line 32 of file LoopNest.h.

◆ GPUMemoryType

enum class Halide::Internal::Autoscheduler::GPUMemoryType

strong

Enumerator
Global
Shared
Local
Registers
Inlined

Definition at line 44 of file LoopNest.h.

Function Documentation

◆ find_and_apply_schedule() [1/2]

void Halide::Internal::Autoscheduler::find_and_apply_schedule	(	FunctionDAG &	dag,
		const std::vector< Function > &	outputs,
		const Adams2019Params &	params,
		CostModel *	cost_model,
		int	beam_size,
		StageMapOfScheduleFeatures *	schedule_features )

◆ generate_tilings() [1/2]

std::vector< std::vector< int64_t > > Halide::Internal::Autoscheduler::generate_tilings	(	const vector< int64_t > &	s,
		int	d,
		int	factor,
		bool	allow_splits )

◆ deepest_common_ancestor()

const LoopNest * Halide::Internal::Autoscheduler::deepest_common_ancestor	(	const std::map< const LoopNest , std::pair< const LoopNest , int > > &	parents,
		const LoopNest *	a,
		const LoopNest *	b )

◆ compute_loop_nest_parents()

void Halide::Internal::Autoscheduler::compute_loop_nest_parents	(	std::map< const LoopNest , std::pair< const LoopNest , int > > &	parents,
		const LoopNest *	here,
		int	depth )

◆ find_and_apply_schedule() [2/2]

void Halide::Internal::Autoscheduler::find_and_apply_schedule	(	FunctionDAG &	dag,
		const std::vector< Function > &	outputs,
		const Anderson2021Params &	params,
		const Target &	target,
		CostModel *	cost_model,
		int	beam_size,
		StageMapOfScheduleFeatures *	schedule_features )

◆ sanitize_names()

void Halide::Internal::Autoscheduler::sanitize_names ( std::string & str )

◆ stringify()

std::string Halide::Internal::Autoscheduler::stringify ( GPU_parallelism label )

◆ may_subtile()

bool Halide::Internal::Autoscheduler::may_subtile ( const Anderson2021Params & params )

◆ get_shared_memory_limit()

int64_t Halide::Internal::Autoscheduler::get_shared_memory_limit ( const Anderson2021Params & params )

◆ get_shared_memory_sm_limit()

int64_t Halide::Internal::Autoscheduler::get_shared_memory_sm_limit ( const Anderson2021Params & params )

◆ get_active_block_hardware_limit()

int64_t Halide::Internal::Autoscheduler::get_active_block_hardware_limit ( const Anderson2021Params & params )

◆ get_active_warp_hardware_limit()

int64_t Halide::Internal::Autoscheduler::get_active_warp_hardware_limit ( const Anderson2021Params & params )

◆ get_register_mem_alloc_limit()

int64_t Halide::Internal::Autoscheduler::get_register_mem_alloc_limit ( )

constexpr

Definition at line 62 of file LoopNest.h.

◆ get_unroll_limit()

int Halide::Internal::Autoscheduler::get_unroll_limit ( const Target & target )

◆ in_range_zero_one()

bool Halide::Internal::Autoscheduler::in_range_zero_one ( double x )

◆ are_valid_thread_extents()

bool Halide::Internal::Autoscheduler::are_valid_thread_extents ( const vector< int64_t > & counts )

◆ get_idle_lane_wastage_limit_env_var()

double Halide::Internal::Autoscheduler::get_idle_lane_wastage_limit_env_var ( )

◆ get_idle_lane_wastage_limit()

double Halide::Internal::Autoscheduler::get_idle_lane_wastage_limit ( )

◆ all()

bool Halide::Internal::Autoscheduler::all ( const vector< int > & v )

◆ accessed_at_constant_indices()

bool Halide::Internal::Autoscheduler::accessed_at_constant_indices	(	const std::vector< int > &	unrolled,
		const FunctionDAG::Edge *	e )

References Halide::Internal::ref_count().

◆ verify_memoized_features()

bool Halide::Internal::Autoscheduler::verify_memoized_features ( )

◆ is_memoize_blocks_enabled()

bool Halide::Internal::Autoscheduler::is_memoize_blocks_enabled ( )

◆ get_stack_memory_adjustment_factor()

double Halide::Internal::Autoscheduler::get_stack_memory_adjustment_factor ( )

◆ get_stack_memory_limit()

int64_t Halide::Internal::Autoscheduler::get_stack_memory_limit ( )

◆ use_adjusted_tilings()

bool Halide::Internal::Autoscheduler::use_adjusted_tilings ( )

◆ compute_root_and_inline_only()

bool Halide::Internal::Autoscheduler::compute_root_and_inline_only ( )

◆ deep_copy_loop_nest() [1/2]

template<typename PostCreateMutator>

void Halide::Internal::Autoscheduler::deep_copy_loop_nest	(	LoopNest *	new_loop_nest,
		const LoopNest *	new_loop_nest_parent,
		const IntrusivePtr< const LoopNest > &	existing_loop_nest,
		const PostCreateMutator &	post_create_mutator )

Definition at line 50 of file State.h.

References Halide::Internal::Autoscheduler::LoopNest::children, Halide::Internal::Autoscheduler::LoopNest::copy_from(), and deep_copy_loop_nest().

Referenced by Halide::Internal::Autoscheduler::State::create_feature_root(), deep_copy_loop_nest(), and deep_copy_loop_nest().

◆ deep_copy_loop_nest() [2/2]

template<typename PostCreateMutator>

LoopNest * Halide::Internal::Autoscheduler::deep_copy_loop_nest	(	const IntrusivePtr< const LoopNest > &	loop_nest,
		const PostCreateMutator &	post_create_mutator )

Definition at line 68 of file State.h.

References deep_copy_loop_nest().

◆ expect_eq()

template<typename A, typename B>

void Halide::Internal::Autoscheduler::expect_eq	(	int	line,
		const A &	expected,
		const B &	actual )

Definition at line 16 of file test.h.

References user_assert.

◆ approx_eq()

template<typename A, typename B>

void Halide::Internal::Autoscheduler::approx_eq	(	int	line,
		const A &	expected,
		const B &	actual,
		float	epsilon )

Definition at line 24 of file test.h.

References user_assert.

◆ expect()

template<typename A>

void Halide::Internal::Autoscheduler::expect	(	int	line,
		const A &	expected )

Definition at line 32 of file test.h.

References user_assert.

◆ all_ones()

bool Halide::Internal::Autoscheduler::all_ones ( const std::vector< int64_t > & nums )

◆ equal_to_existing_size()

bool Halide::Internal::Autoscheduler::equal_to_existing_size	(	const std::vector< int64_t > &	s,
		const std::vector< int64_t > &	nums )

◆ generate_serial_tilings()

std::vector< std::vector< int64_t > > Halide::Internal::Autoscheduler::generate_serial_tilings	(	const std::vector< int64_t > &	s,
		int	d,
		int	last_d,
		int	vectorized_index,
		const std::vector< int > &	vec_dim_serial_sizes,
		bool	filter_small_outer_extents = false,
		bool	allow_inner_ones = false )

◆ generate_tilings() [2/2]

std::vector< std::vector< int64_t > > Halide::Internal::Autoscheduler::generate_tilings	(	const std::vector< int64_t > &	s,
		int	d,
		int	factor,
		bool	allow_splits,
		const std::vector< int > &	inner_sizes = std::vector< int >() )

◆ lowered_dims()

void Halide::Internal::Autoscheduler::lowered_dims	(	const std::vector< int64_t > &	size,
		int	vector_loop_i,
		std::vector< int64_t > &	lowered_size )

moves vectorized dimension first and also removes dimensions with size 1 to reflect actual thread dimensions when loop nests are lowered

◆ generate_gpu_tilings()

std::vector< std::vector< int64_t > > Halide::Internal::Autoscheduler::generate_gpu_tilings	(	const std::vector< std::vector< int64_t > > &	stage_sizes,
		const std::vector< std::vector< int > > &	pure_dims,
		const std::vector< int64_t > &	max_s,
		int	d,
		const std::vector< int > &	vectorized_indices,
		bool	serial_inner,
		bool	is_compute_root_stage )

Variable Documentation

◆ kLocalMemoryLimit

int Halide::Internal::Autoscheduler::kLocalMemoryLimit = 524288

constexpr

Definition at line 32 of file State.h.

Classes

Typedefs

Enumerations

Functions

Variables

Typedef Documentation

◆ StageMapOfScheduleFeatures

◆ BlockCache

◆ Bound

◆ NodeMap

◆ StageMap

◆ Clock

◆ Accumulator

◆ MemInfoType

◆ GlobalMemInfo

◆ SharedMemInfo

◆ LocalMemInfo

◆ LoopNestMap

Enumeration Type Documentation

◆ GPU_parallelism

◆ GPUMemoryType

Function Documentation

◆ find_and_apply_schedule() [1/2]

◆ generate_tilings() [1/2]

◆ deepest_common_ancestor()

◆ compute_loop_nest_parents()

◆ find_and_apply_schedule() [2/2]

◆ sanitize_names()

◆ stringify()

◆ may_subtile()

◆ get_shared_memory_limit()

◆ get_shared_memory_sm_limit()

◆ get_active_block_hardware_limit()

◆ get_active_warp_hardware_limit()

◆ get_register_mem_alloc_limit()

◆ get_unroll_limit()

◆ in_range_zero_one()

◆ are_valid_thread_extents()

◆ get_idle_lane_wastage_limit_env_var()

◆ get_idle_lane_wastage_limit()

◆ all()

◆ accessed_at_constant_indices()

◆ verify_memoized_features()

◆ is_memoize_blocks_enabled()

◆ get_stack_memory_adjustment_factor()

◆ get_stack_memory_limit()

◆ use_adjusted_tilings()

◆ compute_root_and_inline_only()

◆ deep_copy_loop_nest() [1/2]

◆ deep_copy_loop_nest() [2/2]

◆ expect_eq()

◆ approx_eq()

◆ expect()

◆ all_ones()

◆ equal_to_existing_size()

◆ generate_serial_tilings()

◆ generate_tilings() [2/2]

◆ lowered_dims()

◆ generate_gpu_tilings()

Variable Documentation

◆ kLocalMemoryLimit