Go to the documentation of this file.
1 #ifndef HALIDE_FUNC_H
2 #define HALIDE_FUNC_H
4 /** \file
5  *
6  * Defines Func - the front-end handle on a halide function, and related classes.
7  */
9 #include "Argument.h"
10 #include "Expr.h"
11 #include "JITModule.h"
12 #include "Module.h"
13 #include "Param.h"
14 #include "Pipeline.h"
15 #include "RDom.h"
16 #include "Target.h"
17 #include "Tuple.h"
18 #include "Var.h"
20 #include <map>
21 #include <utility>
23 namespace Halide {
25 class OutputImageParam;
26 class ParamMap;
28 /** A class that can represent Vars or RVars. Used for reorder calls
29  * which can accept a mix of either. */
30 struct VarOrRVar {
31  VarOrRVar(const std::string &n, bool r)
32  : var(n), rvar(n), is_rvar(r) {
33  }
34  VarOrRVar(const Var &v)
35  : var(v), is_rvar(false) {
36  }
37  VarOrRVar(const RVar &r)
38  : rvar(r), is_rvar(true) {
39  }
40  VarOrRVar(const RDom &r)
41  : rvar(RVar(r)), is_rvar(true) {
42  }
43  template<int N>
45  : var(u), is_rvar(false) {
46  }
48  const std::string &name() const {
49  if (is_rvar) {
50  return rvar.name();
51  } else {
52  return var.name();
53  }
54  }
58  bool is_rvar;
59 };
61 class ImageParam;
63 namespace Internal {
64 class Function;
65 struct Split;
66 struct StorageDim;
67 } // namespace Internal
69 /** A single definition of a Func. May be a pure or update definition. */
70 class Stage {
71  /** Reference to the Function this stage (or definition) belongs to. */
72  Internal::Function function;
73  Internal::Definition definition;
74  /** Indicate which stage the definition belongs to (0 for initial
75  * definition, 1 for first update, etc.). */
76  size_t stage_index;
77  /** Pure Vars of the Function (from the init definition). */
78  std::vector<Var> dim_vars;
80  void set_dim_type(const VarOrRVar &var, Internal::ForType t);
81  void set_dim_device_api(const VarOrRVar &var, DeviceAPI device_api);
82  void split(const std::string &old, const std::string &outer, const std::string &inner,
83  const Expr &factor, bool exact, TailStrategy tail);
84  void remove(const std::string &var);
85  Stage &purify(const VarOrRVar &old_name, const VarOrRVar &new_name);
87  const std::vector<Internal::StorageDim> &storage_dims() const {
88  return function.schedule().storage_dims();
89  }
91  Stage &compute_with(LoopLevel loop_level, const std::map<std::string, LoopAlignStrategy> &align);
93 public:
94  Stage(Internal::Function f, Internal::Definition d, size_t stage_index)
95  : function(std::move(f)), definition(std::move(d)), stage_index(stage_index) {
96  internal_assert(definition.defined());
98  dim_vars.reserve(function.args().size());
99  for (const auto &arg : function.args()) {
100  dim_vars.emplace_back(arg);
101  }
102  internal_assert(definition.args().size() == dim_vars.size());
103  }
105  /** Return the current StageSchedule associated with this Stage. For
106  * introspection only: to modify schedule, use the Func interface. */
108  return definition.schedule();
109  }
111  /** Return a string describing the current var list taking into
112  * account all the splits, reorders, and tiles. */
113  std::string dump_argument_list() const;
115  /** Return the name of this stage, e.g. "f.update(2)" */
116  std::string name() const;
118  /** Calling rfactor() on an associative update definition a Func will split
119  * the update into an intermediate which computes the partial results and
120  * replaces the current update definition with a new definition which merges
121  * the partial results. If called on a init/pure definition, this will
122  * throw an error. rfactor() will automatically infer the associative reduction
123  * operator and identity of the operator. If it can't prove the operation
124  * is associative or if it cannot find an identity for that operator, this
125  * will throw an error. In addition, commutativity of the operator is required
126  * if rfactor() is called on the inner dimension but excluding the outer
127  * dimensions.
128  *
129  * rfactor() takes as input 'preserved', which is a list of <RVar, Var> pairs.
130  * The rvars not listed in 'preserved' are removed from the original Func and
131  * are lifted to the intermediate Func. The remaining rvars (the ones in
132  * 'preserved') are made pure in the intermediate Func. The intermediate Func's
133  * update definition inherits all scheduling directives (e.g. split,fuse, etc.)
134  * applied to the original Func's update definition. The loop order of the
135  * intermediate Func's update definition is the same as the original, although
136  * the RVars in 'preserved' are replaced by the new pure Vars. The loop order of the
137  * intermediate Func's init definition from innermost to outermost is the args'
138  * order of the original Func's init definition followed by the new pure Vars.
139  *
140  * The intermediate Func also inherits storage order from the original Func
141  * with the new pure Vars added to the outermost.
142  *
143  * For example, f.update(0).rfactor({{r.y, u}}) would rewrite a pipeline like this:
144  \code
145  f(x, y) = 0;
146  f(x, y) += g(r.x, r.y);
147  \endcode
148  * into a pipeline like this:
149  \code
150  f_intm(x, y, u) = 0;
151  f_intm(x, y, u) += g(r.x, u);
153  f(x, y) = 0;
154  f(x, y) += f_intm(x, y, r.y);
155  \endcode
156  *
157  * This has a variety of uses. You can use it to split computation of an associative reduction:
158  \code
159  f(x, y) = 10;
160  RDom r(0, 96);
161  f(x, y) = max(f(x, y), g(x, y, r.x));
162  f.update(0).split(r.x, rxo, rxi, 8).reorder(y, x).parallel(x);
163  f.update(0).rfactor({{rxo, u}}).compute_root().parallel(u).update(0).parallel(u);
164  \endcode
165  *
166  *, which is equivalent to:
167  \code
168  parallel for u = 0 to 11:
169  for y:
170  for x:
171  f_intm(x, y, u) = -inf
172  parallel for x:
173  for y:
174  parallel for u = 0 to 11:
175  for rxi = 0 to 7:
176  f_intm(x, y, u) = max(f_intm(x, y, u), g(8*u + rxi))
177  for y:
178  for x:
179  f(x, y) = 10
180  parallel for x:
181  for y:
182  for rxo = 0 to 11:
183  f(x, y) = max(f(x, y), f_intm(x, y, rxo))
184  \endcode
185  *
186  */
187  // @{
188  Func rfactor(std::vector<std::pair<RVar, Var>> preserved);
189  Func rfactor(const RVar &r, const Var &v);
190  // @}
192  /** Schedule the iteration over this stage to be fused with another
193  * stage 's' from outermost loop to a given LoopLevel. 'this' stage will
194  * be computed AFTER 's' in the innermost fused dimension. There should not
195  * be any dependencies between those two fused stages. If either of the
196  * stages being fused is a stage of an extern Func, this will throw an error.
197  *
198  * Note that the two stages that are fused together should have the same
199  * exact schedule from the outermost to the innermost fused dimension, and
200  * the stage we are calling compute_with on should not have specializations,
201  * e.g. f2.compute_with(f1, x) is allowed only if f2 has no specializations.
202  *
203  * Also, if a producer is desired to be computed at the fused loop level,
204  * the function passed to the compute_at() needs to be the "parent". Consider
205  * the following code:
206  \code
207  input(x, y) = x + y;
208  f(x, y) = input(x, y);
209  f(x, y) += 5;
210  g(x, y) = x - y;
211  g(x, y) += 10;
212  f.compute_with(g, y);
213  f.update().compute_with(g.update(), y);
214  \endcode
215  *
216  * To compute 'input' at the fused loop level at dimension y, we specify
217  * input.compute_at(g, y) instead of input.compute_at(f, y) since 'g' is
218  * the "parent" for this fused loop (i.e. 'g' is computed first before 'f'
219  * is computed). On the other hand, to compute 'input' at the innermost
220  * dimension of 'f', we specify input.compute_at(f, x) instead of
221  * input.compute_at(g, x) since the x dimension of 'f' is not fused
222  * (only the y dimension is).
223  *
224  * Given the constraints, this has a variety of uses. Consider the
225  * following code:
226  \code
227  f(x, y) = x + y;
228  g(x, y) = x - y;
229  h(x, y) = f(x, y) + g(x, y);
230  f.compute_root();
231  g.compute_root();
232  f.split(x, xo, xi, 8);
233  g.split(x, xo, xi, 8);
234  g.compute_with(f, xo);
235  \endcode
236  *
237  * This is equivalent to:
238  \code
239  for y:
240  for xo:
241  for xi:
242  f(8*xo + xi) = (8*xo + xi) + y
243  for xi:
244  g(8*xo + xi) = (8*xo + xi) - y
245  for y:
246  for x:
247  h(x, y) = f(x, y) + g(x, y)
248  \endcode
249  *
250  * The size of the dimensions of the stages computed_with do not have
251  * to match. Consider the following code where 'g' is half the size of 'f':
252  \code
253  Image<int> f_im(size, size), g_im(size/2, size/2);
254  input(x, y) = x + y;
255  f(x, y) = input(x, y);
256  g(x, y) = input(2*x, 2*y);
257  g.compute_with(f, y);
258  input.compute_at(f, y);
259  Pipeline({f, g}).realize({f_im, g_im});
260  \endcode
261  *
262  * This is equivalent to:
263  \code
264  for y = 0 to size-1:
265  for x = 0 to size-1:
266  input(x, y) = x + y;
267  for x = 0 to size-1:
268  f(x, y) = input(x, y)
269  for x = 0 to size/2-1:
270  if (y < size/2-1):
271  g(x, y) = input(2*x, 2*y)
272  \endcode
273  *
274  * 'align' specifies how the loop iteration of each dimension of the
275  * two stages being fused should be aligned in the fused loop nests
276  * (see LoopAlignStrategy for options). Consider the following loop nests:
277  \code
278  for z = f_min_z to f_max_z:
279  for y = f_min_y to f_max_y:
280  for x = f_min_x to f_max_x:
281  f(x, y, z) = x + y + z
282  for z = g_min_z to g_max_z:
283  for y = g_min_y to g_max_y:
284  for x = g_min_x to g_max_x:
285  g(x, y, z) = x - y - z
286  \endcode
287  *
288  * If no alignment strategy is specified, the following loop nest will be
289  * generated:
290  \code
291  for z = min(f_min_z, g_min_z) to max(f_max_z, g_max_z):
292  for y = min(f_min_y, g_min_y) to max(f_max_y, g_max_y):
293  for x = f_min_x to f_max_x:
294  if (f_min_z <= z <= f_max_z):
295  if (f_min_y <= y <= f_max_y):
296  f(x, y, z) = x + y + z
297  for x = g_min_x to g_max_x:
298  if (g_min_z <= z <= g_max_z):
299  if (g_min_y <= y <= g_max_y):
300  g(x, y, z) = x - y - z
301  \endcode
302  *
303  * Instead, these alignment strategies:
304  \code
305  g.compute_with(f, y, {{z, LoopAlignStrategy::AlignStart}, {y, LoopAlignStrategy::AlignEnd}});
306  \endcode
307  * will produce the following loop nest:
308  \code
309  f_loop_min_z = f_min_z
310  f_loop_max_z = max(f_max_z, (f_min_z - g_min_z) + g_max_z)
311  for z = f_min_z to f_loop_max_z:
312  f_loop_min_y = min(f_min_y, (f_max_y - g_max_y) + g_min_y)
313  f_loop_max_y = f_max_y
314  for y = f_loop_min_y to f_loop_max_y:
315  for x = f_min_x to f_max_x:
316  if (f_loop_min_z <= z <= f_loop_max_z):
317  if (f_loop_min_y <= y <= f_loop_max_y):
318  f(x, y, z) = x + y + z
319  for x = g_min_x to g_max_x:
320  g_shift_z = g_min_z - f_loop_min_z
321  g_shift_y = g_max_y - f_loop_max_y
322  if (g_min_z <= (z + g_shift_z) <= g_max_z):
323  if (g_min_y <= (y + g_shift_y) <= g_max_y):
324  g(x, y + g_shift_y, z + g_shift_z) = x - (y + g_shift_y) - (z + g_shift_z)
325  \endcode
326  *
327  * LoopAlignStrategy::AlignStart on dimension z will shift the loop iteration
328  * of 'g' at dimension z so that its starting value matches that of 'f'.
329  * Likewise, LoopAlignStrategy::AlignEnd on dimension y will shift the loop
330  * iteration of 'g' at dimension y so that its end value matches that of 'f'.
331  */
332  // @{
333  Stage &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
334  Stage &compute_with(LoopLevel loop_level, LoopAlignStrategy align = LoopAlignStrategy::Auto);
335  Stage &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
336  Stage &compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align = LoopAlignStrategy::Auto);
337  // @}
339  /** Scheduling calls that control how the domain of this stage is
340  * traversed. See the documentation for Func for the meanings. */
341  // @{
343  Stage &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
344  Stage &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
345  Stage &serial(const VarOrRVar &var);
346  Stage &parallel(const VarOrRVar &var);
347  Stage &vectorize(const VarOrRVar &var);
348  Stage &unroll(const VarOrRVar &var);
349  Stage &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
350  Stage &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
351  Stage &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
352  Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
353  const VarOrRVar &xo, const VarOrRVar &yo,
354  const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor,
356  Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
357  const VarOrRVar &xi, const VarOrRVar &yi,
358  const Expr &xfactor, const Expr &yfactor,
360  Stage &tile(const std::vector<VarOrRVar> &previous,
361  const std::vector<VarOrRVar> &outers,
362  const std::vector<VarOrRVar> &inners,
363  const std::vector<Expr> &factors,
364  const std::vector<TailStrategy> &tails);
365  Stage &tile(const std::vector<VarOrRVar> &previous,
366  const std::vector<VarOrRVar> &outers,
367  const std::vector<VarOrRVar> &inners,
368  const std::vector<Expr> &factors,
370  Stage &tile(const std::vector<VarOrRVar> &previous,
371  const std::vector<VarOrRVar> &inners,
372  const std::vector<Expr> &factors,
374  Stage &reorder(const std::vector<VarOrRVar> &vars);
376  template<typename... Args>
377  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
378  reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
379  std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
380  return reorder(collected_args);
381  }
383  Stage &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
384  Stage specialize(const Expr &condition);
385  void specialize_fail(const std::string &message);
387  Stage &gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
388  Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
389  Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
391  Stage &gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
395  Stage &gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
396  Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
397  Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
399  Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
400  Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
401  const VarOrRVar &thread_x, const VarOrRVar &thread_y,
402  DeviceAPI device_api = DeviceAPI::Default_GPU);
403  Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
404  const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z,
405  DeviceAPI device_api = DeviceAPI::Default_GPU);
407  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
409  DeviceAPI device_api = DeviceAPI::Default_GPU);
411  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
413  DeviceAPI device_api = DeviceAPI::Default_GPU);
414  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
415  const VarOrRVar &bx, const VarOrRVar &by,
416  const VarOrRVar &tx, const VarOrRVar &ty,
417  const Expr &x_size, const Expr &y_size,
419  DeviceAPI device_api = DeviceAPI::Default_GPU);
421  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
422  const VarOrRVar &tx, const VarOrRVar &ty,
423  const Expr &x_size, const Expr &y_size,
425  DeviceAPI device_api = DeviceAPI::Default_GPU);
427  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
428  const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
429  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
430  const Expr &x_size, const Expr &y_size, const Expr &z_size,
432  DeviceAPI device_api = DeviceAPI::Default_GPU);
433  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
434  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
435  const Expr &x_size, const Expr &y_size, const Expr &z_size,
437  DeviceAPI device_api = DeviceAPI::Default_GPU);
440  Stage &atomic(bool override_associativity_test = false);
442  Stage &hexagon(const VarOrRVar &x = Var::outermost());
444  Stage &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
446  Stage &prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
448  template<typename T>
449  Stage &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
451  return prefetch(image.parameter(), at, from, std::move(offset), strategy);
452  }
453  // @}
455  /** Attempt to get the source file and line where this stage was
456  * defined by parsing the process's own debug symbols. Returns an
457  * empty string if no debug symbols were found or the debug
458  * symbols were not understood. Works on OS X and Linux only. */
459  std::string source_location() const;
461  /** Assert that this stage has intentionally been given no schedule, and
462  * suppress the warning about unscheduled update definitions that would
463  * otherwise fire. This counts as a schedule, so calling this twice on the
464  * same Stage will fail the assertion. */
465  void unscheduled();
466 };
468 // For backwards compatibility, keep the ScheduleHandle name.
471 class FuncTupleElementRef;
473 /** A fragment of front-end syntax of the form f(x, y, z), where x, y,
474  * z are Vars or Exprs. If could be the left hand side of a definition or
475  * an update definition, or it could be a call to a function. We don't know
476  * until we see how this object gets used.
477  */
478 class FuncRef {
479  Internal::Function func;
480  int implicit_placeholder_pos;
481  int implicit_count;
482  std::vector<Expr> args;
483  std::vector<Expr> args_with_implicit_vars(const std::vector<Expr> &e) const;
485  /** Helper for function update by Tuple. If the function does not
486  * already have a pure definition, init_val will be used as RHS of
487  * each tuple element in the initial function definition. */
488  template<typename BinaryOp>
489  Stage func_ref_update(const Tuple &e, int init_val);
491  /** Helper for function update by Expr. If the function does not
492  * already have a pure definition, init_val will be used as RHS in
493  * the initial function definition. */
494  template<typename BinaryOp>
495  Stage func_ref_update(Expr e, int init_val);
497 public:
498  FuncRef(const Internal::Function &, const std::vector<Expr> &,
499  int placeholder_pos = -1, int count = 0);
500  FuncRef(Internal::Function, const std::vector<Var> &,
501  int placeholder_pos = -1, int count = 0);
503  /** Use this as the left-hand-side of a definition or an update definition
504  * (see \ref RDom).
505  */
506  Stage operator=(const Expr &);
508  /** Use this as the left-hand-side of a definition or an update definition
509  * for a Func with multiple outputs. */
510  Stage operator=(const Tuple &);
512  /** Define a stage that adds the given expression to this Func. If the
513  * expression refers to some RDom, this performs a sum reduction of the
514  * expression over the domain. If the function does not already have a
515  * pure definition, this sets it to zero.
516  */
517  // @{
519  Stage operator+=(const Tuple &);
520  Stage operator+=(const FuncRef &);
521  // @}
523  /** Define a stage that adds the negative of the given expression to this
524  * Func. If the expression refers to some RDom, this performs a sum reduction
525  * of the negative of the expression over the domain. If the function does
526  * not already have a pure definition, this sets it to zero.
527  */
528  // @{
530  Stage operator-=(const Tuple &);
531  Stage operator-=(const FuncRef &);
532  // @}
534  /** Define a stage that multiplies this Func by the given expression. If the
535  * expression refers to some RDom, this performs a product reduction of the
536  * expression over the domain. If the function does not already have a pure
537  * definition, this sets it to 1.
538  */
539  // @{
541  Stage operator*=(const Tuple &);
542  Stage operator*=(const FuncRef &);
543  // @}
545  /** Define a stage that divides this Func by the given expression.
546  * If the expression refers to some RDom, this performs a product
547  * reduction of the inverse of the expression over the domain. If the
548  * function does not already have a pure definition, this sets it to 1.
549  */
550  // @{
552  Stage operator/=(const Tuple &);
553  Stage operator/=(const FuncRef &);
554  // @}
556  /* Override the usual assignment operator, so that
557  * f(x, y) = g(x, y) defines f.
558  */
559  Stage operator=(const FuncRef &);
561  /** Use this as a call to the function, and not the left-hand-side
562  * of a definition. Only works for single-output Funcs. */
563  operator Expr() const;
565  /** When a FuncRef refers to a function that provides multiple
566  * outputs, you can access each output as an Expr using
567  * operator[].
568  */
569  FuncTupleElementRef operator[](int) const;
571  /** How many outputs does the function this refers to produce. */
572  size_t size() const;
574  /** What function is this calling? */
575  Internal::Function function() const {
576  return func;
577  }
578 };
580 /** Explicit overloads of min and max for FuncRef. These exist to
581  * disambiguate calls to min on FuncRefs when a user has pulled both
582  * Halide::min and std::min into their namespace. */
583 // @{
584 inline Expr min(const FuncRef &a, const FuncRef &b) {
585  return min(Expr(a), Expr(b));
586 }
587 inline Expr max(const FuncRef &a, const FuncRef &b) {
588  return max(Expr(a), Expr(b));
589 }
590 // @}
592 /** A fragment of front-end syntax of the form f(x, y, z)[index], where x, y,
593  * z are Vars or Exprs. If could be the left hand side of an update
594  * definition, or it could be a call to a function. We don't know
595  * until we see how this object gets used.
596  */
598  FuncRef func_ref;
599  std::vector<Expr> args; // args to the function
600  int idx; // Index to function outputs
602  /** Helper function that generates a Tuple where element at 'idx' is set
603  * to 'e' and the rests are undef. */
604  Tuple values_with_undefs(const Expr &e) const;
606 public:
607  FuncTupleElementRef(const FuncRef &ref, const std::vector<Expr> &args, int idx);
609  /** Use this as the left-hand-side of an update definition of Tuple
610  * component 'idx' of a Func (see \ref RDom). The function must
611  * already have an initial definition.
612  */
613  Stage operator=(const Expr &e);
615  /** Define a stage that adds the given expression to Tuple component 'idx'
616  * of this Func. The other Tuple components are unchanged. If the expression
617  * refers to some RDom, this performs a sum reduction of the expression over
618  * the domain. The function must already have an initial definition.
619  */
620  Stage operator+=(const Expr &e);
622  /** Define a stage that adds the negative of the given expression to Tuple
623  * component 'idx' of this Func. The other Tuple components are unchanged.
624  * If the expression refers to some RDom, this performs a sum reduction of
625  * the negative of the expression over the domain. The function must already
626  * have an initial definition.
627  */
628  Stage operator-=(const Expr &e);
630  /** Define a stage that multiplies Tuple component 'idx' of this Func by
631  * the given expression. The other Tuple components are unchanged. If the
632  * expression refers to some RDom, this performs a product reduction of
633  * the expression over the domain. The function must already have an
634  * initial definition.
635  */
636  Stage operator*=(const Expr &e);
638  /** Define a stage that divides Tuple component 'idx' of this Func by
639  * the given expression. The other Tuple components are unchanged.
640  * If the expression refers to some RDom, this performs a product
641  * reduction of the inverse of the expression over the domain. The function
642  * must already have an initial definition.
643  */
644  Stage operator/=(const Expr &e);
646  /* Override the usual assignment operator, so that
647  * f(x, y)[index] = g(x, y) defines f.
648  */
649  Stage operator=(const FuncRef &e);
651  /** Use this as a call to Tuple component 'idx' of a Func, and not the
652  * left-hand-side of a definition. */
653  operator Expr() const;
655  /** What function is this calling? */
656  Internal::Function function() const {
657  return func_ref.function();
658  }
660  /** Return index to the function outputs. */
661  int index() const {
662  return idx;
663  }
664 };
666 namespace Internal {
667 class IRMutator;
668 } // namespace Internal
670 /** Helper class for identifying purpose of an Expr passed to memoize.
671  */
672 class EvictionKey {
673 protected:
675  friend class Func;
677 public:
678  explicit EvictionKey(const Expr &expr = Expr())
679  : key(expr) {
680  }
681 };
683 /** A halide function. This class represents one stage in a Halide
684  * pipeline, and is the unit by which we schedule things. By default
685  * they are aggressively inlined, so you are encouraged to make lots
686  * of little functions, rather than storing things in Exprs. */
687 class Func {
689  /** A handle on the internal halide function that this
690  * represents */
691  Internal::Function func;
693  /** When you make a reference to this function with fewer
694  * arguments than it has dimensions, the argument list is bulked
695  * up with 'implicit' vars with canonical names. This lets you
696  * pass around partially applied Halide functions. */
697  // @{
698  std::pair<int, int> add_implicit_vars(std::vector<Var> &) const;
699  std::pair<int, int> add_implicit_vars(std::vector<Expr> &) const;
700  // @}
702  /** The imaging pipeline that outputs this Func alone. */
703  Pipeline pipeline_;
705  /** Get the imaging pipeline that outputs this Func alone,
706  * creating it (and freezing the Func) if necessary. */
707  Pipeline pipeline();
709  // Helper function for recursive reordering support
710  Func &reorder_storage(const std::vector<Var> &dims, size_t start);
712  void invalidate_cache();
714 public:
715  /** Declare a new undefined function with the given name */
716  explicit Func(const std::string &name);
718  /** Declare a new undefined function with the given name.
719  * The function will be constrained to represent Exprs of required_type.
720  * If required_dims is not AnyDims, the function will be constrained to exactly
721  * that many dimensions. */
722  explicit Func(const Type &required_type, int required_dims, const std::string &name);
724  /** Declare a new undefined function with the given name.
725  * If required_types is not empty, the function will be constrained to represent
726  * Tuples of the same arity and types. (If required_types is empty, there is no constraint.)
727  * If required_dims is not AnyDims, the function will be constrained to exactly
728  * that many dimensions. */
729  explicit Func(const std::vector<Type> &required_types, int required_dims, const std::string &name);
731  /** Declare a new undefined function with an
732  * automatically-generated unique name */
733  Func();
735  /** Declare a new function with an automatically-generated unique
736  * name, and define it to return the given expression (which may
737  * not contain free variables). */
738  explicit Func(const Expr &e);
740  /** Construct a new Func to wrap an existing, already-define
741  * Function object. */
742  explicit Func(Internal::Function f);
744  /** Construct a new Func to wrap a Buffer. */
745  template<typename T, int Dims>
747  : Func() {
748  (*this)(_) = im(_);
749  }
751  /** Evaluate this function over some rectangular domain and return
752  * the resulting buffer or buffers. Performs compilation if the
753  * Func has not previously been realized and compile_jit has not
754  * been called. If the final stage of the pipeline is on the GPU,
755  * data is copied back to the host before being returned. The
756  * returned Realization should probably be instantly converted to
757  * a Buffer class of the appropriate type. That is, do this:
758  *
759  \code
760  f(x) = sin(x);
761  Buffer<float> im = f.realize(...);
762  \endcode
763  *
764  * If your Func has multiple values, because you defined it using
765  * a Tuple, then casting the result of a realize call to a buffer
766  * or image will produce a run-time error. Instead you should do the
767  * following:
768  *
769  \code
770  f(x) = Tuple(x, sin(x));
771  Realization r = f.realize(...);
772  Buffer<int> im0 = r[0];
773  Buffer<float> im1 = r[1];
774  \endcode
775  *
776  * In Halide formal arguments of a computation are specified using
777  * Param<T> and ImageParam objects in the expressions defining the
778  * computation. The param_map argument to realize allows
779  * specifying a set of per-call parameters to be used for a
780  * specific computation. This method is thread-safe where the
781  * globals used by Param<T> and ImageParam are not. Any parameters
782  * that are not in the param_map are taken from the global values,
783  * so those can continue to be used if they are not changing
784  * per-thread.
785  *
786  * One can explicitly construct a ParamMap and
787  * use its set method to insert Parameter to scalar or Buffer
788  * value mappings. (NOTE: ParamMap is deprecated in Halide 16 and
789  * will be removed in Halide 17. Callers requiring threadsafe JIT
790  * calls should migrate to use compile_to_callable() instead.)
791  *
792  \code
793  Param<int32> p(42);
794  ImageParam img(Int(32), 1);
795  f(x) = img(x) + p;
797  Buffer<int32_t) arg_img(10, 10);
798  <fill in arg_img...>
799  ParamMap params;
800  params.set(p, 17);
801  params.set(img, arg_img);
803  Target t = get_jit_target_from_environment();
804  Buffer<int32_t> result = f.realize({10, 10}, t, params);
805  \endcode
806  *
807  * Alternatively, an initializer list can be used
808  * directly in the realize call to pass this information:
809  *
810  \code
811  Param<int32> p(42);
812  ImageParam img(Int(32), 1);
813  f(x) = img(x) + p;
815  Buffer<int32_t) arg_img(10, 10);
816  <fill in arg_img...>
818  Target t = get_jit_target_from_environment();
819  Buffer<int32_t> result = f.realize({10, 10}, t, { { p, 17 }, { img, arg_img } });
820  \endcode
821  *
822  * If the Func cannot be realized into a buffer of the given size
823  * due to scheduling constraints on scattering update definitions,
824  * it will be realized into a larger buffer of the minimum size
825  * possible, and a cropped view at the requested size will be
826  * returned. It is thus not safe to assume the returned buffers
827  * are contiguous in memory. This behavior can be disabled with
828  * the NoBoundsQuery target flag, in which case an error about
829  * writing out of bounds on the output buffer will trigger
830  * instead.
831  *
832  */
833  Realization realize(std::vector<int32_t> sizes = {}, const Target &target = Target(),
834  const ParamMap &param_map = ParamMap::empty_map());
836  /** Same as above, but takes a custom user-provided context to be
837  * passed to runtime functions. This can be used to pass state to
838  * runtime overrides in a thread-safe manner. A nullptr context is
839  * legal, and is equivalent to calling the variant of realize
840  * that does not take a context. */
841  Realization realize(JITUserContext *context,
842  std::vector<int32_t> sizes = {},
843  const Target &target = Target(),
844  const ParamMap &param_map = ParamMap::empty_map());
846  /** Evaluate this function into an existing allocated buffer or
847  * buffers. If the buffer is also one of the arguments to the
848  * function, strange things may happen, as the pipeline isn't
849  * necessarily safe to run in-place. If you pass multiple buffers,
850  * they must have matching sizes. This form of realize does *not*
851  * automatically copy data back from the GPU. */
852  void realize(Pipeline::RealizationArg outputs, const Target &target = Target(),
853  const ParamMap &param_map = ParamMap::empty_map());
855  /** Same as above, but takes a custom user-provided context to be
856  * passed to runtime functions. This can be used to pass state to
857  * runtime overrides in a thread-safe manner. A nullptr context is
858  * legal, and is equivalent to calling the variant of realize
859  * that does not take a context. */
860  void realize(JITUserContext *context,
861  Pipeline::RealizationArg outputs,
862  const Target &target = Target(),
863  const ParamMap &param_map = ParamMap::empty_map());
865  /** For a given size of output, or a given output buffer,
866  * determine the bounds required of all unbound ImageParams
867  * referenced. Communicates the result by allocating new buffers
868  * of the appropriate size and binding them to the unbound
869  * ImageParams.
870  *
871  * Set the documentation for Func::realize regarding the
872  * ParamMap. There is one difference in that input Buffer<>
873  * arguments that are being inferred are specified as a pointer to
874  * the Buffer<> in the ParamMap. E.g.
875  *
876  \code
877  Param<int32> p(42);
878  ImageParam img(Int(32), 1);
879  f(x) = img(x) + p;
881  Target t = get_jit_target_from_environment();
882  Buffer<> in;
883  f.infer_input_bounds({10, 10}, t, { { img, &in } });
884  \endcode
885  * On return, in will be an allocated buffer of the correct size
886  * to evaulate f over a 10x10 region.
887  */
888  // @{
889  void infer_input_bounds(const std::vector<int32_t> &sizes,
890  const Target &target = get_jit_target_from_environment(),
891  const ParamMap &param_map = ParamMap::empty_map());
892  void infer_input_bounds(Pipeline::RealizationArg outputs,
893  const Target &target = get_jit_target_from_environment(),
894  const ParamMap &param_map = ParamMap::empty_map());
895  // @}
897  /** Versions of infer_input_bounds that take a custom user context
898  * to pass to runtime functions. */
899  // @{
900  void infer_input_bounds(JITUserContext *context,
901  const std::vector<int32_t> &sizes,
902  const Target &target = get_jit_target_from_environment(),
903  const ParamMap &param_map = ParamMap::empty_map());
904  void infer_input_bounds(JITUserContext *context,
905  Pipeline::RealizationArg outputs,
906  const Target &target = get_jit_target_from_environment(),
907  const ParamMap &param_map = ParamMap::empty_map());
908  // @}
909  /** Statically compile this function to llvm bitcode, with the
910  * given filename (which should probably end in .bc), type
911  * signature, and C function name (which defaults to the same name
912  * as this halide function */
913  //@{
914  void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
915  const Target &target = get_target_from_environment());
916  void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &,
917  const Target &target = get_target_from_environment());
918  // @}
920  /** Statically compile this function to llvm assembly, with the
921  * given filename (which should probably end in .ll), type
922  * signature, and C function name (which defaults to the same name
923  * as this halide function */
924  //@{
925  void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
926  const Target &target = get_target_from_environment());
927  void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &,
928  const Target &target = get_target_from_environment());
929  // @}
931  /** Statically compile this function to an object file, with the
932  * given filename (which should probably end in .o or .obj), type
933  * signature, and C function name (which defaults to the same name
934  * as this halide function. You probably don't want to use this
935  * directly; call compile_to_static_library or compile_to_file instead. */
936  //@{
937  void compile_to_object(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
938  const Target &target = get_target_from_environment());
939  void compile_to_object(const std::string &filename, const std::vector<Argument> &,
940  const Target &target = get_target_from_environment());
941  // @}
943  /** Emit a header file with the given filename for this
944  * function. The header will define a function with the type
945  * signature given by the second argument, and a name given by the
946  * third. The name defaults to the same name as this halide
947  * function. You don't actually have to have defined this function
948  * yet to call this. You probably don't want to use this directly;
949  * call compile_to_static_library or compile_to_file instead. */
950  void compile_to_header(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name = "",
951  const Target &target = get_target_from_environment());
953  /** Statically compile this function to text assembly equivalent
954  * to the object file generated by compile_to_object. This is
955  * useful for checking what Halide is producing without having to
956  * disassemble anything, or if you need to feed the assembly into
957  * some custom toolchain to produce an object file (e.g. iOS) */
958  //@{
959  void compile_to_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
960  const Target &target = get_target_from_environment());
961  void compile_to_assembly(const std::string &filename, const std::vector<Argument> &,
962  const Target &target = get_target_from_environment());
963  // @}
965  /** Statically compile this function to C source code. This is
966  * useful for providing fallback code paths that will compile on
967  * many platforms. Vectorization will fail, and parallelization
968  * will produce serial code. */
969  void compile_to_c(const std::string &filename,
970  const std::vector<Argument> &,
971  const std::string &fn_name = "",
972  const Target &target = get_target_from_environment());
974  /** Write out an internal representation of lowered code. Useful
975  * for analyzing and debugging scheduling. Can emit html or plain
976  * text. */
977  void compile_to_lowered_stmt(const std::string &filename,
978  const std::vector<Argument> &args,
979  StmtOutputFormat fmt = Text,
980  const Target &target = get_target_from_environment());
982  /** Write out the loop nests specified by the schedule for this
983  * Function. Helpful for understanding what a schedule is
984  * doing. */
985  void print_loop_nest();
987  /** Compile to object file and header pair, with the given
988  * arguments. The name defaults to the same name as this halide
989  * function.
990  */
991  void compile_to_file(const std::string &filename_prefix, const std::vector<Argument> &args,
992  const std::string &fn_name = "",
993  const Target &target = get_target_from_environment());
995  /** Compile to static-library file and header pair, with the given
996  * arguments. The name defaults to the same name as this halide
997  * function.
998  */
999  void compile_to_static_library(const std::string &filename_prefix, const std::vector<Argument> &args,
1000  const std::string &fn_name = "",
1001  const Target &target = get_target_from_environment());
1003  /** Compile to static-library file and header pair once for each target;
1004  * each resulting function will be considered (in order) via halide_can_use_target_features()
1005  * at runtime, with the first appropriate match being selected for subsequent use.
1006  * This is typically useful for specializations that may vary unpredictably by machine
1007  * (e.g., SSE4.1/AVX/AVX2 on x86 desktop machines).
1008  * All targets must have identical arch-os-bits.
1009  */
1010  void compile_to_multitarget_static_library(const std::string &filename_prefix,
1011  const std::vector<Argument> &args,
1012  const std::vector<Target> &targets);
1014  /** Like compile_to_multitarget_static_library(), except that the object files
1015  * are all output as object files (rather than bundled into a static library).
1016  *
1017  * `suffixes` is an optional list of strings to use for as the suffix for each object
1018  * file. If nonempty, it must be the same length as `targets`. (If empty, Target::to_string()
1019  * will be used for each suffix.)
1020  *
1021  * Note that if `targets.size()` > 1, the wrapper code (to select the subtarget)
1022  * will be generated with the filename `${filename_prefix}_wrapper.o`
1023  *
1024  * Note that if `targets.size()` > 1 and `no_runtime` is not specified, the runtime
1025  * will be generated with the filename `${filename_prefix}_runtime.o`
1026  */
1027  void compile_to_multitarget_object_files(const std::string &filename_prefix,
1028  const std::vector<Argument> &args,
1029  const std::vector<Target> &targets,
1030  const std::vector<std::string> &suffixes);
1032  /** Store an internal representation of lowered code as a self
1033  * contained Module suitable for further compilation. */
1034  Module compile_to_module(const std::vector<Argument> &args, const std::string &fn_name = "",
1035  const Target &target = get_target_from_environment());
1037  /** Compile and generate multiple target files with single call.
1038  * Deduces target files based on filenames specified in
1039  * output_files map.
1040  */
1041  void compile_to(const std::map<OutputFileType, std::string> &output_files,
1042  const std::vector<Argument> &args,
1043  const std::string &fn_name,
1044  const Target &target = get_target_from_environment());
1046  /** Eagerly jit compile the function to machine code. This
1047  * normally happens on the first call to realize. If you're
1048  * running your halide pipeline inside time-sensitive code and
1049  * wish to avoid including the time taken to compile a pipeline,
1050  * then you can call this ahead of time. Default is to use the Target
1051  * returned from Halide::get_jit_target_from_environment()
1052  */
1053  void compile_jit(const Target &target = get_jit_target_from_environment());
1055  /** Get a struct containing the currently set custom functions
1056  * used by JIT. This can be mutated. Changes will take effect the
1057  * next time this Func is realized. */
1058  JITHandlers &jit_handlers();
1060  /** Eagerly jit compile the function to machine code and return a callable
1061  * struct that behaves like a function pointer. The calling convention
1062  * will exactly match that of an AOT-compiled version of this Func
1063  * with the same Argument list.
1064  */
1065  Callable compile_to_callable(const std::vector<Argument> &args,
1066  const Target &target = get_jit_target_from_environment());
1068  /** Add a custom pass to be used during lowering. It is run after
1069  * all other lowering passes. Can be used to verify properties of
1070  * the lowered Stmt, instrument it with extra code, or otherwise
1071  * modify it. The Func takes ownership of the pass, and will call
1072  * delete on it when the Func goes out of scope. So don't pass a
1073  * stack object, or share pass instances between multiple
1074  * Funcs. */
1075  template<typename T>
1077  // Template instantiate a custom deleter for this type, then
1078  // wrap in a lambda. The custom deleter lives in user code, so
1079  // that deletion is on the same heap as construction (I hate Windows).
1080  add_custom_lowering_pass(pass, [pass]() { delete_lowering_pass<T>(pass); });
1081  }
1083  /** Add a custom pass to be used during lowering, with the
1084  * function that will be called to delete it also passed in. Set
1085  * it to nullptr if you wish to retain ownership of the object. */
1086  void add_custom_lowering_pass(Internal::IRMutator *pass, std::function<void()> deleter);
1088  /** Remove all previously-set custom lowering passes */
1091  /** Get the custom lowering passes. */
1092  const std::vector<CustomLoweringPass> &custom_lowering_passes();
1094  /** When this function is compiled, include code that dumps its
1095  * values to a file after it is realized, for the purpose of
1096  * debugging.
1097  *
1098  * If filename ends in ".tif" or ".tiff" (case insensitive) the file
1099  * is in TIFF format and can be read by standard tools. Oherwise, the
1100  * file format is as follows:
1101  *
1102  * All data is in the byte-order of the target platform. First, a
1103  * 20 byte-header containing four 32-bit ints, giving the extents
1104  * of the first four dimensions. Dimensions beyond four are
1105  * folded into the fourth. Then, a fifth 32-bit int giving the
1106  * data type of the function. The typecodes are given by: float =
1107  * 0, double = 1, uint8_t = 2, int8_t = 3, uint16_t = 4, int16_t =
1108  * 5, uint32_t = 6, int32_t = 7, uint64_t = 8, int64_t = 9. The
1109  * data follows the header, as a densely packed array of the given
1110  * size and the given type. If given the extension .tmp, this file
1111  * format can be natively read by the program ImageStack. */
1112  void debug_to_file(const std::string &filename);
1114  /** The name of this function, either given during construction,
1115  * or automatically generated. */
1116  const std::string &name() const;
1118  /** Get the pure arguments. */
1119  std::vector<Var> args() const;
1121  /** The right-hand-side value of the pure definition of this
1122  * function. Causes an error if there's no pure definition, or if
1123  * the function is defined to return multiple values. */
1124  Expr value() const;
1126  /** The values returned by this function. An error if the function
1127  * has not been been defined. Returns a Tuple with one element for
1128  * functions defined to return a single value. */
1129  Tuple values() const;
1131  /** Does this function have at least a pure definition. */
1132  bool defined() const;
1134  /** Get the left-hand-side of the update definition. An empty
1135  * vector if there's no update definition. If there are
1136  * multiple update definitions for this function, use the
1137  * argument to select which one you want. */
1138  const std::vector<Expr> &update_args(int idx = 0) const;
1140  /** Get the right-hand-side of an update definition. An error if
1141  * there's no update definition. If there are multiple
1142  * update definitions for this function, use the argument to
1143  * select which one you want. */
1144  Expr update_value(int idx = 0) const;
1146  /** Get the right-hand-side of an update definition for
1147  * functions that returns multiple values. An error if there's no
1148  * update definition. Returns a Tuple with one element for
1149  * functions that return a single value. */
1150  Tuple update_values(int idx = 0) const;
1152  /** Get the RVars of the reduction domain for an update definition, if there is
1153  * one. */
1154  std::vector<RVar> rvars(int idx = 0) const;
1156  /** Does this function have at least one update definition? */
1157  bool has_update_definition() const;
1159  /** How many update definitions does this function have? */
1160  int num_update_definitions() const;
1162  /** Is this function an external stage? That is, was it defined
1163  * using define_extern? */
1164  bool is_extern() const;
1166  /** Add an extern definition for this Func. This lets you define a
1167  * Func that represents an external pipeline stage. You can, for
1168  * example, use it to wrap a call to an extern library such as
1169  * fftw. */
1170  // @{
1171  void define_extern(const std::string &function_name,
1172  const std::vector<ExternFuncArgument> &params, Type t,
1173  int dimensionality,
1175  DeviceAPI device_api = DeviceAPI::Host) {
1176  define_extern(function_name, params, t,
1177  Internal::make_argument_list(dimensionality), mangling,
1178  device_api);
1179  }
1181  void define_extern(const std::string &function_name,
1182  const std::vector<ExternFuncArgument> &params,
1183  const std::vector<Type> &types, int dimensionality,
1184  NameMangling mangling) {
1185  define_extern(function_name, params, types,
1186  Internal::make_argument_list(dimensionality), mangling);
1187  }
1189  void define_extern(const std::string &function_name,
1190  const std::vector<ExternFuncArgument> &params,
1191  const std::vector<Type> &types, int dimensionality,
1193  DeviceAPI device_api = DeviceAPI::Host) {
1194  define_extern(function_name, params, types,
1195  Internal::make_argument_list(dimensionality), mangling,
1196  device_api);
1197  }
1199  void define_extern(const std::string &function_name,
1200  const std::vector<ExternFuncArgument> &params, Type t,
1201  const std::vector<Var> &arguments,
1203  DeviceAPI device_api = DeviceAPI::Host) {
1204  define_extern(function_name, params, std::vector<Type>{t}, arguments,
1205  mangling, device_api);
1206  }
1208  void define_extern(const std::string &function_name,
1209  const std::vector<ExternFuncArgument> &params,
1210  const std::vector<Type> &types,
1211  const std::vector<Var> &arguments,
1213  DeviceAPI device_api = DeviceAPI::Host);
1214  // @}
1216  /** Get the type(s) of the outputs of this Func.
1217  *
1218  * It is not legal to call type() unless the Func has non-Tuple elements.
1219  *
1220  * If the Func isn't yet defined, and was not specified with required types,
1221  * a runtime error will occur.
1222  *
1223  * If the Func isn't yet defined, but *was* specified with required types,
1224  * the requirements will be returned. */
1225  // @{
1226  const Type &type() const;
1227  const std::vector<Type> &types() const;
1228  // @}
1230  /** Get the number of outputs of this Func. Corresponds to the
1231  * size of the Tuple this Func was defined to return.
1232  * If the Func isn't yet defined, but was specified with required types,
1233  * the number of outputs specified in the requirements will be returned. */
1234  int outputs() const;
1236  /** Get the name of the extern function called for an extern
1237  * definition. */
1238  const std::string &extern_function_name() const;
1240  /** The dimensionality (number of arguments) of this function.
1241  * If the Func isn't yet defined, but was specified with required dimensionality,
1242  * the dimensionality specified in the requirements will be returned. */
1243  int dimensions() const;
1245  /** Construct either the left-hand-side of a definition, or a call
1246  * to a functions that happens to only contain vars as
1247  * arguments. If the function has already been defined, and fewer
1248  * arguments are given than the function has dimensions, then
1249  * enough implicit vars are added to the end of the argument list
1250  * to make up the difference (see \ref Var::implicit) */
1251  // @{
1252  FuncRef operator()(std::vector<Var>) const;
1254  template<typename... Args>
1256  operator()(Args &&...args) const {
1257  std::vector<Var> collected_args{std::forward<Args>(args)...};
1258  return this->operator()(collected_args);
1259  }
1260  // @}
1262  /** Either calls to the function, or the left-hand-side of
1263  * an update definition (see \ref RDom). If the function has
1264  * already been defined, and fewer arguments are given than the
1265  * function has dimensions, then enough implicit vars are added to
1266  * the end of the argument list to make up the difference. (see
1267  * \ref Var::implicit)*/
1268  // @{
1269  FuncRef operator()(std::vector<Expr>) const;
1271  template<typename... Args>
1273  operator()(const Expr &x, Args &&...args) const {
1274  std::vector<Expr> collected_args{x, std::forward<Args>(args)...};
1275  return (*this)(collected_args);
1276  }
1277  // @}
1279  /** Creates and returns a new identity Func that wraps this Func. During
1280  * compilation, Halide replaces all calls to this Func done by 'f'
1281  * with calls to the wrapper. If this Func is already wrapped for
1282  * use in 'f', will return the existing wrapper.
1283  *
1284  * For example, g.in(f) would rewrite a pipeline like this:
1285  \code
1286  g(x, y) = ...
1287  f(x, y) = ... g(x, y) ...
1288  \endcode
1289  * into a pipeline like this:
1290  \code
1291  g(x, y) = ...
1292  g_wrap(x, y) = g(x, y)
1293  f(x, y) = ... g_wrap(x, y)
1294  \endcode
1295  *
1296  * This has a variety of uses. You can use it to schedule this
1297  * Func differently in the different places it is used:
1298  \code
1299  g(x, y) = ...
1300  f1(x, y) = ... g(x, y) ...
1301  f2(x, y) = ... g(x, y) ...
1302  g.in(f1).compute_at(f1, y).vectorize(x, 8);
1303  g.in(f2).compute_at(f2, x).unroll(x);
1304  \endcode
1305  *
1306  * You can also use it to stage loads from this Func via some
1307  * intermediate buffer (perhaps on the stack as in
1308  * test/performance/block_transpose.cpp, or in shared GPU memory
1309  * as in test/performance/wrap.cpp). In this we compute the
1310  * wrapper at tiles of the consuming Funcs like so:
1311  \code
1312  g.compute_root()...
1313  g.in(f).compute_at(f, tiles)...
1314  \endcode
1315  *
1316  * Func::in() can also be used to compute pieces of a Func into a
1317  * smaller scratch buffer (perhaps on the GPU) and then copy them
1318  * into a larger output buffer one tile at a time. See
1319  * apps/interpolate/interpolate.cpp for an example of this. In
1320  * this case we compute the Func at tiles of its own wrapper:
1321  \code
1322  f.in(g).compute_root().gpu_tile(...)...
1323  f.compute_at(f.in(g), tiles)...
1324  \endcode
1325  *
1326  * A similar use of Func::in() wrapping Funcs with multiple update
1327  * stages in a pure wrapper. The following code:
1328  \code
1329  f(x, y) = x + y;
1330  f(x, y) += 5;
1331  g(x, y) = f(x, y);
1332  f.compute_root();
1333  \endcode
1334  *
1335  * Is equivalent to:
1336  \code
1337  for y:
1338  for x:
1339  f(x, y) = x + y;
1340  for y:
1341  for x:
1342  f(x, y) += 5
1343  for y:
1344  for x:
1345  g(x, y) = f(x, y)
1346  \endcode
1347  * using Func::in(), we can write:
1348  \code
1349  f(x, y) = x + y;
1350  f(x, y) += 5;
1351  g(x, y) = f(x, y);
1352  f.in(g).compute_root();
1353  \endcode
1354  * which instead produces:
1355  \code
1356  for y:
1357  for x:
1358  f(x, y) = x + y;
1359  f(x, y) += 5
1360  f_wrap(x, y) = f(x, y)
1361  for y:
1362  for x:
1363  g(x, y) = f_wrap(x, y)
1364  \endcode
1365  */
1366  Func in(const Func &f);
1368  /** Create and return an identity wrapper shared by all the Funcs in
1369  * 'fs'. If any of the Funcs in 'fs' already have a custom wrapper,
1370  * this will throw an error. */
1371  Func in(const std::vector<Func> &fs);
1373  /** Create and return a global identity wrapper, which wraps all calls to
1374  * this Func by any other Func. If a global wrapper already exists,
1375  * returns it. The global identity wrapper is only used by callers for
1376  * which no custom wrapper has been specified.
1377  */
1378  Func in();
1380  /** Similar to \ref Func::in; however, instead of replacing the call to
1381  * this Func with an identity Func that refers to it, this replaces the
1382  * call with a clone of this Func.
1383  *
1384  * For example, f.clone_in(g) would rewrite a pipeline like this:
1385  \code
1386  f(x, y) = x + y;
1387  g(x, y) = f(x, y) + 2;
1388  h(x, y) = f(x, y) - 3;
1389  \endcode
1390  * into a pipeline like this:
1391  \code
1392  f(x, y) = x + y;
1393  f_clone(x, y) = x + y;
1394  g(x, y) = f_clone(x, y) + 2;
1395  h(x, y) = f(x, y) - 3;
1396  \endcode
1397  *
1398  */
1399  //@{
1400  Func clone_in(const Func &f);
1401  Func clone_in(const std::vector<Func> &fs);
1402  //@}
1404  /** Declare that this function should be implemented by a call to
1405  * halide_buffer_copy with the given target device API. Asserts
1406  * that the Func has a pure definition which is a simple call to a
1407  * single input, and no update definitions. The wrapper Funcs
1408  * returned by in() are suitable candidates. Consumes all pure
1409  * variables, and rewrites the Func to have an extern definition
1410  * that calls halide_buffer_copy. */
1413  /** Declare that this function should be implemented by a call to
1414  * halide_buffer_copy with a NULL target device API. Equivalent to
1415  * copy_to_device(DeviceAPI::Host). Asserts that the Func has a
1416  * pure definition which is a simple call to a single input, and
1417  * no update definitions. The wrapper Funcs returned by in() are
1418  * suitable candidates. Consumes all pure variables, and rewrites
1419  * the Func to have an extern definition that calls
1420  * halide_buffer_copy.
1421  *
1422  * Note that if the source Func is already valid in host memory,
1423  * this compiles to code that does the minimum number of calls to
1424  * memcpy.
1425  */
1426  Func copy_to_host();
1428  /** Split a dimension into inner and outer subdimensions with the
1429  * given names, where the inner dimension iterates from 0 to
1430  * factor-1. The inner and outer subdimensions can then be dealt
1431  * with using the other scheduling calls. It's ok to reuse the old
1432  * variable name as either the inner or outer variable. The final
1433  * argument specifies how the tail should be handled if the split
1434  * factor does not provably divide the extent. */
1435  Func &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1437  /** Join two dimensions into a single fused dimension. The fused
1438  * dimension covers the product of the extents of the inner and
1439  * outer dimensions given. */
1440  Func &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
1442  /** Mark a dimension to be traversed serially. This is the default. */
1443  Func &serial(const VarOrRVar &var);
1445  /** Mark a dimension to be traversed in parallel */
1446  Func &parallel(const VarOrRVar &var);
1448  /** Split a dimension by the given task_size, and the parallelize the
1449  * outer dimension. This creates parallel tasks that have size
1450  * task_size. After this call, var refers to the outer dimension of
1451  * the split. The inner dimension has a new anonymous name. If you
1452  * wish to mutate it, or schedule with respect to it, do the split
1453  * manually. */
1454  Func &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
1456  /** Mark a dimension to be computed all-at-once as a single
1457  * vector. The dimension should have constant extent -
1458  * e.g. because it is the inner dimension following a split by a
1459  * constant factor. For most uses of vectorize you want the two
1460  * argument form. The variable to be vectorized should be the
1461  * innermost one. */
1462  Func &vectorize(const VarOrRVar &var);
1464  /** Mark a dimension to be completely unrolled. The dimension
1465  * should have constant extent - e.g. because it is the inner
1466  * dimension following a split by a constant factor. For most uses
1467  * of unroll you want the two-argument form. */
1468  Func &unroll(const VarOrRVar &var);
1470  /** Split a dimension by the given factor, then vectorize the
1471  * inner dimension. This is how you vectorize a loop of unknown
1472  * size. The variable to be vectorized should be the innermost
1473  * one. After this call, var refers to the outer dimension of the
1474  * split. 'factor' must be an integer. */
1475  Func &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1477  /** Split a dimension by the given factor, then unroll the inner
1478  * dimension. This is how you unroll a loop of unknown size by
1479  * some constant factor. After this call, var refers to the outer
1480  * dimension of the split. 'factor' must be an integer. */
1481  Func &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1483  /** Statically declare that the range over which a function should
1484  * be evaluated is given by the second and third arguments. This
1485  * can let Halide perform some optimizations. E.g. if you know
1486  * there are going to be 4 color channels, you can completely
1487  * vectorize the color channel dimension without the overhead of
1488  * splitting it up. If bounds inference decides that it requires
1489  * more of this function than the bounds you have stated, a
1490  * runtime error will occur when you try to run your pipeline. */
1491  Func &bound(const Var &var, Expr min, Expr extent);
1493  /** Statically declare the range over which the function will be
1494  * evaluated in the general case. This provides a basis for the auto
1495  * scheduler to make trade-offs and scheduling decisions. The auto
1496  * generated schedules might break when the sizes of the dimensions are
1497  * very different from the estimates specified. These estimates are used
1498  * only by the auto scheduler if the function is a pipeline output. */
1499  Func &set_estimate(const Var &var, const Expr &min, const Expr &extent);
1501  /** Set (min, extent) estimates for all dimensions in the Func
1502  * at once; this is equivalent to calling `set_estimate(args()[n], min, extent)`
1503  * repeatedly, but slightly terser. The size of the estimates vector
1504  * must match the dimensionality of the Func. */
1505  Func &set_estimates(const Region &estimates);
1507  /** Expand the region computed so that the min coordinates is
1508  * congruent to 'remainder' modulo 'modulus', and the extent is a
1509  * multiple of 'modulus'. For example, f.align_bounds(x, 2) forces
1510  * the min and extent realized to be even, and calling
1511  * f.align_bounds(x, 2, 1) forces the min to be odd and the extent
1512  * to be even. The region computed always contains the region that
1513  * would have been computed without this directive, so no
1514  * assertions are injected.
1515  */
1516  Func &align_bounds(const Var &var, Expr modulus, Expr remainder = 0);
1518  /** Expand the region computed so that the extent is a
1519  * multiple of 'modulus'. For example, f.align_extent(x, 2) forces
1520  * the extent realized to be even. The region computed always contains the
1521  * region that would have been computed without this directive, so no
1522  * assertions are injected. (This is essentially equivalent to align_bounds(),
1523  * but always leaving the min untouched.)
1524  */
1525  Func &align_extent(const Var &var, Expr modulus);
1527  /** Bound the extent of a Func's realization, but not its
1528  * min. This means the dimension can be unrolled or vectorized
1529  * even when its min is not fixed (for example because it is
1530  * compute_at tiles of another Func). This can also be useful for
1531  * forcing a function's allocation to be a fixed size, which often
1532  * means it can go on the stack. */
1533  Func &bound_extent(const Var &var, Expr extent);
1535  /** Split two dimensions at once by the given factors, and then
1536  * reorder the resulting dimensions to be xi, yi, xo, yo from
1537  * innermost outwards. This gives a tiled traversal. */
1538  Func &tile(const VarOrRVar &x, const VarOrRVar &y,
1539  const VarOrRVar &xo, const VarOrRVar &yo,
1540  const VarOrRVar &xi, const VarOrRVar &yi,
1541  const Expr &xfactor, const Expr &yfactor,
1544  /** A shorter form of tile, which reuses the old variable names as
1545  * the new outer dimensions */
1546  Func &tile(const VarOrRVar &x, const VarOrRVar &y,
1547  const VarOrRVar &xi, const VarOrRVar &yi,
1548  const Expr &xfactor, const Expr &yfactor,
1551  /** A more general form of tile, which defines tiles of any dimensionality. */
1552  Func &tile(const std::vector<VarOrRVar> &previous,
1553  const std::vector<VarOrRVar> &outers,
1554  const std::vector<VarOrRVar> &inners,
1555  const std::vector<Expr> &factors,
1556  const std::vector<TailStrategy> &tails);
1558  /** The generalized tile, with a single tail strategy to apply to all vars. */
1559  Func &tile(const std::vector<VarOrRVar> &previous,
1560  const std::vector<VarOrRVar> &outers,
1561  const std::vector<VarOrRVar> &inners,
1562  const std::vector<Expr> &factors,
1565  /** Generalized tiling, reusing the previous names as the outer names. */
1566  Func &tile(const std::vector<VarOrRVar> &previous,
1567  const std::vector<VarOrRVar> &inners,
1568  const std::vector<Expr> &factors,
1571  /** Reorder variables to have the given nesting order, from
1572  * innermost out */
1573  Func &reorder(const std::vector<VarOrRVar> &vars);
1575  template<typename... Args>
1577  reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
1578  std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
1579  return reorder(collected_args);
1580  }
1582  /** Rename a dimension. Equivalent to split with a inner size of one. */
1583  Func &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
1585  /** Specify that race conditions are permitted for this Func,
1586  * which enables parallelizing over RVars even when Halide cannot
1587  * prove that it is safe to do so. Use this with great caution,
1588  * and only if you can prove to yourself that this is safe, as it
1589  * may result in a non-deterministic routine that returns
1590  * different values at different times or on different machines. */
1593  /** Issue atomic updates for this Func. This allows parallelization
1594  * on associative RVars. The function throws a compile error when
1595  * Halide fails to prove associativity. Use override_associativity_test
1596  * to disable the associativity test if you believe the function is
1597  * associative or the order of reduction variable execution does not
1598  * matter.
1599  * Halide compiles this into hardware atomic operations whenever possible,
1600  * and falls back to a mutex lock per storage element if it is impossible
1601  * to atomically update.
1602  * There are three possible outcomes of the compiled code:
1603  * atomic add, compare-and-swap loop, and mutex lock.
1604  * For example:
1605  *
1606  * hist(x) = 0;
1607  * hist(im(r)) += 1;
1608  * hist.compute_root();
1609  * hist.update().atomic().parallel();
1610  *
1611  * will be compiled to atomic add operations.
1612  *
1613  * hist(x) = 0;
1614  * hist(im(r)) = min(hist(im(r)) + 1, 100);
1615  * hist.compute_root();
1616  * hist.update().atomic().parallel();
1617  *
1618  * will be compiled to compare-and-swap loops.
1619  *
1620  * arg_max() = {0, im(0)};
1621  * Expr old_index = arg_max()[0];
1622  * Expr old_max = arg_max()[1];
1623  * Expr new_index = select(old_max < im(r), r, old_index);
1624  * Expr new_max = max(im(r), old_max);
1625  * arg_max() = {new_index, new_max};
1626  * arg_max.compute_root();
1627  * arg_max.update().atomic().parallel();
1628  *
1629  * will be compiled to updates guarded by a mutex lock,
1630  * since it is impossible to atomically update two different locations.
1631  *
1632  * Currently the atomic operation is supported by x86, CUDA, and OpenCL backends.
1633  * Compiling to other backends results in a compile error.
1634  * If an operation is compiled into a mutex lock, and is vectorized or is
1635  * compiled to CUDA or OpenCL, it also results in a compile error,
1636  * since per-element mutex lock on vectorized operation leads to a
1637  * deadlock.
1638  * Vectorization of predicated RVars (through rdom.where()) on CPU
1639  * is also unsupported yet (see https://github.com/halide/Halide/issues/4298).
1640  * 8-bit and 16-bit atomics on GPU are also not supported. */
1641  Func &atomic(bool override_associativity_test = false);
1643  /** Specialize a Func. This creates a special-case version of the
1644  * Func where the given condition is true. The most effective
1645  * conditions are those of the form param == value, and boolean
1646  * Params. Consider a simple example:
1647  \code
1648  f(x) = x + select(cond, 0, 1);
1649  f.compute_root();
1650  \endcode
1651  * This is equivalent to:
1652  \code
1653  for (int x = 0; x < width; x++) {
1654  f[x] = x + (cond ? 0 : 1);
1655  }
1656  \endcode
1657  * Adding the scheduling directive:
1658  \code
1659  f.specialize(cond)
1660  \endcode
1661  * makes it equivalent to:
1662  \code
1663  if (cond) {
1664  for (int x = 0; x < width; x++) {
1665  f[x] = x;
1666  }
1667  } else {
1668  for (int x = 0; x < width; x++) {
1669  f[x] = x + 1;
1670  }
1671  }
1672  \endcode
1673  * Note that the inner loops have been simplified. In the first
1674  * path Halide knows that cond is true, and in the second path
1675  * Halide knows that it is false.
1676  *
1677  * The specialized version gets its own schedule, which inherits
1678  * every directive made about the parent Func's schedule so far
1679  * except for its specializations. This method returns a handle to
1680  * the new schedule. If you wish to retrieve the specialized
1681  * sub-schedule again later, you can call this method with the
1682  * same condition. Consider the following example of scheduling
1683  * the specialized version:
1684  *
1685  \code
1686  f(x) = x;
1687  f.compute_root();
1688  f.specialize(width > 1).unroll(x, 2);
1689  \endcode
1690  * Assuming for simplicity that width is even, this is equivalent to:
1691  \code
1692  if (width > 1) {
1693  for (int x = 0; x < width/2; x++) {
1694  f[2*x] = 2*x;
1695  f[2*x + 1] = 2*x + 1;
1696  }
1697  } else {
1698  for (int x = 0; x < width/2; x++) {
1699  f[x] = x;
1700  }
1701  }
1702  \endcode
1703  * For this case, it may be better to schedule the un-specialized
1704  * case instead:
1705  \code
1706  f(x) = x;
1707  f.compute_root();
1708  f.specialize(width == 1); // Creates a copy of the schedule so far.
1709  f.unroll(x, 2); // Only applies to the unspecialized case.
1710  \endcode
1711  * This is equivalent to:
1712  \code
1713  if (width == 1) {
1714  f[0] = 0;
1715  } else {
1716  for (int x = 0; x < width/2; x++) {
1717  f[2*x] = 2*x;
1718  f[2*x + 1] = 2*x + 1;
1719  }
1720  }
1721  \endcode
1722  * This can be a good way to write a pipeline that splits,
1723  * vectorizes, or tiles, but can still handle small inputs.
1724  *
1725  * If a Func has several specializations, the first matching one
1726  * will be used, so the order in which you define specializations
1727  * is significant. For example:
1728  *
1729  \code
1730  f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1731  f.specialize(cond1);
1732  f.specialize(cond2);
1733  \endcode
1734  * is equivalent to:
1735  \code
1736  if (cond1) {
1737  for (int x = 0; x < width; x++) {
1738  f[x] = x + a - (cond2 ? c : d);
1739  }
1740  } else if (cond2) {
1741  for (int x = 0; x < width; x++) {
1742  f[x] = x + b - c;
1743  }
1744  } else {
1745  for (int x = 0; x < width; x++) {
1746  f[x] = x + b - d;
1747  }
1748  }
1749  \endcode
1750  *
1751  * Specializations may in turn be specialized, which creates a
1752  * nested if statement in the generated code.
1753  *
1754  \code
1755  f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1756  f.specialize(cond1).specialize(cond2);
1757  \endcode
1758  * This is equivalent to:
1759  \code
1760  if (cond1) {
1761  if (cond2) {
1762  for (int x = 0; x < width; x++) {
1763  f[x] = x + a - c;
1764  }
1765  } else {
1766  for (int x = 0; x < width; x++) {
1767  f[x] = x + a - d;
1768  }
1769  }
1770  } else {
1771  for (int x = 0; x < width; x++) {
1772  f[x] = x + b - (cond2 ? c : d);
1773  }
1774  }
1775  \endcode
1776  * To create a 4-way if statement that simplifies away all of the
1777  * ternary operators above, you could say:
1778  \code
1779  f.specialize(cond1).specialize(cond2);
1780  f.specialize(cond2);
1781  \endcode
1782  * or
1783  \code
1784  f.specialize(cond1 && cond2);
1785  f.specialize(cond1);
1786  f.specialize(cond2);
1787  \endcode
1788  *
1789  * Any prior Func which is compute_at some variable of this Func
1790  * gets separately included in all paths of the generated if
1791  * statement. The Var in the compute_at call to must exist in all
1792  * paths, but it may have been generated via a different path of
1793  * splits, fuses, and renames. This can be used somewhat
1794  * creatively. Consider the following code:
1795  \code
1796  g(x, y) = 8*x;
1797  f(x, y) = g(x, y) + 1;
1798  f.compute_root().specialize(cond);
1799  Var g_loop;
1800  f.specialize(cond).rename(y, g_loop);
1801  f.rename(x, g_loop);
1802  g.compute_at(f, g_loop);
1803  \endcode
1804  * When cond is true, this is equivalent to g.compute_at(f,y).
1805  * When it is false, this is equivalent to g.compute_at(f,x).
1806  */
1807  Stage specialize(const Expr &condition);
1809  /** Add a specialization to a Func that always terminates execution
1810  * with a call to halide_error(). By itself, this is of limited use,
1811  * but can be useful to terminate chains of specialize() calls where
1812  * no "default" case is expected (thus avoiding unnecessary code generation).
1813  *
1814  * For instance, say we want to optimize a pipeline to process images
1815  * in planar and interleaved format; we might typically do something like:
1816  \code
1817  ImageParam im(UInt(8), 3);
1818  Func f = do_something_with(im);
1819  f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1820  f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1821  \endcode
1822  * This code will vectorize along rows for the planar case, and across pixel
1823  * components for the interleaved case... but there is an implicit "else"
1824  * for the unhandled cases, which generates unoptimized code. If we never
1825  * anticipate passing any other sort of images to this, we code streamline
1826  * our code by adding specialize_fail():
1827  \code
1828  ImageParam im(UInt(8), 3);
1829  Func f = do_something(im);
1830  f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1831  f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1832  f.specialize_fail("Unhandled image format");
1833  \endcode
1834  * Conceptually, this produces codes like:
1835  \code
1836  if (im.dim(0).stride() == 1) {
1837  do_something_planar();
1838  } else if (im.dim(2).stride() == 1) {
1839  do_something_interleaved();
1840  } else {
1841  halide_error("Unhandled image format");
1842  }
1843  \endcode
1844  *
1845  * Note that calling specialize_fail() terminates the specialization chain
1846  * for a given Func; you cannot create new specializations for the Func
1847  * afterwards (though you can retrieve handles to previous specializations).
1848  */
1849  void specialize_fail(const std::string &message);
1851  /** Tell Halide that the following dimensions correspond to GPU
1852  * thread indices. This is useful if you compute a producer
1853  * function within the block indices of a consumer function, and
1854  * want to control how that function's dimensions map to GPU
1855  * threads. If the selected target is not an appropriate GPU, this
1856  * just marks those dimensions as parallel. */
1857  // @{
1858  Func &gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1859  Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1860  Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1861  // @}
1863  /** The given dimension corresponds to the lanes in a GPU
1864  * warp. GPU warp lanes are distinguished from GPU threads by the
1865  * fact that all warp lanes run together in lockstep, which
1866  * permits lightweight communication of data from one lane to
1867  * another. */
1868  Func &gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1870  /** Tell Halide to run this stage using a single gpu thread and
1871  * block. This is not an efficient use of your GPU, but it can be
1872  * useful to avoid copy-back for intermediate update stages that
1873  * touch a very small part of your Func. */
1876  /** Tell Halide that the following dimensions correspond to GPU
1877  * block indices. This is useful for scheduling stages that will
1878  * run serially within each GPU block. If the selected target is
1879  * not ptx, this just marks those dimensions as parallel. */
1880  // @{
1881  Func &gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1882  Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1883  Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1884  // @}
1886  /** Tell Halide that the following dimensions correspond to GPU
1887  * block indices and thread indices. If the selected target is not
1888  * ptx, these just mark the given dimensions as parallel. The
1889  * dimensions are consumed by this call, so do all other
1890  * unrolling, reordering, etc first. */
1891  // @{
1892  Func &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1893  Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
1894  const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1895  Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
1896  const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1897  // @}
1899  /** Short-hand for tiling a domain and mapping the tile indices
1900  * to GPU block indices and the coordinates within each tile to
1901  * GPU thread indices. Consumes the variables given, so do all
1902  * other scheduling first. */
1903  // @{
1904  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
1906  DeviceAPI device_api = DeviceAPI::Default_GPU);
1908  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
1910  DeviceAPI device_api = DeviceAPI::Default_GPU);
1911  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
1912  const VarOrRVar &bx, const VarOrRVar &by,
1913  const VarOrRVar &tx, const VarOrRVar &ty,
1914  const Expr &x_size, const Expr &y_size,
1916  DeviceAPI device_api = DeviceAPI::Default_GPU);
1918  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
1919  const VarOrRVar &tx, const VarOrRVar &ty,
1920  const Expr &x_size, const Expr &y_size,
1922  DeviceAPI device_api = DeviceAPI::Default_GPU);
1924  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
1925  const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
1926  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
1927  const Expr &x_size, const Expr &y_size, const Expr &z_size,
1929  DeviceAPI device_api = DeviceAPI::Default_GPU);
1930  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
1931  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
1932  const Expr &x_size, const Expr &y_size, const Expr &z_size,
1934  DeviceAPI device_api = DeviceAPI::Default_GPU);
1935  // @}
1937  /** Schedule for execution on Hexagon. When a loop is marked with
1938  * Hexagon, that loop is executed on a Hexagon DSP. */
1939  Func &hexagon(const VarOrRVar &x = Var::outermost());
1941  /** Prefetch data written to or read from a Func or an ImageParam by a
1942  * subsequent loop iteration, at an optionally specified iteration offset. You may specify
1943  * specification of different vars for the location of the prefetch() instruction
1944  * vs. the location that is being prefetched:
1945  *
1946  * - the first var specified, 'at', indicates the loop in which the prefetch will be placed
1947  * - the second var specified, 'from', determines the var used to find the bounds to prefetch
1948  * (in conjunction with 'offset')
1949  *
1950  * If 'at' and 'from' are distinct vars, then 'from' must be at a nesting level outside 'at.'
1951  * Note that the value for 'offset' applies only to 'from', not 'at'.
1952  *
1953  * The final argument specifies how prefetch of region outside bounds
1954  * should be handled.
1955  *
1956  * For example, consider this pipeline:
1957  \code
1958  Func f, g;
1959  Var x, y, z;
1960  f(x, y) = x + y;
1961  g(x, y) = 2 * f(x, y);
1962  h(x, y) = 3 * f(x, y);
1963  \endcode
1964  *
1965  * The following schedule:
1966  \code
1967  f.compute_root();
1968  g.prefetch(f, x, x, 2, PrefetchBoundStrategy::NonFaulting);
1969  h.prefetch(f, x, y, 2, PrefetchBoundStrategy::NonFaulting);
1970  \endcode
1971  *
1972  * will inject prefetch call at the innermost loop of 'g' and 'h' and generate
1973  * the following loop nest:
1974  \code
1975  for y = ...
1976  for x = ...
1977  f(x, y) = x + y
1978  for y = ..
1979  for x = ...
1980  prefetch(&f[x + 2, y], 1, 16);
1981  g(x, y) = 2 * f(x, y)
1982  for y = ..
1983  for x = ...
1984  prefetch(&f[x, y + 2], 1, 16);
1985  h(x, y) = 3 * f(x, y)
1986  \endcode
1987  *
1988  * Note that the 'from' nesting level need not be adjacent to 'at':
1989  \code
1990  Func f, g;
1991  Var x, y, z, w;
1992  f(x, y, z, w) = x + y + z + w;
1993  g(x, y, z, w) = 2 * f(x, y, z, w);
1994  \endcode
1995  *
1996  * The following schedule:
1997  \code
1998  f.compute_root();
1999  g.prefetch(f, y, w, 2, PrefetchBoundStrategy::NonFaulting);
2000  \endcode
2001  *
2002  * will produce code that prefetches a tile of data:
2003  \code
2004  for w = ...
2005  for z = ...
2006  for y = ...
2007  for x = ...
2008  f(x, y, z, w) = x + y + z + w
2009  for w = ...
2010  for z = ...
2011  for y = ...
2012  for x0 = ...
2013  prefetch(&f[x0, y, z, w + 2], 1, 16);
2014  for x = ...
2015  g(x, y, z, w) = 2 * f(x, y, z, w)
2016  \endcode
2017  *
2018  * Note that calling prefetch() with the same var for both 'at' and 'from'
2019  * is equivalent to calling prefetch() with that var.
2020  */
2021  // @{
2022  Func &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2024  Func &prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2026  template<typename T>
2027  Func &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2029  return prefetch(image.parameter(), at, from, std::move(offset), strategy);
2030  }
2031  // @}
2033  /** Specify how the storage for the function is laid out. These
2034  * calls let you specify the nesting order of the dimensions. For
2035  * example, foo.reorder_storage(y, x) tells Halide to use
2036  * column-major storage for any realizations of foo, without
2037  * changing how you refer to foo in the code. You may want to do
2038  * this if you intend to vectorize across y. When representing
2039  * color images, foo.reorder_storage(c, x, y) specifies packed
2040  * storage (red, green, and blue values adjacent in memory), and
2041  * foo.reorder_storage(x, y, c) specifies planar storage (entire
2042  * red, green, and blue images one after the other in memory).
2043  *
2044  * If you leave out some dimensions, those remain in the same
2045  * positions in the nesting order while the specified variables
2046  * are reordered around them. */
2047  // @{
2048  Func &reorder_storage(const std::vector<Var> &dims);
2050  Func &reorder_storage(const Var &x, const Var &y);
2051  template<typename... Args>
2053  reorder_storage(const Var &x, const Var &y, Args &&...args) {
2054  std::vector<Var> collected_args{x, y, std::forward<Args>(args)...};
2055  return reorder_storage(collected_args);
2056  }
2057  // @}
2059  /** Pad the storage extent of a particular dimension of
2060  * realizations of this function up to be a multiple of the
2061  * specified alignment. This guarantees that the strides for the
2062  * dimensions stored outside of dim will be multiples of the
2063  * specified alignment, where the strides and alignment are
2064  * measured in numbers of elements.
2065  *
2066  * For example, to guarantee that a function foo(x, y, c)
2067  * representing an image has scanlines starting on offsets
2068  * aligned to multiples of 16, use foo.align_storage(x, 16). */
2069  Func &align_storage(const Var &dim, const Expr &alignment);
2071  /** Store realizations of this function in a circular buffer of a
2072  * given extent. This is more efficient when the extent of the
2073  * circular buffer is a power of 2. If the fold factor is too
2074  * small, or the dimension is not accessed monotonically, the
2075  * pipeline will generate an error at runtime.
2076  *
2077  * The fold_forward option indicates that the new values of the
2078  * producer are accessed by the consumer in a monotonically
2079  * increasing order. Folding storage of producers is also
2080  * supported if the new values are accessed in a monotonically
2081  * decreasing order by setting fold_forward to false.
2082  *
2083  * For example, consider the pipeline:
2084  \code
2085  Func f, g;
2086  Var x, y;
2087  g(x, y) = x*y;
2088  f(x, y) = g(x, y) + g(x, y+1);
2089  \endcode
2090  *
2091  * If we schedule f like so:
2092  *
2093  \code
2094  g.compute_at(f, y).store_root().fold_storage(y, 2);
2095  \endcode
2096  *
2097  * Then g will be computed at each row of f and stored in a buffer
2098  * with an extent in y of 2, alternately storing each computed row
2099  * of g in row y=0 or y=1.
2100  */
2101  Func &fold_storage(const Var &dim, const Expr &extent, bool fold_forward = true);
2103  /** Compute this function as needed for each unique value of the
2104  * given var for the given calling function f.
2105  *
2106  * For example, consider the simple pipeline:
2107  \code
2108  Func f, g;
2109  Var x, y;
2110  g(x, y) = x*y;
2111  f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2112  \endcode
2113  *
2114  * If we schedule f like so:
2115  *
2116  \code
2117  g.compute_at(f, x);
2118  \endcode
2119  *
2120  * Then the C code equivalent to this pipeline will look like this
2121  *
2122  \code
2124  int f[height][width];
2125  for (int y = 0; y < height; y++) {
2126  for (int x = 0; x < width; x++) {
2127  int g[2][2];
2128  g[0][0] = x*y;
2129  g[0][1] = (x+1)*y;
2130  g[1][0] = x*(y+1);
2131  g[1][1] = (x+1)*(y+1);
2132  f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
2133  }
2134  }
2136  \endcode
2137  *
2138  * The allocation and computation of g is within f's loop over x,
2139  * and enough of g is computed to satisfy all that f will need for
2140  * that iteration. This has excellent locality - values of g are
2141  * used as soon as they are computed, but it does redundant
2142  * work. Each value of g ends up getting computed four times. If
2143  * we instead schedule f like so:
2144  *
2145  \code
2146  g.compute_at(f, y);
2147  \endcode
2148  *
2149  * The equivalent C code is:
2150  *
2151  \code
2152  int f[height][width];
2153  for (int y = 0; y < height; y++) {
2154  int g[2][width+1];
2155  for (int x = 0; x < width; x++) {
2156  g[0][x] = x*y;
2157  g[1][x] = x*(y+1);
2158  }
2159  for (int x = 0; x < width; x++) {
2160  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2161  }
2162  }
2163  \endcode
2164  *
2165  * The allocation and computation of g is within f's loop over y,
2166  * and enough of g is computed to satisfy all that f will need for
2167  * that iteration. This does less redundant work (each point in g
2168  * ends up being evaluated twice), but the locality is not quite
2169  * as good, and we have to allocate more temporary memory to store
2170  * g.
2171  */
2172  Func &compute_at(const Func &f, const Var &var);
2174  /** Schedule a function to be computed within the iteration over
2175  * some dimension of an update domain. Produces equivalent code
2176  * to the version of compute_at that takes a Var. */
2177  Func &compute_at(const Func &f, const RVar &var);
2179  /** Schedule a function to be computed within the iteration over
2180  * a given LoopLevel. */
2181  Func &compute_at(LoopLevel loop_level);
2183  /** Schedule the iteration over the initial definition of this function
2184  * to be fused with another stage 's' from outermost loop to a
2185  * given LoopLevel. */
2186  // @{
2187  Func &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
2189  Func &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
2192  /** Compute all of this function once ahead of time. Reusing
2193  * the example in \ref Func::compute_at :
2194  *
2195  \code
2196  Func f, g;
2197  Var x, y;
2198  g(x, y) = x*y;
2199  f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2201  g.compute_root();
2202  \endcode
2203  *
2204  * is equivalent to
2205  *
2206  \code
2207  int f[height][width];
2208  int g[height+1][width+1];
2209  for (int y = 0; y < height+1; y++) {
2210  for (int x = 0; x < width+1; x++) {
2211  g[y][x] = x*y;
2212  }
2213  }
2214  for (int y = 0; y < height; y++) {
2215  for (int x = 0; x < width; x++) {
2216  f[y][x] = g[y][x] + g[y+1][x] + g[y][x+1] + g[y+1][x+1];
2217  }
2218  }
2219  \endcode
2220  *
2221  * g is computed once ahead of time, and enough is computed to
2222  * satisfy all uses of it. This does no redundant work (each point
2223  * in g is evaluated once), but has poor locality (values of g are
2224  * probably not still in cache when they are used by f), and
2225  * allocates lots of temporary memory to store g.
2226  */
2227  Func &compute_root();
2229  /** Use the halide_memoization_cache_... interface to store a
2230  * computed version of this function across invocations of the
2231  * Func.
2232  *
2233  * If an eviction_key is provided, it must be constructed with
2234  * Expr of integer or handle type. The key Expr will be promoted
2235  * to a uint64_t and can be used with halide_memoization_cache_evict
2236  * to remove memoized entries using this eviction key from the
2237  * cache. Memoized computations that do not provide an eviction
2238  * key will never be evicted by this mechanism.
2239  */
2240  Func &memoize(const EvictionKey &eviction_key = EvictionKey());
2242  /** Produce this Func asynchronously in a separate
2243  * thread. Consumers will be run by the task system when the
2244  * production is complete. If this Func's store level is different
2245  * to its compute level, consumers will be run concurrently,
2246  * blocking as necessary to prevent reading ahead of what the
2247  * producer has computed. If storage is folded, then the producer
2248  * will additionally not be permitted to run too far ahead of the
2249  * consumer, to avoid clobbering data that has not yet been
2250  * used.
2251  *
2252  * Take special care when combining this with custom thread pool
2253  * implementations, as avoiding deadlock with producer-consumer
2254  * parallelism requires a much more sophisticated parallel runtime
2255  * than with data parallelism alone. It is strongly recommended
2256  * you just use Halide's default thread pool, which guarantees no
2257  * deadlock and a bound on the number of threads launched.
2258  */
2259  Func &async();
2261  /** Bound the extent of a Func's storage, but not extent of its
2262  * compute. This can be useful for forcing a function's allocation
2263  * to be a fixed size, which often means it can go on the stack.
2264  * If bounds inference decides that it requires more storage for
2265  * this function than the allocation size you have stated, a runtime
2266  * error will occur when you try to run the pipeline. */
2267  Func &bound_storage(const Var &dim, const Expr &bound);
2269  /** Allocate storage for this function within f's loop over
2270  * var. Scheduling storage is optional, and can be used to
2271  * separate the loop level at which storage occurs from the loop
2272  * level at which computation occurs to trade off between locality
2273  * and redundant work. This can open the door for two types of
2274  * optimization.
2275  *
2276  * Consider again the pipeline from \ref Func::compute_at :
2277  \code
2278  Func f, g;
2279  Var x, y;
2280  g(x, y) = x*y;
2281  f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
2282  \endcode
2283  *
2284  * If we schedule it like so:
2285  *
2286  \code
2287  g.compute_at(f, x).store_at(f, y);
2288  \endcode
2289  *
2290  * Then the computation of g takes place within the loop over x,
2291  * but the storage takes place within the loop over y:
2292  *
2293  \code
2294  int f[height][width];
2295  for (int y = 0; y < height; y++) {
2296  int g[2][width+1];
2297  for (int x = 0; x < width; x++) {
2298  g[0][x] = x*y;
2299  g[0][x+1] = (x+1)*y;
2300  g[1][x] = x*(y+1);
2301  g[1][x+1] = (x+1)*(y+1);
2302  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2303  }
2304  }
2305  \endcode
2306  *
2307  * Provided the for loop over x is serial, halide then
2308  * automatically performs the following sliding window
2309  * optimization:
2310  *
2311  \code
2312  int f[height][width];
2313  for (int y = 0; y < height; y++) {
2314  int g[2][width+1];
2315  for (int x = 0; x < width; x++) {
2316  if (x == 0) {
2317  g[0][x] = x*y;
2318  g[1][x] = x*(y+1);
2319  }
2320  g[0][x+1] = (x+1)*y;
2321  g[1][x+1] = (x+1)*(y+1);
2322  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2323  }
2324  }
2325  \endcode
2326  *
2327  * Two of the assignments to g only need to be done when x is
2328  * zero. The rest of the time, those sites have already been
2329  * filled in by a previous iteration. This version has the
2330  * locality of compute_at(f, x), but allocates more memory and
2331  * does much less redundant work.
2332  *
2333  * Halide then further optimizes this pipeline like so:
2334  *
2335  \code
2336  int f[height][width];
2337  for (int y = 0; y < height; y++) {
2338  int g[2][2];
2339  for (int x = 0; x < width; x++) {
2340  if (x == 0) {
2341  g[0][0] = x*y;
2342  g[1][0] = x*(y+1);
2343  }
2344  g[0][(x+1)%2] = (x+1)*y;
2345  g[1][(x+1)%2] = (x+1)*(y+1);
2346  f[y][x] = g[0][x%2] + g[1][x%2] + g[0][(x+1)%2] + g[1][(x+1)%2];
2347  }
2348  }
2349  \endcode
2350  *
2351  * Halide has detected that it's possible to use a circular buffer
2352  * to represent g, and has reduced all accesses to g modulo 2 in
2353  * the x dimension. This optimization only triggers if the for
2354  * loop over x is serial, and if halide can statically determine
2355  * some power of two large enough to cover the range needed. For
2356  * powers of two, the modulo operator compiles to more efficient
2357  * bit-masking. This optimization reduces memory usage, and also
2358  * improves locality by reusing recently-accessed memory instead
2359  * of pulling new memory into cache.
2360  *
2361  */
2362  Func &store_at(const Func &f, const Var &var);
2364  /** Equivalent to the version of store_at that takes a Var, but
2365  * schedules storage within the loop over a dimension of a
2366  * reduction domain */
2367  Func &store_at(const Func &f, const RVar &var);
2369  /** Equivalent to the version of store_at that takes a Var, but
2370  * schedules storage at a given LoopLevel. */
2371  Func &store_at(LoopLevel loop_level);
2373  /** Equivalent to \ref Func::store_at, but schedules storage
2374  * outside the outermost loop. */
2375  Func &store_root();
2377  /** Aggressively inline all uses of this function. This is the
2378  * default schedule, so you're unlikely to need to call this. For
2379  * a Func with an update definition, that means it gets computed
2380  * as close to the innermost loop as possible.
2381  *
2382  * Consider once more the pipeline from \ref Func::compute_at :
2383  *
2384  \code
2385  Func f, g;
2386  Var x, y;
2387  g(x, y) = x*y;
2388  f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
2389  \endcode
2390  *
2391  * Leaving g as inline, this compiles to code equivalent to the following C:
2392  *
2393  \code
2394  int f[height][width];
2395  for (int y = 0; y < height; y++) {
2396  for (int x = 0; x < width; x++) {
2397  f[y][x] = x*y + x*(y+1) + (x+1)*y + (x+1)*(y+1);
2398  }
2399  }
2400  \endcode
2401  */
2402  Func &compute_inline();
2404  /** Get a handle on an update step for the purposes of scheduling
2405  * it. */
2406  Stage update(int idx = 0);
2408  /** Set the type of memory this Func should be stored in. Controls
2409  * whether allocations go on the stack or the heap on the CPU, and
2410  * in global vs shared vs local on the GPU. See the documentation
2411  * on MemoryType for more detail. */
2412  Func &store_in(MemoryType memory_type);
2414  /** Trace all loads from this Func by emitting calls to
2415  * halide_trace. If the Func is inlined, this has no
2416  * effect. */
2417  Func &trace_loads();
2419  /** Trace all stores to the buffer backing this Func by emitting
2420  * calls to halide_trace. If the Func is inlined, this call
2421  * has no effect. */
2422  Func &trace_stores();
2424  /** Trace all realizations of this Func by emitting calls to
2425  * halide_trace. */
2428  /** Add a string of arbitrary text that will be passed thru to trace
2429  * inspection code if the Func is realized in trace mode. (Funcs that are
2430  * inlined won't have their tags emitted.) Ignored entirely if
2431  * tracing is not enabled for the Func (or globally).
2432  */
2433  Func &add_trace_tag(const std::string &trace_tag);
2435  /** Get a handle on the internal halide function that this Func
2436  * represents. Useful if you want to do introspection on Halide
2437  * functions */
2438  Internal::Function function() const {
2439  return func;
2440  }
2442  /** You can cast a Func to its pure stage for the purposes of
2443  * scheduling it. */
2444  operator Stage() const;
2446  /** Get a handle on the output buffer for this Func. Only relevant
2447  * if this is the output Func in a pipeline. Useful for making
2448  * static promises about strides, mins, and extents. */
2449  // @{
2451  std::vector<OutputImageParam> output_buffers() const;
2452  // @}
2454  /** Use a Func as an argument to an external stage. */
2455  operator ExternFuncArgument() const;
2457  /** Infer the arguments to the Func, sorted into a canonical order:
2458  * all buffers (sorted alphabetically by name), followed by all non-buffers
2459  * (sorted alphabetically by name).
2460  This lets you write things like:
2461  \code
2462  func.compile_to_assembly("/dev/stdout", func.infer_arguments());
2463  \endcode
2464  */
2465  std::vector<Argument> infer_arguments() const;
2467  /** Get the source location of the pure definition of this
2468  * Func. See Stage::source_location() */
2469  std::string source_location() const;
2471  /** Return the current StageSchedule associated with this initial
2472  * Stage of this Func. For introspection only: to modify schedule,
2473  * use the Func interface. */
2475  return Stage(*this).get_schedule();
2476  }
2477 };
2479 namespace Internal {
2481 template<typename Last>
2482 inline void check_types(const Tuple &t, int idx) {
2483  using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
2484  user_assert(t[idx].type() == type_of<T>())
2485  << "Can't evaluate expression "
2486  << t[idx] << " of type " << t[idx].type()
2487  << " as a scalar of type " << type_of<T>() << "\n";
2488 }
2490 template<typename First, typename Second, typename... Rest>
2491 inline void check_types(const Tuple &t, int idx) {
2492  check_types<First>(t, idx);
2493  check_types<Second, Rest...>(t, idx + 1);
2494 }
2496 template<typename Last>
2497 inline void assign_results(Realization &r, int idx, Last last) {
2498  using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
2499  *last = Buffer<T>(r[idx])();
2500 }
2502 template<typename First, typename Second, typename... Rest>
2503 inline void assign_results(Realization &r, int idx, First first, Second second, Rest &&...rest) {
2504  assign_results<First>(r, idx, first);
2505  assign_results<Second, Rest...>(r, idx + 1, second, rest...);
2506 }
2508 } // namespace Internal
2510 /** JIT-Compile and run enough code to evaluate a Halide
2511  * expression. This can be thought of as a scalar version of
2512  * \ref Func::realize */
2513 template<typename T>
2515  user_assert(e.type() == type_of<T>())
2516  << "Can't evaluate expression "
2517  << e << " of type " << e.type()
2518  << " as a scalar of type " << type_of<T>() << "\n";
2519  Func f;
2520  f() = e;
2521  Buffer<T, 0> im = f.realize(ctx);
2522  return im();
2523 }
2525 /** evaluate with a default user context */
2526 template<typename T>
2528  return evaluate<T>(nullptr, e);
2529 }
2531 /** JIT-compile and run enough code to evaluate a Halide Tuple. */
2532 template<typename First, typename... Rest>
2533 HALIDE_NO_USER_CODE_INLINE void evaluate(JITUserContext *ctx, Tuple t, First first, Rest &&...rest) {
2534  Internal::check_types<First, Rest...>(t, 0);
2536  Func f;
2537  f() = t;
2538  Realization r = f.realize(ctx);
2539  Internal::assign_results(r, 0, first, rest...);
2540 }
2542 /** JIT-compile and run enough code to evaluate a Halide Tuple. */
2543 template<typename First, typename... Rest>
2544 HALIDE_NO_USER_CODE_INLINE void evaluate(Tuple t, First first, Rest &&...rest) {
2545  evaluate<First, Rest...>(nullptr, std::move(t), std::forward<First>(first), std::forward<Rest...>(rest...));
2546 }
2548 namespace Internal {
2550 inline void schedule_scalar(Func f) {
2552  if (t.has_gpu_feature()) {
2553  f.gpu_single_thread();
2554  }
2555  if (t.has_feature(Target::HVX)) {
2556  f.hexagon();
2557  }
2558 }
2560 } // namespace Internal
2562 /** JIT-Compile and run enough code to evaluate a Halide
2563  * expression. This can be thought of as a scalar version of
2564  * \ref Func::realize. Can use GPU if jit target from environment
2565  * specifies one.
2566  */
2567 template<typename T>
2569  user_assert(e.type() == type_of<T>())
2570  << "Can't evaluate expression "
2571  << e << " of type " << e.type()
2572  << " as a scalar of type " << type_of<T>() << "\n";
2573  Func f;
2574  f() = e;
2576  Buffer<T, 0> im = f.realize();
2577  return im();
2578 }
2580 /** JIT-compile and run enough code to evaluate a Halide Tuple. Can
2581  * use GPU if jit target from environment specifies one. */
2582 // @{
2583 template<typename First, typename... Rest>
2584 HALIDE_NO_USER_CODE_INLINE void evaluate_may_gpu(Tuple t, First first, Rest &&...rest) {
2585  Internal::check_types<First, Rest...>(t, 0);
2587  Func f;
2588  f() = t;
2590  Realization r = f.realize();
2591  Internal::assign_results(r, 0, first, rest...);
2592 }
2593 // @}
2595 } // namespace Halide
2597 #endif
Stage & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Stage & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func copy_to_device(DeviceAPI d=DeviceAPI::Default_GPU)
Declare that this function should be implemented by a call to halide_buffer_copy with the given targe...
Stage & parallel(const VarOrRVar &var)
Func & compute_with(const Stage &s, const VarOrRVar &var, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy >> &align)
Schedule the iteration over the initial definition of this function to be fused with another stage 's...
Stage & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Definition: Util.h:232
Expr key
Definition: Func.h:674
void compile_jit(const Target &target=get_jit_target_from_environment())
Eagerly jit compile the function to machine code.
const std::string & name() const
Definition: Func.h:48
std::vector< RVar > rvars(int idx=0) const
Get the RVars of the reduction domain for an update definition, if there is one.
bool has_feature(Feature f) const
Declare a new undefined function with an automatically-generated unique name.
void compile_to_c(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name="", const Target &target=get_target_from_environment())
Statically compile this function to C source code.
const std::string & name() const
The name of this function, either given during construction, or automatically generated.
Stage & reorder(const std::vector< VarOrRVar > &vars)
Stage specialize(const Expr &condition)
Specialize a Func.
Func & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices.
#define internal_assert(c)
Definition: Errors.h:19
std::vector< Range > Region
A multi-dimensional box.
Definition: Expr.h:344
Stage ScheduleHandle
Definition: Func.h:469
A Halide variable, to be used when defining functions.
Definition: Var.h:19
std::string name() const
Return the name of this stage, e.g.
Stage operator/=(const Expr &e)
Define a stage that divides Tuple component 'idx' of this Func by the given expression.
HALIDE_NO_USER_CODE_INLINE T evaluate(JITUserContext *ctx, const Expr &e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition: Func.h:2514
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Short-hand for tiling a domain and mapping the tile indices to GPU block indices and the coordinates ...
FuncTupleElementRef(const FuncRef &ref, const std::vector< Expr > &args, int idx)
@ Default_GPU
HALIDE_NO_USER_CODE_INLINE Func(Buffer< T, Dims > &im)
Construct a new Func to wrap a Buffer.
Definition: Func.h:746
A reference-counted handle to a parameter to a halide pipeline.
Definition: Parameter.h:28
@ Host
Used to denote for loops that run on the same device as the containing code.
Var var
Definition: Func.h:56
Tuple values() const
The values returned by this function.
Func & align_bounds(const Var &var, Expr modulus, Expr remainder=0)
Expand the region computed so that the min coordinates is congruent to 'remainder' modulo 'modulus',...
Stage & serial(const VarOrRVar &var)
void compile_to_lowered_stmt(const std::string &filename, const std::vector< Argument > &args, StmtOutputFormat fmt=Text, const Target &target=get_target_from_environment())
Write out an internal representation of lowered code.
Stage & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & reorder(const std::vector< VarOrRVar > &vars)
Reorder variables to have the given nesting order, from innermost out.
RVar rvar
Definition: Func.h:57
Expr min(const FuncRef &a, const FuncRef &b)
Explicit overloads of min and max for FuncRef.
Definition: Func.h:584
void compile_to_llvm_assembly(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to llvm assembly, with the given filename (which should probably end...
Used to determine if the output printed to file should be as a normal string or as an HTML file which...
Definition: Pipeline.h:73
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
FuncTupleElementRef operator[](int) const
When a FuncRef refers to a function that provides multiple outputs, you can access each output as an ...
Func & hexagon(const VarOrRVar &x=Var::outermost())
Schedule for execution on Hexagon.
An enum describing a type of loop traversal.
Definition: Expr.h:400
size_t size() const
How many outputs does the function this refers to produce.
Stage & rename(const VarOrRVar &old_name, const VarOrRVar &new_name)
const Type & type() const
Get the type(s) of the outputs of this Func.
Func & store_at(const Func &f, const Var &var)
Allocate storage for this function within f's loop over var.
Func & bound_storage(const Var &dim, const Expr &bound)
Bound the extent of a Func's storage, but not extent of its compute.
An Image parameter to a halide pipeline.
Definition: ImageParam.h:23
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Expr, Args... >::value, FuncRef >::type operator()(const Expr &x, Args &&...args) const
Definition: Func.h:1273
Func & vectorize(const VarOrRVar &var)
Mark a dimension to be computed all-at-once as a single vector.
Func & fold_storage(const Var &dim, const Expr &extent, bool fold_forward=true)
Store realizations of this function in a circular buffer of a given extent.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Func & >::type reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args)
Definition: Func.h:1577
Func & compute_at(const Func &f, const Var &var)
Compute this function as needed for each unique value of the given var for the given calling function...
Func & set_estimates(const Region &estimates)
Set (min, extent) estimates for all dimensions in the Func at once; this is equivalent to calling set...
Different ways to handle the case when the start/end of the loops of stages computed with (fused) are...
Definition: Schedule.h:110
Internal::Function function() const
What function is this calling?
Definition: Func.h:575
#define user_assert(c)
Definition: test.h:10
Func rfactor(std::vector< std::pair< RVar, Var >> preserved)
Calling rfactor() on an associative update definition a Func will split the update into an intermedia...
void compile_to_bitcode(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to llvm bitcode, with the given filename (which should probably end ...
Stage operator+=(const Expr &e)
Define a stage that adds the given expression to Tuple component 'idx' of this Func.
void add_custom_lowering_pass(T *pass)
Add a custom pass to be used during lowering.
Definition: Func.h:1076
@ GuardWithIf
Guard the prefetch with if-guards that ignores the prefetch if any of the prefetched region ever goes...
Target get_target_from_environment()
Return the target that Halide will use.
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled? I.e.
@ Default
Match whatever is specified in the Target.
static Var outermost()
A Var that represents the location outside the outermost loop.
Definition: Var.h:163
Func & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Join two dimensions into a single fused dimension.
void clear_custom_lowering_passes()
Remove all previously-set custom lowering passes.
@ Text
Definition: Pipeline.h:74
FuncRef operator()(std::vector< Var >) const
Construct either the left-hand-side of a definition, or a call to a functions that happens to only co...
Realization realize(std::vector< int32_t > sizes={}, const Target &target=Target(), const ParamMap &param_map=ParamMap::empty_map())
Evaluate this function over some rectangular domain and return the resulting buffer or buffers.
Stage & prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition: Func.h:449
A Function definition which can either represent a init or an update definition.
Definition: Definition.h:38
FuncRef(const Internal::Function &, const std::vector< Expr > &, int placeholder_pos=-1, int count=0)
Stage operator-=(const Expr &e)
Define a stage that adds the negative of the given expression to Tuple component 'idx' of this Func.
Func & add_trace_tag(const std::string &trace_tag)
Add a string of arbitrary text that will be passed thru to trace inspection code if the Func is reali...
void unscheduled()
Assert that this stage has intentionally been given no schedule, and suppress the warning about unsch...
Different ways to handle a tail case in a split when the factor does not provably divide the extent.
Definition: Schedule.h:32
Stage operator+=(Expr)
Define a stage that adds the given expression to this Func.
A context to be passed to Pipeline::realize.
Definition: JITModule.h:136
Types in the halide type system.
Definition: Type.h:276
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, int dimensionality, NameMangling mangling)
Definition: Func.h:1181
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Stage & >::type reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args)
Definition: Func.h:378
A class representing a Halide pipeline.
Definition: Pipeline.h:108
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Definition: AbstractGenerator.h:19
void compile_to_multitarget_static_library(const std::string &filename_prefix, const std::vector< Argument > &args, const std::vector< Target > &targets)
Compile to static-library file and header pair once for each target; each resulting function will be ...
Expr value() const
The right-hand-side value of the pure definition of this function.
An argument to an extern-defined Func.
Definition: ExternFuncArgument.h:17
Func copy_to_host()
Declare that this function should be implemented by a call to halide_buffer_copy with a NULL target d...
@ Internal
Not visible externally, similar to 'static' linkage in C.
std::string source_location() const
Get the source location of the pure definition of this Func.
Definition: Target.h:121
int outputs() const
Get the number of outputs of this Func.
Func & set_estimate(const Var &var, const Expr &min, const Expr &extent)
Statically declare the range over which the function will be evaluated in the general case.
Stage update(int idx=0)
Get a handle on an update step for the purposes of scheduling it.
std::vector< Argument > infer_arguments() const
Infer the arguments to the Func, sorted into a canonical order: all buffers (sorted alphabetically by...
Func & serial(const VarOrRVar &var)
Mark a dimension to be traversed serially.
std::vector< Var > make_argument_list(int dimensionality)
Make a list of unique arguments for definitions with unnamed arguments.
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Add an extern definition for this Func.
Definition: Func.h:1171
Stage operator=(const Expr &)
Use this as the left-hand-side of a definition or an update definition (see RDom).
Func & bound_extent(const Var &var, Expr extent)
Bound the extent of a Func's realization, but not its min.
@ Auto
By default, LoopAlignStrategy is set to NoAlign.
void compile_to_static_library(const std::string &filename_prefix, const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Compile to static-library file and header pair, with the given arguments.
std::string dump_argument_list() const
Return a string describing the current var list taking into account all the splits,...
A Halide::Buffer is a named shared reference to a Halide::Runtime::Buffer.
Definition: Argument.h:16
const std::vector< Expr > & args() const
Get the default (no-specialization) arguments (left-hand-side) of the definition.
Stage(Internal::Function f, Internal::Definition d, size_t stage_index)
Definition: Func.h:94
Func & gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
The given dimension corresponds to the lanes in a GPU warp.
void infer_input_bounds(const std::vector< int32_t > &sizes, const Target &target=get_jit_target_from_environment(), const ParamMap &param_map=ParamMap::empty_map())
For a given size of output, or a given output buffer, determine the bounds required of all unbound Im...
Stage & unroll(const VarOrRVar &var)
const std::string & extern_function_name() const
Get the name of the extern function called for an extern definition.
VarOrRVar(const RDom &r)
Definition: Func.h:40
Module compile_to_module(const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Store an internal representation of lowered code as a self contained Module suitable for further comp...
bool is_rvar
Definition: Func.h:58
JITHandlers & jit_handlers()
Get a struct containing the currently set custom functions used by JIT.
void compile_to_multitarget_object_files(const std::string &filename_prefix, const std::vector< Argument > &args, const std::vector< Target > &targets, const std::vector< std::string > &suffixes)
Like compile_to_multitarget_static_library(), except that the object files are all output as object f...
Expr update_value(int idx=0) const
Get the right-hand-side of an update definition.
bool defined() const
Definition objects are nullable.
Func & async()
Produce this Func asynchronously in a separate thread.
void compile_to_assembly(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to text assembly equivalent to the object file generated by compile_...
Func & store_root()
Equivalent to Func::store_at, but schedules storage outside the outermost loop.
HALIDE_ALWAYS_INLINE Type type() const
Get the type of this expression node.
Definition: Expr.h:321
void compile_to_file(const std::string &filename_prefix, const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Compile to object file and header pair, with the given arguments.
A handle on the output buffer of a pipeline.
Definition: OutputImageParam.h:19
Func & parallel(const VarOrRVar &var)
Mark a dimension to be traversed in parallel.
Func & prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Prefetch data written to or read from a Func or an ImageParam by a subsequent loop iteration,...
bool is_extern() const
Is this function an external stage? That is, was it defined using define_extern?
Stage operator=(const Expr &e)
Use this as the left-hand-side of an update definition of Tuple component 'idx' of a Func (see RDom).
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Var, Args... >::value, FuncRef >::type operator()(Args &&...args) const
Definition: Func.h:1256
Func & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Split two dimensions at once by the given factors, and then reorder the resulting dimensions to be xi...
Stage & hexagon(const VarOrRVar &x=Var::outermost())
Func clone_in(const Func &f)
Similar to Func::in; however, instead of replacing the call to this Func with an identity Func that r...
Stage & vectorize(const VarOrRVar &var)
std::vector< OutputImageParam > output_buffers() const
Stage specialize(const Expr &condition)
void debug_to_file(const std::string &filename)
When this function is compiled, include code that dumps its values to a file after it is realized,...
@ Auto
For pure definitions use ShiftInwards.
void compile_to_object(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to an object file, with the given filename (which should probably en...
Stage operator-=(Expr)
Define a stage that adds the negative of the given expression to this Func.
Func & memoize(const EvictionKey &eviction_key=EvictionKey())
Use the halide_memoization_cache_...
An enum to specify calling convention for extern stages.
Definition: Function.h:25
Func & store_in(MemoryType memory_type)
Set the type of memory this Func should be stored in.
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
A halide function.
Definition: Func.h:687
const std::vector< CustomLoweringPass > & custom_lowering_passes()
Get the custom lowering passes.
Func & compute_root()
Compute all of this function once ahead of time.
EvictionKey(const Expr &expr=Expr())
Definition: Func.h:678
A base class for passes over the IR which modify it (e.g.
Definition: IRMutator.h:26
Definition: Util.h:45
Stage & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & unroll(const VarOrRVar &var)
Mark a dimension to be completely unrolled.
Func & bound(const Var &var, Expr min, Expr extent)
Statically declare that the range over which a function should be evaluated is given by the second an...
Func & trace_stores()
Trace all stores to the buffer backing this Func by emitting calls to halide_trace.
A reference-counted handle to Halide's internal representation of a function.
Definition: Function.h:39
Func & compute_inline()
Aggressively inline all uses of this function.
void assign_results(Realization &r, int idx, Last last)
Definition: Func.h:2497
VarOrRVar(const ImplicitVar< N > &u)
Definition: Func.h:44
const std::vector< Type > & types() const
bool defined() const
Does this function have at least a pure definition.
void compile_to_header(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name="", const Target &target=get_target_from_environment())
Emit a header file with the given filename for this function.
int dimensions() const
The dimensionality (number of arguments) of this function.
HALIDE_NO_USER_CODE_INLINE T evaluate_may_gpu(const Expr &e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition: Func.h:2568
VarOrRVar(const RVar &r)
Definition: Func.h:37
A reference to a site in a Halide statement at the top of the body of a particular for loop.
Definition: Schedule.h:176
Stage operator*=(const Expr &e)
Define a stage that multiplies Tuple component 'idx' of this Func by the given expression.
Func & rename(const VarOrRVar &old_name, const VarOrRVar &new_name)
Rename a dimension.
A reduction variable represents a single dimension of a reduction domain (RDom).
Definition: RDom.h:29
A multi-dimensional domain over which to iterate.
Definition: RDom.h:193
Func & align_extent(const Var &var, Expr modulus)
Expand the region computed so that the extent is a multiple of 'modulus'.
Func & trace_realizations()
Trace all realizations of this Func by emitting calls to halide_trace.
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this initial Stage of this Func.
Definition: Func.h:2474
OutputImageParam output_buffer() const
Get a handle on the output buffer for this Func.
const std::string & name() const
The name of this reduction variable.
Func & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU thread indices.
Func & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide to run this stage using a single gpu thread and block.
bool has_update_definition() const
Does this function have at least one update definition?
Target get_jit_target_from_environment()
Return the target that Halide will use for jit-compilation.
void print_loop_nest()
Write out the loop nests specified by the schedule for this Function.
std::vector< Var > args() const
Get the pure arguments.
Func & atomic(bool override_associativity_test=false)
Issue atomic updates for this Func.
Func & align_storage(const Var &dim, const Expr &alignment)
Pad the storage extent of a particular dimension of realizations of this function up to be a multiple...
Stage operator/=(Expr)
Define a stage that divides this Func by the given expression.
VarOrRVar(const Var &v)
Definition: Func.h:34
Different ways to handle accesses outside the original extents in a prefetch.
Definition: PrefetchDirective.h:16
const StageSchedule & schedule() const
Get the default (no-specialization) stage-specific schedule associated with this definition.
Func & allow_race_conditions()
Specify that race conditions are permitted for this Func, which enables parallelizing over RVars even...
void specialize_fail(const std::string &message)
Add a specialization to a Func that always terminates execution with a call to halide_error().
Callable compile_to_callable(const std::vector< Argument > &args, const Target &target=get_jit_target_from_environment())
Eagerly jit compile the function to machine code and return a callable struct that behaves like a fun...
Stage & allow_race_conditions()
std::string source_location() const
Attempt to get the source file and line where this stage was defined by parsing the process's own deb...
A fragment of Halide syntax.
Definition: Expr.h:257
Helper class for identifying purpose of an Expr passed to memoize.
Definition: Func.h:672
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, const std::vector< Var > &arguments, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Definition: Func.h:1199
Definition: Var.h:169
A Realization is a vector of references to existing Buffer objects.
Definition: Realization.h:19
A class that can represent Vars or RVars.
Definition: Func.h:30
int index() const
Return index to the function outputs.
Definition: Func.h:661
A fragment of front-end syntax of the form f(x, y, z)[index], where x, y, z are Vars or Exprs.
Definition: Func.h:597
An enum describing different address spaces to be used with Func::store_in.
Definition: Expr.h:347
void specialize_fail(const std::string &message)
Func in()
Create and return a global identity wrapper, which wraps all calls to this Func by any other Func.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Var, Args... >::value, Func & >::type reorder_storage(const Var &x, const Var &y, Args &&...args)
Definition: Func.h:2053
Func & prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition: Func.h:2027
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Definition: Func.h:1189
Expr max(const FuncRef &a, const FuncRef &b)
Definition: Func.h:587
A schedule for a single stage of a Halide pipeline.
Definition: Schedule.h:646
Stage & prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Tuple update_values(int idx=0) const
Get the right-hand-side of an update definition for functions that returns multiple values.
A single definition of a Func.
Definition: Func.h:70
void check_types(const Tuple &t, int idx)
Definition: Func.h:2482
Create a small array of Exprs for defining and calling functions with multiple outputs.
Definition: Tuple.h:18
void compile_to(const std::map< OutputFileType, std::string > &output_files, const std::vector< Argument > &args, const std::string &fn_name, const Target &target=get_target_from_environment())
Compile and generate multiple target files with single call.
A fragment of front-end syntax of the form f(x, y, z), where x, y, z are Vars or Exprs.
Definition: Func.h:478
Func & trace_loads()
Trace all loads from this Func by emitting calls to halide_trace.
int num_update_definitions() const
How many update definitions does this function have?
A struct representing a target machine and os to generate code for.
Definition: Target.h:19
Stage & gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
const std::vector< Expr > & update_args(int idx=0) const
Get the left-hand-side of the update definition.
Stage operator*=(Expr)
Define a stage that multiplies this Func by the given expression.
Stage & atomic(bool override_associativity_test=false)
An enum describing a type of device API.
Definition: DeviceAPI.h:15
void schedule_scalar(Func f)
Definition: Func.h:2550
const std::string & name() const
Get the name of a Var.
static const ParamMap & empty_map()
A const ref to an empty ParamMap.
Definition: ParamMap.h:110
Func & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension into inner and outer subdimensions with the given names, where the inner dimension ...
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices and thread indices.
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this Stage.
Definition: Func.h:107
VarOrRVar(const std::string &n, bool r)
Definition: Func.h:31