Halide
Func.h
Go to the documentation of this file.
1 #ifndef HALIDE_FUNC_H
2 #define HALIDE_FUNC_H
3 
4 /** \file
5  *
6  * Defines Func - the front-end handle on a halide function, and related classes.
7  */
8 
9 #include "Argument.h"
10 #include "Expr.h"
11 #include "JITModule.h"
12 #include "Module.h"
13 #include "Param.h"
14 #include "Pipeline.h"
15 #include "RDom.h"
16 #include "Target.h"
17 #include "Tuple.h"
18 #include "Var.h"
19 
20 #include <map>
21 #include <utility>
22 
23 namespace Halide {
24 
25 class OutputImageParam;
26 class ParamMap;
27 
28 /** A class that can represent Vars or RVars. Used for reorder calls
29  * which can accept a mix of either. */
30 struct VarOrRVar {
31  VarOrRVar(const std::string &n, bool r)
32  : var(n), rvar(n), is_rvar(r) {
33  }
34  VarOrRVar(const Var &v)
35  : var(v), is_rvar(false) {
36  }
37  VarOrRVar(const RVar &r)
38  : rvar(r), is_rvar(true) {
39  }
40  VarOrRVar(const RDom &r)
41  : rvar(RVar(r)), is_rvar(true) {
42  }
43  template<int N>
45  : var(u), is_rvar(false) {
46  }
47 
48  const std::string &name() const {
49  if (is_rvar)
50  return rvar.name();
51  else
52  return var.name();
53  }
54 
57  bool is_rvar;
58 };
59 
60 class ImageParam;
61 
62 namespace Internal {
63 class Function;
64 struct Split;
65 struct StorageDim;
66 } // namespace Internal
67 
68 /** A single definition of a Func. May be a pure or update definition. */
69 class Stage {
70  /** Reference to the Function this stage (or definition) belongs to. */
71  Internal::Function function;
72  Internal::Definition definition;
73  /** Indicate which stage the definition belongs to (0 for initial
74  * definition, 1 for first update, etc.). */
75  size_t stage_index;
76  /** Pure Vars of the Function (from the init definition). */
77  std::vector<Var> dim_vars;
78 
79  void set_dim_type(const VarOrRVar &var, Internal::ForType t);
80  void set_dim_device_api(const VarOrRVar &var, DeviceAPI device_api);
81  void split(const std::string &old, const std::string &outer, const std::string &inner,
82  const Expr &factor, bool exact, TailStrategy tail);
83  void remove(const std::string &var);
84  Stage &purify(const VarOrRVar &old_name, const VarOrRVar &new_name);
85 
86  const std::vector<Internal::StorageDim> &storage_dims() const {
87  return function.schedule().storage_dims();
88  }
89 
90  Stage &compute_with(LoopLevel loop_level, const std::map<std::string, LoopAlignStrategy> &align);
91 
92 public:
93  Stage(Internal::Function f, Internal::Definition d, size_t stage_index)
94  : function(std::move(f)), definition(std::move(d)), stage_index(stage_index) {
95  internal_assert(definition.defined());
96  definition.schedule().touched() = true;
97 
98  dim_vars.reserve(function.args().size());
99  for (const auto &arg : function.args()) {
100  dim_vars.emplace_back(arg);
101  }
102  internal_assert(definition.args().size() == dim_vars.size());
103  }
104 
105  /** Return the current StageSchedule associated with this Stage. For
106  * introspection only: to modify schedule, use the Func interface. */
108  return definition.schedule();
109  }
110 
111  /** Return a string describing the current var list taking into
112  * account all the splits, reorders, and tiles. */
113  std::string dump_argument_list() const;
114 
115  /** Return the name of this stage, e.g. "f.update(2)" */
116  std::string name() const;
117 
118  /** Calling rfactor() on an associative update definition a Func will split
119  * the update into an intermediate which computes the partial results and
120  * replaces the current update definition with a new definition which merges
121  * the partial results. If called on a init/pure definition, this will
122  * throw an error. rfactor() will automatically infer the associative reduction
123  * operator and identity of the operator. If it can't prove the operation
124  * is associative or if it cannot find an identity for that operator, this
125  * will throw an error. In addition, commutativity of the operator is required
126  * if rfactor() is called on the inner dimension but excluding the outer
127  * dimensions.
128  *
129  * rfactor() takes as input 'preserved', which is a list of <RVar, Var> pairs.
130  * The rvars not listed in 'preserved' are removed from the original Func and
131  * are lifted to the intermediate Func. The remaining rvars (the ones in
132  * 'preserved') are made pure in the intermediate Func. The intermediate Func's
133  * update definition inherits all scheduling directives (e.g. split,fuse, etc.)
134  * applied to the original Func's update definition. The loop order of the
135  * intermediate Func's update definition is the same as the original, although
136  * the RVars in 'preserved' are replaced by the new pure Vars. The loop order of the
137  * intermediate Func's init definition from innermost to outermost is the args'
138  * order of the original Func's init definition followed by the new pure Vars.
139  *
140  * The intermediate Func also inherits storage order from the original Func
141  * with the new pure Vars added to the outermost.
142  *
143  * For example, f.update(0).rfactor({{r.y, u}}) would rewrite a pipeline like this:
144  \code
145  f(x, y) = 0;
146  f(x, y) += g(r.x, r.y);
147  \endcode
148  * into a pipeline like this:
149  \code
150  f_intm(x, y, u) = 0;
151  f_intm(x, y, u) += g(r.x, u);
152 
153  f(x, y) = 0;
154  f(x, y) += f_intm(x, y, r.y);
155  \endcode
156  *
157  * This has a variety of uses. You can use it to split computation of an associative reduction:
158  \code
159  f(x, y) = 10;
160  RDom r(0, 96);
161  f(x, y) = max(f(x, y), g(x, y, r.x));
162  f.update(0).split(r.x, rxo, rxi, 8).reorder(y, x).parallel(x);
163  f.update(0).rfactor({{rxo, u}}).compute_root().parallel(u).update(0).parallel(u);
164  \endcode
165  *
166  *, which is equivalent to:
167  \code
168  parallel for u = 0 to 11:
169  for y:
170  for x:
171  f_intm(x, y, u) = -inf
172  parallel for x:
173  for y:
174  parallel for u = 0 to 11:
175  for rxi = 0 to 7:
176  f_intm(x, y, u) = max(f_intm(x, y, u), g(8*u + rxi))
177  for y:
178  for x:
179  f(x, y) = 10
180  parallel for x:
181  for y:
182  for rxo = 0 to 11:
183  f(x, y) = max(f(x, y), f_intm(x, y, rxo))
184  \endcode
185  *
186  */
187  // @{
188  Func rfactor(std::vector<std::pair<RVar, Var>> preserved);
189  Func rfactor(const RVar &r, const Var &v);
190  // @}
191 
192  /** Schedule the iteration over this stage to be fused with another
193  * stage 's' from outermost loop to a given LoopLevel. 'this' stage will
194  * be computed AFTER 's' in the innermost fused dimension. There should not
195  * be any dependencies between those two fused stages. If either of the
196  * stages being fused is a stage of an extern Func, this will throw an error.
197  *
198  * Note that the two stages that are fused together should have the same
199  * exact schedule from the outermost to the innermost fused dimension, and
200  * the stage we are calling compute_with on should not have specializations,
201  * e.g. f2.compute_with(f1, x) is allowed only if f2 has no specializations.
202  *
203  * Also, if a producer is desired to be computed at the fused loop level,
204  * the function passed to the compute_at() needs to be the "parent". Consider
205  * the following code:
206  \code
207  input(x, y) = x + y;
208  f(x, y) = input(x, y);
209  f(x, y) += 5;
210  g(x, y) = x - y;
211  g(x, y) += 10;
212  f.compute_with(g, y);
213  f.update().compute_with(g.update(), y);
214  \endcode
215  *
216  * To compute 'input' at the fused loop level at dimension y, we specify
217  * input.compute_at(g, y) instead of input.compute_at(f, y) since 'g' is
218  * the "parent" for this fused loop (i.e. 'g' is computed first before 'f'
219  * is computed). On the other hand, to compute 'input' at the innermost
220  * dimension of 'f', we specify input.compute_at(f, x) instead of
221  * input.compute_at(g, x) since the x dimension of 'f' is not fused
222  * (only the y dimension is).
223  *
224  * Given the constraints, this has a variety of uses. Consider the
225  * following code:
226  \code
227  f(x, y) = x + y;
228  g(x, y) = x - y;
229  h(x, y) = f(x, y) + g(x, y);
230  f.compute_root();
231  g.compute_root();
232  f.split(x, xo, xi, 8);
233  g.split(x, xo, xi, 8);
234  g.compute_with(f, xo);
235  \endcode
236  *
237  * This is equivalent to:
238  \code
239  for y:
240  for xo:
241  for xi:
242  f(8*xo + xi) = (8*xo + xi) + y
243  for xi:
244  g(8*xo + xi) = (8*xo + xi) - y
245  for y:
246  for x:
247  h(x, y) = f(x, y) + g(x, y)
248  \endcode
249  *
250  * The size of the dimensions of the stages computed_with do not have
251  * to match. Consider the following code where 'g' is half the size of 'f':
252  \code
253  Image<int> f_im(size, size), g_im(size/2, size/2);
254  input(x, y) = x + y;
255  f(x, y) = input(x, y);
256  g(x, y) = input(2*x, 2*y);
257  g.compute_with(f, y);
258  input.compute_at(f, y);
259  Pipeline({f, g}).realize({f_im, g_im});
260  \endcode
261  *
262  * This is equivalent to:
263  \code
264  for y = 0 to size-1:
265  for x = 0 to size-1:
266  input(x, y) = x + y;
267  for x = 0 to size-1:
268  f(x, y) = input(x, y)
269  for x = 0 to size/2-1:
270  if (y < size/2-1):
271  g(x, y) = input(2*x, 2*y)
272  \endcode
273  *
274  * 'align' specifies how the loop iteration of each dimension of the
275  * two stages being fused should be aligned in the fused loop nests
276  * (see LoopAlignStrategy for options). Consider the following loop nests:
277  \code
278  for z = f_min_z to f_max_z:
279  for y = f_min_y to f_max_y:
280  for x = f_min_x to f_max_x:
281  f(x, y, z) = x + y + z
282  for z = g_min_z to g_max_z:
283  for y = g_min_y to g_max_y:
284  for x = g_min_x to g_max_x:
285  g(x, y, z) = x - y - z
286  \endcode
287  *
288  * If no alignment strategy is specified, the following loop nest will be
289  * generated:
290  \code
291  for z = min(f_min_z, g_min_z) to max(f_max_z, g_max_z):
292  for y = min(f_min_y, g_min_y) to max(f_max_y, g_max_y):
293  for x = f_min_x to f_max_x:
294  if (f_min_z <= z <= f_max_z):
295  if (f_min_y <= y <= f_max_y):
296  f(x, y, z) = x + y + z
297  for x = g_min_x to g_max_x:
298  if (g_min_z <= z <= g_max_z):
299  if (g_min_y <= y <= g_max_y):
300  g(x, y, z) = x - y - z
301  \endcode
302  *
303  * Instead, these alignment strategies:
304  \code
305  g.compute_with(f, y, {{z, LoopAlignStrategy::AlignStart}, {y, LoopAlignStrategy::AlignEnd}});
306  \endcode
307  * will produce the following loop nest:
308  \code
309  f_loop_min_z = f_min_z
310  f_loop_max_z = max(f_max_z, (f_min_z - g_min_z) + g_max_z)
311  for z = f_min_z to f_loop_max_z:
312  f_loop_min_y = min(f_min_y, (f_max_y - g_max_y) + g_min_y)
313  f_loop_max_y = f_max_y
314  for y = f_loop_min_y to f_loop_max_y:
315  for x = f_min_x to f_max_x:
316  if (f_loop_min_z <= z <= f_loop_max_z):
317  if (f_loop_min_y <= y <= f_loop_max_y):
318  f(x, y, z) = x + y + z
319  for x = g_min_x to g_max_x:
320  g_shift_z = g_min_z - f_loop_min_z
321  g_shift_y = g_max_y - f_loop_max_y
322  if (g_min_z <= (z + g_shift_z) <= g_max_z):
323  if (g_min_y <= (y + g_shift_y) <= g_max_y):
324  g(x, y + g_shift_y, z + g_shift_z) = x - (y + g_shift_y) - (z + g_shift_z)
325  \endcode
326  *
327  * LoopAlignStrategy::AlignStart on dimension z will shift the loop iteration
328  * of 'g' at dimension z so that its starting value matches that of 'f'.
329  * Likewise, LoopAlignStrategy::AlignEnd on dimension y will shift the loop
330  * iteration of 'g' at dimension y so that its end value matches that of 'f'.
331  */
332  // @{
333  Stage &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
334  Stage &compute_with(LoopLevel loop_level, LoopAlignStrategy align = LoopAlignStrategy::Auto);
335  Stage &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
336  Stage &compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align = LoopAlignStrategy::Auto);
337  // @}
338 
339  /** Scheduling calls that control how the domain of this stage is
340  * traversed. See the documentation for Func for the meanings. */
341  // @{
342 
343  Stage &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
344  Stage &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
345  Stage &serial(const VarOrRVar &var);
346  Stage &parallel(const VarOrRVar &var);
347  Stage &vectorize(const VarOrRVar &var);
348  Stage &unroll(const VarOrRVar &var);
349  Stage &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
350  Stage &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
351  Stage &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
352  Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
353  const VarOrRVar &xo, const VarOrRVar &yo,
354  const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor,
356  Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
357  const VarOrRVar &xi, const VarOrRVar &yi,
358  const Expr &xfactor, const Expr &yfactor,
360  Stage &tile(const std::vector<VarOrRVar> &previous,
361  const std::vector<VarOrRVar> &outers,
362  const std::vector<VarOrRVar> &inners,
363  const std::vector<Expr> &factors,
364  const std::vector<TailStrategy> &tails);
365  Stage &tile(const std::vector<VarOrRVar> &previous,
366  const std::vector<VarOrRVar> &outers,
367  const std::vector<VarOrRVar> &inners,
368  const std::vector<Expr> &factors,
370  Stage &tile(const std::vector<VarOrRVar> &previous,
371  const std::vector<VarOrRVar> &inners,
372  const std::vector<Expr> &factors,
374  Stage &reorder(const std::vector<VarOrRVar> &vars);
375 
376  template<typename... Args>
377  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
378  reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&... args) {
379  std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
380  return reorder(collected_args);
381  }
382 
383  Stage &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
384  Stage specialize(const Expr &condition);
385  void specialize_fail(const std::string &message);
386 
387  Stage &gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
388  Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
389  Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
390 
391  Stage &gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
392 
394 
395  Stage &gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
396  Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
397  Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
398 
399  Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
400  Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
401  const VarOrRVar &thread_x, const VarOrRVar &thread_y,
402  DeviceAPI device_api = DeviceAPI::Default_GPU);
403  Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
404  const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z,
405  DeviceAPI device_api = DeviceAPI::Default_GPU);
406 
407  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
409  DeviceAPI device_api = DeviceAPI::Default_GPU);
410 
411  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
413  DeviceAPI device_api = DeviceAPI::Default_GPU);
414  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
415  const VarOrRVar &bx, const VarOrRVar &by,
416  const VarOrRVar &tx, const VarOrRVar &ty,
417  const Expr &x_size, const Expr &y_size,
419  DeviceAPI device_api = DeviceAPI::Default_GPU);
420 
421  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
422  const VarOrRVar &tx, const VarOrRVar &ty,
423  const Expr &x_size, const Expr &y_size,
425  DeviceAPI device_api = DeviceAPI::Default_GPU);
426 
427  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
428  const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
429  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
430  const Expr &x_size, const Expr &y_size, const Expr &z_size,
432  DeviceAPI device_api = DeviceAPI::Default_GPU);
433  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
434  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
435  const Expr &x_size, const Expr &y_size, const Expr &z_size,
437  DeviceAPI device_api = DeviceAPI::Default_GPU);
438 
440  Stage &atomic(bool override_associativity_test = false);
441 
442  Stage &hexagon(const VarOrRVar &x = Var::outermost());
443  Stage &prefetch(const Func &f, const VarOrRVar &var, Expr offset = 1,
445  Stage &prefetch(const Internal::Parameter &param, const VarOrRVar &var, Expr offset = 1,
447  template<typename T>
448  Stage &prefetch(const T &image, VarOrRVar var, Expr offset = 1,
450  return prefetch(image.parameter(), var, offset, strategy);
451  }
452  // @}
453 
454  /** Attempt to get the source file and line where this stage was
455  * defined by parsing the process's own debug symbols. Returns an
456  * empty string if no debug symbols were found or the debug
457  * symbols were not understood. Works on OS X and Linux only. */
458  std::string source_location() const;
459 };
460 
461 // For backwards compatibility, keep the ScheduleHandle name.
463 
464 class FuncTupleElementRef;
465 
466 /** A fragment of front-end syntax of the form f(x, y, z), where x, y,
467  * z are Vars or Exprs. If could be the left hand side of a definition or
468  * an update definition, or it could be a call to a function. We don't know
469  * until we see how this object gets used.
470  */
471 class FuncRef {
472  Internal::Function func;
473  int implicit_placeholder_pos;
474  int implicit_count;
475  std::vector<Expr> args;
476  std::vector<Expr> args_with_implicit_vars(const std::vector<Expr> &e) const;
477 
478  /** Helper for function update by Tuple. If the function does not
479  * already have a pure definition, init_val will be used as RHS of
480  * each tuple element in the initial function definition. */
481  template<typename BinaryOp>
482  Stage func_ref_update(const Tuple &e, int init_val);
483 
484  /** Helper for function update by Expr. If the function does not
485  * already have a pure definition, init_val will be used as RHS in
486  * the initial function definition. */
487  template<typename BinaryOp>
488  Stage func_ref_update(Expr e, int init_val);
489 
490 public:
491  FuncRef(const Internal::Function &, const std::vector<Expr> &,
492  int placeholder_pos = -1, int count = 0);
493  FuncRef(Internal::Function, const std::vector<Var> &,
494  int placeholder_pos = -1, int count = 0);
495 
496  /** Use this as the left-hand-side of a definition or an update definition
497  * (see \ref RDom).
498  */
499  Stage operator=(const Expr &);
500 
501  /** Use this as the left-hand-side of a definition or an update definition
502  * for a Func with multiple outputs. */
503  Stage operator=(const Tuple &);
504 
505  /** Define a stage that adds the given expression to this Func. If the
506  * expression refers to some RDom, this performs a sum reduction of the
507  * expression over the domain. If the function does not already have a
508  * pure definition, this sets it to zero.
509  */
510  // @{
512  Stage operator+=(const Tuple &);
513  Stage operator+=(const FuncRef &);
514  // @}
515 
516  /** Define a stage that adds the negative of the given expression to this
517  * Func. If the expression refers to some RDom, this performs a sum reduction
518  * of the negative of the expression over the domain. If the function does
519  * not already have a pure definition, this sets it to zero.
520  */
521  // @{
523  Stage operator-=(const Tuple &);
524  Stage operator-=(const FuncRef &);
525  // @}
526 
527  /** Define a stage that multiplies this Func by the given expression. If the
528  * expression refers to some RDom, this performs a product reduction of the
529  * expression over the domain. If the function does not already have a pure
530  * definition, this sets it to 1.
531  */
532  // @{
534  Stage operator*=(const Tuple &);
535  Stage operator*=(const FuncRef &);
536  // @}
537 
538  /** Define a stage that divides this Func by the given expression.
539  * If the expression refers to some RDom, this performs a product
540  * reduction of the inverse of the expression over the domain. If the
541  * function does not already have a pure definition, this sets it to 1.
542  */
543  // @{
545  Stage operator/=(const Tuple &);
546  Stage operator/=(const FuncRef &);
547  // @}
548 
549  /* Override the usual assignment operator, so that
550  * f(x, y) = g(x, y) defines f.
551  */
552  Stage operator=(const FuncRef &);
553 
554  /** Use this as a call to the function, and not the left-hand-side
555  * of a definition. Only works for single-output Funcs. */
556  operator Expr() const;
557 
558  /** When a FuncRef refers to a function that provides multiple
559  * outputs, you can access each output as an Expr using
560  * operator[].
561  */
562  FuncTupleElementRef operator[](int) const;
563 
564  /** How many outputs does the function this refers to produce. */
565  size_t size() const;
566 
567  /** What function is this calling? */
568  Internal::Function function() const {
569  return func;
570  }
571 };
572 
573 /** Explicit overloads of min and max for FuncRef. These exist to
574  * disambiguate calls to min on FuncRefs when a user has pulled both
575  * Halide::min and std::min into their namespace. */
576 // @{
577 inline Expr min(const FuncRef &a, const FuncRef &b) {
578  return min(Expr(a), Expr(b));
579 }
580 inline Expr max(const FuncRef &a, const FuncRef &b) {
581  return max(Expr(a), Expr(b));
582 }
583 // @}
584 
585 /** A fragment of front-end syntax of the form f(x, y, z)[index], where x, y,
586  * z are Vars or Exprs. If could be the left hand side of an update
587  * definition, or it could be a call to a function. We don't know
588  * until we see how this object gets used.
589  */
591  FuncRef func_ref;
592  std::vector<Expr> args; // args to the function
593  int idx; // Index to function outputs
594 
595  /** Helper function that generates a Tuple where element at 'idx' is set
596  * to 'e' and the rests are undef. */
597  Tuple values_with_undefs(const Expr &e) const;
598 
599 public:
600  FuncTupleElementRef(const FuncRef &ref, const std::vector<Expr> &args, int idx);
601 
602  /** Use this as the left-hand-side of an update definition of Tuple
603  * component 'idx' of a Func (see \ref RDom). The function must
604  * already have an initial definition.
605  */
606  Stage operator=(const Expr &e);
607 
608  /** Define a stage that adds the given expression to Tuple component 'idx'
609  * of this Func. The other Tuple components are unchanged. If the expression
610  * refers to some RDom, this performs a sum reduction of the expression over
611  * the domain. The function must already have an initial definition.
612  */
613  Stage operator+=(const Expr &e);
614 
615  /** Define a stage that adds the negative of the given expression to Tuple
616  * component 'idx' of this Func. The other Tuple components are unchanged.
617  * If the expression refers to some RDom, this performs a sum reduction of
618  * the negative of the expression over the domain. The function must already
619  * have an initial definition.
620  */
621  Stage operator-=(const Expr &e);
622 
623  /** Define a stage that multiplies Tuple component 'idx' of this Func by
624  * the given expression. The other Tuple components are unchanged. If the
625  * expression refers to some RDom, this performs a product reduction of
626  * the expression over the domain. The function must already have an
627  * initial definition.
628  */
629  Stage operator*=(const Expr &e);
630 
631  /** Define a stage that divides Tuple component 'idx' of this Func by
632  * the given expression. The other Tuple components are unchanged.
633  * If the expression refers to some RDom, this performs a product
634  * reduction of the inverse of the expression over the domain. The function
635  * must already have an initial definition.
636  */
637  Stage operator/=(const Expr &e);
638 
639  /* Override the usual assignment operator, so that
640  * f(x, y)[index] = g(x, y) defines f.
641  */
642  Stage operator=(const FuncRef &e);
643 
644  /** Use this as a call to Tuple component 'idx' of a Func, and not the
645  * left-hand-side of a definition. */
646  operator Expr() const;
647 
648  /** What function is this calling? */
649  Internal::Function function() const {
650  return func_ref.function();
651  }
652 
653  /** Return index to the function outputs. */
654  int index() const {
655  return idx;
656  }
657 };
658 
659 namespace Internal {
660 class IRMutator;
661 } // namespace Internal
662 
663 /** A halide function. This class represents one stage in a Halide
664  * pipeline, and is the unit by which we schedule things. By default
665  * they are aggressively inlined, so you are encouraged to make lots
666  * of little functions, rather than storing things in Exprs. */
667 class Func {
668 
669  /** A handle on the internal halide function that this
670  * represents */
671  Internal::Function func;
672 
673  /** When you make a reference to this function with fewer
674  * arguments than it has dimensions, the argument list is bulked
675  * up with 'implicit' vars with canonical names. This lets you
676  * pass around partially applied Halide functions. */
677  // @{
678  std::pair<int, int> add_implicit_vars(std::vector<Var> &) const;
679  std::pair<int, int> add_implicit_vars(std::vector<Expr> &) const;
680  // @}
681 
682  /** The imaging pipeline that outputs this Func alone. */
683  Pipeline pipeline_;
684 
685  /** Get the imaging pipeline that outputs this Func alone,
686  * creating it (and freezing the Func) if necessary. */
687  Pipeline pipeline();
688 
689  // Helper function for recursive reordering support
690  Func &reorder_storage(const std::vector<Var> &dims, size_t start);
691 
692  void invalidate_cache();
693 
694 public:
695  /** Declare a new undefined function with the given name */
696  explicit Func(const std::string &name);
697 
698  /** Declare a new undefined function with an
699  * automatically-generated unique name */
700  Func();
701 
702  /** Declare a new function with an automatically-generated unique
703  * name, and define it to return the given expression (which may
704  * not contain free variables). */
705  explicit Func(const Expr &e);
706 
707  /** Construct a new Func to wrap an existing, already-define
708  * Function object. */
709  explicit Func(Internal::Function f);
710 
711  /** Construct a new Func to wrap a Buffer. */
712  template<typename T>
714  : Func() {
715  (*this)(_) = im(_);
716  }
717 
718  /** Evaluate this function over some rectangular domain and return
719  * the resulting buffer or buffers. Performs compilation if the
720  * Func has not previously been realized and jit_compile has not
721  * been called. If the final stage of the pipeline is on the GPU,
722  * data is copied back to the host before being returned. The
723  * returned Realization should probably be instantly converted to
724  * a Buffer class of the appropriate type. That is, do this:
725  *
726  \code
727  f(x) = sin(x);
728  Buffer<float> im = f.realize(...);
729  \endcode
730  *
731  * If your Func has multiple values, because you defined it using
732  * a Tuple, then casting the result of a realize call to a buffer
733  * or image will produce a run-time error. Instead you should do the
734  * following:
735  *
736  \code
737  f(x) = Tuple(x, sin(x));
738  Realization r = f.realize(...);
739  Buffer<int> im0 = r[0];
740  Buffer<float> im1 = r[1];
741  \endcode
742  *
743  * In Halide formal arguments of a computation are specified using
744  * Param<T> and ImageParam objects in the expressions defining the
745  * computation. The param_map argument to realize allows
746  * specifying a set of per-call parameters to be used for a
747  * specific computation. This method is thread-safe where the
748  * globals used by Param<T> and ImageParam are not. Any parameters
749  * that are not in the param_map are taken from the global values,
750  * so those can continue to be used if they are not changing
751  * per-thread.
752  *
753  * One can explicitly construct a ParamMap and
754  * use its set method to insert Parameter to scalar or Buffer
755  * value mappings:
756  *
757  \code
758  Param<int32> p(42);
759  ImageParam img(Int(32), 1);
760  f(x) = img(x) + p;
761 
762  Buffer<int32_t) arg_img(10, 10);
763  <fill in arg_img...>
764  ParamMap params;
765  params.set(p, 17);
766  params.set(img, arg_img);
767 
768  Target t = get_jit_target_from_environment();
769  Buffer<int32_t> result = f.realize(10, 10, t, params);
770  \endcode
771  *
772  * Alternatively, an initializer list can be used
773  * directly in the realize call to pass this information:
774  *
775  \code
776  Param<int32> p(42);
777  ImageParam img(Int(32), 1);
778  f(x) = img(x) + p;
779 
780  Buffer<int32_t) arg_img(10, 10);
781  <fill in arg_img...>
782 
783  Target t = get_jit_target_from_environment();
784  Buffer<int32_t> result = f.realize(10, 10, t, { { p, 17 }, { img, arg_img } });
785  \endcode
786  *
787  * If the Func cannot be realized into a buffer of the given size
788  * due to scheduling constraints on scattering update definitions,
789  * it will be realized into a larger buffer of the minimum size
790  * possible, and a cropped view at the requested size will be
791  * returned. It is thus not safe to assume the returned buffers
792  * are contiguous in memory. This behavior can be disabled with
793  * the NoBoundsQuery target flag, in which case an error about
794  * writing out of bounds on the output buffer will trigger
795  * instead.
796  *
797  */
798  // @{
799  Realization realize(std::vector<int32_t> sizes, const Target &target = Target(),
800  const ParamMap &param_map = ParamMap::empty_map());
801  Realization realize(int x_size, int y_size, int z_size, int w_size, const Target &target = Target(),
802  const ParamMap &param_map = ParamMap::empty_map());
803  Realization realize(int x_size, int y_size, int z_size, const Target &target = Target(),
804  const ParamMap &param_map = ParamMap::empty_map());
805  Realization realize(int x_size, int y_size, const Target &target = Target(),
806  const ParamMap &param_map = ParamMap::empty_map());
807  Realization realize(int x_size, const Target &target = Target(),
808  const ParamMap &param_map = ParamMap::empty_map());
809  Realization realize(const Target &target = Target(),
810  const ParamMap &param_map = ParamMap::empty_map());
811  // @}
812 
813  /** Evaluate this function into an existing allocated buffer or
814  * buffers. If the buffer is also one of the arguments to the
815  * function, strange things may happen, as the pipeline isn't
816  * necessarily safe to run in-place. If you pass multiple buffers,
817  * they must have matching sizes. This form of realize does *not*
818  * automatically copy data back from the GPU. */
819  void realize(Pipeline::RealizationArg outputs, const Target &target = Target(),
820  const ParamMap &param_map = ParamMap::empty_map());
821 
822  /** For a given size of output, or a given output buffer,
823  * determine the bounds required of all unbound ImageParams
824  * referenced. Communicates the result by allocating new buffers
825  * of the appropriate size and binding them to the unbound
826  * ImageParams.
827  *
828  * Set the documentation for Func::realize regarding the
829  * ParamMap. There is one difference in that input Buffer<>
830  * arguments that are being inferred are specified as a pointer to
831  * the Buffer<> in the ParamMap. E.g.
832  *
833  \code
834  Param<int32> p(42);
835  ImageParam img(Int(32), 1);
836  f(x) = img(x) + p;
837 
838  Target t = get_jit_target_from_environment();
839  Buffer<> in;
840  f.infer_input_bounds({10, 10}, t, { { img, &in } });
841  \endcode
842  * On return, in will be an allocated buffer of the correct size
843  * to evaulate f over a 10x10 region.
844  */
845  // @{
846  void infer_input_bounds(const std::vector<int32_t> &sizes,
847  const Target &target = get_jit_target_from_environment(),
848  const ParamMap &param_map = ParamMap::empty_map());
849  HALIDE_ATTRIBUTE_DEPRECATED("Call infer_input_bounds() with an explicit vector<int> instead")
850  void infer_input_bounds(int x_size = 0, int y_size = 0, int z_size = 0, int w_size = 0,
851  const Target &target = get_jit_target_from_environment(),
852  const ParamMap &param_map = ParamMap::empty_map());
853  // TODO: this is a temporary wrapper used to disambiguate the cases where
854  // a single-entry braced list would match the deprecated overload
855  // (rather than the vector overload); when the deprecated method is removed,
856  // this should be removed, too
857  void infer_input_bounds(const std::initializer_list<int> &sizes,
858  const Target &target = get_jit_target_from_environment(),
859  const ParamMap &param_map = ParamMap::empty_map()) {
860  infer_input_bounds(std::vector<int>{sizes}, target, param_map);
861  }
863  const Target &target = get_jit_target_from_environment(),
864  const ParamMap &param_map = ParamMap::empty_map());
865  // @}
866 
867  /** Statically compile this function to llvm bitcode, with the
868  * given filename (which should probably end in .bc), type
869  * signature, and C function name (which defaults to the same name
870  * as this halide function */
871  //@{
872  void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
873  const Target &target = get_target_from_environment());
874  void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &,
875  const Target &target = get_target_from_environment());
876  // @}
877 
878  /** Statically compile this function to llvm assembly, with the
879  * given filename (which should probably end in .ll), type
880  * signature, and C function name (which defaults to the same name
881  * as this halide function */
882  //@{
883  void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
884  const Target &target = get_target_from_environment());
885  void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &,
886  const Target &target = get_target_from_environment());
887  // @}
888 
889  /** Statically compile this function to an object file, with the
890  * given filename (which should probably end in .o or .obj), type
891  * signature, and C function name (which defaults to the same name
892  * as this halide function. You probably don't want to use this
893  * directly; call compile_to_static_library or compile_to_file instead. */
894  //@{
895  void compile_to_object(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
896  const Target &target = get_target_from_environment());
897  void compile_to_object(const std::string &filename, const std::vector<Argument> &,
898  const Target &target = get_target_from_environment());
899  // @}
900 
901  /** Emit a header file with the given filename for this
902  * function. The header will define a function with the type
903  * signature given by the second argument, and a name given by the
904  * third. The name defaults to the same name as this halide
905  * function. You don't actually have to have defined this function
906  * yet to call this. You probably don't want to use this directly;
907  * call compile_to_static_library or compile_to_file instead. */
908  void compile_to_header(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name = "",
909  const Target &target = get_target_from_environment());
910 
911  /** Statically compile this function to text assembly equivalent
912  * to the object file generated by compile_to_object. This is
913  * useful for checking what Halide is producing without having to
914  * disassemble anything, or if you need to feed the assembly into
915  * some custom toolchain to produce an object file (e.g. iOS) */
916  //@{
917  void compile_to_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
918  const Target &target = get_target_from_environment());
919  void compile_to_assembly(const std::string &filename, const std::vector<Argument> &,
920  const Target &target = get_target_from_environment());
921  // @}
922 
923  /** Statically compile this function to C source code. This is
924  * useful for providing fallback code paths that will compile on
925  * many platforms. Vectorization will fail, and parallelization
926  * will produce serial code. */
927  void compile_to_c(const std::string &filename,
928  const std::vector<Argument> &,
929  const std::string &fn_name = "",
930  const Target &target = get_target_from_environment());
931 
932  /** Write out an internal representation of lowered code. Useful
933  * for analyzing and debugging scheduling. Can emit html or plain
934  * text. */
935  void compile_to_lowered_stmt(const std::string &filename,
936  const std::vector<Argument> &args,
937  StmtOutputFormat fmt = Text,
938  const Target &target = get_target_from_environment());
939 
940  /** Write out the loop nests specified by the schedule for this
941  * Function. Helpful for understanding what a schedule is
942  * doing. */
943  void print_loop_nest();
944 
945  /** Compile to object file and header pair, with the given
946  * arguments. The name defaults to the same name as this halide
947  * function.
948  */
949  void compile_to_file(const std::string &filename_prefix, const std::vector<Argument> &args,
950  const std::string &fn_name = "",
951  const Target &target = get_target_from_environment());
952 
953  /** Compile to static-library file and header pair, with the given
954  * arguments. The name defaults to the same name as this halide
955  * function.
956  */
957  void compile_to_static_library(const std::string &filename_prefix, const std::vector<Argument> &args,
958  const std::string &fn_name = "",
959  const Target &target = get_target_from_environment());
960 
961  /** Compile to static-library file and header pair once for each target;
962  * each resulting function will be considered (in order) via halide_can_use_target_features()
963  * at runtime, with the first appropriate match being selected for subsequent use.
964  * This is typically useful for specializations that may vary unpredictably by machine
965  * (e.g., SSE4.1/AVX/AVX2 on x86 desktop machines).
966  * All targets must have identical arch-os-bits.
967  */
968  void compile_to_multitarget_static_library(const std::string &filename_prefix,
969  const std::vector<Argument> &args,
970  const std::vector<Target> &targets);
971 
972  /** Like compile_to_multitarget_static_library(), except that the object files
973  * are all output as object files (rather than bundled into a static library).
974  *
975  * `suffixes` is an optional list of strings to use for as the suffix for each object
976  * file. If nonempty, it must be the same length as `targets`. (If empty, Target::to_string()
977  * will be used for each suffix.)
978  *
979  * Note that if `targets.size()` > 1, the wrapper code (to select the subtarget)
980  * will be generated with the filename `${filename_prefix}_wrapper.o`
981  *
982  * Note that if `targets.size()` > 1 and `no_runtime` is not specified, the runtime
983  * will be generated with the filename `${filename_prefix}_runtime.o`
984  */
985  void compile_to_multitarget_object_files(const std::string &filename_prefix,
986  const std::vector<Argument> &args,
987  const std::vector<Target> &targets,
988  const std::vector<std::string> &suffixes);
989 
990  /** Store an internal representation of lowered code as a self
991  * contained Module suitable for further compilation. */
992  Module compile_to_module(const std::vector<Argument> &args, const std::string &fn_name = "",
993  const Target &target = get_target_from_environment());
994 
995  /** Compile and generate multiple target files with single call.
996  * Deduces target files based on filenames specified in
997  * output_files map.
998  */
999  void compile_to(const std::map<Output, std::string> &output_files,
1000  const std::vector<Argument> &args,
1001  const std::string &fn_name,
1002  const Target &target = get_target_from_environment());
1003 
1004  /** Eagerly jit compile the function to machine code. This
1005  * normally happens on the first call to realize. If you're
1006  * running your halide pipeline inside time-sensitive code and
1007  * wish to avoid including the time taken to compile a pipeline,
1008  * then you can call this ahead of time. Default is to use the Target
1009  * returned from Halide::get_jit_target_from_environment()
1010  */
1011  void compile_jit(const Target &target = get_jit_target_from_environment());
1012 
1013  /** Set the error handler function that be called in the case of
1014  * runtime errors during halide pipelines. If you are compiling
1015  * statically, you can also just define your own function with
1016  * signature
1017  \code
1018  extern "C" void halide_error(void *user_context, const char *);
1019  \endcode
1020  * This will clobber Halide's version.
1021  */
1022  void set_error_handler(void (*handler)(void *, const char *));
1023 
1024  /** Set a custom malloc and free for halide to use. Malloc should
1025  * return 32-byte aligned chunks of memory, and it should be safe
1026  * for Halide to read slightly out of bounds (up to 8 bytes before
1027  * the start or beyond the end). If compiling statically, routines
1028  * with appropriate signatures can be provided directly
1029  \code
1030  extern "C" void *halide_malloc(void *, size_t)
1031  extern "C" void halide_free(void *, void *)
1032  \endcode
1033  * These will clobber Halide's versions. See HalideRuntime.h
1034  * for declarations.
1035  */
1036  void set_custom_allocator(void *(*malloc)(void *, size_t),
1037  void (*free)(void *, void *));
1038 
1039  /** Set a custom task handler to be called by the parallel for
1040  * loop. It is useful to set this if you want to do some
1041  * additional bookkeeping at the granularity of parallel
1042  * tasks. The default implementation does this:
1043  \code
1044  extern "C" int halide_do_task(void *user_context,
1045  int (*f)(void *, int, uint8_t *),
1046  int idx, uint8_t *state) {
1047  return f(user_context, idx, state);
1048  }
1049  \endcode
1050  * If you are statically compiling, you can also just define your
1051  * own version of the above function, and it will clobber Halide's
1052  * version.
1053  *
1054  * If you're trying to use a custom parallel runtime, you probably
1055  * don't want to call this. See instead \ref Func::set_custom_do_par_for .
1056  */
1057  void set_custom_do_task(
1058  int (*custom_do_task)(void *, int (*)(void *, int, uint8_t *),
1059  int, uint8_t *));
1060 
1061  /** Set a custom parallel for loop launcher. Useful if your app
1062  * already manages a thread pool. The default implementation is
1063  * equivalent to this:
1064  \code
1065  extern "C" int halide_do_par_for(void *user_context,
1066  int (*f)(void *, int, uint8_t *),
1067  int min, int extent, uint8_t *state) {
1068  int exit_status = 0;
1069  parallel for (int idx = min; idx < min+extent; idx++) {
1070  int job_status = halide_do_task(user_context, f, idx, state);
1071  if (job_status) exit_status = job_status;
1072  }
1073  return exit_status;
1074  }
1075  \endcode
1076  *
1077  * However, notwithstanding the above example code, if one task
1078  * fails, we may skip over other tasks, and if two tasks return
1079  * different error codes, we may select one arbitrarily to return.
1080  *
1081  * If you are statically compiling, you can also just define your
1082  * own version of the above function, and it will clobber Halide's
1083  * version.
1084  */
1085  void set_custom_do_par_for(
1086  int (*custom_do_par_for)(void *, int (*)(void *, int, uint8_t *), int,
1087  int, uint8_t *));
1088 
1089  /** Set custom routines to call when tracing is enabled. Call this
1090  * on the output Func of your pipeline. This then sets custom
1091  * routines for the entire pipeline, not just calls to this
1092  * Func.
1093  *
1094  * If you are statically compiling, you can also just define your
1095  * own versions of the tracing functions (see HalideRuntime.h),
1096  * and they will clobber Halide's versions. */
1097  void set_custom_trace(int (*trace_fn)(void *, const halide_trace_event_t *));
1098 
1099  /** Set the function called to print messages from the runtime.
1100  * If you are compiling statically, you can also just define your
1101  * own function with signature
1102  \code
1103  extern "C" void halide_print(void *user_context, const char *);
1104  \endcode
1105  * This will clobber Halide's version.
1106  */
1107  void set_custom_print(void (*handler)(void *, const char *));
1108 
1109  /** Get a struct containing the currently set custom functions
1110  * used by JIT. */
1112 
1113  /** Add a custom pass to be used during lowering. It is run after
1114  * all other lowering passes. Can be used to verify properties of
1115  * the lowered Stmt, instrument it with extra code, or otherwise
1116  * modify it. The Func takes ownership of the pass, and will call
1117  * delete on it when the Func goes out of scope. So don't pass a
1118  * stack object, or share pass instances between multiple
1119  * Funcs. */
1120  template<typename T>
1122  // Template instantiate a custom deleter for this type, then
1123  // wrap in a lambda. The custom deleter lives in user code, so
1124  // that deletion is on the same heap as construction (I hate Windows).
1125  add_custom_lowering_pass(pass, [pass]() { delete_lowering_pass<T>(pass); });
1126  }
1127 
1128  /** Add a custom pass to be used during lowering, with the
1129  * function that will be called to delete it also passed in. Set
1130  * it to nullptr if you wish to retain ownership of the object. */
1131  void add_custom_lowering_pass(Internal::IRMutator *pass, std::function<void()> deleter);
1132 
1133  /** Remove all previously-set custom lowering passes */
1135 
1136  /** Get the custom lowering passes. */
1137  const std::vector<CustomLoweringPass> &custom_lowering_passes();
1138 
1139  /** When this function is compiled, include code that dumps its
1140  * values to a file after it is realized, for the purpose of
1141  * debugging.
1142  *
1143  * If filename ends in ".tif" or ".tiff" (case insensitive) the file
1144  * is in TIFF format and can be read by standard tools. Oherwise, the
1145  * file format is as follows:
1146  *
1147  * All data is in the byte-order of the target platform. First, a
1148  * 20 byte-header containing four 32-bit ints, giving the extents
1149  * of the first four dimensions. Dimensions beyond four are
1150  * folded into the fourth. Then, a fifth 32-bit int giving the
1151  * data type of the function. The typecodes are given by: float =
1152  * 0, double = 1, uint8_t = 2, int8_t = 3, uint16_t = 4, int16_t =
1153  * 5, uint32_t = 6, int32_t = 7, uint64_t = 8, int64_t = 9. The
1154  * data follows the header, as a densely packed array of the given
1155  * size and the given type. If given the extension .tmp, this file
1156  * format can be natively read by the program ImageStack. */
1157  void debug_to_file(const std::string &filename);
1158 
1159  /** The name of this function, either given during construction,
1160  * or automatically generated. */
1161  const std::string &name() const;
1162 
1163  /** Get the pure arguments. */
1164  std::vector<Var> args() const;
1165 
1166  /** The right-hand-side value of the pure definition of this
1167  * function. Causes an error if there's no pure definition, or if
1168  * the function is defined to return multiple values. */
1169  Expr value() const;
1170 
1171  /** The values returned by this function. An error if the function
1172  * has not been been defined. Returns a Tuple with one element for
1173  * functions defined to return a single value. */
1174  Tuple values() const;
1175 
1176  /** Does this function have at least a pure definition. */
1177  bool defined() const;
1178 
1179  /** Get the left-hand-side of the update definition. An empty
1180  * vector if there's no update definition. If there are
1181  * multiple update definitions for this function, use the
1182  * argument to select which one you want. */
1183  const std::vector<Expr> &update_args(int idx = 0) const;
1184 
1185  /** Get the right-hand-side of an update definition. An error if
1186  * there's no update definition. If there are multiple
1187  * update definitions for this function, use the argument to
1188  * select which one you want. */
1189  Expr update_value(int idx = 0) const;
1190 
1191  /** Get the right-hand-side of an update definition for
1192  * functions that returns multiple values. An error if there's no
1193  * update definition. Returns a Tuple with one element for
1194  * functions that return a single value. */
1195  Tuple update_values(int idx = 0) const;
1196 
1197  /** Get the RVars of the reduction domain for an update definition, if there is
1198  * one. */
1199  std::vector<RVar> rvars(int idx = 0) const;
1200 
1201  /** Does this function have at least one update definition? */
1202  bool has_update_definition() const;
1203 
1204  /** How many update definitions does this function have? */
1205  int num_update_definitions() const;
1206 
1207  /** Is this function an external stage? That is, was it defined
1208  * using define_extern? */
1209  bool is_extern() const;
1210 
1211  /** Add an extern definition for this Func. This lets you define a
1212  * Func that represents an external pipeline stage. You can, for
1213  * example, use it to wrap a call to an extern library such as
1214  * fftw. */
1215  // @{
1216  void define_extern(const std::string &function_name,
1217  const std::vector<ExternFuncArgument> &params, Type t,
1218  int dimensionality,
1220  DeviceAPI device_api = DeviceAPI::Host) {
1221  define_extern(function_name, params, t,
1222  Internal::make_argument_list(dimensionality), mangling,
1223  device_api);
1224  }
1225 
1226  void define_extern(const std::string &function_name,
1227  const std::vector<ExternFuncArgument> &params,
1228  const std::vector<Type> &types, int dimensionality,
1229  NameMangling mangling) {
1230  define_extern(function_name, params, types,
1231  Internal::make_argument_list(dimensionality), mangling);
1232  }
1233 
1234  void define_extern(const std::string &function_name,
1235  const std::vector<ExternFuncArgument> &params,
1236  const std::vector<Type> &types, int dimensionality,
1238  DeviceAPI device_api = DeviceAPI::Host) {
1239  define_extern(function_name, params, types,
1240  Internal::make_argument_list(dimensionality), mangling,
1241  device_api);
1242  }
1243 
1244  void define_extern(const std::string &function_name,
1245  const std::vector<ExternFuncArgument> &params, Type t,
1246  const std::vector<Var> &arguments,
1248  DeviceAPI device_api = DeviceAPI::Host) {
1249  define_extern(function_name, params, std::vector<Type>{t}, arguments,
1250  mangling, device_api);
1251  }
1252 
1253  void define_extern(const std::string &function_name,
1254  const std::vector<ExternFuncArgument> &params,
1255  const std::vector<Type> &types,
1256  const std::vector<Var> &arguments,
1258  DeviceAPI device_api = DeviceAPI::Host);
1259  // @}
1260 
1261  /** Get the types of the outputs of this Func. */
1262  const std::vector<Type> &output_types() const;
1263 
1264  /** Get the number of outputs of this Func. Corresponds to the
1265  * size of the Tuple this Func was defined to return. */
1266  int outputs() const;
1267 
1268  /** Get the name of the extern function called for an extern
1269  * definition. */
1270  const std::string &extern_function_name() const;
1271 
1272  /** The dimensionality (number of arguments) of this
1273  * function. Zero if the function is not yet defined. */
1274  int dimensions() const;
1275 
1276  /** Construct either the left-hand-side of a definition, or a call
1277  * to a functions that happens to only contain vars as
1278  * arguments. If the function has already been defined, and fewer
1279  * arguments are given than the function has dimensions, then
1280  * enough implicit vars are added to the end of the argument list
1281  * to make up the difference (see \ref Var::implicit) */
1282  // @{
1283  FuncRef operator()(std::vector<Var>) const;
1284 
1285  template<typename... Args>
1286  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Var, Args...>::value, FuncRef>::type
1287  operator()(Args &&... args) const {
1288  std::vector<Var> collected_args{std::forward<Args>(args)...};
1289  return this->operator()(collected_args);
1290  }
1291  // @}
1292 
1293  /** Either calls to the function, or the left-hand-side of
1294  * an update definition (see \ref RDom). If the function has
1295  * already been defined, and fewer arguments are given than the
1296  * function has dimensions, then enough implicit vars are added to
1297  * the end of the argument list to make up the difference. (see
1298  * \ref Var::implicit)*/
1299  // @{
1300  FuncRef operator()(std::vector<Expr>) const;
1301 
1302  template<typename... Args>
1303  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Expr, Args...>::value, FuncRef>::type
1304  operator()(const Expr &x, Args &&... args) const {
1305  std::vector<Expr> collected_args{x, std::forward<Args>(args)...};
1306  return (*this)(collected_args);
1307  }
1308  // @}
1309 
1310  /** Creates and returns a new identity Func that wraps this Func. During
1311  * compilation, Halide replaces all calls to this Func done by 'f'
1312  * with calls to the wrapper. If this Func is already wrapped for
1313  * use in 'f', will return the existing wrapper.
1314  *
1315  * For example, g.in(f) would rewrite a pipeline like this:
1316  \code
1317  g(x, y) = ...
1318  f(x, y) = ... g(x, y) ...
1319  \endcode
1320  * into a pipeline like this:
1321  \code
1322  g(x, y) = ...
1323  g_wrap(x, y) = g(x, y)
1324  f(x, y) = ... g_wrap(x, y)
1325  \endcode
1326  *
1327  * This has a variety of uses. You can use it to schedule this
1328  * Func differently in the different places it is used:
1329  \code
1330  g(x, y) = ...
1331  f1(x, y) = ... g(x, y) ...
1332  f2(x, y) = ... g(x, y) ...
1333  g.in(f1).compute_at(f1, y).vectorize(x, 8);
1334  g.in(f2).compute_at(f2, x).unroll(x);
1335  \endcode
1336  *
1337  * You can also use it to stage loads from this Func via some
1338  * intermediate buffer (perhaps on the stack as in
1339  * test/performance/block_transpose.cpp, or in shared GPU memory
1340  * as in test/performance/wrap.cpp). In this we compute the
1341  * wrapper at tiles of the consuming Funcs like so:
1342  \code
1343  g.compute_root()...
1344  g.in(f).compute_at(f, tiles)...
1345  \endcode
1346  *
1347  * Func::in() can also be used to compute pieces of a Func into a
1348  * smaller scratch buffer (perhaps on the GPU) and then copy them
1349  * into a larger output buffer one tile at a time. See
1350  * apps/interpolate/interpolate.cpp for an example of this. In
1351  * this case we compute the Func at tiles of its own wrapper:
1352  \code
1353  f.in(g).compute_root().gpu_tile(...)...
1354  f.compute_at(f.in(g), tiles)...
1355  \endcode
1356  *
1357  * A similar use of Func::in() wrapping Funcs with multiple update
1358  * stages in a pure wrapper. The following code:
1359  \code
1360  f(x, y) = x + y;
1361  f(x, y) += 5;
1362  g(x, y) = f(x, y);
1363  f.compute_root();
1364  \endcode
1365  *
1366  * Is equivalent to:
1367  \code
1368  for y:
1369  for x:
1370  f(x, y) = x + y;
1371  for y:
1372  for x:
1373  f(x, y) += 5
1374  for y:
1375  for x:
1376  g(x, y) = f(x, y)
1377  \endcode
1378  * using Func::in(), we can write:
1379  \code
1380  f(x, y) = x + y;
1381  f(x, y) += 5;
1382  g(x, y) = f(x, y);
1383  f.in(g).compute_root();
1384  \endcode
1385  * which instead produces:
1386  \code
1387  for y:
1388  for x:
1389  f(x, y) = x + y;
1390  f(x, y) += 5
1391  f_wrap(x, y) = f(x, y)
1392  for y:
1393  for x:
1394  g(x, y) = f_wrap(x, y)
1395  \endcode
1396  */
1397  Func in(const Func &f);
1398 
1399  /** Create and return an identity wrapper shared by all the Funcs in
1400  * 'fs'. If any of the Funcs in 'fs' already have a custom wrapper,
1401  * this will throw an error. */
1402  Func in(const std::vector<Func> &fs);
1403 
1404  /** Create and return a global identity wrapper, which wraps all calls to
1405  * this Func by any other Func. If a global wrapper already exists,
1406  * returns it. The global identity wrapper is only used by callers for
1407  * which no custom wrapper has been specified.
1408  */
1409  Func in();
1410 
1411  /** Similar to \ref Func::in; however, instead of replacing the call to
1412  * this Func with an identity Func that refers to it, this replaces the
1413  * call with a clone of this Func.
1414  *
1415  * For example, f.clone_in(g) would rewrite a pipeline like this:
1416  \code
1417  f(x, y) = x + y;
1418  g(x, y) = f(x, y) + 2;
1419  h(x, y) = f(x, y) - 3;
1420  \endcode
1421  * into a pipeline like this:
1422  \code
1423  f(x, y) = x + y;
1424  f_clone(x, y) = x + y;
1425  g(x, y) = f_clone(x, y) + 2;
1426  h(x, y) = f(x, y) - 3;
1427  \endcode
1428  *
1429  */
1430  //@{
1431  Func clone_in(const Func &f);
1432  Func clone_in(const std::vector<Func> &fs);
1433  //@}
1434 
1435  /** Declare that this function should be implemented by a call to
1436  * halide_buffer_copy with the given target device API. Asserts
1437  * that the Func has a pure definition which is a simple call to a
1438  * single input, and no update definitions. The wrapper Funcs
1439  * returned by in() are suitable candidates. Consumes all pure
1440  * variables, and rewrites the Func to have an extern definition
1441  * that calls halide_buffer_copy. */
1443 
1444  /** Declare that this function should be implemented by a call to
1445  * halide_buffer_copy with a NULL target device API. Equivalent to
1446  * copy_to_device(DeviceAPI::Host). Asserts that the Func has a
1447  * pure definition which is a simple call to a single input, and
1448  * no update definitions. The wrapper Funcs returned by in() are
1449  * suitable candidates. Consumes all pure variables, and rewrites
1450  * the Func to have an extern definition that calls
1451  * halide_buffer_copy.
1452  *
1453  * Note that if the source Func is already valid in host memory,
1454  * this compiles to code that does the minimum number of calls to
1455  * memcpy.
1456  */
1457  Func copy_to_host();
1458 
1459  /** Split a dimension into inner and outer subdimensions with the
1460  * given names, where the inner dimension iterates from 0 to
1461  * factor-1. The inner and outer subdimensions can then be dealt
1462  * with using the other scheduling calls. It's ok to reuse the old
1463  * variable name as either the inner or outer variable. The final
1464  * argument specifies how the tail should be handled if the split
1465  * factor does not provably divide the extent. */
1466  Func &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1467 
1468  /** Join two dimensions into a single fused dimenion. The fused
1469  * dimension covers the product of the extents of the inner and
1470  * outer dimensions given. */
1471  Func &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
1472 
1473  /** Mark a dimension to be traversed serially. This is the default. */
1474  Func &serial(const VarOrRVar &var);
1475 
1476  /** Mark a dimension to be traversed in parallel */
1477  Func &parallel(const VarOrRVar &var);
1478 
1479  /** Split a dimension by the given task_size, and the parallelize the
1480  * outer dimension. This creates parallel tasks that have size
1481  * task_size. After this call, var refers to the outer dimension of
1482  * the split. The inner dimension has a new anonymous name. If you
1483  * wish to mutate it, or schedule with respect to it, do the split
1484  * manually. */
1485  Func &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
1486 
1487  /** Mark a dimension to be computed all-at-once as a single
1488  * vector. The dimension should have constant extent -
1489  * e.g. because it is the inner dimension following a split by a
1490  * constant factor. For most uses of vectorize you want the two
1491  * argument form. The variable to be vectorized should be the
1492  * innermost one. */
1493  Func &vectorize(const VarOrRVar &var);
1494 
1495  /** Mark a dimension to be completely unrolled. The dimension
1496  * should have constant extent - e.g. because it is the inner
1497  * dimension following a split by a constant factor. For most uses
1498  * of unroll you want the two-argument form. */
1499  Func &unroll(const VarOrRVar &var);
1500 
1501  /** Split a dimension by the given factor, then vectorize the
1502  * inner dimension. This is how you vectorize a loop of unknown
1503  * size. The variable to be vectorized should be the innermost
1504  * one. After this call, var refers to the outer dimension of the
1505  * split. 'factor' must be an integer. */
1506  Func &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1507 
1508  /** Split a dimension by the given factor, then unroll the inner
1509  * dimension. This is how you unroll a loop of unknown size by
1510  * some constant factor. After this call, var refers to the outer
1511  * dimension of the split. 'factor' must be an integer. */
1512  Func &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1513 
1514  /** Statically declare that the range over which a function should
1515  * be evaluated is given by the second and third arguments. This
1516  * can let Halide perform some optimizations. E.g. if you know
1517  * there are going to be 4 color channels, you can completely
1518  * vectorize the color channel dimension without the overhead of
1519  * splitting it up. If bounds inference decides that it requires
1520  * more of this function than the bounds you have stated, a
1521  * runtime error will occur when you try to run your pipeline. */
1522  Func &bound(const Var &var, Expr min, Expr extent);
1523 
1524  /** Statically declare the range over which the function will be
1525  * evaluated in the general case. This provides a basis for the auto
1526  * scheduler to make trade-offs and scheduling decisions. The auto
1527  * generated schedules might break when the sizes of the dimensions are
1528  * very different from the estimates specified. These estimates are used
1529  * only by the auto scheduler if the function is a pipeline output. */
1530  Func &set_estimate(const Var &var, const Expr &min, const Expr &extent);
1531 
1532  HALIDE_ATTRIBUTE_DEPRECATED("Use set_estimate() instead")
1533  Func &estimate(const Var &var, const Expr &min, const Expr &extent) {
1534  return set_estimate(var, min, extent);
1535  }
1536 
1537  /** Set (min, extent) estimates for all dimensions in the Func
1538  * at once; this is equivalent to calling `set_estimate(args()[n], min, extent)`
1539  * repeatedly, but slightly terser. The size of the estimates vector
1540  * must match the dimensionality of the Func. */
1541  Func &set_estimates(const Region &estimates);
1542 
1543  /** Expand the region computed so that the min coordinates is
1544  * congruent to 'remainder' modulo 'modulus', and the extent is a
1545  * multiple of 'modulus'. For example, f.align_bounds(x, 2) forces
1546  * the min and extent realized to be even, and calling
1547  * f.align_bounds(x, 2, 1) forces the min to be odd and the extent
1548  * to be even. The region computed always contains the region that
1549  * would have been computed without this directive, so no
1550  * assertions are injected. */
1551  Func &align_bounds(const Var &var, Expr modulus, Expr remainder = 0);
1552 
1553  /** Bound the extent of a Func's realization, but not its
1554  * min. This means the dimension can be unrolled or vectorized
1555  * even when its min is not fixed (for example because it is
1556  * compute_at tiles of another Func). This can also be useful for
1557  * forcing a function's allocation to be a fixed size, which often
1558  * means it can go on the stack. */
1559  Func &bound_extent(const Var &var, Expr extent);
1560 
1561  /** Split two dimensions at once by the given factors, and then
1562  * reorder the resulting dimensions to be xi, yi, xo, yo from
1563  * innermost outwards. This gives a tiled traversal. */
1564  Func &tile(const VarOrRVar &x, const VarOrRVar &y,
1565  const VarOrRVar &xo, const VarOrRVar &yo,
1566  const VarOrRVar &xi, const VarOrRVar &yi,
1567  const Expr &xfactor, const Expr &yfactor,
1569 
1570  /** A shorter form of tile, which reuses the old variable names as
1571  * the new outer dimensions */
1572  Func &tile(const VarOrRVar &x, const VarOrRVar &y,
1573  const VarOrRVar &xi, const VarOrRVar &yi,
1574  const Expr &xfactor, const Expr &yfactor,
1576 
1577  /** A more general form of tile, which defines tiles of any dimensionality. */
1578  Func &tile(const std::vector<VarOrRVar> &previous,
1579  const std::vector<VarOrRVar> &outers,
1580  const std::vector<VarOrRVar> &inners,
1581  const std::vector<Expr> &factors,
1582  const std::vector<TailStrategy> &tails);
1583 
1584  /** The generalized tile, with a single tail strategy to apply to all vars. */
1585  Func &tile(const std::vector<VarOrRVar> &previous,
1586  const std::vector<VarOrRVar> &outers,
1587  const std::vector<VarOrRVar> &inners,
1588  const std::vector<Expr> &factors,
1590 
1591  /** Generalized tiling, reusing the previous names as the outer names. */
1592  Func &tile(const std::vector<VarOrRVar> &previous,
1593  const std::vector<VarOrRVar> &inners,
1594  const std::vector<Expr> &factors,
1596 
1597  /** Reorder variables to have the given nesting order, from
1598  * innermost out */
1599  Func &reorder(const std::vector<VarOrRVar> &vars);
1600 
1601  template<typename... Args>
1602  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Func &>::type
1603  reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&... args) {
1604  std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
1605  return reorder(collected_args);
1606  }
1607 
1608  /** Rename a dimension. Equivalent to split with a inner size of one. */
1609  Func &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
1610 
1611  /** Specify that race conditions are permitted for this Func,
1612  * which enables parallelizing over RVars even when Halide cannot
1613  * prove that it is safe to do so. Use this with great caution,
1614  * and only if you can prove to yourself that this is safe, as it
1615  * may result in a non-deterministic routine that returns
1616  * different values at different times or on different machines. */
1618 
1619  /** Issue atomic updates for this Func. This allows parallelization
1620  * on associative RVars. The function throws a compile error when
1621  * Halide fails to prove associativity. Use override_associativity_test
1622  * to disable the associativity test if you believe the function is
1623  * associative or the order of reduction variable execution does not
1624  * matter.
1625  * Halide compiles this into hardware atomic operations whenever possible,
1626  * and falls back to a mutex lock per storage element if it is impossible
1627  * to atomically update.
1628  * There are three possible outcomes of the compiled code:
1629  * atomic add, compare-and-swap loop, and mutex lock.
1630  * For example:
1631  *
1632  * hist(x) = 0;
1633  * hist(im(r)) += 1;
1634  * hist.compute_root();
1635  * hist.update().atomic().parallel();
1636  *
1637  * will be compiled to atomic add operations.
1638  *
1639  * hist(x) = 0;
1640  * hist(im(r)) = min(hist(im(r)) + 1, 100);
1641  * hist.compute_root();
1642  * hist.update().atomic().parallel();
1643  *
1644  * will be compiled to compare-and-swap loops.
1645  *
1646  * arg_max() = {0, im(0)};
1647  * Expr old_index = arg_max()[0];
1648  * Expr old_max = arg_max()[1];
1649  * Expr new_index = select(old_max < im(r), r, old_index);
1650  * Expr new_max = max(im(r), old_max);
1651  * arg_max() = {new_index, new_max};
1652  * arg_max.compute_root();
1653  * arg_max.update().atomic().parallel();
1654  *
1655  * will be compiled to updates guarded by a mutex lock,
1656  * since it is impossible to atomically update two different locations.
1657  *
1658  * Currently the atomic operation is supported by x86, CUDA, and OpenCL backends.
1659  * Compiling to other backends results in a compile error.
1660  * If an operation is compiled into a mutex lock, and is vectorized or is
1661  * compiled to CUDA or OpenCL, it also results in a compile error,
1662  * since per-element mutex lock on vectorized operation leads to a
1663  * deadlock.
1664  * Vectorization of predicated RVars (through rdom.where()) on CPU
1665  * is also unsupported yet (see https://github.com/halide/Halide/issues/4298).
1666  * 8-bit and 16-bit atomics on GPU are also not supported. */
1667  Func &atomic(bool override_associativity_test = false);
1668 
1669  /** Specialize a Func. This creates a special-case version of the
1670  * Func where the given condition is true. The most effective
1671  * conditions are those of the form param == value, and boolean
1672  * Params. Consider a simple example:
1673  \code
1674  f(x) = x + select(cond, 0, 1);
1675  f.compute_root();
1676  \endcode
1677  * This is equivalent to:
1678  \code
1679  for (int x = 0; x < width; x++) {
1680  f[x] = x + (cond ? 0 : 1);
1681  }
1682  \endcode
1683  * Adding the scheduling directive:
1684  \code
1685  f.specialize(cond)
1686  \endcode
1687  * makes it equivalent to:
1688  \code
1689  if (cond) {
1690  for (int x = 0; x < width; x++) {
1691  f[x] = x;
1692  }
1693  } else {
1694  for (int x = 0; x < width; x++) {
1695  f[x] = x + 1;
1696  }
1697  }
1698  \endcode
1699  * Note that the inner loops have been simplified. In the first
1700  * path Halide knows that cond is true, and in the second path
1701  * Halide knows that it is false.
1702  *
1703  * The specialized version gets its own schedule, which inherits
1704  * every directive made about the parent Func's schedule so far
1705  * except for its specializations. This method returns a handle to
1706  * the new schedule. If you wish to retrieve the specialized
1707  * sub-schedule again later, you can call this method with the
1708  * same condition. Consider the following example of scheduling
1709  * the specialized version:
1710  *
1711  \code
1712  f(x) = x;
1713  f.compute_root();
1714  f.specialize(width > 1).unroll(x, 2);
1715  \endcode
1716  * Assuming for simplicity that width is even, this is equivalent to:
1717  \code
1718  if (width > 1) {
1719  for (int x = 0; x < width/2; x++) {
1720  f[2*x] = 2*x;
1721  f[2*x + 1] = 2*x + 1;
1722  }
1723  } else {
1724  for (int x = 0; x < width/2; x++) {
1725  f[x] = x;
1726  }
1727  }
1728  \endcode
1729  * For this case, it may be better to schedule the un-specialized
1730  * case instead:
1731  \code
1732  f(x) = x;
1733  f.compute_root();
1734  f.specialize(width == 1); // Creates a copy of the schedule so far.
1735  f.unroll(x, 2); // Only applies to the unspecialized case.
1736  \endcode
1737  * This is equivalent to:
1738  \code
1739  if (width == 1) {
1740  f[0] = 0;
1741  } else {
1742  for (int x = 0; x < width/2; x++) {
1743  f[2*x] = 2*x;
1744  f[2*x + 1] = 2*x + 1;
1745  }
1746  }
1747  \endcode
1748  * This can be a good way to write a pipeline that splits,
1749  * vectorizes, or tiles, but can still handle small inputs.
1750  *
1751  * If a Func has several specializations, the first matching one
1752  * will be used, so the order in which you define specializations
1753  * is significant. For example:
1754  *
1755  \code
1756  f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1757  f.specialize(cond1);
1758  f.specialize(cond2);
1759  \endcode
1760  * is equivalent to:
1761  \code
1762  if (cond1) {
1763  for (int x = 0; x < width; x++) {
1764  f[x] = x + a - (cond2 ? c : d);
1765  }
1766  } else if (cond2) {
1767  for (int x = 0; x < width; x++) {
1768  f[x] = x + b - c;
1769  }
1770  } else {
1771  for (int x = 0; x < width; x++) {
1772  f[x] = x + b - d;
1773  }
1774  }
1775  \endcode
1776  *
1777  * Specializations may in turn be specialized, which creates a
1778  * nested if statement in the generated code.
1779  *
1780  \code
1781  f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1782  f.specialize(cond1).specialize(cond2);
1783  \endcode
1784  * This is equivalent to:
1785  \code
1786  if (cond1) {
1787  if (cond2) {
1788  for (int x = 0; x < width; x++) {
1789  f[x] = x + a - c;
1790  }
1791  } else {
1792  for (int x = 0; x < width; x++) {
1793  f[x] = x + a - d;
1794  }
1795  }
1796  } else {
1797  for (int x = 0; x < width; x++) {
1798  f[x] = x + b - (cond2 ? c : d);
1799  }
1800  }
1801  \endcode
1802  * To create a 4-way if statement that simplifies away all of the
1803  * ternary operators above, you could say:
1804  \code
1805  f.specialize(cond1).specialize(cond2);
1806  f.specialize(cond2);
1807  \endcode
1808  * or
1809  \code
1810  f.specialize(cond1 && cond2);
1811  f.specialize(cond1);
1812  f.specialize(cond2);
1813  \endcode
1814  *
1815  * Any prior Func which is compute_at some variable of this Func
1816  * gets separately included in all paths of the generated if
1817  * statement. The Var in the compute_at call to must exist in all
1818  * paths, but it may have been generated via a different path of
1819  * splits, fuses, and renames. This can be used somewhat
1820  * creatively. Consider the following code:
1821  \code
1822  g(x, y) = 8*x;
1823  f(x, y) = g(x, y) + 1;
1824  f.compute_root().specialize(cond);
1825  Var g_loop;
1826  f.specialize(cond).rename(y, g_loop);
1827  f.rename(x, g_loop);
1828  g.compute_at(f, g_loop);
1829  \endcode
1830  * When cond is true, this is equivalent to g.compute_at(f,y).
1831  * When it is false, this is equivalent to g.compute_at(f,x).
1832  */
1833  Stage specialize(const Expr &condition);
1834 
1835  /** Add a specialization to a Func that always terminates execution
1836  * with a call to halide_error(). By itself, this is of limited use,
1837  * but can be useful to terminate chains of specialize() calls where
1838  * no "default" case is expected (thus avoiding unnecessary code generation).
1839  *
1840  * For instance, say we want to optimize a pipeline to process images
1841  * in planar and interleaved format; we might typically do something like:
1842  \code
1843  ImageParam im(UInt(8), 3);
1844  Func f = do_something_with(im);
1845  f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1846  f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1847  \endcode
1848  * This code will vectorize along rows for the planar case, and across pixel
1849  * components for the interleaved case... but there is an implicit "else"
1850  * for the unhandled cases, which generates unoptimized code. If we never
1851  * anticipate passing any other sort of images to this, we code streamline
1852  * our code by adding specialize_fail():
1853  \code
1854  ImageParam im(UInt(8), 3);
1855  Func f = do_something(im);
1856  f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1857  f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1858  f.specialize_fail("Unhandled image format");
1859  \endcode
1860  * Conceptually, this produces codes like:
1861  \code
1862  if (im.dim(0).stride() == 1) {
1863  do_something_planar();
1864  } else if (im.dim(2).stride() == 1) {
1865  do_something_interleaved();
1866  } else {
1867  halide_error("Unhandled image format");
1868  }
1869  \endcode
1870  *
1871  * Note that calling specialize_fail() terminates the specialization chain
1872  * for a given Func; you cannot create new specializations for the Func
1873  * afterwards (though you can retrieve handles to previous specializations).
1874  */
1875  void specialize_fail(const std::string &message);
1876 
1877  /** Tell Halide that the following dimensions correspond to GPU
1878  * thread indices. This is useful if you compute a producer
1879  * function within the block indices of a consumer function, and
1880  * want to control how that function's dimensions map to GPU
1881  * threads. If the selected target is not an appropriate GPU, this
1882  * just marks those dimensions as parallel. */
1883  // @{
1884  Func &gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1885  Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1886  Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1887  // @}
1888 
1889  /** The given dimension corresponds to the lanes in a GPU
1890  * warp. GPU warp lanes are distinguished from GPU threads by the
1891  * fact that all warp lanes run together in lockstep, which
1892  * permits lightweight communication of data from one lane to
1893  * another. */
1894  Func &gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1895 
1896  /** Tell Halide to run this stage using a single gpu thread and
1897  * block. This is not an efficient use of your GPU, but it can be
1898  * useful to avoid copy-back for intermediate update stages that
1899  * touch a very small part of your Func. */
1901 
1902  /** Tell Halide that the following dimensions correspond to GPU
1903  * block indices. This is useful for scheduling stages that will
1904  * run serially within each GPU block. If the selected target is
1905  * not ptx, this just marks those dimensions as parallel. */
1906  // @{
1907  Func &gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1908  Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1909  Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1910  // @}
1911 
1912  /** Tell Halide that the following dimensions correspond to GPU
1913  * block indices and thread indices. If the selected target is not
1914  * ptx, these just mark the given dimensions as parallel. The
1915  * dimensions are consumed by this call, so do all other
1916  * unrolling, reordering, etc first. */
1917  // @{
1918  Func &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1919  Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
1920  const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1921  Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
1922  const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1923  // @}
1924 
1925  /** Short-hand for tiling a domain and mapping the tile indices
1926  * to GPU block indices and the coordinates within each tile to
1927  * GPU thread indices. Consumes the variables given, so do all
1928  * other scheduling first. */
1929  // @{
1930  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
1932  DeviceAPI device_api = DeviceAPI::Default_GPU);
1933 
1934  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
1936  DeviceAPI device_api = DeviceAPI::Default_GPU);
1937  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
1938  const VarOrRVar &bx, const VarOrRVar &by,
1939  const VarOrRVar &tx, const VarOrRVar &ty,
1940  const Expr &x_size, const Expr &y_size,
1942  DeviceAPI device_api = DeviceAPI::Default_GPU);
1943 
1944  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
1945  const VarOrRVar &tx, const VarOrRVar &ty,
1946  const Expr &x_size, const Expr &y_size,
1948  DeviceAPI device_api = DeviceAPI::Default_GPU);
1949 
1950  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
1951  const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
1952  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
1953  const Expr &x_size, const Expr &y_size, const Expr &z_size,
1955  DeviceAPI device_api = DeviceAPI::Default_GPU);
1956  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
1957  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
1958  const Expr &x_size, const Expr &y_size, const Expr &z_size,
1960  DeviceAPI device_api = DeviceAPI::Default_GPU);
1961  // @}
1962 
1963  /** Schedule for execution using coordinate-based hardware api.
1964  * GLSL is an example of this. Conceptually, this is
1965  * similar to parallelization over 'x' and 'y' (since GLSL shaders compute
1966  * individual output pixels in parallel) and vectorization over 'c'
1967  * (since GLSL/RS implicitly vectorizes the color channel). */
1968  Func &shader(const Var &x, const Var &y, const Var &c, DeviceAPI device_api);
1969 
1970  /** Schedule for execution as GLSL kernel. */
1971  Func &glsl(const Var &x, const Var &y, const Var &c);
1972 
1973  /** Schedule for execution on Hexagon. When a loop is marked with
1974  * Hexagon, that loop is executed on a Hexagon DSP. */
1975  Func &hexagon(const VarOrRVar &x = Var::outermost());
1976 
1977  /** Prefetch data written to or read from a Func or an ImageParam by a
1978  * subsequent loop iteration, at an optionally specified iteration offset.
1979  * 'var' specifies at which loop level the prefetch calls should be inserted.
1980  * The final argument specifies how prefetch of region outside bounds
1981  * should be handled.
1982  *
1983  * For example, consider this pipeline:
1984  \code
1985  Func f, g;
1986  Var x, y;
1987  f(x, y) = x + y;
1988  g(x, y) = 2 * f(x, y);
1989  \endcode
1990  *
1991  * The following schedule:
1992  \code
1993  f.compute_root();
1994  g.prefetch(f, x, 2, PrefetchBoundStrategy::NonFaulting);
1995  \endcode
1996  *
1997  * will inject prefetch call at the innermost loop of 'g' and generate
1998  * the following loop nest:
1999  * for y = ...
2000  * for x = ...
2001  * f(x, y) = x + y
2002  * for y = ..
2003  * for x = ...
2004  * prefetch(&f[x + 2, y], 1, 16);
2005  * g(x, y) = 2 * f(x, y)
2006  */
2007  // @{
2008  Func &prefetch(const Func &f, const VarOrRVar &var, Expr offset = 1,
2010  Func &prefetch(const Internal::Parameter &param, const VarOrRVar &var, Expr offset = 1,
2012  template<typename T>
2013  Func &prefetch(const T &image, VarOrRVar var, Expr offset = 1,
2015  return prefetch(image.parameter(), var, offset, strategy);
2016  }
2017  // @}
2018 
2019  /** Specify how the storage for the function is laid out. These
2020  * calls let you specify the nesting order of the dimensions. For
2021  * example, foo.reorder_storage(y, x) tells Halide to use
2022  * column-major storage for any realizations of foo, without
2023  * changing how you refer to foo in the code. You may want to do
2024  * this if you intend to vectorize across y. When representing
2025  * color images, foo.reorder_storage(c, x, y) specifies packed
2026  * storage (red, green, and blue values adjacent in memory), and
2027  * foo.reorder_storage(x, y, c) specifies planar storage (entire
2028  * red, green, and blue images one after the other in memory).
2029  *
2030  * If you leave out some dimensions, those remain in the same
2031  * positions in the nesting order while the specified variables
2032  * are reordered around them. */
2033  // @{
2034  Func &reorder_storage(const std::vector<Var> &dims);
2035 
2036  Func &reorder_storage(const Var &x, const Var &y);
2037  template<typename... Args>
2038  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Var, Args...>::value, Func &>::type
2039  reorder_storage(const Var &x, const Var &y, Args &&... args) {
2040  std::vector<Var> collected_args{x, y, std::forward<Args>(args)...};
2041  return reorder_storage(collected_args);
2042  }
2043  // @}
2044 
2045  /** Pad the storage extent of a particular dimension of
2046  * realizations of this function up to be a multiple of the
2047  * specified alignment. This guarantees that the strides for the
2048  * dimensions stored outside of dim will be multiples of the
2049  * specified alignment, where the strides and alignment are
2050  * measured in numbers of elements.
2051  *
2052  * For example, to guarantee that a function foo(x, y, c)
2053  * representing an image has scanlines starting on offsets
2054  * aligned to multiples of 16, use foo.align_storage(x, 16). */
2055  Func &align_storage(const Var &dim, const Expr &alignment);
2056 
2057  /** Store realizations of this function in a circular buffer of a
2058  * given extent. This is more efficient when the extent of the
2059  * circular buffer is a power of 2. If the fold factor is too
2060  * small, or the dimension is not accessed monotonically, the
2061  * pipeline will generate an error at runtime.
2062  *
2063  * The fold_forward option indicates that the new values of the
2064  * producer are accessed by the consumer in a monotonically
2065  * increasing order. Folding storage of producers is also
2066  * supported if the new values are accessed in a monotonically
2067  * decreasing order by setting fold_forward to false.
2068  *
2069  * For example, consider the pipeline:
2070  \code
2071  Func f, g;
2072  Var x, y;
2073  g(x, y) = x*y;
2074  f(x, y) = g(x, y) + g(x, y+1);
2075  \endcode
2076  *
2077  * If we schedule f like so:
2078  *
2079  \code
2080  g.compute_at(f, y).store_root().fold_storage(y, 2);
2081  \endcode
2082  *
2083  * Then g will be computed at each row of f and stored in a buffer
2084  * with an extent in y of 2, alternately storing each computed row
2085  * of g in row y=0 or y=1.
2086  */
2087  Func &fold_storage(const Var &dim, const Expr &extent, bool fold_forward = true);
2088 
2089  /** Compute this function as needed for each unique value of the
2090  * given var for the given calling function f.
2091  *
2092  * For example, consider the simple pipeline:
2093  \code
2094  Func f, g;
2095  Var x, y;
2096  g(x, y) = x*y;
2097  f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2098  \endcode
2099  *
2100  * If we schedule f like so:
2101  *
2102  \code
2103  g.compute_at(f, x);
2104  \endcode
2105  *
2106  * Then the C code equivalent to this pipeline will look like this
2107  *
2108  \code
2109 
2110  int f[height][width];
2111  for (int y = 0; y < height; y++) {
2112  for (int x = 0; x < width; x++) {
2113  int g[2][2];
2114  g[0][0] = x*y;
2115  g[0][1] = (x+1)*y;
2116  g[1][0] = x*(y+1);
2117  g[1][1] = (x+1)*(y+1);
2118  f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
2119  }
2120  }
2121 
2122  \endcode
2123  *
2124  * The allocation and computation of g is within f's loop over x,
2125  * and enough of g is computed to satisfy all that f will need for
2126  * that iteration. This has excellent locality - values of g are
2127  * used as soon as they are computed, but it does redundant
2128  * work. Each value of g ends up getting computed four times. If
2129  * we instead schedule f like so:
2130  *
2131  \code
2132  g.compute_at(f, y);
2133  \endcode
2134  *
2135  * The equivalent C code is:
2136  *
2137  \code
2138  int f[height][width];
2139  for (int y = 0; y < height; y++) {
2140  int g[2][width+1];
2141  for (int x = 0; x < width; x++) {
2142  g[0][x] = x*y;
2143  g[1][x] = x*(y+1);
2144  }
2145  for (int x = 0; x < width; x++) {
2146  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2147  }
2148  }
2149  \endcode
2150  *
2151  * The allocation and computation of g is within f's loop over y,
2152  * and enough of g is computed to satisfy all that f will need for
2153  * that iteration. This does less redundant work (each point in g
2154  * ends up being evaluated twice), but the locality is not quite
2155  * as good, and we have to allocate more temporary memory to store
2156  * g.
2157  */
2158  Func &compute_at(const Func &f, const Var &var);
2159 
2160  /** Schedule a function to be computed within the iteration over
2161  * some dimension of an update domain. Produces equivalent code
2162  * to the version of compute_at that takes a Var. */
2163  Func &compute_at(const Func &f, const RVar &var);
2164 
2165  /** Schedule a function to be computed within the iteration over
2166  * a given LoopLevel. */
2167  Func &compute_at(LoopLevel loop_level);
2168 
2169  /** Schedule the iteration over the initial definition of this function
2170  * to be fused with another stage 's' from outermost loop to a
2171  * given LoopLevel. */
2172  // @{
2173  Func &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
2175  Func &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
2177 
2178  /** Compute all of this function once ahead of time. Reusing
2179  * the example in \ref Func::compute_at :
2180  *
2181  \code
2182  Func f, g;
2183  Var x, y;
2184  g(x, y) = x*y;
2185  f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2186 
2187  g.compute_root();
2188  \endcode
2189  *
2190  * is equivalent to
2191  *
2192  \code
2193  int f[height][width];
2194  int g[height+1][width+1];
2195  for (int y = 0; y < height+1; y++) {
2196  for (int x = 0; x < width+1; x++) {
2197  g[y][x] = x*y;
2198  }
2199  }
2200  for (int y = 0; y < height; y++) {
2201  for (int x = 0; x < width; x++) {
2202  f[y][x] = g[y][x] + g[y+1][x] + g[y][x+1] + g[y+1][x+1];
2203  }
2204  }
2205  \endcode
2206  *
2207  * g is computed once ahead of time, and enough is computed to
2208  * satisfy all uses of it. This does no redundant work (each point
2209  * in g is evaluated once), but has poor locality (values of g are
2210  * probably not still in cache when they are used by f), and
2211  * allocates lots of temporary memory to store g.
2212  */
2213  Func &compute_root();
2214 
2215  /** Use the halide_memoization_cache_... interface to store a
2216  * computed version of this function across invocations of the
2217  * Func.
2218  */
2219  Func &memoize();
2220 
2221  /** Produce this Func asynchronously in a separate
2222  * thread. Consumers will be run by the task system when the
2223  * production is complete. If this Func's store level is different
2224  * to its compute level, consumers will be run concurrently,
2225  * blocking as necessary to prevent reading ahead of what the
2226  * producer has computed. If storage is folded, then the producer
2227  * will additionally not be permitted to run too far ahead of the
2228  * consumer, to avoid clobbering data that has not yet been
2229  * used.
2230  *
2231  * Take special care when combining this with custom thread pool
2232  * implementations, as avoiding deadlock with producer-consumer
2233  * parallelism requires a much more sophisticated parallel runtime
2234  * than with data parallelism alone. It is strongly recommended
2235  * you just use Halide's default thread pool, which guarantees no
2236  * deadlock and a bound on the number of threads launched.
2237  */
2238  Func &async();
2239 
2240  /** Allocate storage for this function within f's loop over
2241  * var. Scheduling storage is optional, and can be used to
2242  * separate the loop level at which storage occurs from the loop
2243  * level at which computation occurs to trade off between locality
2244  * and redundant work. This can open the door for two types of
2245  * optimization.
2246  *
2247  * Consider again the pipeline from \ref Func::compute_at :
2248  \code
2249  Func f, g;
2250  Var x, y;
2251  g(x, y) = x*y;
2252  f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
2253  \endcode
2254  *
2255  * If we schedule it like so:
2256  *
2257  \code
2258  g.compute_at(f, x).store_at(f, y);
2259  \endcode
2260  *
2261  * Then the computation of g takes place within the loop over x,
2262  * but the storage takes place within the loop over y:
2263  *
2264  \code
2265  int f[height][width];
2266  for (int y = 0; y < height; y++) {
2267  int g[2][width+1];
2268  for (int x = 0; x < width; x++) {
2269  g[0][x] = x*y;
2270  g[0][x+1] = (x+1)*y;
2271  g[1][x] = x*(y+1);
2272  g[1][x+1] = (x+1)*(y+1);
2273  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2274  }
2275  }
2276  \endcode
2277  *
2278  * Provided the for loop over x is serial, halide then
2279  * automatically performs the following sliding window
2280  * optimization:
2281  *
2282  \code
2283  int f[height][width];
2284  for (int y = 0; y < height; y++) {
2285  int g[2][width+1];
2286  for (int x = 0; x < width; x++) {
2287  if (x == 0) {
2288  g[0][x] = x*y;
2289  g[1][x] = x*(y+1);
2290  }
2291  g[0][x+1] = (x+1)*y;
2292  g[1][x+1] = (x+1)*(y+1);
2293  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2294  }
2295  }
2296  \endcode
2297  *
2298  * Two of the assignments to g only need to be done when x is
2299  * zero. The rest of the time, those sites have already been
2300  * filled in by a previous iteration. This version has the
2301  * locality of compute_at(f, x), but allocates more memory and
2302  * does much less redundant work.
2303  *
2304  * Halide then further optimizes this pipeline like so:
2305  *
2306  \code
2307  int f[height][width];
2308  for (int y = 0; y < height; y++) {
2309  int g[2][2];
2310  for (int x = 0; x < width; x++) {
2311  if (x == 0) {
2312  g[0][0] = x*y;
2313  g[1][0] = x*(y+1);
2314  }
2315  g[0][(x+1)%2] = (x+1)*y;
2316  g[1][(x+1)%2] = (x+1)*(y+1);
2317  f[y][x] = g[0][x%2] + g[1][x%2] + g[0][(x+1)%2] + g[1][(x+1)%2];
2318  }
2319  }
2320  \endcode
2321  *
2322  * Halide has detected that it's possible to use a circular buffer
2323  * to represent g, and has reduced all accesses to g modulo 2 in
2324  * the x dimension. This optimization only triggers if the for
2325  * loop over x is serial, and if halide can statically determine
2326  * some power of two large enough to cover the range needed. For
2327  * powers of two, the modulo operator compiles to more efficient
2328  * bit-masking. This optimization reduces memory usage, and also
2329  * improves locality by reusing recently-accessed memory instead
2330  * of pulling new memory into cache.
2331  *
2332  */
2333  Func &store_at(const Func &f, const Var &var);
2334 
2335  /** Equivalent to the version of store_at that takes a Var, but
2336  * schedules storage within the loop over a dimension of a
2337  * reduction domain */
2338  Func &store_at(const Func &f, const RVar &var);
2339 
2340  /** Equivalent to the version of store_at that takes a Var, but
2341  * schedules storage at a given LoopLevel. */
2342  Func &store_at(LoopLevel loop_level);
2343 
2344  /** Equivalent to \ref Func::store_at, but schedules storage
2345  * outside the outermost loop. */
2346  Func &store_root();
2347 
2348  /** Aggressively inline all uses of this function. This is the
2349  * default schedule, so you're unlikely to need to call this. For
2350  * a Func with an update definition, that means it gets computed
2351  * as close to the innermost loop as possible.
2352  *
2353  * Consider once more the pipeline from \ref Func::compute_at :
2354  *
2355  \code
2356  Func f, g;
2357  Var x, y;
2358  g(x, y) = x*y;
2359  f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
2360  \endcode
2361  *
2362  * Leaving g as inline, this compiles to code equivalent to the following C:
2363  *
2364  \code
2365  int f[height][width];
2366  for (int y = 0; y < height; y++) {
2367  for (int x = 0; x < width; x++) {
2368  f[y][x] = x*y + x*(y+1) + (x+1)*y + (x+1)*(y+1);
2369  }
2370  }
2371  \endcode
2372  */
2373  Func &compute_inline();
2374 
2375  /** Get a handle on an update step for the purposes of scheduling
2376  * it. */
2377  Stage update(int idx = 0);
2378 
2379  /** Set the type of memory this Func should be stored in. Controls
2380  * whether allocations go on the stack or the heap on the CPU, and
2381  * in global vs shared vs local on the GPU. See the documentation
2382  * on MemoryType for more detail. */
2383  Func &store_in(MemoryType memory_type);
2384 
2385  /** Trace all loads from this Func by emitting calls to
2386  * halide_trace. If the Func is inlined, this has no
2387  * effect. */
2388  Func &trace_loads();
2389 
2390  /** Trace all stores to the buffer backing this Func by emitting
2391  * calls to halide_trace. If the Func is inlined, this call
2392  * has no effect. */
2393  Func &trace_stores();
2394 
2395  /** Trace all realizations of this Func by emitting calls to
2396  * halide_trace. */
2398 
2399  /** Add a string of arbitrary text that will be passed thru to trace
2400  * inspection code if the Func is realized in trace mode. (Funcs that are
2401  * inlined won't have their tags emitted.) Ignored entirely if
2402  * tracing is not enabled for the Func (or globally).
2403  */
2404  Func &add_trace_tag(const std::string &trace_tag);
2405 
2406  /** Get a handle on the internal halide function that this Func
2407  * represents. Useful if you want to do introspection on Halide
2408  * functions */
2409  Internal::Function function() const {
2410  return func;
2411  }
2412 
2413  /** You can cast a Func to its pure stage for the purposes of
2414  * scheduling it. */
2415  operator Stage() const;
2416 
2417  /** Get a handle on the output buffer for this Func. Only relevant
2418  * if this is the output Func in a pipeline. Useful for making
2419  * static promises about strides, mins, and extents. */
2420  // @{
2422  std::vector<OutputImageParam> output_buffers() const;
2423  // @}
2424 
2425  /** Use a Func as an argument to an external stage. */
2426  operator ExternFuncArgument() const;
2427 
2428  /** Infer the arguments to the Func, sorted into a canonical order:
2429  * all buffers (sorted alphabetically by name), followed by all non-buffers
2430  * (sorted alphabetically by name).
2431  This lets you write things like:
2432  \code
2433  func.compile_to_assembly("/dev/stdout", func.infer_arguments());
2434  \endcode
2435  */
2436  std::vector<Argument> infer_arguments() const;
2437 
2438  /** Get the source location of the pure definition of this
2439  * Func. See Stage::source_location() */
2440  std::string source_location() const;
2441 
2442  /** Return the current StageSchedule associated with this initial
2443  * Stage of this Func. For introspection only: to modify schedule,
2444  * use the Func interface. */
2446  return Stage(*this).get_schedule();
2447  }
2448 };
2449 
2450 namespace Internal {
2451 
2452 template<typename Last>
2453 inline void check_types(const Tuple &t, int idx) {
2454  using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
2455  user_assert(t[idx].type() == type_of<T>())
2456  << "Can't evaluate expression "
2457  << t[idx] << " of type " << t[idx].type()
2458  << " as a scalar of type " << type_of<T>() << "\n";
2459 }
2460 
2461 template<typename First, typename Second, typename... Rest>
2462 inline void check_types(const Tuple &t, int idx) {
2463  check_types<First>(t, idx);
2464  check_types<Second, Rest...>(t, idx + 1);
2465 }
2466 
2467 template<typename Last>
2468 inline void assign_results(Realization &r, int idx, Last last) {
2469  using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
2470  *last = Buffer<T>(r[idx])();
2471 }
2472 
2473 template<typename First, typename Second, typename... Rest>
2474 inline void assign_results(Realization &r, int idx, First first, Second second, Rest &&... rest) {
2475  assign_results<First>(r, idx, first);
2476  assign_results<Second, Rest...>(r, idx + 1, second, rest...);
2477 }
2478 
2479 } // namespace Internal
2480 
2481 /** JIT-Compile and run enough code to evaluate a Halide
2482  * expression. This can be thought of as a scalar version of
2483  * \ref Func::realize */
2484 template<typename T>
2486  user_assert(e.type() == type_of<T>())
2487  << "Can't evaluate expression "
2488  << e << " of type " << e.type()
2489  << " as a scalar of type " << type_of<T>() << "\n";
2490  Func f;
2491  f() = e;
2492  Buffer<T> im = f.realize();
2493  return im();
2494 }
2495 
2496 /** JIT-compile and run enough code to evaluate a Halide Tuple. */
2497 template<typename First, typename... Rest>
2498 HALIDE_NO_USER_CODE_INLINE void evaluate(Tuple t, First first, Rest &&... rest) {
2499  Internal::check_types<First, Rest...>(t, 0);
2500 
2501  Func f;
2502  f() = t;
2503  Realization r = f.realize();
2504  Internal::assign_results(r, 0, first, rest...);
2505 }
2506 
2507 namespace Internal {
2508 
2509 inline void schedule_scalar(Func f) {
2511  if (t.has_gpu_feature()) {
2512  f.gpu_single_thread();
2513  }
2515  f.hexagon();
2516  }
2517 }
2518 
2519 } // namespace Internal
2520 
2521 /** JIT-Compile and run enough code to evaluate a Halide
2522  * expression. This can be thought of as a scalar version of
2523  * \ref Func::realize. Can use GPU if jit target from environment
2524  * specifies one.
2525  */
2526 template<typename T>
2528  user_assert(e.type() == type_of<T>())
2529  << "Can't evaluate expression "
2530  << e << " of type " << e.type()
2531  << " as a scalar of type " << type_of<T>() << "\n";
2532  Func f;
2533  f() = e;
2535  Buffer<T> im = f.realize();
2536  return im();
2537 }
2538 
2539 /** JIT-compile and run enough code to evaluate a Halide Tuple. Can
2540  * use GPU if jit target from environment specifies one. */
2541 // @{
2542 template<typename First, typename... Rest>
2543 HALIDE_NO_USER_CODE_INLINE void evaluate_may_gpu(Tuple t, First first, Rest &&... rest) {
2544  Internal::check_types<First, Rest...>(t, 0);
2545 
2546  Func f;
2547  f() = t;
2549  Realization r = f.realize();
2550  Internal::assign_results(r, 0, first, rest...);
2551 }
2552 // @}
2553 
2554 } // namespace Halide
2555 
2556 #endif
Halide::Stage::fuse
Stage & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Halide::Stage::gpu_blocks
Stage & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Halide::Func::output_types
const std::vector< Type > & output_types() const
Get the types of the outputs of this Func.
Halide::Func::copy_to_device
Func copy_to_device(DeviceAPI d=DeviceAPI::Default_GPU)
Declare that this function should be implemented by a call to halide_buffer_copy with the given targe...
Halide::Stage::parallel
Stage & parallel(const VarOrRVar &var)
Halide::Func::compute_with
Func & compute_with(const Stage &s, const VarOrRVar &var, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy >> &align)
Schedule the iteration over the initial definition of this function to be fused with another stage 's...
Halide::Stage::tile
Stage & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Halide::Func::operator()
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Var, Args... >::value, FuncRef >::type operator()(Args &&... args) const
Definition: Func.h:1287
Halide::Internal::all_are_convertible
Definition: Util.h:210
Halide::Func::compile_jit
void compile_jit(const Target &target=get_jit_target_from_environment())
Eagerly jit compile the function to machine code.
Halide::VarOrRVar::name
const std::string & name() const
Definition: Func.h:48
Halide::Func::prefetch
Func & prefetch(const Func &f, const VarOrRVar &var, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Prefetch data written to or read from a Func or an ImageParam by a subsequent loop iteration,...
Halide::Func::rvars
std::vector< RVar > rvars(int idx=0) const
Get the RVars of the reduction domain for an update definition, if there is one.
Halide::Target::has_feature
bool has_feature(Feature f) const
Halide::Func::Func
Func()
Declare a new undefined function with an automatically-generated unique name.
Halide::Func::compile_to_c
void compile_to_c(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name="", const Target &target=get_target_from_environment())
Statically compile this function to C source code.
Halide::Func::realize
Realization realize(std::vector< int32_t > sizes, const Target &target=Target(), const ParamMap &param_map=ParamMap::empty_map())
Evaluate this function over some rectangular domain and return the resulting buffer or buffers.
Halide::Func::name
const std::string & name() const
The name of this function, either given during construction, or automatically generated.
Halide::Stage::reorder
Stage & reorder(const std::vector< VarOrRVar > &vars)
halide_trace_event_t
Definition: HalideRuntime.h:501
Tuple.h
Halide::Func::specialize
Stage specialize(const Expr &condition)
Specialize a Func.
Halide::Func::gpu_blocks
Func & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices.
internal_assert
#define internal_assert(c)
Definition: Errors.h:19
Halide::Region
std::vector< Range > Region
A multi-dimensional box.
Definition: Expr.h:343
Halide::ScheduleHandle
Stage ScheduleHandle
Definition: Func.h:462
Halide::Var
A Halide variable, to be used when defining functions.
Definition: Var.h:19
Halide::Stage::name
std::string name() const
Return the name of this stage, e.g.
Halide::FuncTupleElementRef::operator/=
Stage operator/=(const Expr &e)
Define a stage that divides Tuple component 'idx' of this Func by the given expression.
Halide::Func::gpu_tile
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Short-hand for tiling a domain and mapping the tile indices to GPU block indices and the coordinates ...
uint8_t
unsigned __INT8_TYPE__ uint8_t
Definition: runtime_internal.h:25
Halide::Func::shader
Func & shader(const Var &x, const Var &y, const Var &c, DeviceAPI device_api)
Schedule for execution using coordinate-based hardware api.
Halide::FuncTupleElementRef::FuncTupleElementRef
FuncTupleElementRef(const FuncRef &ref, const std::vector< Expr > &args, int idx)
Halide::DeviceAPI::Default_GPU
@ Default_GPU
Halide::Func::set_error_handler
void set_error_handler(void(*handler)(void *, const char *))
Set the error handler function that be called in the case of runtime errors during halide pipelines.
Halide::Internal::Parameter
A reference-counted handle to a parameter to a halide pipeline.
Definition: Parameter.h:28
Halide::DeviceAPI::Host
@ Host
Used to denote for loops that run on the same device as the containing code.
Halide::VarOrRVar::var
Var var
Definition: Func.h:55
Halide::Func::values
Tuple values() const
The values returned by this function.
Halide::Func::align_bounds
Func & align_bounds(const Var &var, Expr modulus, Expr remainder=0)
Expand the region computed so that the min coordinates is congruent to 'remainder' modulo 'modulus',...
Halide::Stage::prefetch
Stage & prefetch(const Func &f, const VarOrRVar &var, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Halide::Stage::serial
Stage & serial(const VarOrRVar &var)
Halide::Func::compile_to_lowered_stmt
void compile_to_lowered_stmt(const std::string &filename, const std::vector< Argument > &args, StmtOutputFormat fmt=Text, const Target &target=get_target_from_environment())
Write out an internal representation of lowered code.
Halide::Pipeline::RealizationArg
Definition: Pipeline.h:99
Halide::Stage::gpu_threads
Stage & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Halide::Func::reorder
Func & reorder(const std::vector< VarOrRVar > &vars)
Reorder variables to have the given nesting order, from innermost out.
Halide::VarOrRVar::rvar
RVar rvar
Definition: Func.h:56
Halide::min
Expr min(const FuncRef &a, const FuncRef &b)
Explicit overloads of min and max for FuncRef.
Definition: Func.h:577
Halide::Internal::StageSchedule::touched
bool & touched()
This flag is set to true if the dims list has been manipulated by the user (or if a ScheduleHandle wa...
Halide::Func::compile_to_llvm_assembly
void compile_to_llvm_assembly(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to llvm assembly, with the given filename (which should probably end...
Halide::StmtOutputFormat
StmtOutputFormat
Used to determine if the output printed to file should be as a normal string or as an HTML file which...
Definition: Pipeline.h:61
Halide::Stage::gpu_tile
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Halide::FuncRef::operator[]
FuncTupleElementRef operator[](int) const
When a FuncRef refers to a function that provides multiple outputs, you can access each output as an ...
Halide::Func::hexagon
Func & hexagon(const VarOrRVar &x=Var::outermost())
Schedule for execution on Hexagon.
Halide::Stage::reorder
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Stage & >::type reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&... args)
Definition: Func.h:378
Halide::Internal::ForType
ForType
An enum describing a type of loop traversal.
Definition: Expr.h:391
Halide::FuncRef::size
size_t size() const
How many outputs does the function this refers to produce.
Target.h
Halide::Stage::rename
Stage & rename(const VarOrRVar &old_name, const VarOrRVar &new_name)
Halide::Func::store_at
Func & store_at(const Func &f, const Var &var)
Allocate storage for this function within f's loop over var.
Halide::ImageParam
An Image parameter to a halide pipeline.
Definition: ImageParam.h:23
Halide::Func::vectorize
Func & vectorize(const VarOrRVar &var)
Mark a dimension to be computed all-at-once as a single vector.
Halide::Func::fold_storage
Func & fold_storage(const Var &dim, const Expr &extent, bool fold_forward=true)
Store realizations of this function in a circular buffer of a given extent.
Var.h
Halide::ParamMap
Definition: ParamMap.h:17
Halide::Func::compute_at
Func & compute_at(const Func &f, const Var &var)
Compute this function as needed for each unique value of the given var for the given calling function...
Halide::Func::set_estimates
Func & set_estimates(const Region &estimates)
Set (min, extent) estimates for all dimensions in the Func at once; this is equivalent to calling set...
Halide::LoopAlignStrategy
LoopAlignStrategy
Different ways to handle the case when the start/end of the loops of stages computed with (fused) are...
Definition: Schedule.h:77
Halide::FuncRef::function
Internal::Function function() const
What function is this calling?
Definition: Func.h:568
Halide::Stage::rfactor
Func rfactor(std::vector< std::pair< RVar, Var >> preserved)
Calling rfactor() on an associative update definition a Func will split the update into an intermedia...
Halide::Func::compile_to_bitcode
void compile_to_bitcode(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to llvm bitcode, with the given filename (which should probably end ...
Halide::Func::operator()
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Expr, Args... >::value, FuncRef >::type operator()(const Expr &x, Args &&... args) const
Definition: Func.h:1304
Halide::FuncTupleElementRef::operator+=
Stage operator+=(const Expr &e)
Define a stage that adds the given expression to Tuple component 'idx' of this Func.
Halide::Func::add_custom_lowering_pass
void add_custom_lowering_pass(T *pass)
Add a custom pass to be used during lowering.
Definition: Func.h:1121
Halide::PrefetchBoundStrategy::GuardWithIf
@ GuardWithIf
Guard the prefetch with if-guards that ignores the prefetch if any of the prefetched region ever goes...
Halide::get_target_from_environment
Target get_target_from_environment()
Return the target that Halide will use.
user_assert
#define user_assert(c)
Definition: Errors.h:15
Pipeline.h
Halide::Target::has_gpu_feature
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled? I.e.
Halide::NameMangling::Default
@ Default
Match whatever is specified in the Target.
Halide::Var::outermost
static Var outermost()
A Var that represents the location outside the outermost loop.
Definition: Var.h:163
Halide::Func::fuse
Func & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Join two dimensions into a single fused dimenion.
Halide::Func::clear_custom_lowering_passes
void clear_custom_lowering_passes()
Remove all previously-set custom lowering passes.
Halide::Text
@ Text
Definition: Pipeline.h:62
Halide::Func::operator()
FuncRef operator()(std::vector< Var >) const
Construct either the left-hand-side of a definition, or a call to a functions that happens to only co...
Halide::Target::HVX_128
@ HVX_128
Definition: Target.h:97
Halide::Internal::Definition
A Function definition which can either represent a init or an update definition.
Definition: Definition.h:38
Halide::FuncRef::FuncRef
FuncRef(const Internal::Function &, const std::vector< Expr > &, int placeholder_pos=-1, int count=0)
Halide::FuncTupleElementRef::operator-=
Stage operator-=(const Expr &e)
Define a stage that adds the negative of the given expression to Tuple component 'idx' of this Func.
Halide::Func::add_trace_tag
Func & add_trace_tag(const std::string &trace_tag)
Add a string of arbitrary text that will be passed thru to trace inspection code if the Func is reali...
Halide::TailStrategy
TailStrategy
Different ways to handle a tail case in a split when the factor does not provably divide the extent.
Definition: Schedule.h:32
Halide::FuncRef::operator+=
Stage operator+=(Expr)
Define a stage that adds the given expression to this Func.
malloc
void * malloc(size_t)
Halide::Module
A halide module.
Definition: Module.h:136
Halide::Type
Types in the halide type system.
Definition: Type.h:269
Halide::Func::define_extern
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, int dimensionality, NameMangling mangling)
Definition: Func.h:1226
Halide::Pipeline
A class representing a Halide pipeline.
Definition: Pipeline.h:97
Halide::Func::set_custom_trace
void set_custom_trace(int(*trace_fn)(void *, const halide_trace_event_t *))
Set custom routines to call when tracing is enabled.
Halide
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Definition: AddAtomicMutex.h:21
Halide::Func::set_custom_allocator
void set_custom_allocator(void *(*malloc)(void *, size_t), void(*free)(void *, void *))
Set a custom malloc and free for halide to use.
Halide::Func::reorder
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Func & >::type reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&... args)
Definition: Func.h:1603
Halide::Func::compile_to_multitarget_static_library
void compile_to_multitarget_static_library(const std::string &filename_prefix, const std::vector< Argument > &args, const std::vector< Target > &targets)
Compile to static-library file and header pair once for each target; each resulting function will be ...
Halide::Func::value
Expr value() const
The right-hand-side value of the pure definition of this function.
Halide::ExternFuncArgument
An argument to an extern-defined Func.
Definition: ExternFuncArgument.h:17
Halide::Func::copy_to_host
Func copy_to_host()
Declare that this function should be implemented by a call to halide_buffer_copy with a NULL target d...
Halide::LinkageType::Internal
@ Internal
Not visible externally, similar to 'static' linkage in C.
Halide::Func::source_location
std::string source_location() const
Get the source location of the pure definition of this Func.
Halide::Func::outputs
int outputs() const
Get the number of outputs of this Func.
Halide::Func::set_estimate
Func & set_estimate(const Var &var, const Expr &min, const Expr &extent)
Statically declare the range over which the function will be evaluated in the general case.
Halide::Func::update
Stage update(int idx=0)
Get a handle on an update step for the purposes of scheduling it.
Argument.h
Halide::Func::infer_arguments
std::vector< Argument > infer_arguments() const
Infer the arguments to the Func, sorted into a canonical order: all buffers (sorted alphabetically by...
Halide::Func::serial
Func & serial(const VarOrRVar &var)
Mark a dimension to be traversed serially.
Halide::Internal::make_argument_list
std::vector< Var > make_argument_list(int dimensionality)
Make a list of unique arguments for definitions with unnamed arguments.
Halide::Func::define_extern
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Add an extern definition for this Func.
Definition: Func.h:1216
Halide::FuncRef::operator=
Stage operator=(const Expr &)
Use this as the left-hand-side of a definition or an update definition (see RDom).
Halide::Func::bound_extent
Func & bound_extent(const Var &var, Expr extent)
Bound the extent of a Func's realization, but not its min.
Halide::LoopAlignStrategy::Auto
@ Auto
By default, LoopAlignStrategy is set to NoAlign.
Halide::Func::compile_to_static_library
void compile_to_static_library(const std::string &filename_prefix, const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Compile to static-library file and header pair, with the given arguments.
Halide::Stage::dump_argument_list
std::string dump_argument_list() const
Return a string describing the current var list taking into account all the splits,...
JITModule.h
Halide::Buffer
A Halide::Buffer is a named shared reference to a Halide::Runtime::Buffer.
Definition: Argument.h:16
Halide::Internal::Definition::args
const std::vector< Expr > & args() const
Get the default (no-specialization) arguments (left-hand-side) of the definition.
Halide::Stage::Stage
Stage(Internal::Function f, Internal::Definition d, size_t stage_index)
Definition: Func.h:93
Halide::Func::gpu_lanes
Func & gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
The given dimension corresponds to the lanes in a GPU warp.
Halide::Func::infer_input_bounds
void infer_input_bounds(const std::vector< int32_t > &sizes, const Target &target=get_jit_target_from_environment(), const ParamMap &param_map=ParamMap::empty_map())
For a given size of output, or a given output buffer, determine the bounds required of all unbound Im...
Halide::Stage::unroll
Stage & unroll(const VarOrRVar &var)
Halide::Func::extern_function_name
const std::string & extern_function_name() const
Get the name of the extern function called for an extern definition.
Halide::Internal::JITHandlers
Definition: JITModule.h:139
Halide::VarOrRVar::VarOrRVar
VarOrRVar(const RDom &r)
Definition: Func.h:40
HALIDE_ATTRIBUTE_DEPRECATED
#define HALIDE_ATTRIBUTE_DEPRECATED(x)
Definition: HalideRuntime.h:1544
Halide::Func::compile_to_module
Module compile_to_module(const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Store an internal representation of lowered code as a self contained Module suitable for further comp...
Halide::VarOrRVar::is_rvar
bool is_rvar
Definition: Func.h:57
Halide::Func::compile_to_multitarget_object_files
void compile_to_multitarget_object_files(const std::string &filename_prefix, const std::vector< Argument > &args, const std::vector< Target > &targets, const std::vector< std::string > &suffixes)
Like compile_to_multitarget_static_library(), except that the object files are all output as object f...
Halide::Func::update_value
Expr update_value(int idx=0) const
Get the right-hand-side of an update definition.
Halide::Internal::Definition::defined
bool defined() const
Definition objects are nullable.
Halide::Func::async
Func & async()
Produce this Func asynchronously in a separate thread.
Halide::Func::compile_to_assembly
void compile_to_assembly(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to text assembly equivalent to the object file generated by compile_...
Halide::Func::compile_to
void compile_to(const std::map< Output, std::string > &output_files, const std::vector< Argument > &args, const std::string &fn_name, const Target &target=get_target_from_environment())
Compile and generate multiple target files with single call.
Halide::Func::store_root
Func & store_root()
Equivalent to Func::store_at, but schedules storage outside the outermost loop.
Halide::Expr::type
HALIDE_ALWAYS_INLINE Type type() const
Get the type of this expression node.
Definition: Expr.h:320
Halide::Func::compile_to_file
void compile_to_file(const std::string &filename_prefix, const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Compile to object file and header pair, with the given arguments.
Halide::OutputImageParam
A handle on the output buffer of a pipeline.
Definition: OutputImageParam.h:19
Halide::Func::parallel
Func & parallel(const VarOrRVar &var)
Mark a dimension to be traversed in parallel.
Halide::Func::is_extern
bool is_extern() const
Is this function an external stage? That is, was it defined using define_extern?
Halide::FuncTupleElementRef::operator=
Stage operator=(const Expr &e)
Use this as the left-hand-side of an update definition of Tuple component 'idx' of a Func (see RDom).
Halide::Func::tile
Func & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Split two dimensions at once by the given factors, and then reorder the resulting dimensions to be xi...
Halide::Stage::hexagon
Stage & hexagon(const VarOrRVar &x=Var::outermost())
Halide::Func::clone_in
Func clone_in(const Func &f)
Similar to Func::in; however, instead of replacing the call to this Func with an identity Func that r...
Halide::Stage::vectorize
Stage & vectorize(const VarOrRVar &var)
Halide::Func::output_buffers
std::vector< OutputImageParam > output_buffers() const
Halide::Stage::specialize
Stage specialize(const Expr &condition)
Halide::Func::debug_to_file
void debug_to_file(const std::string &filename)
When this function is compiled, include code that dumps its values to a file after it is realized,...
Halide::Func::Func
HALIDE_NO_USER_CODE_INLINE Func(Buffer< T > &im)
Construct a new Func to wrap a Buffer.
Definition: Func.h:713
Halide::TailStrategy::Auto
@ Auto
For pure definitions use ShiftInwards.
Halide::Func::compile_to_object
void compile_to_object(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to an object file, with the given filename (which should probably en...
Halide::FuncRef::operator-=
Stage operator-=(Expr)
Define a stage that adds the negative of the given expression to this Func.
Halide::NameMangling
NameMangling
An enum to specify calling convention for extern stages.
Definition: Function.h:24
Halide::Func::estimate
Func & estimate(const Var &var, const Expr &min, const Expr &extent)
Definition: Func.h:1533
Expr.h
Halide::Func::store_in
Func & store_in(MemoryType memory_type)
Set the type of memory this Func should be stored in.
Halide::Stage::gpu
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Halide::Func::set_custom_do_par_for
void set_custom_do_par_for(int(*custom_do_par_for)(void *, int(*)(void *, int, uint8_t *), int, int, uint8_t *))
Set a custom parallel for loop launcher.
Halide::Func
A halide function.
Definition: Func.h:667
Halide::Func::custom_lowering_passes
const std::vector< CustomLoweringPass > & custom_lowering_passes()
Get the custom lowering passes.
Halide::Target::HVX_64
@ HVX_64
Definition: Target.h:96
Halide::Func::compute_root
Func & compute_root()
Compute all of this function once ahead of time.
Halide::Internal::IRMutator
A base class for passes over the IR which modify it (e.g.
Definition: IRMutator.h:24
HALIDE_NO_USER_CODE_INLINE
#define HALIDE_NO_USER_CODE_INLINE
Definition: Util.h:44
Halide::Stage::gpu_single_thread
Stage & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Halide::Func::unroll
Func & unroll(const VarOrRVar &var)
Mark a dimension to be completely unrolled.
Halide::Func::bound
Func & bound(const Var &var, Expr min, Expr extent)
Statically declare that the range over which a function should be evaluated is given by the second an...
Halide::Func::trace_stores
Func & trace_stores()
Trace all stores to the buffer backing this Func by emitting calls to halide_trace.
Halide::Internal::Function
A reference-counted handle to Halide's internal representation of a function.
Definition: Function.h:38
Halide::Func::compute_inline
Func & compute_inline()
Aggressively inline all uses of this function.
Halide::Internal::assign_results
void assign_results(Realization &r, int idx, Last last)
Definition: Func.h:2468
Halide::VarOrRVar::VarOrRVar
VarOrRVar(const ImplicitVar< N > &u)
Definition: Func.h:44
Halide::Func::defined
bool defined() const
Does this function have at least a pure definition.
Halide::Func::compile_to_header
void compile_to_header(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name="", const Target &target=get_target_from_environment())
Emit a header file with the given filename for this function.
Halide::Func::dimensions
int dimensions() const
The dimensionality (number of arguments) of this function.
Halide::evaluate_may_gpu
HALIDE_NO_USER_CODE_INLINE T evaluate_may_gpu(const Expr &e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition: Func.h:2527
Halide::VarOrRVar::VarOrRVar
VarOrRVar(const RVar &r)
Definition: Func.h:37
Halide::Func::set_custom_do_task
void set_custom_do_task(int(*custom_do_task)(void *, int(*)(void *, int, uint8_t *), int, uint8_t *))
Set a custom task handler to be called by the parallel for loop.
Halide::LoopLevel
A reference to a site in a Halide statement at the top of the body of a particular for loop.
Definition: Schedule.h:143
Halide::FuncTupleElementRef::operator*=
Stage operator*=(const Expr &e)
Define a stage that multiplies Tuple component 'idx' of this Func by the given expression.
Halide::Func::rename
Func & rename(const VarOrRVar &old_name, const VarOrRVar &new_name)
Rename a dimension.
Halide::RVar
A reduction variable represents a single dimension of a reduction domain (RDom).
Definition: RDom.h:29
Halide::RDom
A multi-dimensional domain over which to iterate.
Definition: RDom.h:191
Halide::Func::memoize
Func & memoize()
Use the halide_memoization_cache_...
Halide::Func::reorder_storage
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Var, Args... >::value, Func & >::type reorder_storage(const Var &x, const Var &y, Args &&... args)
Definition: Func.h:2039
Halide::Func::trace_realizations
Func & trace_realizations()
Trace all realizations of this Func by emitting calls to halide_trace.
Halide::Func::get_schedule
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this initial Stage of this Func.
Definition: Func.h:2445
Halide::Func::output_buffer
OutputImageParam output_buffer() const
Get a handle on the output buffer for this Func.
Halide::RVar::name
const std::string & name() const
The name of this reduction variable.
Halide::Func::gpu_threads
Func & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU thread indices.
free
void free(void *)
Halide::Func::gpu_single_thread
Func & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide to run this stage using a single gpu thread and block.
Halide::Func::has_update_definition
bool has_update_definition() const
Does this function have at least one update definition?
Halide::get_jit_target_from_environment
Target get_jit_target_from_environment()
Return the target that Halide will use for jit-compilation.
Halide::Func::print_loop_nest
void print_loop_nest()
Write out the loop nests specified by the schedule for this Function.
Halide::Func::args
std::vector< Var > args() const
Get the pure arguments.
Halide::Func::atomic
Func & atomic(bool override_associativity_test=false)
Issue atomic updates for this Func.
Halide::Func::align_storage
Func & align_storage(const Var &dim, const Expr &alignment)
Pad the storage extent of a particular dimension of realizations of this function up to be a multiple...
Halide::FuncRef::operator/=
Stage operator/=(Expr)
Define a stage that divides this Func by the given expression.
Halide::VarOrRVar::VarOrRVar
VarOrRVar(const Var &v)
Definition: Func.h:34
Halide::Runtime::Internal::custom_do_task
WEAK halide_do_task_t custom_do_task
Definition: thread_pool_common.h:548
Halide::PrefetchBoundStrategy
PrefetchBoundStrategy
Different ways to handle accesses outside the original extents in a prefetch.
Definition: PrefetchDirective.h:16
Halide::Internal::Definition::schedule
const StageSchedule & schedule() const
Get the default (no-specialization) stage-specific schedule associated with this definition.
Halide::Func::allow_race_conditions
Func & allow_race_conditions()
Specify that race conditions are permitted for this Func, which enables parallelizing over RVars even...
Halide::Func::specialize_fail
void specialize_fail(const std::string &message)
Add a specialization to a Func that always terminates execution with a call to halide_error().
Halide::Stage::allow_race_conditions
Stage & allow_race_conditions()
Halide::Stage::source_location
std::string source_location() const
Attempt to get the source file and line where this stage was defined by parsing the process's own deb...
Halide::evaluate
HALIDE_NO_USER_CODE_INLINE T evaluate(const Expr &e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition: Func.h:2485
Halide::Expr
A fragment of Halide syntax.
Definition: Expr.h:256
Halide::Func::define_extern
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, const std::vector< Var > &arguments, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Definition: Func.h:1244
Halide::ImplicitVar
Definition: Var.h:169
Halide::Realization
A Realization is a vector of references to existing Buffer objects.
Definition: Realization.h:21
Halide::VarOrRVar
A class that can represent Vars or RVars.
Definition: Func.h:30
Halide::FuncTupleElementRef::index
int index() const
Return index to the function outputs.
Definition: Func.h:654
Halide::FuncTupleElementRef
A fragment of front-end syntax of the form f(x, y, z)[index], where x, y, z are Vars or Exprs.
Definition: Func.h:590
Halide::MemoryType
MemoryType
An enum describing different address spaces to be used with Func::store_in.
Definition: Expr.h:346
Module.h
Halide::Stage::specialize_fail
void specialize_fail(const std::string &message)
RDom.h
Halide::Func::in
Func in()
Create and return a global identity wrapper, which wraps all calls to this Func by any other Func.
Halide::Func::define_extern
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Definition: Func.h:1234
Halide::max
Expr max(const FuncRef &a, const FuncRef &b)
Definition: Func.h:580
Halide::Internal::StageSchedule
A schedule for a single stage of a Halide pipeline.
Definition: Schedule.h:604
Param.h
Halide::Func::update_values
Tuple update_values(int idx=0) const
Get the right-hand-side of an update definition for functions that returns multiple values.
Halide::Stage
A single definition of a Func.
Definition: Func.h:69
Halide::Internal::check_types
void check_types(const Tuple &t, int idx)
Definition: Func.h:2453
Halide::Tuple
Create a small array of Exprs for defining and calling functions with multiple outputs.
Definition: Tuple.h:18
Halide::FuncRef
A fragment of front-end syntax of the form f(x, y, z), where x, y, z are Vars or Exprs.
Definition: Func.h:471
Halide::Func::jit_handlers
const Internal::JITHandlers & jit_handlers()
Get a struct containing the currently set custom functions used by JIT.
Halide::Runtime::Internal::custom_do_par_for
WEAK halide_do_par_for_t custom_do_par_for
Definition: thread_pool_common.h:550
Halide::Func::trace_loads
Func & trace_loads()
Trace all loads from this Func by emitting calls to halide_trace.
Halide::Func::num_update_definitions
int num_update_definitions() const
How many update definitions does this function have?
Halide::Target
A struct representing a target machine and os to generate code for.
Definition: Target.h:19
Halide::Stage::gpu_lanes
Stage & gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Halide::Func::prefetch
Func & prefetch(const T &image, VarOrRVar var, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition: Func.h:2013
Halide::Func::glsl
Func & glsl(const Var &x, const Var &y, const Var &c)
Schedule for execution as GLSL kernel.
Halide::Func::update_args
const std::vector< Expr > & update_args(int idx=0) const
Get the left-hand-side of the update definition.
Halide::Func::set_custom_print
void set_custom_print(void(*handler)(void *, const char *))
Set the function called to print messages from the runtime.
Halide::FuncRef::operator*=
Stage operator*=(Expr)
Define a stage that multiplies this Func by the given expression.
Halide::Stage::atomic
Stage & atomic(bool override_associativity_test=false)
Halide::DeviceAPI
DeviceAPI
An enum describing a type of device API.
Definition: DeviceAPI.h:15
Halide::Internal::schedule_scalar
void schedule_scalar(Func f)
Definition: Func.h:2509
Halide::Var::name
const std::string & name() const
Get the name of a Var.
Halide::ParamMap::empty_map
static const ParamMap & empty_map()
A const ref to an empty ParamMap.
Definition: ParamMap.h:108
Halide::Func::split
Func & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension into inner and outer subdimensions with the given names, where the inner dimension ...
Halide::Stage::prefetch
Stage & prefetch(const T &image, VarOrRVar var, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition: Func.h:448
Halide::Func::gpu
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices and thread indices.
Halide::Stage::get_schedule
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this Stage.
Definition: Func.h:107
Halide::VarOrRVar::VarOrRVar
VarOrRVar(const std::string &n, bool r)
Definition: Func.h:31