Halide
Func.h
Go to the documentation of this file.
1 #ifndef HALIDE_FUNC_H
2 #define HALIDE_FUNC_H
3 
4 /** \file
5  *
6  * Defines Func - the front-end handle on a halide function, and related classes.
7  */
8 
9 #include "IR.h"
10 #include "Var.h"
11 #include "Function.h"
12 #include "Param.h"
13 #include "Argument.h"
14 #include "RDom.h"
15 #include "JITModule.h"
16 #include "Target.h"
17 #include "Tuple.h"
18 #include "Module.h"
19 #include "Pipeline.h"
20 
21 #include <map>
22 
23 namespace Halide {
24 
25 class OutputImageParam;
26 
27 /** A class that can represent Vars or RVars. Used for reorder calls
28  * which can accept a mix of either. */
29 struct VarOrRVar {
30  VarOrRVar(const std::string &n, bool r) : var(n), rvar(n), is_rvar(r) {}
31  VarOrRVar(const Var &v) : var(v), is_rvar(false) {}
32  VarOrRVar(const RVar &r) : rvar(r), is_rvar(true) {}
33  VarOrRVar(const RDom &r) : rvar(RVar(r)), is_rvar(true) {}
34 
35  const std::string &name() const {
36  if (is_rvar) return rvar.name();
37  else return var.name();
38  }
39 
42  bool is_rvar;
43 };
44 
45 class ImageParam;
46 
47 namespace Internal {
48 struct Split;
49 struct StorageDim;
50 }
51 
52 /** A single definition of a Func. May be a pure or update definition. */
53 class Stage {
54  Internal::Definition definition;
55  std::string stage_name;
56  /** Pure Vars of the Function (from the init definition). */
57  std::vector<Var> dim_vars;
58  /** This is just a reference to the FuncSchedule owned by the Function
59  * associated with this Stage. */
60  Internal::FuncSchedule func_schedule;
61 
62  void set_dim_type(VarOrRVar var, Internal::ForType t);
63  void set_dim_device_api(VarOrRVar var, DeviceAPI device_api);
64  void split(const std::string &old, const std::string &outer, const std::string &inner,
65  Expr factor, bool exact, TailStrategy tail);
66  void remove(const std::string &var);
67  Stage &purify(VarOrRVar old_name, VarOrRVar new_name);
68 
69 public:
70  Stage(Internal::Definition d, const std::string &n, const std::vector<Var> &args,
71  const Internal::FuncSchedule &func_s)
72  : definition(d), stage_name(n), dim_vars(args), func_schedule(func_s) {
73  internal_assert(definition.args().size() == dim_vars.size());
74  definition.schedule().touched() = true;
75  }
76 
77  Stage(Internal::Definition d, const std::string &n, const std::vector<std::string> &args,
78  const Internal::FuncSchedule &func_s)
79  : definition(d), stage_name(n), func_schedule(func_s) {
80  definition.schedule().touched() = true;
81 
82  std::vector<Var> dim_vars(args.size());
83  for (size_t i = 0; i < args.size(); i++) {
84  dim_vars[i] = Var(args[i]);
85  }
86  internal_assert(definition.args().size() == dim_vars.size());
87  }
88 
89  /** Return the current StageSchedule associated with this Stage. For
90  * introspection only: to modify schedule, use the Func interface. */
91  const Internal::StageSchedule &get_schedule() const { return definition.schedule(); }
92 
93  /** Return a string describing the current var list taking into
94  * account all the splits, reorders, and tiles. */
95  EXPORT std::string dump_argument_list() const;
96 
97  /** Return the name of this stage, e.g. "f.update(2)" */
98  EXPORT const std::string &name() const;
99 
100  /** Calling rfactor() on an associative update definition a Func will split
101  * the update into an intermediate which computes the partial results and
102  * replaces the current update definition with a new definition which merges
103  * the partial results. If called on a init/pure definition, this will
104  * throw an error. rfactor() will automatically infer the associative reduction
105  * operator and identity of the operator. If it can't prove the operation
106  * is associative or if it cannot find an identity for that operator, this
107  * will throw an error. In addition, commutativity of the operator is required
108  * if rfactor() is called on the inner dimension but excluding the outer
109  * dimensions.
110  *
111  * rfactor() takes as input 'preserved', which is a list of <RVar, Var> pairs.
112  * The rvars not listed in 'preserved' are removed from the original Func and
113  * are lifted to the intermediate Func. The remaining rvars (the ones in
114  * 'preserved') are made pure in the intermediate Func. The intermediate Func's
115  * update definition inherits all scheduling directives (e.g. split,fuse, etc.)
116  * applied to the original Func's update definition. The loop order of the
117  * intermediate Func's update definition is the same as the original, although
118  * the RVars in 'preserved' are replaced by the new pure Vars. The loop order of the
119  * intermediate Func's init definition from innermost to outermost is the args'
120  * order of the original Func's init definition followed by the new pure Vars.
121  *
122  * The intermediate Func also inherits storage order from the original Func
123  * with the new pure Vars added to the outermost.
124  *
125  * For example, f.update(0).rfactor({{r.y, u}}) would rewrite a pipeline like this:
126  \code
127  f(x, y) = 0;
128  f(x, y) += g(r.x, r.y);
129  \endcode
130  * into a pipeline like this:
131  \code
132  f_intm(x, y, u) = 0;
133  f_intm(x, y, u) += g(r.x, u);
134 
135  f(x, y) = 0;
136  f(x, y) += f_intm(x, y, r.y);
137  \endcode
138  *
139  * This has a variety of uses. You can use it to split computation of an associative reduction:
140  \code
141  f(x, y) = 10;
142  RDom r(0, 96);
143  f(x, y) = max(f(x, y), g(x, y, r.x));
144  f.update(0).split(r.x, rxo, rxi, 8).reorder(y, x).parallel(x);
145  f.update(0).rfactor({{rxo, u}}).compute_root().parallel(u).update(0).parallel(u);
146  \endcode
147  *
148  *, which is equivalent to:
149  \code
150  parallel for u = 0 to 11:
151  for y:
152  for x:
153  f_intm(x, y, u) = -inf
154  parallel for x:
155  for y:
156  parallel for u = 0 to 11:
157  for rxi = 0 to 7:
158  f_intm(x, y, u) = max(f_intm(x, y, u), g(8*u + rxi))
159  for y:
160  for x:
161  f(x, y) = 10
162  parallel for x:
163  for y:
164  for rxo = 0 to 11:
165  f(x, y) = max(f(x, y), f_intm(x, y, u))
166  \endcode
167  *
168  */
169  // @{
170  EXPORT Func rfactor(std::vector<std::pair<RVar, Var>> preserved);
171  EXPORT Func rfactor(RVar r, Var v);
172  // @}
173 
174  /** Scheduling calls that control how the domain of this stage is
175  * traversed. See the documentation for Func for the meanings. */
176  // @{
177 
178  EXPORT Stage &split(VarOrRVar old, VarOrRVar outer, VarOrRVar inner, Expr factor, TailStrategy tail = TailStrategy::Auto);
179  EXPORT Stage &fuse(VarOrRVar inner, VarOrRVar outer, VarOrRVar fused);
180  EXPORT Stage &serial(VarOrRVar var);
181  EXPORT Stage &parallel(VarOrRVar var);
182  EXPORT Stage &vectorize(VarOrRVar var);
183  EXPORT Stage &unroll(VarOrRVar var);
184  EXPORT Stage &parallel(VarOrRVar var, Expr task_size, TailStrategy tail = TailStrategy::Auto);
185  EXPORT Stage &vectorize(VarOrRVar var, Expr factor, TailStrategy tail = TailStrategy::Auto);
186  EXPORT Stage &unroll(VarOrRVar var, Expr factor, TailStrategy tail = TailStrategy::Auto);
187  EXPORT Stage &tile(VarOrRVar x, VarOrRVar y,
188  VarOrRVar xo, VarOrRVar yo,
189  VarOrRVar xi, VarOrRVar yi, Expr
190  xfactor, Expr yfactor,
192  EXPORT Stage &tile(VarOrRVar x, VarOrRVar y,
193  VarOrRVar xi, VarOrRVar yi,
194  Expr xfactor, Expr yfactor,
196  EXPORT Stage &reorder(const std::vector<VarOrRVar> &vars);
197 
198  template <typename... Args>
199  NO_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
200  reorder(VarOrRVar x, VarOrRVar y, Args&&... args) {
201  std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
202  return reorder(collected_args);
203  }
204 
205  EXPORT Stage &rename(VarOrRVar old_name, VarOrRVar new_name);
206  EXPORT Stage specialize(Expr condition);
207  EXPORT void specialize_fail(const std::string &message);
208 
209  EXPORT Stage &gpu_threads(VarOrRVar thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
210  EXPORT Stage &gpu_threads(VarOrRVar thread_x, VarOrRVar thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
211  EXPORT Stage &gpu_threads(VarOrRVar thread_x, VarOrRVar thread_y, VarOrRVar thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
212  EXPORT Stage &gpu_single_thread(DeviceAPI device_api = DeviceAPI::Default_GPU);
213 
214  EXPORT Stage &gpu_blocks(VarOrRVar block_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
215  EXPORT Stage &gpu_blocks(VarOrRVar block_x, VarOrRVar block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
216  EXPORT Stage &gpu_blocks(VarOrRVar block_x, VarOrRVar block_y, VarOrRVar block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
217 
218  EXPORT Stage &gpu(VarOrRVar block_x, VarOrRVar thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
219  EXPORT Stage &gpu(VarOrRVar block_x, VarOrRVar block_y,
220  VarOrRVar thread_x, VarOrRVar thread_y,
221  DeviceAPI device_api = DeviceAPI::Default_GPU);
222  EXPORT Stage &gpu(VarOrRVar block_x, VarOrRVar block_y, VarOrRVar block_z,
223  VarOrRVar thread_x, VarOrRVar thread_y, VarOrRVar thread_z,
224  DeviceAPI device_api = DeviceAPI::Default_GPU);
225 
226  // TODO(psuriana): For now we need to expand "tx" into Var and RVar versions
227  // due to conflict with the deprecated interfaces since Var can be implicitly
228  // converted into either VarOrRVar or Expr. Merge this later once we remove
229  // the deprecated interfaces.
230  EXPORT Stage &gpu_tile(VarOrRVar x, VarOrRVar bx, Var tx, Expr x_size,
232  DeviceAPI device_api = DeviceAPI::Default_GPU);
233  EXPORT Stage &gpu_tile(VarOrRVar x, VarOrRVar bx, RVar tx, Expr x_size,
235  DeviceAPI device_api = DeviceAPI::Default_GPU);
236 
237  EXPORT Stage &gpu_tile(VarOrRVar x, VarOrRVar tx, Expr x_size,
239  DeviceAPI device_api = DeviceAPI::Default_GPU);
240  EXPORT Stage &gpu_tile(VarOrRVar x, VarOrRVar y,
241  VarOrRVar bx, VarOrRVar by,
242  VarOrRVar tx, VarOrRVar ty,
243  Expr x_size, Expr y_size,
245  DeviceAPI device_api = DeviceAPI::Default_GPU);
246 
247  EXPORT Stage &gpu_tile(VarOrRVar x, VarOrRVar y,
248  VarOrRVar tx, Var ty,
249  Expr x_size, Expr y_size,
251  DeviceAPI device_api = DeviceAPI::Default_GPU);
252  EXPORT Stage &gpu_tile(VarOrRVar x, VarOrRVar y,
253  VarOrRVar tx, RVar ty,
254  Expr x_size, Expr y_size,
256  DeviceAPI device_api = DeviceAPI::Default_GPU);
257 
258  EXPORT Stage &gpu_tile(VarOrRVar x, VarOrRVar y, VarOrRVar z,
259  VarOrRVar bx, VarOrRVar by, VarOrRVar bz,
260  VarOrRVar tx, VarOrRVar ty, VarOrRVar tz,
261  Expr x_size, Expr y_size, Expr z_size,
263  DeviceAPI device_api = DeviceAPI::Default_GPU);
264  EXPORT Stage &gpu_tile(VarOrRVar x, VarOrRVar y, VarOrRVar z,
265  VarOrRVar tx, VarOrRVar ty, VarOrRVar tz,
266  Expr x_size, Expr y_size, Expr z_size,
268  DeviceAPI device_api = DeviceAPI::Default_GPU);
269 
270  // If we mark these as deprecated, some build environments will complain
271  // about the internal-only calls. Since these are rarely used outside
272  // Func itself, we'll just comment them as deprecated for now.
273  // HALIDE_ATTRIBUTE_DEPRECATED("This form of gpu_tile() is deprecated.")
274  EXPORT Stage &gpu_tile(VarOrRVar x, Expr x_size,
276  DeviceAPI device_api = DeviceAPI::Default_GPU);
277  // HALIDE_ATTRIBUTE_DEPRECATED("This form of gpu_tile() is deprecated.")
278  EXPORT Stage &gpu_tile(VarOrRVar x, VarOrRVar y,
279  Expr x_size, Expr y_size,
281  DeviceAPI device_api = DeviceAPI::Default_GPU);
282  // HALIDE_ATTRIBUTE_DEPRECATED("This form of gpu_tile() is deprecated.")
283  EXPORT Stage &gpu_tile(VarOrRVar x, VarOrRVar y, VarOrRVar z,
284  Expr x_size, Expr y_size, Expr z_size,
286  DeviceAPI device_api = DeviceAPI::Default_GPU);
287 
288  EXPORT Stage &allow_race_conditions();
289 
290  EXPORT Stage &hexagon(VarOrRVar x = Var::outermost());
291  EXPORT Stage &prefetch(const Func &f, VarOrRVar var, Expr offset = 1,
293  EXPORT Stage &prefetch(const Internal::Parameter &param, VarOrRVar var, Expr offset = 1,
295  template<typename T>
296  Stage &prefetch(const T &image, VarOrRVar var, Expr offset = 1,
298  return prefetch(image.parameter(), var, offset, strategy);
299  }
300  // @}
301 };
302 
303 // For backwards compatibility, keep the ScheduleHandle name.
305 
306 
307 class FuncTupleElementRef;
308 
309 /** A fragment of front-end syntax of the form f(x, y, z), where x, y,
310  * z are Vars or Exprs. If could be the left hand side of a definition or
311  * an update definition, or it could be a call to a function. We don't know
312  * until we see how this object gets used.
313  */
314 class FuncRef {
315  Internal::Function func;
316  int implicit_placeholder_pos;
317  int implicit_count;
318  std::vector<Expr> args;
319  std::vector<Expr> args_with_implicit_vars(const std::vector<Expr> &e) const;
320 
321  /** Helper for function update by Tuple. If the function does not
322  * already have a pure definition, init_val will be used as RHS of
323  * each tuple element in the initial function definition. */
324  template <typename BinaryOp>
325  Stage func_ref_update(const Tuple &e, int init_val);
326 
327  /** Helper for function update by Expr. If the function does not
328  * already have a pure definition, init_val will be used as RHS in
329  * the initial function definition. */
330  template <typename BinaryOp>
331  Stage func_ref_update(Expr e, int init_val);
332 
333 public:
334  FuncRef(Internal::Function, const std::vector<Expr> &,
335  int placeholder_pos = -1, int count = 0);
336  FuncRef(Internal::Function, const std::vector<Var> &,
337  int placeholder_pos = -1, int count = 0);
338 
339  /** Use this as the left-hand-side of a definition or an update definition
340  * (see \ref RDom).
341  */
342  EXPORT Stage operator=(Expr);
343 
344  /** Use this as the left-hand-side of a definition or an update definition
345  * for a Func with multiple outputs. */
346  EXPORT Stage operator=(const Tuple &);
347 
348  /** Define a stage that adds the given expression to this Func. If the
349  * expression refers to some RDom, this performs a sum reduction of the
350  * expression over the domain. If the function does not already have a
351  * pure definition, this sets it to zero.
352  */
353  // @{
355  EXPORT Stage operator+=(const Tuple &);
356  EXPORT Stage operator+=(const FuncRef &);
357  // @}
358 
359  /** Define a stage that adds the negative of the given expression to this
360  * Func. If the expression refers to some RDom, this performs a sum reduction
361  * of the negative of the expression over the domain. If the function does
362  * not already have a pure definition, this sets it to zero.
363  */
364  // @{
366  EXPORT Stage operator-=(const Tuple &);
367  EXPORT Stage operator-=(const FuncRef &);
368  // @}
369 
370  /** Define a stage that multiplies this Func by the given expression. If the
371  * expression refers to some RDom, this performs a product reduction of the
372  * expression over the domain. If the function does not already have a pure
373  * definition, this sets it to 1.
374  */
375  // @{
377  EXPORT Stage operator*=(const Tuple &);
378  EXPORT Stage operator*=(const FuncRef &);
379  // @}
380 
381  /** Define a stage that divides this Func by the given expression.
382  * If the expression refers to some RDom, this performs a product
383  * reduction of the inverse of the expression over the domain. If the
384  * function does not already have a pure definition, this sets it to 1.
385  */
386  // @{
388  EXPORT Stage operator/=(const Tuple &);
389  EXPORT Stage operator/=(const FuncRef &);
390  // @}
391 
392  /* Override the usual assignment operator, so that
393  * f(x, y) = g(x, y) defines f.
394  */
395  EXPORT Stage operator=(const FuncRef &);
396 
397  /** Use this as a call to the function, and not the left-hand-side
398  * of a definition. Only works for single-output Funcs. */
399  EXPORT operator Expr() const;
400 
401  /** When a FuncRef refers to a function that provides multiple
402  * outputs, you can access each output as an Expr using
403  * operator[].
404  */
405  EXPORT FuncTupleElementRef operator[](int) const;
406 
407  /** How many outputs does the function this refers to produce. */
408  EXPORT size_t size() const;
409 
410  /** What function is this calling? */
411  EXPORT Internal::Function function() const {return func;}
412 };
413 
414 /** Explicit overloads of min and max for FuncRef. These exist to
415  * disambiguate calls to min on FuncRefs when a user has pulled both
416  * Halide::min and std::min into their namespace. */
417 // @{
418 inline Expr min(FuncRef a, FuncRef b) {return min(Expr(std::move(a)), Expr(std::move(b)));}
419 inline Expr max(FuncRef a, FuncRef b) {return max(Expr(std::move(a)), Expr(std::move(b)));}
420 // @}
421 
422 /** A fragment of front-end syntax of the form f(x, y, z)[index], where x, y,
423  * z are Vars or Exprs. If could be the left hand side of an update
424  * definition, or it could be a call to a function. We don't know
425  * until we see how this object gets used.
426  */
428  FuncRef func_ref;
429  std::vector<Expr> args; // args to the function
430  int idx; // Index to function outputs
431 
432  /** Helper function that generates a Tuple where element at 'idx' is set
433  * to 'e' and the rests are undef. */
434  Tuple values_with_undefs(Expr e) const;
435 
436 public:
437  FuncTupleElementRef(const FuncRef &ref, const std::vector<Expr>& args, int idx);
438 
439  /** Use this as the left-hand-side of an update definition of Tuple
440  * component 'idx' of a Func (see \ref RDom). The function must
441  * already have an initial definition.
442  */
443  EXPORT Stage operator=(Expr e);
444 
445 
446  /** Define a stage that adds the given expression to Tuple component 'idx'
447  * of this Func. The other Tuple components are unchanged. If the expression
448  * refers to some RDom, this performs a sum reduction of the expression over
449  * the domain. The function must already have an initial definition.
450  */
452 
453  /** Define a stage that adds the negative of the given expression to Tuple
454  * component 'idx' of this Func. The other Tuple components are unchanged.
455  * If the expression refers to some RDom, this performs a sum reduction of
456  * the negative of the expression over the domain. The function must already
457  * have an initial definition.
458  */
460 
461  /** Define a stage that multiplies Tuple component 'idx' of this Func by
462  * the given expression. The other Tuple components are unchanged. If the
463  * expression refers to some RDom, this performs a product reduction of
464  * the expression over the domain. The function must already have an
465  * initial definition.
466  */
468 
469  /** Define a stage that divides Tuple component 'idx' of this Func by
470  * the given expression. The other Tuple components are unchanged.
471  * If the expression refers to some RDom, this performs a product
472  * reduction of the inverse of the expression over the domain. The function
473  * must already have an initial definition.
474  */
476 
477  /* Override the usual assignment operator, so that
478  * f(x, y)[index] = g(x, y) defines f.
479  */
480  EXPORT Stage operator=(const FuncRef &e);
481 
482  /** Use this as a call to Tuple component 'idx' of a Func, and not the
483  * left-hand-side of a definition. */
484  EXPORT operator Expr() const;
485 
486  /** What function is this calling? */
487  EXPORT Internal::Function function() const {return func_ref.function();}
488 
489  /** Return index to the function outputs. */
490  EXPORT int index() const {return idx;}
491 };
492 
493 namespace Internal {
494 struct ErrorBuffer;
495 class IRMutator2;
496 }
497 
498 /** A halide function. This class represents one stage in a Halide
499  * pipeline, and is the unit by which we schedule things. By default
500  * they are aggressively inlined, so you are encouraged to make lots
501  * of little functions, rather than storing things in Exprs. */
502 class Func {
503 
504  /** A handle on the internal halide function that this
505  * represents */
506  Internal::Function func;
507 
508  /** When you make a reference to this function with fewer
509  * arguments than it has dimensions, the argument list is bulked
510  * up with 'implicit' vars with canonical names. This lets you
511  * pass around partially applied Halide functions. */
512  // @{
513  std::pair<int, int> add_implicit_vars(std::vector<Var> &) const;
514  std::pair<int, int> add_implicit_vars(std::vector<Expr> &) const;
515  // @}
516 
517  /** The imaging pipeline that outputs this Func alone. */
518  Pipeline pipeline_;
519 
520  /** Get the imaging pipeline that outputs this Func alone,
521  * creating it (and freezing the Func) if necessary. */
522  Pipeline pipeline();
523 
524  // Helper function for recursive reordering support
525  EXPORT Func &reorder_storage(const std::vector<Var> &dims, size_t start);
526 
527  EXPORT void invalidate_cache();
528 
529 public:
530 
531  /** Declare a new undefined function with the given name */
532  EXPORT explicit Func(const std::string &name);
533 
534  /** Declare a new undefined function with an
535  * automatically-generated unique name */
536  EXPORT Func();
537 
538  /** Declare a new function with an automatically-generated unique
539  * name, and define it to return the given expression (which may
540  * not contain free variables). */
541  EXPORT explicit Func(Expr e);
542 
543  /** Construct a new Func to wrap an existing, already-define
544  * Function object. */
545  EXPORT explicit Func(Internal::Function f);
546 
547  /** Construct a new Func to wrap a Buffer. */
548  template<typename T>
549  NO_INLINE explicit Func(Buffer<T> &im) : Func() {
550  (*this)(_) = im(_);
551  }
552 
553  /** Evaluate this function over some rectangular domain and return
554  * the resulting buffer or buffers. Performs compilation if the
555  * Func has not previously been realized and jit_compile has not
556  * been called. If the final stage of the pipeline is on the GPU,
557  * data is copied back to the host before being returned. The
558  * returned Realization should probably be instantly converted to
559  * a Buffer class of the appropriate type. That is, do this:
560  *
561  \code
562  f(x) = sin(x);
563  Buffer<float> im = f.realize(...);
564  \endcode
565  *
566  * If your Func has multiple values, because you defined it using
567  * a Tuple, then casting the result of a realize call to a buffer
568  * or image will produce a run-time error. Instead you should do the
569  * following:
570  *
571  \code
572  f(x) = Tuple(x, sin(x));
573  Realization r = f.realize(...);
574  Buffer<int> im0 = r[0];
575  Buffer<float> im1 = r[1];
576  \endcode
577  *
578  */
579  // @{
580  EXPORT Realization realize(std::vector<int32_t> sizes, const Target &target = Target());
581  EXPORT Realization realize(int x_size, int y_size, int z_size, int w_size,
582  const Target &target = Target());
583  EXPORT Realization realize(int x_size, int y_size, int z_size,
584  const Target &target = Target());
585  EXPORT Realization realize(int x_size, int y_size,
586  const Target &target = Target());
587  EXPORT Realization realize(int x_size,
588  const Target &target = Target());
589  EXPORT Realization realize(const Target &target = Target());
590  // @}
591 
592  /** Evaluate this function into an existing allocated buffer or
593  * buffers. If the buffer is also one of the arguments to the
594  * function, strange things may happen, as the pipeline isn't
595  * necessarily safe to run in-place. If you pass multiple buffers,
596  * they must have matching sizes. This form of realize does *not*
597  * automatically copy data back from the GPU. */
598  EXPORT void realize(Realization dst, const Target &target = Target());
599 
600  /** For a given size of output, or a given output buffer,
601  * determine the bounds required of all unbound ImageParams
602  * referenced. Communicates the result by allocating new buffers
603  * of the appropriate size and binding them to the unbound
604  * ImageParams. */
605  // @{
606  EXPORT void infer_input_bounds(int x_size = 0, int y_size = 0, int z_size = 0, int w_size = 0);
607  EXPORT void infer_input_bounds(Realization dst);
608  // @}
609 
610  /** Statically compile this function to llvm bitcode, with the
611  * given filename (which should probably end in .bc), type
612  * signature, and C function name (which defaults to the same name
613  * as this halide function */
614  //@{
615  EXPORT void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
616  const Target &target = get_target_from_environment());
617  EXPORT void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &,
618  const Target &target = get_target_from_environment());
619  // @}
620 
621  /** Statically compile this function to llvm assembly, with the
622  * given filename (which should probably end in .ll), type
623  * signature, and C function name (which defaults to the same name
624  * as this halide function */
625  //@{
626  EXPORT void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
627  const Target &target = get_target_from_environment());
628  EXPORT void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &,
629  const Target &target = get_target_from_environment());
630  // @}
631 
632  /** Statically compile this function to an object file, with the
633  * given filename (which should probably end in .o or .obj), type
634  * signature, and C function name (which defaults to the same name
635  * as this halide function. You probably don't want to use this
636  * directly; call compile_to_static_library or compile_to_file instead. */
637  //@{
638  EXPORT void compile_to_object(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
639  const Target &target = get_target_from_environment());
640  EXPORT void compile_to_object(const std::string &filename, const std::vector<Argument> &,
641  const Target &target = get_target_from_environment());
642  // @}
643 
644  /** Emit a header file with the given filename for this
645  * function. The header will define a function with the type
646  * signature given by the second argument, and a name given by the
647  * third. The name defaults to the same name as this halide
648  * function. You don't actually have to have defined this function
649  * yet to call this. You probably don't want to use this directly;
650  * call compile_to_static_library or compile_to_file instead. */
651  EXPORT void compile_to_header(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name = "",
652  const Target &target = get_target_from_environment());
653 
654  /** Statically compile this function to text assembly equivalent
655  * to the object file generated by compile_to_object. This is
656  * useful for checking what Halide is producing without having to
657  * disassemble anything, or if you need to feed the assembly into
658  * some custom toolchain to produce an object file (e.g. iOS) */
659  //@{
660  EXPORT void compile_to_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
661  const Target &target = get_target_from_environment());
662  EXPORT void compile_to_assembly(const std::string &filename, const std::vector<Argument> &,
663  const Target &target = get_target_from_environment());
664  // @}
665 
666  /** Statically compile this function to C source code. This is
667  * useful for providing fallback code paths that will compile on
668  * many platforms. Vectorization will fail, and parallelization
669  * will produce serial code. */
670  EXPORT void compile_to_c(const std::string &filename,
671  const std::vector<Argument> &,
672  const std::string &fn_name = "",
673  const Target &target = get_target_from_environment());
674 
675  /** Write out an internal representation of lowered code. Useful
676  * for analyzing and debugging scheduling. Can emit html or plain
677  * text. */
678  EXPORT void compile_to_lowered_stmt(const std::string &filename,
679  const std::vector<Argument> &args,
680  StmtOutputFormat fmt = Text,
681  const Target &target = get_target_from_environment());
682 
683  /** Write out the loop nests specified by the schedule for this
684  * Function. Helpful for understanding what a schedule is
685  * doing. */
686  EXPORT void print_loop_nest();
687 
688  /** Compile to object file and header pair, with the given
689  * arguments. The name defaults to the same name as this halide
690  * function.
691  */
692  EXPORT void compile_to_file(const std::string &filename_prefix, const std::vector<Argument> &args,
693  const std::string &fn_name = "",
694  const Target &target = get_target_from_environment());
695 
696  /** Compile to static-library file and header pair, with the given
697  * arguments. The name defaults to the same name as this halide
698  * function.
699  */
700  EXPORT void compile_to_static_library(const std::string &filename_prefix, const std::vector<Argument> &args,
701  const std::string &fn_name = "",
702  const Target &target = get_target_from_environment());
703 
704  /** Compile to static-library file and header pair once for each target;
705  * each resulting function will be considered (in order) via halide_can_use_target_features()
706  * at runtime, with the first appropriate match being selected for subsequent use.
707  * This is typically useful for specializations that may vary unpredictably by machine
708  * (e.g., SSE4.1/AVX/AVX2 on x86 desktop machines).
709  * All targets must have identical arch-os-bits.
710  */
711  EXPORT void compile_to_multitarget_static_library(const std::string &filename_prefix,
712  const std::vector<Argument> &args,
713  const std::vector<Target> &targets);
714 
715  /** Store an internal representation of lowered code as a self
716  * contained Module suitable for further compilation. */
717  EXPORT Module compile_to_module(const std::vector<Argument> &args, const std::string &fn_name = "",
718  const Target &target = get_target_from_environment());
719 
720  /** Compile and generate multiple target files with single call.
721  * Deduces target files based on filenames specified in
722  * output_files struct.
723  */
724  EXPORT void compile_to(const Outputs &output_files,
725  const std::vector<Argument> &args,
726  const std::string &fn_name,
727  const Target &target = get_target_from_environment());
728 
729  /** Eagerly jit compile the function to machine code. This
730  * normally happens on the first call to realize. If you're
731  * running your halide pipeline inside time-sensitive code and
732  * wish to avoid including the time taken to compile a pipeline,
733  * then you can call this ahead of time. Returns the raw function
734  * pointer to the compiled pipeline. Default is to use the Target
735  * returned from Halide::get_jit_target_from_environment()
736  */
737  EXPORT void *compile_jit(const Target &target = get_jit_target_from_environment());
738 
739  /** Set the error handler function that be called in the case of
740  * runtime errors during halide pipelines. If you are compiling
741  * statically, you can also just define your own function with
742  * signature
743  \code
744  extern "C" void halide_error(void *user_context, const char *);
745  \endcode
746  * This will clobber Halide's version.
747  */
748  EXPORT void set_error_handler(void (*handler)(void *, const char *));
749 
750  /** Set a custom malloc and free for halide to use. Malloc should
751  * return 32-byte aligned chunks of memory, and it should be safe
752  * for Halide to read slightly out of bounds (up to 8 bytes before
753  * the start or beyond the end). If compiling statically, routines
754  * with appropriate signatures can be provided directly
755  \code
756  extern "C" void *halide_malloc(void *, size_t)
757  extern "C" void halide_free(void *, void *)
758  \endcode
759  * These will clobber Halide's versions. See \file HalideRuntime.h
760  * for declarations.
761  */
762  EXPORT void set_custom_allocator(void *(*malloc)(void *, size_t),
763  void (*free)(void *, void *));
764 
765  /** Set a custom task handler to be called by the parallel for
766  * loop. It is useful to set this if you want to do some
767  * additional bookkeeping at the granularity of parallel
768  * tasks. The default implementation does this:
769  \code
770  extern "C" int halide_do_task(void *user_context,
771  int (*f)(void *, int, uint8_t *),
772  int idx, uint8_t *state) {
773  return f(user_context, idx, state);
774  }
775  \endcode
776  * If you are statically compiling, you can also just define your
777  * own version of the above function, and it will clobber Halide's
778  * version.
779  *
780  * If you're trying to use a custom parallel runtime, you probably
781  * don't want to call this. See instead \ref Func::set_custom_do_par_for .
782  */
783  EXPORT void set_custom_do_task(
784  int (*custom_do_task)(void *, int (*)(void *, int, uint8_t *),
785  int, uint8_t *));
786 
787  /** Set a custom parallel for loop launcher. Useful if your app
788  * already manages a thread pool. The default implementation is
789  * equivalent to this:
790  \code
791  extern "C" int halide_do_par_for(void *user_context,
792  int (*f)(void *, int, uint8_t *),
793  int min, int extent, uint8_t *state) {
794  int exit_status = 0;
795  parallel for (int idx = min; idx < min+extent; idx++) {
796  int job_status = halide_do_task(user_context, f, idx, state);
797  if (job_status) exit_status = job_status;
798  }
799  return exit_status;
800  }
801  \endcode
802  *
803  * However, notwithstanding the above example code, if one task
804  * fails, we may skip over other tasks, and if two tasks return
805  * different error codes, we may select one arbitrarily to return.
806  *
807  * If you are statically compiling, you can also just define your
808  * own version of the above function, and it will clobber Halide's
809  * version.
810  */
811  EXPORT void set_custom_do_par_for(
812  int (*custom_do_par_for)(void *, int (*)(void *, int, uint8_t *), int,
813  int, uint8_t *));
814 
815  /** Set custom routines to call when tracing is enabled. Call this
816  * on the output Func of your pipeline. This then sets custom
817  * routines for the entire pipeline, not just calls to this
818  * Func.
819  *
820  * If you are statically compiling, you can also just define your
821  * own versions of the tracing functions (see HalideRuntime.h),
822  * and they will clobber Halide's versions. */
823  EXPORT void set_custom_trace(int (*trace_fn)(void *, const halide_trace_event_t *));
824 
825  /** Set the function called to print messages from the runtime.
826  * If you are compiling statically, you can also just define your
827  * own function with signature
828  \code
829  extern "C" void halide_print(void *user_context, const char *);
830  \endcode
831  * This will clobber Halide's version.
832  */
833  EXPORT void set_custom_print(void (*handler)(void *, const char *));
834 
835  /** Get a struct containing the currently set custom functions
836  * used by JIT. */
837  EXPORT const Internal::JITHandlers &jit_handlers();
838 
839  /** Add a custom pass to be used during lowering. It is run after
840  * all other lowering passes. Can be used to verify properties of
841  * the lowered Stmt, instrument it with extra code, or otherwise
842  * modify it. The Func takes ownership of the pass, and will call
843  * delete on it when the Func goes out of scope. So don't pass a
844  * stack object, or share pass instances between multiple
845  * Funcs. */
846  template<typename T>
847  void add_custom_lowering_pass(T *pass) {
848  // Template instantiate a custom deleter for this type, then
849  // cast it to a deleter that takes a IRMutator2 *. The custom
850  // deleter lives in user code, so that deletion is on the same
851  // heap as construction (I hate Windows).
852  void (*deleter)(Internal::IRMutator2 *) =
853  (void (*)(Internal::IRMutator2 *))(&delete_lowering_pass<T>);
854  add_custom_lowering_pass(pass, deleter);
855  }
856 
857  /** Add a custom pass to be used during lowering, with the
858  * function that will be called to delete it also passed in. Set
859  * it to nullptr if you wish to retain ownership of the object. */
860  EXPORT void add_custom_lowering_pass(Internal::IRMutator2 *pass, void (*deleter)(Internal::IRMutator2 *));
861 
862  /** Remove all previously-set custom lowering passes */
863  EXPORT void clear_custom_lowering_passes();
864 
865  /** Get the custom lowering passes. */
866  EXPORT const std::vector<CustomLoweringPass> &custom_lowering_passes();
867 
868  /** When this function is compiled, include code that dumps its
869  * values to a file after it is realized, for the purpose of
870  * debugging.
871  *
872  * If filename ends in ".tif" or ".tiff" (case insensitive) the file
873  * is in TIFF format and can be read by standard tools. Oherwise, the
874  * file format is as follows:
875  *
876  * All data is in the byte-order of the target platform. First, a
877  * 20 byte-header containing four 32-bit ints, giving the extents
878  * of the first four dimensions. Dimensions beyond four are
879  * folded into the fourth. Then, a fifth 32-bit int giving the
880  * data type of the function. The typecodes are given by: float =
881  * 0, double = 1, uint8_t = 2, int8_t = 3, uint16_t = 4, int16_t =
882  * 5, uint32_t = 6, int32_t = 7, uint64_t = 8, int64_t = 9. The
883  * data follows the header, as a densely packed array of the given
884  * size and the given type. If given the extension .tmp, this file
885  * format can be natively read by the program ImageStack. */
886  EXPORT void debug_to_file(const std::string &filename);
887 
888  /** The name of this function, either given during construction,
889  * or automatically generated. */
890  EXPORT const std::string &name() const;
891 
892  /** Get the pure arguments. */
893  EXPORT std::vector<Var> args() const;
894 
895  /** The right-hand-side value of the pure definition of this
896  * function. Causes an error if there's no pure definition, or if
897  * the function is defined to return multiple values. */
898  EXPORT Expr value() const;
899 
900  /** The values returned by this function. An error if the function
901  * has not been been defined. Returns a Tuple with one element for
902  * functions defined to return a single value. */
903  EXPORT Tuple values() const;
904 
905  /** Does this function have at least a pure definition. */
906  EXPORT bool defined() const;
907 
908  /** Get the left-hand-side of the update definition. An empty
909  * vector if there's no update definition. If there are
910  * multiple update definitions for this function, use the
911  * argument to select which one you want. */
912  EXPORT const std::vector<Expr> &update_args(int idx = 0) const;
913 
914  /** Get the right-hand-side of an update definition. An error if
915  * there's no update definition. If there are multiple
916  * update definitions for this function, use the argument to
917  * select which one you want. */
918  EXPORT Expr update_value(int idx = 0) const;
919 
920  /** Get the right-hand-side of an update definition for
921  * functions that returns multiple values. An error if there's no
922  * update definition. Returns a Tuple with one element for
923  * functions that return a single value. */
924  EXPORT Tuple update_values(int idx = 0) const;
925 
926  /** Get the RVars of the reduction domain for an update definition, if there is
927  * one. */
928  EXPORT std::vector<RVar> rvars(int idx = 0) const;
929 
930  /** Does this function have at least one update definition? */
931  EXPORT bool has_update_definition() const;
932 
933  /** How many update definitions does this function have? */
934  EXPORT int num_update_definitions() const;
935 
936  /** Is this function an external stage? That is, was it defined
937  * using define_extern? */
938  EXPORT bool is_extern() const;
939 
940  /** Add an extern definition for this Func. This lets you define a
941  * Func that represents an external pipeline stage. You can, for
942  * example, use it to wrap a call to an extern library such as
943  * fftw. */
944  // @{
945  EXPORT void define_extern(const std::string &function_name,
946  const std::vector<ExternFuncArgument> &params,
947  Type t,
948  int dimensionality,
949  NameMangling mangling,
950  bool uses_old_buffer_t) {
951  define_extern(function_name, params, std::vector<Type>{t},
952  dimensionality, mangling, DeviceAPI::Host, uses_old_buffer_t);
953  }
954 
955  EXPORT void define_extern(const std::string &function_name,
956  const std::vector<ExternFuncArgument> &params,
957  Type t,
958  int dimensionality,
960  DeviceAPI device_api = DeviceAPI::Host,
961  bool uses_old_buffer_t = false) {
962  define_extern(function_name, params, std::vector<Type>{t},
963  dimensionality, mangling, device_api, uses_old_buffer_t);
964  }
965 
966  EXPORT void define_extern(const std::string &function_name,
967  const std::vector<ExternFuncArgument> &params,
968  const std::vector<Type> &types,
969  int dimensionality,
970  NameMangling mangling,
971  bool uses_old_buffer_t) {
972  define_extern(function_name, params, types,
973  dimensionality, mangling, DeviceAPI::Host, uses_old_buffer_t);
974  }
975 
976  EXPORT void define_extern(const std::string &function_name,
977  const std::vector<ExternFuncArgument> &params,
978  const std::vector<Type> &types,
979  int dimensionality,
981  DeviceAPI device_api = DeviceAPI::Host,
982  bool uses_old_buffer_t = false);
983  // @}
984 
985  /** Get the types of the outputs of this Func. */
986  EXPORT const std::vector<Type> &output_types() const;
987 
988  /** Get the number of outputs of this Func. Corresponds to the
989  * size of the Tuple this Func was defined to return. */
990  EXPORT int outputs() const;
991 
992  /** Get the name of the extern function called for an extern
993  * definition. */
994  EXPORT const std::string &extern_function_name() const;
995 
996  /** The dimensionality (number of arguments) of this
997  * function. Zero if the function is not yet defined. */
998  EXPORT int dimensions() const;
999 
1000  /** Construct either the left-hand-side of a definition, or a call
1001  * to a functions that happens to only contain vars as
1002  * arguments. If the function has already been defined, and fewer
1003  * arguments are given than the function has dimensions, then
1004  * enough implicit vars are added to the end of the argument list
1005  * to make up the difference (see \ref Var::implicit) */
1006  // @{
1007  EXPORT FuncRef operator()(std::vector<Var>) const;
1008 
1009  template <typename... Args>
1010  NO_INLINE typename std::enable_if<Internal::all_are_convertible<Var, Args...>::value, FuncRef>::type
1011  operator()(Args&&... args) const {
1012  std::vector<Var> collected_args{std::forward<Args>(args)...};
1013  return this->operator()(collected_args);
1014  }
1015  // @}
1016 
1017  /** Either calls to the function, or the left-hand-side of
1018  * an update definition (see \ref RDom). If the function has
1019  * already been defined, and fewer arguments are given than the
1020  * function has dimensions, then enough implicit vars are added to
1021  * the end of the argument list to make up the difference. (see
1022  * \ref Var::implicit)*/
1023  // @{
1024  EXPORT FuncRef operator()(std::vector<Expr>) const;
1025 
1026  template <typename... Args>
1027  NO_INLINE typename std::enable_if<Internal::all_are_convertible<Expr, Args...>::value, FuncRef>::type
1028  operator()(Expr x, Args&&... args) const {
1029  std::vector<Expr> collected_args{x, std::forward<Args>(args)...};
1030  return (*this)(collected_args);
1031  }
1032  // @}
1033 
1034  /** Creates and returns a new identity Func that wraps this Func. During
1035  * compilation, Halide replaces all calls to this Func done by 'f'
1036  * with calls to the wrapper. If this Func is already wrapped for
1037  * use in 'f', will return the existing wrapper.
1038  *
1039  * For example, g.in(f) would rewrite a pipeline like this:
1040  \code
1041  g(x, y) = ...
1042  f(x, y) = ... g(x, y) ...
1043  \endcode
1044  * into a pipeline like this:
1045  \code
1046  g(x, y) = ...
1047  g_wrap(x, y) = g(x, y)
1048  f(x, y) = ... g_wrap(x, y)
1049  \endcode
1050  *
1051  * This has a variety of uses. You can use it to schedule this
1052  * Func differently in the different places it is used:
1053  \code
1054  g(x, y) = ...
1055  f1(x, y) = ... g(x, y) ...
1056  f2(x, y) = ... g(x, y) ...
1057  g.in(f1).compute_at(f1, y).vectorize(x, 8);
1058  g.in(f2).compute_at(f2, x).unroll(x);
1059  \endcode
1060  *
1061  * You can also use it to stage loads from this Func via some
1062  * intermediate buffer (perhaps on the stack as in
1063  * test/performance/block_transpose.cpp, or in shared GPU memory
1064  * as in test/performance/wrap.cpp). In this we compute the
1065  * wrapper at tiles of the consuming Funcs like so:
1066  \code
1067  g.compute_root()...
1068  g.in(f).compute_at(f, tiles)...
1069  \endcode
1070  *
1071  * Func::in() can also be used to compute pieces of a Func into a
1072  * smaller scratch buffer (perhaps on the GPU) and then copy them
1073  * into a larger output buffer one tile at a time. See
1074  * apps/interpolate/interpolate.cpp for an example of this. In
1075  * this case we compute the Func at tiles of its own wrapper:
1076  \code
1077  f.in(g).compute_root().gpu_tile(...)...
1078  f.compute_at(f.in(g), tiles)...
1079  \endcode
1080  *
1081  * A similar use of Func::in() wrapping Funcs with multiple update
1082  * stages in a pure wrapper. The following code:
1083  \code
1084  f(x, y) = x + y;
1085  f(x, y) += 5;
1086  g(x, y) = f(x, y);
1087  f.compute_root();
1088  \endcode
1089  *
1090  * Is equivalent to:
1091  \code
1092  for y:
1093  for x:
1094  f(x, y) = x + y;
1095  for y:
1096  for x:
1097  f(x, y) += 5
1098  for y:
1099  for x:
1100  g(x, y) = f(x, y)
1101  \endcode
1102  * using Func::in(), we can write:
1103  \code
1104  f(x, y) = x + y;
1105  f(x, y) += 5;
1106  g(x, y) = f(x, y);
1107  f.in(g).compute_root();
1108  \endcode
1109  * which instead produces:
1110  \code
1111  for y:
1112  for x:
1113  f(x, y) = x + y;
1114  f(x, y) += 5
1115  f_wrap(x, y) = f(x, y)
1116  for y:
1117  for x:
1118  g(x, y) = f_wrap(x, y)
1119  \endcode
1120  */
1121  EXPORT Func in(const Func &f);
1122 
1123  /** Create and return an identity wrapper shared by all the Funcs in
1124  * 'fs'. If any of the Funcs in 'fs' already have a custom wrapper,
1125  * this will throw an error. */
1126  EXPORT Func in(const std::vector<Func> &fs);
1127 
1128  /** Create and return a global identity wrapper, which wraps all calls to
1129  * this Func by any other Func. If a global wrapper already exists,
1130  * returns it. The global identity wrapper is only used by callers for
1131  * which no custom wrapper has been specified.
1132  */
1133  EXPORT Func in();
1134 
1135  /** Similar to \ref Func::in; however, instead of replacing the call to
1136  * this Func with an identity Func that refers to it, this replaces the
1137  * call with a clone of this Func.
1138  *
1139  * For example, f.clone_in(g) would rewrite a pipeline like this:
1140  \code
1141  f(x, y) = x + y;
1142  g(x, y) = f(x, y) + 2;
1143  h(x, y) = f(x, y) - 3;
1144  \endcode
1145  * into a pipeline like this:
1146  \code
1147  f(x, y) = x + y;
1148  f_clone(x, y) = x + y;
1149  g(x, y) = f_clone(x, y) + 2;
1150  h(x, y) = f(x, y) - 3;
1151  \endcode
1152  *
1153  */
1154  //@{
1155  EXPORT Func clone_in(const Func &f);
1156  EXPORT Func clone_in(const std::vector<Func> &fs);
1157  //@}
1158 
1159  /** Declare that this function should be implemented by a call to
1160  * halide_buffer_copy with the given target device API. Asserts
1161  * that the Func has a pure definition which is a simple call to a
1162  * single input, and no update definitions. The wrapper Funcs
1163  * returned by in() are suitable candidates. Consumes all pure
1164  * variables, and rewrites the Func to have an extern definition
1165  * that calls halide_buffer_copy. */
1166  EXPORT Func copy_to_device(DeviceAPI d = DeviceAPI::Default_GPU);
1167 
1168  /** Declare that this function should be implemented by a call to
1169  * halide_buffer_copy with a NULL target device API. Equivalent to
1170  * copy_to_device(DeviceAPI::Host). Asserts that the Func has a
1171  * pure definition which is a simple call to a single input, and
1172  * no update definitions. The wrapper Funcs returned by in() are
1173  * suitable candidates. Consumes all pure variables, and rewrites
1174  * the Func to have an extern definition that calls
1175  * halide_buffer_copy.
1176  *
1177  * Note that if the source Func is already valid in host memory,
1178  * this compiles to code that does the minimum number of calls to
1179  * memcpy.
1180  */
1181  EXPORT Func copy_to_host();
1182 
1183  /** Split a dimension into inner and outer subdimensions with the
1184  * given names, where the inner dimension iterates from 0 to
1185  * factor-1. The inner and outer subdimensions can then be dealt
1186  * with using the other scheduling calls. It's ok to reuse the old
1187  * variable name as either the inner or outer variable. The final
1188  * argument specifies how the tail should be handled if the split
1189  * factor does not provably divide the extent. */
1190  EXPORT Func &split(VarOrRVar old, VarOrRVar outer, VarOrRVar inner, Expr factor, TailStrategy tail = TailStrategy::Auto);
1191 
1192  /** Join two dimensions into a single fused dimenion. The fused
1193  * dimension covers the product of the extents of the inner and
1194  * outer dimensions given. */
1195  EXPORT Func &fuse(VarOrRVar inner, VarOrRVar outer, VarOrRVar fused);
1196 
1197  /** Mark a dimension to be traversed serially. This is the default. */
1198  EXPORT Func &serial(VarOrRVar var);
1199 
1200  /** Mark a dimension to be traversed in parallel */
1201  EXPORT Func &parallel(VarOrRVar var);
1202 
1203  /** Split a dimension by the given task_size, and the parallelize the
1204  * outer dimension. This creates parallel tasks that have size
1205  * task_size. After this call, var refers to the outer dimension of
1206  * the split. The inner dimension has a new anonymous name. If you
1207  * wish to mutate it, or schedule with respect to it, do the split
1208  * manually. */
1209  EXPORT Func &parallel(VarOrRVar var, Expr task_size, TailStrategy tail = TailStrategy::Auto);
1210 
1211  /** Mark a dimension to be computed all-at-once as a single
1212  * vector. The dimension should have constant extent -
1213  * e.g. because it is the inner dimension following a split by a
1214  * constant factor. For most uses of vectorize you want the two
1215  * argument form. The variable to be vectorized should be the
1216  * innermost one. */
1217  EXPORT Func &vectorize(VarOrRVar var);
1218 
1219  /** Mark a dimension to be completely unrolled. The dimension
1220  * should have constant extent - e.g. because it is the inner
1221  * dimension following a split by a constant factor. For most uses
1222  * of unroll you want the two-argument form. */
1223  EXPORT Func &unroll(VarOrRVar var);
1224 
1225  /** Split a dimension by the given factor, then vectorize the
1226  * inner dimension. This is how you vectorize a loop of unknown
1227  * size. The variable to be vectorized should be the innermost
1228  * one. After this call, var refers to the outer dimension of the
1229  * split. 'factor' must be an integer. */
1230  EXPORT Func &vectorize(VarOrRVar var, Expr factor, TailStrategy tail = TailStrategy::Auto);
1231 
1232  /** Split a dimension by the given factor, then unroll the inner
1233  * dimension. This is how you unroll a loop of unknown size by
1234  * some constant factor. After this call, var refers to the outer
1235  * dimension of the split. 'factor' must be an integer. */
1236  EXPORT Func &unroll(VarOrRVar var, Expr factor, TailStrategy tail = TailStrategy::Auto);
1237 
1238  /** Statically declare that the range over which a function should
1239  * be evaluated is given by the second and third arguments. This
1240  * can let Halide perform some optimizations. E.g. if you know
1241  * there are going to be 4 color channels, you can completely
1242  * vectorize the color channel dimension without the overhead of
1243  * splitting it up. If bounds inference decides that it requires
1244  * more of this function than the bounds you have stated, a
1245  * runtime error will occur when you try to run your pipeline. */
1246  EXPORT Func &bound(Var var, Expr min, Expr extent);
1247 
1248  /** Statically declare the range over which the function will be
1249  * evaluated in the general case. This provides a basis for the auto
1250  * scheduler to make trade-offs and scheduling decisions. The auto
1251  * generated schedules might break when the sizes of the dimensions are
1252  * very different from the estimates specified. These estimates are used
1253  * only by the auto scheduler if the function is a pipeline output. */
1254  EXPORT Func &estimate(Var var, Expr min, Expr extent);
1255 
1256  /** Expand the region computed so that the min coordinates is
1257  * congruent to 'remainder' modulo 'modulus', and the extent is a
1258  * multiple of 'modulus'. For example, f.align_bounds(x, 2) forces
1259  * the min and extent realized to be even, and calling
1260  * f.align_bounds(x, 2, 1) forces the min to be odd and the extent
1261  * to be even. The region computed always contains the region that
1262  * would have been computed without this directive, so no
1263  * assertions are injected. */
1264  EXPORT Func &align_bounds(Var var, Expr modulus, Expr remainder = 0);
1265 
1266  /** Bound the extent of a Func's realization, but not its
1267  * min. This means the dimension can be unrolled or vectorized
1268  * even when its min is not fixed (for example because it is
1269  * compute_at tiles of another Func). This can also be useful for
1270  * forcing a function's allocation to be a fixed size, which often
1271  * means it can go on the stack. */
1272  EXPORT Func &bound_extent(Var var, Expr extent);
1273 
1274  /** Split two dimensions at once by the given factors, and then
1275  * reorder the resulting dimensions to be xi, yi, xo, yo from
1276  * innermost outwards. This gives a tiled traversal. */
1277  EXPORT Func &tile(VarOrRVar x, VarOrRVar y,
1278  VarOrRVar xo, VarOrRVar yo,
1279  VarOrRVar xi, VarOrRVar yi,
1280  Expr xfactor, Expr yfactor,
1282 
1283  /** A shorter form of tile, which reuses the old variable names as
1284  * the new outer dimensions */
1285  EXPORT Func &tile(VarOrRVar x, VarOrRVar y,
1286  VarOrRVar xi, VarOrRVar yi,
1287  Expr xfactor, Expr yfactor,
1289 
1290  /** Reorder variables to have the given nesting order, from
1291  * innermost out */
1292  EXPORT Func &reorder(const std::vector<VarOrRVar> &vars);
1293 
1294  template <typename... Args>
1295  NO_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Func &>::type
1296  reorder(VarOrRVar x, VarOrRVar y, Args&&... args) {
1297  std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
1298  return reorder(collected_args);
1299  }
1300 
1301  /** Rename a dimension. Equivalent to split with a inner size of one. */
1302  EXPORT Func &rename(VarOrRVar old_name, VarOrRVar new_name);
1303 
1304  /** Specify that race conditions are permitted for this Func,
1305  * which enables parallelizing over RVars even when Halide cannot
1306  * prove that it is safe to do so. Use this with great caution,
1307  * and only if you can prove to yourself that this is safe, as it
1308  * may result in a non-deterministic routine that returns
1309  * different values at different times or on different machines. */
1310  EXPORT Func &allow_race_conditions();
1311 
1312 
1313  /** Specialize a Func. This creates a special-case version of the
1314  * Func where the given condition is true. The most effective
1315  * conditions are those of the form param == value, and boolean
1316  * Params. Consider a simple example:
1317  \code
1318  f(x) = x + select(cond, 0, 1);
1319  f.compute_root();
1320  \endcode
1321  * This is equivalent to:
1322  \code
1323  for (int x = 0; x < width; x++) {
1324  f[x] = x + (cond ? 0 : 1);
1325  }
1326  \endcode
1327  * Adding the scheduling directive:
1328  \code
1329  f.specialize(cond)
1330  \endcode
1331  * makes it equivalent to:
1332  \code
1333  if (cond) {
1334  for (int x = 0; x < width; x++) {
1335  f[x] = x;
1336  }
1337  } else {
1338  for (int x = 0; x < width; x++) {
1339  f[x] = x + 1;
1340  }
1341  }
1342  \endcode
1343  * Note that the inner loops have been simplified. In the first
1344  * path Halide knows that cond is true, and in the second path
1345  * Halide knows that it is false.
1346  *
1347  * The specialized version gets its own schedule, which inherits
1348  * every directive made about the parent Func's schedule so far
1349  * except for its specializations. This method returns a handle to
1350  * the new schedule. If you wish to retrieve the specialized
1351  * sub-schedule again later, you can call this method with the
1352  * same condition. Consider the following example of scheduling
1353  * the specialized version:
1354  *
1355  \code
1356  f(x) = x;
1357  f.compute_root();
1358  f.specialize(width > 1).unroll(x, 2);
1359  \endcode
1360  * Assuming for simplicity that width is even, this is equivalent to:
1361  \code
1362  if (width > 1) {
1363  for (int x = 0; x < width/2; x++) {
1364  f[2*x] = 2*x;
1365  f[2*x + 1] = 2*x + 1;
1366  }
1367  } else {
1368  for (int x = 0; x < width/2; x++) {
1369  f[x] = x;
1370  }
1371  }
1372  \endcode
1373  * For this case, it may be better to schedule the un-specialized
1374  * case instead:
1375  \code
1376  f(x) = x;
1377  f.compute_root();
1378  f.specialize(width == 1); // Creates a copy of the schedule so far.
1379  f.unroll(x, 2); // Only applies to the unspecialized case.
1380  \endcode
1381  * This is equivalent to:
1382  \code
1383  if (width == 1) {
1384  f[0] = 0;
1385  } else {
1386  for (int x = 0; x < width/2; x++) {
1387  f[2*x] = 2*x;
1388  f[2*x + 1] = 2*x + 1;
1389  }
1390  }
1391  \endcode
1392  * This can be a good way to write a pipeline that splits,
1393  * vectorizes, or tiles, but can still handle small inputs.
1394  *
1395  * If a Func has several specializations, the first matching one
1396  * will be used, so the order in which you define specializations
1397  * is significant. For example:
1398  *
1399  \code
1400  f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1401  f.specialize(cond1);
1402  f.specialize(cond2);
1403  \endcode
1404  * is equivalent to:
1405  \code
1406  if (cond1) {
1407  for (int x = 0; x < width; x++) {
1408  f[x] = x + a - (cond2 ? c : d);
1409  }
1410  } else if (cond2) {
1411  for (int x = 0; x < width; x++) {
1412  f[x] = x + b - c;
1413  }
1414  } else {
1415  for (int x = 0; x < width; x++) {
1416  f[x] = x + b - d;
1417  }
1418  }
1419  \endcode
1420  *
1421  * Specializations may in turn be specialized, which creates a
1422  * nested if statement in the generated code.
1423  *
1424  \code
1425  f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1426  f.specialize(cond1).specialize(cond2);
1427  \endcode
1428  * This is equivalent to:
1429  \code
1430  if (cond1) {
1431  if (cond2) {
1432  for (int x = 0; x < width; x++) {
1433  f[x] = x + a - c;
1434  }
1435  } else {
1436  for (int x = 0; x < width; x++) {
1437  f[x] = x + a - d;
1438  }
1439  }
1440  } else {
1441  for (int x = 0; x < width; x++) {
1442  f[x] = x + b - (cond2 ? c : d);
1443  }
1444  }
1445  \endcode
1446  * To create a 4-way if statement that simplifies away all of the
1447  * ternary operators above, you could say:
1448  \code
1449  f.specialize(cond1).specialize(cond2);
1450  f.specialize(cond2);
1451  \endcode
1452  * or
1453  \code
1454  f.specialize(cond1 && cond2);
1455  f.specialize(cond1);
1456  f.specialize(cond2);
1457  \endcode
1458  *
1459  * Any prior Func which is compute_at some variable of this Func
1460  * gets separately included in all paths of the generated if
1461  * statement. The Var in the compute_at call to must exist in all
1462  * paths, but it may have been generated via a different path of
1463  * splits, fuses, and renames. This can be used somewhat
1464  * creatively. Consider the following code:
1465  \code
1466  g(x, y) = 8*x;
1467  f(x, y) = g(x, y) + 1;
1468  f.compute_root().specialize(cond);
1469  Var g_loop;
1470  f.specialize(cond).rename(y, g_loop);
1471  f.rename(x, g_loop);
1472  g.compute_at(f, g_loop);
1473  \endcode
1474  * When cond is true, this is equivalent to g.compute_at(f,y).
1475  * When it is false, this is equivalent to g.compute_at(f,x).
1476  */
1477  EXPORT Stage specialize(Expr condition);
1478 
1479  /** Add a specialization to a Func that always terminates execution
1480  * with a call to halide_error(). By itself, this is of limited use,
1481  * but can be useful to terminate chains of specialize() calls where
1482  * no "default" case is expected (thus avoiding unnecessary code generation).
1483  *
1484  * For instance, say we want to optimize a pipeline to process images
1485  * in planar and interleaved format; we might typically do something like:
1486  \code
1487  ImageParam im(UInt(8), 3);
1488  Func f = do_something_with(im);
1489  f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1490  f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1491  \endcode
1492  * This code will vectorize along rows for the planar case, and across pixel
1493  * components for the interleaved case... but there is an implicit "else"
1494  * for the unhandled cases, which generates unoptimized code. If we never
1495  * anticipate passing any other sort of images to this, we code streamline
1496  * our code by adding specialize_fail():
1497  \code
1498  ImageParam im(UInt(8), 3);
1499  Func f = do_something(im);
1500  f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1501  f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1502  f.specialize_fail("Unhandled image format");
1503  \endcode
1504  * Conceptually, this produces codes like:
1505  \code
1506  if (im.dim(0).stride() == 1) {
1507  do_something_planar();
1508  } else if (im.dim(2).stride() == 1) {
1509  do_something_interleaved();
1510  } else {
1511  halide_error("Unhandled image format");
1512  }
1513  \endcode
1514  *
1515  * Note that calling specialize_fail() terminates the specialization chain
1516  * for a given Func; you cannot create new specializations for the Func
1517  * afterwards (though you can retrieve handles to previous specializations).
1518  */
1519  EXPORT void specialize_fail(const std::string &message);
1520 
1521  /** Tell Halide that the following dimensions correspond to GPU
1522  * thread indices. This is useful if you compute a producer
1523  * function within the block indices of a consumer function, and
1524  * want to control how that function's dimensions map to GPU
1525  * threads. If the selected target is not an appropriate GPU, this
1526  * just marks those dimensions as parallel. */
1527  // @{
1528  EXPORT Func &gpu_threads(VarOrRVar thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1529  EXPORT Func &gpu_threads(VarOrRVar thread_x, VarOrRVar thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1530  EXPORT Func &gpu_threads(VarOrRVar thread_x, VarOrRVar thread_y, VarOrRVar thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1531  // @}
1532 
1533  /** Tell Halide to run this stage using a single gpu thread and
1534  * block. This is not an efficient use of your GPU, but it can be
1535  * useful to avoid copy-back for intermediate update stages that
1536  * touch a very small part of your Func. */
1537  EXPORT Func &gpu_single_thread(DeviceAPI device_api = DeviceAPI::Default_GPU);
1538 
1539  /** Tell Halide that the following dimensions correspond to GPU
1540  * block indices. This is useful for scheduling stages that will
1541  * run serially within each GPU block. If the selected target is
1542  * not ptx, this just marks those dimensions as parallel. */
1543  // @{
1544  EXPORT Func &gpu_blocks(VarOrRVar block_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1545  EXPORT Func &gpu_blocks(VarOrRVar block_x, VarOrRVar block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1546  EXPORT Func &gpu_blocks(VarOrRVar block_x, VarOrRVar block_y, VarOrRVar block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1547  // @}
1548 
1549  /** Tell Halide that the following dimensions correspond to GPU
1550  * block indices and thread indices. If the selected target is not
1551  * ptx, these just mark the given dimensions as parallel. The
1552  * dimensions are consumed by this call, so do all other
1553  * unrolling, reordering, etc first. */
1554  // @{
1555  EXPORT Func &gpu(VarOrRVar block_x, VarOrRVar thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1556  EXPORT Func &gpu(VarOrRVar block_x, VarOrRVar block_y,
1557  VarOrRVar thread_x, VarOrRVar thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1558  EXPORT Func &gpu(VarOrRVar block_x, VarOrRVar block_y, VarOrRVar block_z,
1559  VarOrRVar thread_x, VarOrRVar thread_y, VarOrRVar thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1560  // @}
1561 
1562  /** Short-hand for tiling a domain and mapping the tile indices
1563  * to GPU block indices and the coordinates within each tile to
1564  * GPU thread indices. Consumes the variables given, so do all
1565  * other scheduling first. */
1566  // @{
1567  EXPORT Func &gpu_tile(VarOrRVar x, VarOrRVar bx, Var tx, Expr x_size,
1569  DeviceAPI device_api = DeviceAPI::Default_GPU);
1570  EXPORT Func &gpu_tile(VarOrRVar x, VarOrRVar bx, RVar tx, Expr x_size,
1572  DeviceAPI device_api = DeviceAPI::Default_GPU);
1573 
1574  EXPORT Func &gpu_tile(VarOrRVar x, VarOrRVar tx, Expr x_size,
1576  DeviceAPI device_api = DeviceAPI::Default_GPU);
1577  EXPORT Func &gpu_tile(VarOrRVar x, VarOrRVar y,
1578  VarOrRVar bx, VarOrRVar by,
1579  VarOrRVar tx, VarOrRVar ty,
1580  Expr x_size, Expr y_size,
1582  DeviceAPI device_api = DeviceAPI::Default_GPU);
1583 
1584  EXPORT Func &gpu_tile(VarOrRVar x, VarOrRVar y,
1585  VarOrRVar tx, Var ty,
1586  Expr x_size, Expr y_size,
1588  DeviceAPI device_api = DeviceAPI::Default_GPU);
1589  EXPORT Func &gpu_tile(VarOrRVar x, VarOrRVar y,
1590  VarOrRVar tx, RVar ty,
1591  Expr x_size, Expr y_size,
1593  DeviceAPI device_api = DeviceAPI::Default_GPU);
1594 
1595  EXPORT Func &gpu_tile(VarOrRVar x, VarOrRVar y, VarOrRVar z,
1596  VarOrRVar bx, VarOrRVar by, VarOrRVar bz,
1597  VarOrRVar tx, VarOrRVar ty, VarOrRVar tz,
1598  Expr x_size, Expr y_size, Expr z_size,
1600  DeviceAPI device_api = DeviceAPI::Default_GPU);
1601  EXPORT Func &gpu_tile(VarOrRVar x, VarOrRVar y, VarOrRVar z,
1602  VarOrRVar tx, VarOrRVar ty, VarOrRVar tz,
1603  Expr x_size, Expr y_size, Expr z_size,
1605  DeviceAPI device_api = DeviceAPI::Default_GPU);
1606 
1607  HALIDE_ATTRIBUTE_DEPRECATED("This form of gpu_tile() is deprecated.")
1608  EXPORT Func &gpu_tile(VarOrRVar x, Expr x_size,
1610  DeviceAPI device_api = DeviceAPI::Default_GPU);
1611  HALIDE_ATTRIBUTE_DEPRECATED("This form of gpu_tile() is deprecated.")
1612  EXPORT Func &gpu_tile(VarOrRVar x, VarOrRVar y, Expr x_size, Expr y_size,
1614  DeviceAPI device_api = DeviceAPI::Default_GPU);
1615  HALIDE_ATTRIBUTE_DEPRECATED("This form of gpu_tile() is deprecated.")
1616  EXPORT Func &gpu_tile(VarOrRVar x, VarOrRVar y, VarOrRVar z,
1617  Expr x_size, Expr y_size, Expr z_size,
1619  DeviceAPI device_api = DeviceAPI::Default_GPU);
1620  // @}
1621 
1622  /** Schedule for execution using coordinate-based hardware api.
1623  * GLSL is an example of this. Conceptually, this is
1624  * similar to parallelization over 'x' and 'y' (since GLSL shaders compute
1625  * individual output pixels in parallel) and vectorization over 'c'
1626  * (since GLSL/RS implicitly vectorizes the color channel). */
1627  EXPORT Func &shader(Var x, Var y, Var c, DeviceAPI device_api);
1628 
1629  /** Schedule for execution as GLSL kernel. */
1630  EXPORT Func &glsl(Var x, Var y, Var c);
1631 
1632  /** Schedule for execution on Hexagon. When a loop is marked with
1633  * Hexagon, that loop is executed on a Hexagon DSP. */
1634  EXPORT Func &hexagon(VarOrRVar x = Var::outermost());
1635 
1636  /** Prefetch data written to or read from a Func or an ImageParam by a
1637  * subsequent loop iteration, at an optionally specified iteration offset.
1638  * 'var' specifies at which loop level the prefetch calls should be inserted.
1639  * The final argument specifies how prefetch of region outside bounds
1640  * should be handled.
1641  *
1642  * For example, consider this pipeline:
1643  \code
1644  Func f, g;
1645  Var x, y;
1646  f(x, y) = x + y;
1647  g(x, y) = 2 * f(x, y);
1648  \endcode
1649  *
1650  * The following schedule:
1651  \code
1652  f.compute_root();
1653  g.prefetch(f, x, 2, PrefetchBoundStrategy::NonFaulting);
1654  \endcode
1655  *
1656  * will inject prefetch call at the innermost loop of 'g' and generate
1657  * the following loop nest:
1658  * for y = ...
1659  * for x = ...
1660  * f(x, y) = x + y
1661  * for y = ..
1662  * for x = ...
1663  * prefetch(&f[x + 2, y], 1, 16);
1664  * g(x, y) = 2 * f(x, y)
1665  */
1666  // @{
1667  EXPORT Func &prefetch(const Func &f, VarOrRVar var, Expr offset = 1,
1669  EXPORT Func &prefetch(const Internal::Parameter &param, VarOrRVar var, Expr offset = 1,
1671  template<typename T>
1672  Func &prefetch(const T &image, VarOrRVar var, Expr offset = 1,
1674  return prefetch(image.parameter(), var, offset, strategy);
1675  }
1676  // @}
1677 
1678  /** Specify how the storage for the function is laid out. These
1679  * calls let you specify the nesting order of the dimensions. For
1680  * example, foo.reorder_storage(y, x) tells Halide to use
1681  * column-major storage for any realizations of foo, without
1682  * changing how you refer to foo in the code. You may want to do
1683  * this if you intend to vectorize across y. When representing
1684  * color images, foo.reorder_storage(c, x, y) specifies packed
1685  * storage (red, green, and blue values adjacent in memory), and
1686  * foo.reorder_storage(x, y, c) specifies planar storage (entire
1687  * red, green, and blue images one after the other in memory).
1688  *
1689  * If you leave out some dimensions, those remain in the same
1690  * positions in the nesting order while the specified variables
1691  * are reordered around them. */
1692  // @{
1693  EXPORT Func &reorder_storage(const std::vector<Var> &dims);
1694 
1695  EXPORT Func &reorder_storage(Var x, Var y);
1696  template <typename... Args>
1697  NO_INLINE typename std::enable_if<Internal::all_are_convertible<Var, Args...>::value, Func &>::type
1698  reorder_storage(Var x, Var y, Args&&... args) {
1699  std::vector<Var> collected_args{x, y, std::forward<Args>(args)...};
1700  return reorder_storage(collected_args);
1701  }
1702  // @}
1703 
1704  /** Pad the storage extent of a particular dimension of
1705  * realizations of this function up to be a multiple of the
1706  * specified alignment. This guarantees that the strides for the
1707  * dimensions stored outside of dim will be multiples of the
1708  * specified alignment, where the strides and alignment are
1709  * measured in numbers of elements.
1710  *
1711  * For example, to guarantee that a function foo(x, y, c)
1712  * representing an image has scanlines starting on offsets
1713  * aligned to multiples of 16, use foo.align_storage(x, 16). */
1714  EXPORT Func &align_storage(Var dim, Expr alignment);
1715 
1716  /** Store realizations of this function in a circular buffer of a
1717  * given extent. This is more efficient when the extent of the
1718  * circular buffer is a power of 2. If the fold factor is too
1719  * small, or the dimension is not accessed monotonically, the
1720  * pipeline will generate an error at runtime.
1721  *
1722  * The fold_forward option indicates that the new values of the
1723  * producer are accessed by the consumer in a monotonically
1724  * increasing order. Folding storage of producers is also
1725  * supported if the new values are accessed in a monotonically
1726  * decreasing order by setting fold_forward to false.
1727  *
1728  * For example, consider the pipeline:
1729  \code
1730  Func f, g;
1731  Var x, y;
1732  g(x, y) = x*y;
1733  f(x, y) = g(x, y) + g(x, y+1);
1734  \endcode
1735  *
1736  * If we schedule f like so:
1737  *
1738  \code
1739  g.compute_at(f, y).store_root().fold_storage(y, 2);
1740  \endcode
1741  *
1742  * Then g will be computed at each row of f and stored in a buffer
1743  * with an extent in y of 2, alternately storing each computed row
1744  * of g in row y=0 or y=1.
1745  */
1746  EXPORT Func &fold_storage(Var dim, Expr extent, bool fold_forward = true);
1747 
1748  /** Compute this function as needed for each unique value of the
1749  * given var for the given calling function f.
1750  *
1751  * For example, consider the simple pipeline:
1752  \code
1753  Func f, g;
1754  Var x, y;
1755  g(x, y) = x*y;
1756  f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
1757  \endcode
1758  *
1759  * If we schedule f like so:
1760  *
1761  \code
1762  g.compute_at(f, x);
1763  \endcode
1764  *
1765  * Then the C code equivalent to this pipeline will look like this
1766  *
1767  \code
1768 
1769  int f[height][width];
1770  for (int y = 0; y < height; y++) {
1771  for (int x = 0; x < width; x++) {
1772  int g[2][2];
1773  g[0][0] = x*y;
1774  g[0][1] = (x+1)*y;
1775  g[1][0] = x*(y+1);
1776  g[1][1] = (x+1)*(y+1);
1777  f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
1778  }
1779  }
1780 
1781  \endcode
1782  *
1783  * The allocation and computation of g is within f's loop over x,
1784  * and enough of g is computed to satisfy all that f will need for
1785  * that iteration. This has excellent locality - values of g are
1786  * used as soon as they are computed, but it does redundant
1787  * work. Each value of g ends up getting computed four times. If
1788  * we instead schedule f like so:
1789  *
1790  \code
1791  g.compute_at(f, y);
1792  \endcode
1793  *
1794  * The equivalent C code is:
1795  *
1796  \code
1797  int f[height][width];
1798  for (int y = 0; y < height; y++) {
1799  int g[2][width+1];
1800  for (int x = 0; x < width; x++) {
1801  g[0][x] = x*y;
1802  g[1][x] = x*(y+1);
1803  }
1804  for (int x = 0; x < width; x++) {
1805  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
1806  }
1807  }
1808  \endcode
1809  *
1810  * The allocation and computation of g is within f's loop over y,
1811  * and enough of g is computed to satisfy all that f will need for
1812  * that iteration. This does less redundant work (each point in g
1813  * ends up being evaluated twice), but the locality is not quite
1814  * as good, and we have to allocate more temporary memory to store
1815  * g.
1816  */
1817  EXPORT Func &compute_at(Func f, Var var);
1818 
1819  /** Schedule a function to be computed within the iteration over
1820  * some dimension of an update domain. Produces equivalent code
1821  * to the version of compute_at that takes a Var. */
1822  EXPORT Func &compute_at(Func f, RVar var);
1823 
1824  /** Schedule a function to be computed within the iteration over
1825  * a given LoopLevel. */
1826  EXPORT Func &compute_at(LoopLevel loop_level);
1827 
1828  /** Compute all of this function once ahead of time. Reusing
1829  * the example in \ref Func::compute_at:
1830  *
1831  \code
1832  Func f, g;
1833  Var x, y;
1834  g(x, y) = x*y;
1835  f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
1836 
1837  g.compute_root();
1838  \endcode
1839  *
1840  * is equivalent to
1841  *
1842  \code
1843  int f[height][width];
1844  int g[height+1][width+1];
1845  for (int y = 0; y < height+1; y++) {
1846  for (int x = 0; x < width+1; x++) {
1847  g[y][x] = x*y;
1848  }
1849  }
1850  for (int y = 0; y < height; y++) {
1851  for (int x = 0; x < width; x++) {
1852  f[y][x] = g[y][x] + g[y+1][x] + g[y][x+1] + g[y+1][x+1];
1853  }
1854  }
1855  \endcode
1856  *
1857  * g is computed once ahead of time, and enough is computed to
1858  * satisfy all uses of it. This does no redundant work (each point
1859  * in g is evaluated once), but has poor locality (values of g are
1860  * probably not still in cache when they are used by f), and
1861  * allocates lots of temporary memory to store g.
1862  */
1863  EXPORT Func &compute_root();
1864 
1865  /** Use the halide_memoization_cache_... interface to store a
1866  * computed version of this function across invocations of the
1867  * Func.
1868  */
1869  EXPORT Func &memoize();
1870 
1871 
1872  /** Allocate storage for this function within f's loop over
1873  * var. Scheduling storage is optional, and can be used to
1874  * separate the loop level at which storage occurs from the loop
1875  * level at which computation occurs to trade off between locality
1876  * and redundant work. This can open the door for two types of
1877  * optimization.
1878  *
1879  * Consider again the pipeline from \ref Func::compute_at :
1880  \code
1881  Func f, g;
1882  Var x, y;
1883  g(x, y) = x*y;
1884  f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
1885  \endcode
1886  *
1887  * If we schedule it like so:
1888  *
1889  \code
1890  g.compute_at(f, x).store_at(f, y);
1891  \endcode
1892  *
1893  * Then the computation of g takes place within the loop over x,
1894  * but the storage takes place within the loop over y:
1895  *
1896  \code
1897  int f[height][width];
1898  for (int y = 0; y < height; y++) {
1899  int g[2][width+1];
1900  for (int x = 0; x < width; x++) {
1901  g[0][x] = x*y;
1902  g[0][x+1] = (x+1)*y;
1903  g[1][x] = x*(y+1);
1904  g[1][x+1] = (x+1)*(y+1);
1905  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
1906  }
1907  }
1908  \endcode
1909  *
1910  * Provided the for loop over x is serial, halide then
1911  * automatically performs the following sliding window
1912  * optimization:
1913  *
1914  \code
1915  int f[height][width];
1916  for (int y = 0; y < height; y++) {
1917  int g[2][width+1];
1918  for (int x = 0; x < width; x++) {
1919  if (x == 0) {
1920  g[0][x] = x*y;
1921  g[1][x] = x*(y+1);
1922  }
1923  g[0][x+1] = (x+1)*y;
1924  g[1][x+1] = (x+1)*(y+1);
1925  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
1926  }
1927  }
1928  \endcode
1929  *
1930  * Two of the assignments to g only need to be done when x is
1931  * zero. The rest of the time, those sites have already been
1932  * filled in by a previous iteration. This version has the
1933  * locality of compute_at(f, x), but allocates more memory and
1934  * does much less redundant work.
1935  *
1936  * Halide then further optimizes this pipeline like so:
1937  *
1938  \code
1939  int f[height][width];
1940  for (int y = 0; y < height; y++) {
1941  int g[2][2];
1942  for (int x = 0; x < width; x++) {
1943  if (x == 0) {
1944  g[0][0] = x*y;
1945  g[1][0] = x*(y+1);
1946  }
1947  g[0][(x+1)%2] = (x+1)*y;
1948  g[1][(x+1)%2] = (x+1)*(y+1);
1949  f[y][x] = g[0][x%2] + g[1][x%2] + g[0][(x+1)%2] + g[1][(x+1)%2];
1950  }
1951  }
1952  \endcode
1953  *
1954  * Halide has detected that it's possible to use a circular buffer
1955  * to represent g, and has reduced all accesses to g modulo 2 in
1956  * the x dimension. This optimization only triggers if the for
1957  * loop over x is serial, and if halide can statically determine
1958  * some power of two large enough to cover the range needed. For
1959  * powers of two, the modulo operator compiles to more efficient
1960  * bit-masking. This optimization reduces memory usage, and also
1961  * improves locality by reusing recently-accessed memory instead
1962  * of pulling new memory into cache.
1963  *
1964  */
1965  EXPORT Func &store_at(Func f, Var var);
1966 
1967  /** Equivalent to the version of store_at that takes a Var, but
1968  * schedules storage within the loop over a dimension of a
1969  * reduction domain */
1970  EXPORT Func &store_at(Func f, RVar var);
1971 
1972 
1973  /** Equivalent to the version of store_at that takes a Var, but
1974  * schedules storage at a given LoopLevel. */
1975  EXPORT Func &store_at(LoopLevel loop_level);
1976 
1977  /** Equivalent to \ref Func::store_at, but schedules storage
1978  * outside the outermost loop. */
1979  EXPORT Func &store_root();
1980 
1981  /** Aggressively inline all uses of this function. This is the
1982  * default schedule, so you're unlikely to need to call this. For
1983  * a Func with an update definition, that means it gets computed
1984  * as close to the innermost loop as possible.
1985  *
1986  * Consider once more the pipeline from \ref Func::compute_at :
1987  *
1988  \code
1989  Func f, g;
1990  Var x, y;
1991  g(x, y) = x*y;
1992  f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
1993  \endcode
1994  *
1995  * Leaving g as inline, this compiles to code equivalent to the following C:
1996  *
1997  \code
1998  int f[height][width];
1999  for (int y = 0; y < height; y++) {
2000  for (int x = 0; x < width; x++) {
2001  f[y][x] = x*y + x*(y+1) + (x+1)*y + (x+1)*(y+1);
2002  }
2003  }
2004  \endcode
2005  */
2006  EXPORT Func &compute_inline();
2007 
2008  /** Get a handle on an update step for the purposes of scheduling
2009  * it. */
2010  EXPORT Stage update(int idx = 0);
2011 
2012  /** Trace all loads from this Func by emitting calls to
2013  * halide_trace. If the Func is inlined, this has no
2014  * effect. */
2015  EXPORT Func &trace_loads();
2016 
2017  /** Trace all stores to the buffer backing this Func by emitting
2018  * calls to halide_trace. If the Func is inlined, this call
2019  * has no effect. */
2020  EXPORT Func &trace_stores();
2021 
2022  /** Trace all realizations of this Func by emitting calls to
2023  * halide_trace. */
2024  EXPORT Func &trace_realizations();
2025 
2026  /** Get a handle on the internal halide function that this Func
2027  * represents. Useful if you want to do introspection on Halide
2028  * functions */
2029  Internal::Function function() const {
2030  return func;
2031  }
2032 
2033  /** You can cast a Func to its pure stage for the purposes of
2034  * scheduling it. */
2035  EXPORT operator Stage() const;
2036 
2037  /** Get a handle on the output buffer for this Func. Only relevant
2038  * if this is the output Func in a pipeline. Useful for making
2039  * static promises about strides, mins, and extents. */
2040  // @{
2041  EXPORT OutputImageParam output_buffer() const;
2042  EXPORT std::vector<OutputImageParam> output_buffers() const;
2043  // @}
2044 
2045  /** Use a Func as an argument to an external stage. */
2046  operator ExternFuncArgument() const {
2047  return ExternFuncArgument(func);
2048  }
2049 
2050  /** Infer the arguments to the Func, sorted into a canonical order:
2051  * all buffers (sorted alphabetically by name), followed by all non-buffers
2052  * (sorted alphabetically by name).
2053  This lets you write things like:
2054  \code
2055  func.compile_to_assembly("/dev/stdout", func.infer_arguments());
2056  \endcode
2057  */
2058  EXPORT std::vector<Argument> infer_arguments() const;
2059 };
2060 
2061 namespace Internal {
2062 
2063 template <typename Last>
2064 inline void check_types(const Tuple &t, int idx) {
2065  using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
2066  user_assert(t[idx].type() == type_of<T>())
2067  << "Can't evaluate expression "
2068  << t[idx] << " of type " << t[idx].type()
2069  << " as a scalar of type " << type_of<T>() << "\n";
2070 }
2071 
2072 template <typename First, typename Second, typename... Rest>
2073 inline void check_types(const Tuple &t, int idx) {
2074  check_types<First>(t, idx);
2075  check_types<Second, Rest...>(t, idx+1);
2076 }
2077 
2078 template <typename Last>
2079 inline void assign_results(Realization &r, int idx, Last last) {
2080  using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
2081  *last = Buffer<T>(r[idx])();
2082 }
2083 
2084 template <typename First, typename Second, typename... Rest>
2085 inline void assign_results(Realization &r, int idx, First first, Second second, Rest&&... rest) {
2086  assign_results<First>(r, idx, first);
2087  assign_results<Second, Rest...>(r, idx+1, second, rest...);
2088 }
2089 
2090 } // namespace Internal
2091 
2092 /** JIT-Compile and run enough code to evaluate a Halide
2093  * expression. This can be thought of as a scalar version of
2094  * \ref Func::realize */
2095 template<typename T>
2097  user_assert(e.type() == type_of<T>())
2098  << "Can't evaluate expression "
2099  << e << " of type " << e.type()
2100  << " as a scalar of type " << type_of<T>() << "\n";
2101  Func f;
2102  f() = e;
2103  Buffer<T> im = f.realize();
2104  return im();
2105 }
2106 
2107 /** JIT-compile and run enough code to evaluate a Halide Tuple. */
2108 template <typename First, typename... Rest>
2109 NO_INLINE void evaluate(Tuple t, First first, Rest&&... rest) {
2110  Internal::check_types<First, Rest...>(t, 0);
2111 
2112  Func f;
2113  f() = t;
2114  Realization r = f.realize();
2115  Internal::assign_results(r, 0, first, rest...);
2116 }
2117 
2118 
2119 namespace Internal {
2120 
2121 inline void schedule_scalar(Func f) {
2123  if (t.has_gpu_feature()) {
2124  f.gpu_single_thread();
2125  }
2127  f.hexagon();
2128  }
2129 }
2130 
2131 } // namespace Internal
2132 
2133 /** JIT-Compile and run enough code to evaluate a Halide
2134  * expression. This can be thought of as a scalar version of
2135  * \ref Func::realize. Can use GPU if jit target from environment
2136  * specifies one.
2137  */
2138 template<typename T>
2140  user_assert(e.type() == type_of<T>())
2141  << "Can't evaluate expression "
2142  << e << " of type " << e.type()
2143  << " as a scalar of type " << type_of<T>() << "\n";
2144  Func f;
2145  f() = e;
2147  Buffer<T> im = f.realize();
2148  return im();
2149 }
2150 
2151 /** JIT-compile and run enough code to evaluate a Halide Tuple. Can
2152  * use GPU if jit target from environment specifies one. */
2153 // @{
2154 template <typename First, typename... Rest>
2155 NO_INLINE void evaluate_may_gpu(Tuple t, First first, Rest&&... rest) {
2156  Internal::check_types<First, Rest...>(t, 0);
2157 
2158  Func f;
2159  f() = t;
2161  Realization r = f.realize();
2162  Internal::assign_results(r, 0, first, rest...);
2163 }
2164 // @}
2165 
2166 }
2167 
2168 
2169 #endif
std::vector< InferredArgument > infer_arguments(Stmt body, const std::vector< Function > &outputs)
Create a small array of Exprs for defining and calling functions with multiple outputs.
Definition: Tuple.h:19
Guard the inner loop with an if statement that prevents evaluation beyond the original extent...
A reference to a site in a Halide statement at the top of the body of a particular for loop...
Definition: Schedule.h:98
Stage(Internal::Definition d, const std::string &n, const std::vector< Var > &args, const Internal::FuncSchedule &func_s)
Definition: Func.h:70
bool & touched()
This flag is set to true if the dims list has been manipulated by the user (or if a ScheduleHandle wa...
A schedule for a single stage of a Halide pipeline.
Definition: Schedule.h:331
static Var outermost()
A Var that represents the location outside the outermost loop.
Definition: Var.h:165
A fragment of Halide syntax.
Definition: Expr.h:276
A class representing a Halide pipeline.
Definition: Pipeline.h:58
NO_INLINE std::enable_if< Internal::all_are_convertible< Var, Args... >::value, Func & >::type reorder_storage(Var x, Var y, Args &&... args)
Specify how the storage for the function is laid out.
Definition: Func.h:1698
A base class for passes over the IR which modify it (e.g.
Definition: IRMutator.h:124
A Realization is a vector of references to existing Buffer objects.
Definition: Tuple.h:69
For pure definitions use ShiftInwards.
A halide function.
Definition: Func.h:502
Defines the front-end syntax for reduction domains and reduction variables.
Defines the front-end class representing an entire Halide imaging pipeline.
A struct specifying a collection of outputs.
Definition: Outputs.h:16
bool has_feature(Feature f) const
Definition: Target.h:128
void check_types(const Tuple &t, int idx)
Definition: Func.h:2073
Defines Module, an IR container that fully describes a Halide program.
TailStrategy
Different ways to handle a tail case in a split when the factor does not provably divide the extent...
Definition: Schedule.h:29
A struct representing a target machine and os to generate code for.
Definition: Target.h:21
VarOrRVar(const Var &v)
Definition: Func.h:31
A Halide variable, to be used when defining functions.
Definition: Var.h:17
const std::string & name() const
Definition: Func.h:35
#define HALIDE_ATTRIBUTE_DEPRECATED(x)
EXPORT Func & hexagon(VarOrRVar x=Var::outermost())
Schedule for execution on Hexagon.
NO_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Func & >::type reorder(VarOrRVar x, VarOrRVar y, Args &&... args)
Definition: Func.h:1296
bool is_rvar
Definition: Func.h:42
A fragment of front-end syntax of the form f(x, y, z), where x, y, z are Vars or Exprs.
Definition: Func.h:314
EXPORT Realization realize(std::vector< int32_t > sizes, const Target &target=Target())
Evaluate this function over some rectangular domain and return the resulting buffer or buffers...
A Halide::Buffer is a named shared reference to a Halide::Runtime::Buffer.
Definition: Buffer.h:12
Defines methods for manipulating and analyzing boolean expressions.
EXPORT Internal::Function function() const
What function is this calling?
Definition: Func.h:411
std::string print_loop_nest(const std::vector< Function > &output_funcs)
Emit some simple pseudocode that shows the structure of the loop nest specified by this pipeline&#39;s sc...
A class that can represent Vars or RVars.
Definition: Func.h:29
EXPORT Func & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide to run this stage using a single gpu thread and block.
Expr & operator+=(Expr &a, Expr b)
Modify the first expression to be the sum of two expressions, without changing its type...
Definition: IROperator.h:280
unsigned __INT8_TYPE__ uint8_t
Defines the structure that describes a Halide target.
Stmt debug_to_file(Stmt s, const std::vector< Function > &outputs, const std::map< std::string, Function > &env)
Takes a statement with Realize nodes still unlowered.
NO_INLINE T evaluate(Expr e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition: Func.h:2096
Expr & operator*=(Expr &a, Expr b)
Modify the first expression to be the product of two expressions, without changing its type...
Definition: IROperator.h:367
Expr min(FuncRef a, FuncRef b)
Explicit overloads of min and max for FuncRef.
Definition: Func.h:418
EXPORT Target get_jit_target_from_environment()
Return the target that Halide will use for jit-compilation.
A fragment of front-end syntax of the form f(x, y, z)[index], where x, y, z are Vars or Exprs...
Definition: Func.h:427
Defines the struct representing lifetime and dependencies of a JIT compiled halide pipeline...
Used to denote for loops that run on the same device as the containing code.
A schedule for a Function of a Halide pipeline.
Definition: Schedule.h:245
#define NO_INLINE
Definition: Util.h:41
A halide module.
Definition: Module.h:83
const std::string & name() const
Get the name of a Var.
Definition: Var.h:27
void check_types(const Tuple &t, int idx)
Definition: Func.h:2064
Expr & operator-=(Expr &a, Expr b)
Modify the first expression to be the difference of two expressions, without changing its type...
Definition: IROperator.h:329
NO_INLINE Func(Buffer< T > &im)
Construct a new Func to wrap a Buffer.
Definition: Func.h:549
EXPORT Var _
A placeholder variable for infered arguments.
Match whatever is specified in the Target.
Func & prefetch(const T &image, VarOrRVar var, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Prefetch data written to or read from a Func or an ImageParam by a subsequent loop iteration...
Definition: Func.h:1672
#define internal_assert(c)
Definition: Error.h:140
A handle on the output buffer of a pipeline.
Classes for declaring scalar parameters to halide pipelines.
EXPORT void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host, bool uses_old_buffer_t=false)
Add an extern definition for this Func.
Definition: Func.h:955
Stage(Internal::Definition d, const std::string &n, const std::vector< std::string > &args, const Internal::FuncSchedule &func_s)
Definition: Func.h:77
void schedule_scalar(Func f)
Definition: Func.h:2121
char * dst
Definition: printer.h:30
Type type() const
Get the type of this expression node.
Definition: Expr.h:302
void assign_results(Realization &r, int idx, Last last)
Definition: Func.h:2079
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled? I.e.
Definition: Target.h:178
EXPORT int index() const
Return index to the function outputs.
Definition: Func.h:490
PrefetchBoundStrategy
Different ways to handle accesses outside the original extents in a prefetch.
Definition: Schedule.h:73
VarOrRVar(const RDom &r)
Definition: Func.h:33
ForType
An enum describing a type of loop traversal.
Definition: Expr.h:345
NO_INLINE T evaluate_may_gpu(Expr e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition: Func.h:2139
Defines the Var - the front-end variable.
NO_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Stage & >::type reorder(VarOrRVar x, VarOrRVar y, Args &&... args)
Scheduling calls that control how the domain of this stage is traversed.
Definition: Func.h:200
A reference-counted handle to a parameter to a halide pipeline.
Definition: Parameter.h:21
EXPORT const std::string & name() const
The name of this reduction variable.
Subtypes for Halide expressions (Halide::Expr) and statements (Halide::Internal::Stmt) ...
Defines a type used for expressing the type signature of a generated halide pipeline.
A reference-counted handle to Halide&#39;s internal representation of a function.
Definition: Function.h:67
#define user_assert(c)
Definition: Error.h:141
EXPORT Target get_target_from_environment()
Return the target that Halide will use.
An Image parameter to a halide pipeline.
Definition: ImageParam.h:20
An argument to an extern-defined Func.
Definition: Function.h:24
Types in the halide type system.
Definition: Type.h:285
NameMangling
An enum to specify calling convention for extern stages.
Definition: Function.h:54
void * malloc(size_t)
A Function definition which can either represent a init or an update definition.
Definition: Definition.h:38
Defines the internal representation of a halide function and related classes.
Expr & operator/=(Expr &a, Expr b)
Modify the first expression to be the ratio of two expressions, without changing its type...
Definition: IROperator.h:389
void add_custom_lowering_pass(T *pass)
Add a custom pass to be used during lowering.
Definition: Func.h:847
EXPORT void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, int dimensionality, NameMangling mangling, bool uses_old_buffer_t)
Add an extern definition for this Func.
Definition: Func.h:945
EXPORT const StageSchedule & schedule() const
Get the default (no-specialization) stage-specific schedule associated with this definition.
Stage & prefetch(const T &image, VarOrRVar var, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Scheduling calls that control how the domain of this stage is traversed.
Definition: Func.h:296
A single definition of a Func.
Definition: Func.h:53
NO_INLINE std::enable_if< Internal::all_are_convertible< Var, Args... >::value, FuncRef >::type operator()(Args &&... args) const
Construct either the left-hand-side of a definition, or a call to a functions that happens to only co...
Definition: Func.h:1011
#define EXPORT
Definition: Util.h:30
A reduction variable represents a single dimension of a reduction domain (RDom).
Definition: RDom.h:22
A multi-dimensional domain over which to iterate.
Definition: RDom.h:179
EXPORT void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, int dimensionality, NameMangling mangling, bool uses_old_buffer_t)
Add an extern definition for this Func.
Definition: Func.h:966
NO_INLINE std::enable_if< Internal::all_are_convertible< Expr, Args... >::value, FuncRef >::type operator()(Expr x, Args &&... args) const
Either calls to the function, or the left-hand-side of an update definition (see RDom).
Definition: Func.h:1028
VarOrRVar(const RVar &r)
Definition: Func.h:32
EXPORT const std::vector< Expr > & args() const
Get the default (no-specialization) arguments (left-hand-side) of the definition. ...
Stage ScheduleHandle
Definition: Func.h:304
DeviceAPI
An enum describing a type of device API.
Definition: Expr.h:317
VarOrRVar(const std::string &n, bool r)
Definition: Func.h:30
void assign_results(Realization &r, int idx, First first, Second second, Rest &&... rest)
Definition: Func.h:2085
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this Stage.
Definition: Func.h:91
Defines Tuple - the front-end handle on small arrays of expressions.
Expr max(FuncRef a, FuncRef b)
Explicit overloads of min and max for FuncRef.
Definition: Func.h:419
void free(void *)
StmtOutputFormat
Used to determine if the output printed to file should be as a normal string or as an HTML file which...
Definition: Pipeline.h:34