Halide 14.0.0
Halide compiler and libraries
Func.h
Go to the documentation of this file.
1#ifndef HALIDE_FUNC_H
2#define HALIDE_FUNC_H
3
4/** \file
5 *
6 * Defines Func - the front-end handle on a halide function, and related classes.
7 */
8
9#include "Argument.h"
10#include "Expr.h"
11#include "JITModule.h"
12#include "Module.h"
13#include "Param.h"
14#include "Pipeline.h"
15#include "RDom.h"
16#include "Target.h"
17#include "Tuple.h"
18#include "Var.h"
19
20#include <map>
21#include <utility>
22
23namespace Halide {
24
25class OutputImageParam;
26class ParamMap;
27
28/** A class that can represent Vars or RVars. Used for reorder calls
29 * which can accept a mix of either. */
30struct VarOrRVar {
31 VarOrRVar(const std::string &n, bool r)
32 : var(n), rvar(n), is_rvar(r) {
33 }
34 VarOrRVar(const Var &v)
35 : var(v), is_rvar(false) {
36 }
37 VarOrRVar(const RVar &r)
38 : rvar(r), is_rvar(true) {
39 }
40 VarOrRVar(const RDom &r)
41 : rvar(RVar(r)), is_rvar(true) {
42 }
43 template<int N>
45 : var(u), is_rvar(false) {
46 }
47
48 const std::string &name() const {
49 if (is_rvar) {
50 return rvar.name();
51 } else {
52 return var.name();
53 }
54 }
55
58 bool is_rvar;
59};
60
61class ImageParam;
62
63namespace Internal {
64class Function;
65struct Split;
66struct StorageDim;
67} // namespace Internal
68
69/** A single definition of a Func. May be a pure or update definition. */
70class Stage {
71 /** Reference to the Function this stage (or definition) belongs to. */
72 Internal::Function function;
73 Internal::Definition definition;
74 /** Indicate which stage the definition belongs to (0 for initial
75 * definition, 1 for first update, etc.). */
76 size_t stage_index;
77 /** Pure Vars of the Function (from the init definition). */
78 std::vector<Var> dim_vars;
79
80 void set_dim_type(const VarOrRVar &var, Internal::ForType t);
81 void set_dim_device_api(const VarOrRVar &var, DeviceAPI device_api);
82 void split(const std::string &old, const std::string &outer, const std::string &inner,
83 const Expr &factor, bool exact, TailStrategy tail);
84 void remove(const std::string &var);
85 Stage &purify(const VarOrRVar &old_name, const VarOrRVar &new_name);
86
87 const std::vector<Internal::StorageDim> &storage_dims() const {
88 return function.schedule().storage_dims();
89 }
90
91 Stage &compute_with(LoopLevel loop_level, const std::map<std::string, LoopAlignStrategy> &align);
92
93public:
95 : function(std::move(f)), definition(std::move(d)), stage_index(stage_index) {
96 internal_assert(definition.defined());
97
98 dim_vars.reserve(function.args().size());
99 for (const auto &arg : function.args()) {
100 dim_vars.emplace_back(arg);
101 }
102 internal_assert(definition.args().size() == dim_vars.size());
103 }
104
105 /** Return the current StageSchedule associated with this Stage. For
106 * introspection only: to modify schedule, use the Func interface. */
108 return definition.schedule();
109 }
110
111 /** Return a string describing the current var list taking into
112 * account all the splits, reorders, and tiles. */
113 std::string dump_argument_list() const;
114
115 /** Return the name of this stage, e.g. "f.update(2)" */
116 std::string name() const;
117
118 /** Calling rfactor() on an associative update definition a Func will split
119 * the update into an intermediate which computes the partial results and
120 * replaces the current update definition with a new definition which merges
121 * the partial results. If called on a init/pure definition, this will
122 * throw an error. rfactor() will automatically infer the associative reduction
123 * operator and identity of the operator. If it can't prove the operation
124 * is associative or if it cannot find an identity for that operator, this
125 * will throw an error. In addition, commutativity of the operator is required
126 * if rfactor() is called on the inner dimension but excluding the outer
127 * dimensions.
128 *
129 * rfactor() takes as input 'preserved', which is a list of <RVar, Var> pairs.
130 * The rvars not listed in 'preserved' are removed from the original Func and
131 * are lifted to the intermediate Func. The remaining rvars (the ones in
132 * 'preserved') are made pure in the intermediate Func. The intermediate Func's
133 * update definition inherits all scheduling directives (e.g. split,fuse, etc.)
134 * applied to the original Func's update definition. The loop order of the
135 * intermediate Func's update definition is the same as the original, although
136 * the RVars in 'preserved' are replaced by the new pure Vars. The loop order of the
137 * intermediate Func's init definition from innermost to outermost is the args'
138 * order of the original Func's init definition followed by the new pure Vars.
139 *
140 * The intermediate Func also inherits storage order from the original Func
141 * with the new pure Vars added to the outermost.
142 *
143 * For example, f.update(0).rfactor({{r.y, u}}) would rewrite a pipeline like this:
144 \code
145 f(x, y) = 0;
146 f(x, y) += g(r.x, r.y);
147 \endcode
148 * into a pipeline like this:
149 \code
150 f_intm(x, y, u) = 0;
151 f_intm(x, y, u) += g(r.x, u);
152
153 f(x, y) = 0;
154 f(x, y) += f_intm(x, y, r.y);
155 \endcode
156 *
157 * This has a variety of uses. You can use it to split computation of an associative reduction:
158 \code
159 f(x, y) = 10;
160 RDom r(0, 96);
161 f(x, y) = max(f(x, y), g(x, y, r.x));
162 f.update(0).split(r.x, rxo, rxi, 8).reorder(y, x).parallel(x);
163 f.update(0).rfactor({{rxo, u}}).compute_root().parallel(u).update(0).parallel(u);
164 \endcode
165 *
166 *, which is equivalent to:
167 \code
168 parallel for u = 0 to 11:
169 for y:
170 for x:
171 f_intm(x, y, u) = -inf
172 parallel for x:
173 for y:
174 parallel for u = 0 to 11:
175 for rxi = 0 to 7:
176 f_intm(x, y, u) = max(f_intm(x, y, u), g(8*u + rxi))
177 for y:
178 for x:
179 f(x, y) = 10
180 parallel for x:
181 for y:
182 for rxo = 0 to 11:
183 f(x, y) = max(f(x, y), f_intm(x, y, rxo))
184 \endcode
185 *
186 */
187 // @{
188 Func rfactor(std::vector<std::pair<RVar, Var>> preserved);
189 Func rfactor(const RVar &r, const Var &v);
190 // @}
191
192 /** Schedule the iteration over this stage to be fused with another
193 * stage 's' from outermost loop to a given LoopLevel. 'this' stage will
194 * be computed AFTER 's' in the innermost fused dimension. There should not
195 * be any dependencies between those two fused stages. If either of the
196 * stages being fused is a stage of an extern Func, this will throw an error.
197 *
198 * Note that the two stages that are fused together should have the same
199 * exact schedule from the outermost to the innermost fused dimension, and
200 * the stage we are calling compute_with on should not have specializations,
201 * e.g. f2.compute_with(f1, x) is allowed only if f2 has no specializations.
202 *
203 * Also, if a producer is desired to be computed at the fused loop level,
204 * the function passed to the compute_at() needs to be the "parent". Consider
205 * the following code:
206 \code
207 input(x, y) = x + y;
208 f(x, y) = input(x, y);
209 f(x, y) += 5;
210 g(x, y) = x - y;
211 g(x, y) += 10;
212 f.compute_with(g, y);
213 f.update().compute_with(g.update(), y);
214 \endcode
215 *
216 * To compute 'input' at the fused loop level at dimension y, we specify
217 * input.compute_at(g, y) instead of input.compute_at(f, y) since 'g' is
218 * the "parent" for this fused loop (i.e. 'g' is computed first before 'f'
219 * is computed). On the other hand, to compute 'input' at the innermost
220 * dimension of 'f', we specify input.compute_at(f, x) instead of
221 * input.compute_at(g, x) since the x dimension of 'f' is not fused
222 * (only the y dimension is).
223 *
224 * Given the constraints, this has a variety of uses. Consider the
225 * following code:
226 \code
227 f(x, y) = x + y;
228 g(x, y) = x - y;
229 h(x, y) = f(x, y) + g(x, y);
230 f.compute_root();
231 g.compute_root();
232 f.split(x, xo, xi, 8);
233 g.split(x, xo, xi, 8);
234 g.compute_with(f, xo);
235 \endcode
236 *
237 * This is equivalent to:
238 \code
239 for y:
240 for xo:
241 for xi:
242 f(8*xo + xi) = (8*xo + xi) + y
243 for xi:
244 g(8*xo + xi) = (8*xo + xi) - y
245 for y:
246 for x:
247 h(x, y) = f(x, y) + g(x, y)
248 \endcode
249 *
250 * The size of the dimensions of the stages computed_with do not have
251 * to match. Consider the following code where 'g' is half the size of 'f':
252 \code
253 Image<int> f_im(size, size), g_im(size/2, size/2);
254 input(x, y) = x + y;
255 f(x, y) = input(x, y);
256 g(x, y) = input(2*x, 2*y);
257 g.compute_with(f, y);
258 input.compute_at(f, y);
259 Pipeline({f, g}).realize({f_im, g_im});
260 \endcode
261 *
262 * This is equivalent to:
263 \code
264 for y = 0 to size-1:
265 for x = 0 to size-1:
266 input(x, y) = x + y;
267 for x = 0 to size-1:
268 f(x, y) = input(x, y)
269 for x = 0 to size/2-1:
270 if (y < size/2-1):
271 g(x, y) = input(2*x, 2*y)
272 \endcode
273 *
274 * 'align' specifies how the loop iteration of each dimension of the
275 * two stages being fused should be aligned in the fused loop nests
276 * (see LoopAlignStrategy for options). Consider the following loop nests:
277 \code
278 for z = f_min_z to f_max_z:
279 for y = f_min_y to f_max_y:
280 for x = f_min_x to f_max_x:
281 f(x, y, z) = x + y + z
282 for z = g_min_z to g_max_z:
283 for y = g_min_y to g_max_y:
284 for x = g_min_x to g_max_x:
285 g(x, y, z) = x - y - z
286 \endcode
287 *
288 * If no alignment strategy is specified, the following loop nest will be
289 * generated:
290 \code
291 for z = min(f_min_z, g_min_z) to max(f_max_z, g_max_z):
292 for y = min(f_min_y, g_min_y) to max(f_max_y, g_max_y):
293 for x = f_min_x to f_max_x:
294 if (f_min_z <= z <= f_max_z):
295 if (f_min_y <= y <= f_max_y):
296 f(x, y, z) = x + y + z
297 for x = g_min_x to g_max_x:
298 if (g_min_z <= z <= g_max_z):
299 if (g_min_y <= y <= g_max_y):
300 g(x, y, z) = x - y - z
301 \endcode
302 *
303 * Instead, these alignment strategies:
304 \code
305 g.compute_with(f, y, {{z, LoopAlignStrategy::AlignStart}, {y, LoopAlignStrategy::AlignEnd}});
306 \endcode
307 * will produce the following loop nest:
308 \code
309 f_loop_min_z = f_min_z
310 f_loop_max_z = max(f_max_z, (f_min_z - g_min_z) + g_max_z)
311 for z = f_min_z to f_loop_max_z:
312 f_loop_min_y = min(f_min_y, (f_max_y - g_max_y) + g_min_y)
313 f_loop_max_y = f_max_y
314 for y = f_loop_min_y to f_loop_max_y:
315 for x = f_min_x to f_max_x:
316 if (f_loop_min_z <= z <= f_loop_max_z):
317 if (f_loop_min_y <= y <= f_loop_max_y):
318 f(x, y, z) = x + y + z
319 for x = g_min_x to g_max_x:
320 g_shift_z = g_min_z - f_loop_min_z
321 g_shift_y = g_max_y - f_loop_max_y
322 if (g_min_z <= (z + g_shift_z) <= g_max_z):
323 if (g_min_y <= (y + g_shift_y) <= g_max_y):
324 g(x, y + g_shift_y, z + g_shift_z) = x - (y + g_shift_y) - (z + g_shift_z)
325 \endcode
326 *
327 * LoopAlignStrategy::AlignStart on dimension z will shift the loop iteration
328 * of 'g' at dimension z so that its starting value matches that of 'f'.
329 * Likewise, LoopAlignStrategy::AlignEnd on dimension y will shift the loop
330 * iteration of 'g' at dimension y so that its end value matches that of 'f'.
331 */
332 // @{
333 Stage &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
335 Stage &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
337 // @}
338
339 /** Scheduling calls that control how the domain of this stage is
340 * traversed. See the documentation for Func for the meanings. */
341 // @{
342
343 Stage &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
344 Stage &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
345 Stage &serial(const VarOrRVar &var);
348 Stage &unroll(const VarOrRVar &var);
349 Stage &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
350 Stage &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
351 Stage &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
352 Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
353 const VarOrRVar &xo, const VarOrRVar &yo,
354 const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor,
356 Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
357 const VarOrRVar &xi, const VarOrRVar &yi,
358 const Expr &xfactor, const Expr &yfactor,
360 Stage &tile(const std::vector<VarOrRVar> &previous,
361 const std::vector<VarOrRVar> &outers,
362 const std::vector<VarOrRVar> &inners,
363 const std::vector<Expr> &factors,
364 const std::vector<TailStrategy> &tails);
365 Stage &tile(const std::vector<VarOrRVar> &previous,
366 const std::vector<VarOrRVar> &outers,
367 const std::vector<VarOrRVar> &inners,
368 const std::vector<Expr> &factors,
370 Stage &tile(const std::vector<VarOrRVar> &previous,
371 const std::vector<VarOrRVar> &inners,
372 const std::vector<Expr> &factors,
374 Stage &reorder(const std::vector<VarOrRVar> &vars);
375
376 template<typename... Args>
377 HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
378 reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
379 std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
380 return reorder(collected_args);
381 }
382
383 Stage &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
384 Stage specialize(const Expr &condition);
385 void specialize_fail(const std::string &message);
386
388 Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
389 Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
390
392
394
396 Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
397 Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
398
399 Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
400 Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
401 const VarOrRVar &thread_x, const VarOrRVar &thread_y,
402 DeviceAPI device_api = DeviceAPI::Default_GPU);
403 Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
404 const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z,
405 DeviceAPI device_api = DeviceAPI::Default_GPU);
406
407 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
409 DeviceAPI device_api = DeviceAPI::Default_GPU);
410
411 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
413 DeviceAPI device_api = DeviceAPI::Default_GPU);
414 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
415 const VarOrRVar &bx, const VarOrRVar &by,
416 const VarOrRVar &tx, const VarOrRVar &ty,
417 const Expr &x_size, const Expr &y_size,
419 DeviceAPI device_api = DeviceAPI::Default_GPU);
420
421 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
422 const VarOrRVar &tx, const VarOrRVar &ty,
423 const Expr &x_size, const Expr &y_size,
425 DeviceAPI device_api = DeviceAPI::Default_GPU);
426
427 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
428 const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
429 const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
430 const Expr &x_size, const Expr &y_size, const Expr &z_size,
432 DeviceAPI device_api = DeviceAPI::Default_GPU);
433 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
434 const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
435 const Expr &x_size, const Expr &y_size, const Expr &z_size,
437 DeviceAPI device_api = DeviceAPI::Default_GPU);
438
440 Stage &atomic(bool override_associativity_test = false);
441
443
444 HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
445 Stage &prefetch(const Func &f, const VarOrRVar &var, int offset = 1,
446 PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
447 return prefetch(f, var, var, offset, strategy);
448 }
449 HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
450 Stage &prefetch(const Internal::Parameter &param, const VarOrRVar &var, int offset = 1,
451 PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
452 return prefetch(param, var, var, offset, strategy);
453 }
454 template<typename T>
455 HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
456 Stage &prefetch(const T &image, VarOrRVar var, int offset = 1,
457 PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
458 return prefetch(image.parameter(), var, var, offset, strategy);
459 }
460 Stage &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
462 Stage &prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
464 template<typename T>
465 Stage &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
467 return prefetch(image.parameter(), at, from, std::move(offset), strategy);
468 }
469 // @}
470
471 /** Attempt to get the source file and line where this stage was
472 * defined by parsing the process's own debug symbols. Returns an
473 * empty string if no debug symbols were found or the debug
474 * symbols were not understood. Works on OS X and Linux only. */
475 std::string source_location() const;
476
477 /** Assert that this stage has intentionally been given no schedule, and
478 * suppress the warning about unscheduled update definitions that would
479 * otherwise fire. This counts as a schedule, so calling this twice on the
480 * same Stage will fail the assertion. */
482};
483
484// For backwards compatibility, keep the ScheduleHandle name.
486
488
489/** A fragment of front-end syntax of the form f(x, y, z), where x, y,
490 * z are Vars or Exprs. If could be the left hand side of a definition or
491 * an update definition, or it could be a call to a function. We don't know
492 * until we see how this object gets used.
493 */
494class FuncRef {
496 int implicit_placeholder_pos;
497 int implicit_count;
498 std::vector<Expr> args;
499 std::vector<Expr> args_with_implicit_vars(const std::vector<Expr> &e) const;
500
501 /** Helper for function update by Tuple. If the function does not
502 * already have a pure definition, init_val will be used as RHS of
503 * each tuple element in the initial function definition. */
504 template<typename BinaryOp>
505 Stage func_ref_update(const Tuple &e, int init_val);
506
507 /** Helper for function update by Expr. If the function does not
508 * already have a pure definition, init_val will be used as RHS in
509 * the initial function definition. */
510 template<typename BinaryOp>
511 Stage func_ref_update(Expr e, int init_val);
512
513public:
514 FuncRef(const Internal::Function &, const std::vector<Expr> &,
515 int placeholder_pos = -1, int count = 0);
516 FuncRef(Internal::Function, const std::vector<Var> &,
517 int placeholder_pos = -1, int count = 0);
518
519 /** Use this as the left-hand-side of a definition or an update definition
520 * (see \ref RDom).
521 */
523
524 /** Use this as the left-hand-side of a definition or an update definition
525 * for a Func with multiple outputs. */
527
528 /** Define a stage that adds the given expression to this Func. If the
529 * expression refers to some RDom, this performs a sum reduction of the
530 * expression over the domain. If the function does not already have a
531 * pure definition, this sets it to zero.
532 */
533 // @{
537 // @}
538
539 /** Define a stage that adds the negative of the given expression to this
540 * Func. If the expression refers to some RDom, this performs a sum reduction
541 * of the negative of the expression over the domain. If the function does
542 * not already have a pure definition, this sets it to zero.
543 */
544 // @{
548 // @}
549
550 /** Define a stage that multiplies this Func by the given expression. If the
551 * expression refers to some RDom, this performs a product reduction of the
552 * expression over the domain. If the function does not already have a pure
553 * definition, this sets it to 1.
554 */
555 // @{
559 // @}
560
561 /** Define a stage that divides this Func by the given expression.
562 * If the expression refers to some RDom, this performs a product
563 * reduction of the inverse of the expression over the domain. If the
564 * function does not already have a pure definition, this sets it to 1.
565 */
566 // @{
570 // @}
571
572 /* Override the usual assignment operator, so that
573 * f(x, y) = g(x, y) defines f.
574 */
576
577 /** Use this as a call to the function, and not the left-hand-side
578 * of a definition. Only works for single-output Funcs. */
579 operator Expr() const;
580
581 /** When a FuncRef refers to a function that provides multiple
582 * outputs, you can access each output as an Expr using
583 * operator[].
584 */
586
587 /** How many outputs does the function this refers to produce. */
588 size_t size() const;
589
590 /** What function is this calling? */
592 return func;
593 }
594};
595
596/** Explicit overloads of min and max for FuncRef. These exist to
597 * disambiguate calls to min on FuncRefs when a user has pulled both
598 * Halide::min and std::min into their namespace. */
599// @{
600inline Expr min(const FuncRef &a, const FuncRef &b) {
601 return min(Expr(a), Expr(b));
602}
603inline Expr max(const FuncRef &a, const FuncRef &b) {
604 return max(Expr(a), Expr(b));
605}
606// @}
607
608/** A fragment of front-end syntax of the form f(x, y, z)[index], where x, y,
609 * z are Vars or Exprs. If could be the left hand side of an update
610 * definition, or it could be a call to a function. We don't know
611 * until we see how this object gets used.
612 */
614 FuncRef func_ref;
615 std::vector<Expr> args; // args to the function
616 int idx; // Index to function outputs
617
618 /** Helper function that generates a Tuple where element at 'idx' is set
619 * to 'e' and the rests are undef. */
620 Tuple values_with_undefs(const Expr &e) const;
621
622public:
623 FuncTupleElementRef(const FuncRef &ref, const std::vector<Expr> &args, int idx);
624
625 /** Use this as the left-hand-side of an update definition of Tuple
626 * component 'idx' of a Func (see \ref RDom). The function must
627 * already have an initial definition.
628 */
630
631 /** Define a stage that adds the given expression to Tuple component 'idx'
632 * of this Func. The other Tuple components are unchanged. If the expression
633 * refers to some RDom, this performs a sum reduction of the expression over
634 * the domain. The function must already have an initial definition.
635 */
637
638 /** Define a stage that adds the negative of the given expression to Tuple
639 * component 'idx' of this Func. The other Tuple components are unchanged.
640 * If the expression refers to some RDom, this performs a sum reduction of
641 * the negative of the expression over the domain. The function must already
642 * have an initial definition.
643 */
645
646 /** Define a stage that multiplies Tuple component 'idx' of this Func by
647 * the given expression. The other Tuple components are unchanged. If the
648 * expression refers to some RDom, this performs a product reduction of
649 * the expression over the domain. The function must already have an
650 * initial definition.
651 */
653
654 /** Define a stage that divides Tuple component 'idx' of this Func by
655 * the given expression. The other Tuple components are unchanged.
656 * If the expression refers to some RDom, this performs a product
657 * reduction of the inverse of the expression over the domain. The function
658 * must already have an initial definition.
659 */
661
662 /* Override the usual assignment operator, so that
663 * f(x, y)[index] = g(x, y) defines f.
664 */
666
667 /** Use this as a call to Tuple component 'idx' of a Func, and not the
668 * left-hand-side of a definition. */
669 operator Expr() const;
670
671 /** What function is this calling? */
673 return func_ref.function();
674 }
675
676 /** Return index to the function outputs. */
677 int index() const {
678 return idx;
679 }
680};
681
682namespace Internal {
683class IRMutator;
684} // namespace Internal
685
686/** Helper class for identifying purpose of an Expr passed to memoize.
687 */
689protected:
691 friend class Func;
692
693public:
694 explicit EvictionKey(const Expr &expr = Expr())
695 : key(expr) {
696 }
697};
698
699/** A halide function. This class represents one stage in a Halide
700 * pipeline, and is the unit by which we schedule things. By default
701 * they are aggressively inlined, so you are encouraged to make lots
702 * of little functions, rather than storing things in Exprs. */
703class Func {
704
705 /** A handle on the internal halide function that this
706 * represents */
708
709 /** When you make a reference to this function with fewer
710 * arguments than it has dimensions, the argument list is bulked
711 * up with 'implicit' vars with canonical names. This lets you
712 * pass around partially applied Halide functions. */
713 // @{
714 std::pair<int, int> add_implicit_vars(std::vector<Var> &) const;
715 std::pair<int, int> add_implicit_vars(std::vector<Expr> &) const;
716 // @}
717
718 /** The imaging pipeline that outputs this Func alone. */
719 Pipeline pipeline_;
720
721 /** Get the imaging pipeline that outputs this Func alone,
722 * creating it (and freezing the Func) if necessary. */
723 Pipeline pipeline();
724
725 // Helper function for recursive reordering support
726 Func &reorder_storage(const std::vector<Var> &dims, size_t start);
727
728 void invalidate_cache();
729
730public:
731 /** Declare a new undefined function with the given name */
732 explicit Func(const std::string &name);
733
734 /** Declare a new undefined function with an
735 * automatically-generated unique name */
737
738 /** Declare a new function with an automatically-generated unique
739 * name, and define it to return the given expression (which may
740 * not contain free variables). */
741 explicit Func(const Expr &e);
742
743 /** Construct a new Func to wrap an existing, already-define
744 * Function object. */
746
747 /** Construct a new Func to wrap a Buffer. */
748 template<typename T, int Dims>
750 : Func() {
751 (*this)(_) = im(_);
752 }
753
754 /** Evaluate this function over some rectangular domain and return
755 * the resulting buffer or buffers. Performs compilation if the
756 * Func has not previously been realized and compile_jit has not
757 * been called. If the final stage of the pipeline is on the GPU,
758 * data is copied back to the host before being returned. The
759 * returned Realization should probably be instantly converted to
760 * a Buffer class of the appropriate type. That is, do this:
761 *
762 \code
763 f(x) = sin(x);
764 Buffer<float> im = f.realize(...);
765 \endcode
766 *
767 * If your Func has multiple values, because you defined it using
768 * a Tuple, then casting the result of a realize call to a buffer
769 * or image will produce a run-time error. Instead you should do the
770 * following:
771 *
772 \code
773 f(x) = Tuple(x, sin(x));
774 Realization r = f.realize(...);
775 Buffer<int> im0 = r[0];
776 Buffer<float> im1 = r[1];
777 \endcode
778 *
779 * In Halide formal arguments of a computation are specified using
780 * Param<T> and ImageParam objects in the expressions defining the
781 * computation. The param_map argument to realize allows
782 * specifying a set of per-call parameters to be used for a
783 * specific computation. This method is thread-safe where the
784 * globals used by Param<T> and ImageParam are not. Any parameters
785 * that are not in the param_map are taken from the global values,
786 * so those can continue to be used if they are not changing
787 * per-thread.
788 *
789 * One can explicitly construct a ParamMap and
790 * use its set method to insert Parameter to scalar or Buffer
791 * value mappings:
792 *
793 \code
794 Param<int32> p(42);
795 ImageParam img(Int(32), 1);
796 f(x) = img(x) + p;
797
798 Buffer<int32_t) arg_img(10, 10);
799 <fill in arg_img...>
800 ParamMap params;
801 params.set(p, 17);
802 params.set(img, arg_img);
803
804 Target t = get_jit_target_from_environment();
805 Buffer<int32_t> result = f.realize({10, 10}, t, params);
806 \endcode
807 *
808 * Alternatively, an initializer list can be used
809 * directly in the realize call to pass this information:
810 *
811 \code
812 Param<int32> p(42);
813 ImageParam img(Int(32), 1);
814 f(x) = img(x) + p;
815
816 Buffer<int32_t) arg_img(10, 10);
817 <fill in arg_img...>
818
819 Target t = get_jit_target_from_environment();
820 Buffer<int32_t> result = f.realize({10, 10}, t, { { p, 17 }, { img, arg_img } });
821 \endcode
822 *
823 * If the Func cannot be realized into a buffer of the given size
824 * due to scheduling constraints on scattering update definitions,
825 * it will be realized into a larger buffer of the minimum size
826 * possible, and a cropped view at the requested size will be
827 * returned. It is thus not safe to assume the returned buffers
828 * are contiguous in memory. This behavior can be disabled with
829 * the NoBoundsQuery target flag, in which case an error about
830 * writing out of bounds on the output buffer will trigger
831 * instead.
832 *
833 */
834 Realization realize(std::vector<int32_t> sizes = {}, const Target &target = Target(),
835 const ParamMap &param_map = ParamMap::empty_map());
836
837 /** Same as above, but takes a custom user-provided context to be
838 * passed to runtime functions. This can be used to pass state to
839 * runtime overrides in a thread-safe manner. A nullptr context is
840 * legal, and is equivalent to calling the variant of realize
841 * that does not take a context. */
843 std::vector<int32_t> sizes = {},
844 const Target &target = Target(),
845 const ParamMap &param_map = ParamMap::empty_map());
846
847 /** Evaluate this function into an existing allocated buffer or
848 * buffers. If the buffer is also one of the arguments to the
849 * function, strange things may happen, as the pipeline isn't
850 * necessarily safe to run in-place. If you pass multiple buffers,
851 * they must have matching sizes. This form of realize does *not*
852 * automatically copy data back from the GPU. */
854 const ParamMap &param_map = ParamMap::empty_map());
855
856 /** Same as above, but takes a custom user-provided context to be
857 * passed to runtime functions. This can be used to pass state to
858 * runtime overrides in a thread-safe manner. A nullptr context is
859 * legal, and is equivalent to calling the variant of realize
860 * that does not take a context. */
861 void realize(JITUserContext *context,
863 const Target &target = Target(),
864 const ParamMap &param_map = ParamMap::empty_map());
865
866 /** For a given size of output, or a given output buffer,
867 * determine the bounds required of all unbound ImageParams
868 * referenced. Communicates the result by allocating new buffers
869 * of the appropriate size and binding them to the unbound
870 * ImageParams.
871 *
872 * Set the documentation for Func::realize regarding the
873 * ParamMap. There is one difference in that input Buffer<>
874 * arguments that are being inferred are specified as a pointer to
875 * the Buffer<> in the ParamMap. E.g.
876 *
877 \code
878 Param<int32> p(42);
879 ImageParam img(Int(32), 1);
880 f(x) = img(x) + p;
881
882 Target t = get_jit_target_from_environment();
883 Buffer<> in;
884 f.infer_input_bounds({10, 10}, t, { { img, &in } });
885 \endcode
886 * On return, in will be an allocated buffer of the correct size
887 * to evaulate f over a 10x10 region.
888 */
889 // @{
890 void infer_input_bounds(const std::vector<int32_t> &sizes,
891 const Target &target = get_jit_target_from_environment(),
892 const ParamMap &param_map = ParamMap::empty_map());
894 const Target &target = get_jit_target_from_environment(),
895 const ParamMap &param_map = ParamMap::empty_map());
896 // @}
897
898 /** Versions of infer_input_bounds that take a custom user context
899 * to pass to runtime functions. */
900 // @{
902 const std::vector<int32_t> &sizes,
903 const Target &target = get_jit_target_from_environment(),
904 const ParamMap &param_map = ParamMap::empty_map());
907 const Target &target = get_jit_target_from_environment(),
908 const ParamMap &param_map = ParamMap::empty_map());
909 // @}
910 /** Statically compile this function to llvm bitcode, with the
911 * given filename (which should probably end in .bc), type
912 * signature, and C function name (which defaults to the same name
913 * as this halide function */
914 //@{
915 void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
916 const Target &target = get_target_from_environment());
917 void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &,
918 const Target &target = get_target_from_environment());
919 // @}
920
921 /** Statically compile this function to llvm assembly, with the
922 * given filename (which should probably end in .ll), type
923 * signature, and C function name (which defaults to the same name
924 * as this halide function */
925 //@{
926 void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
927 const Target &target = get_target_from_environment());
928 void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &,
929 const Target &target = get_target_from_environment());
930 // @}
931
932 /** Statically compile this function to an object file, with the
933 * given filename (which should probably end in .o or .obj), type
934 * signature, and C function name (which defaults to the same name
935 * as this halide function. You probably don't want to use this
936 * directly; call compile_to_static_library or compile_to_file instead. */
937 //@{
938 void compile_to_object(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
939 const Target &target = get_target_from_environment());
940 void compile_to_object(const std::string &filename, const std::vector<Argument> &,
941 const Target &target = get_target_from_environment());
942 // @}
943
944 /** Emit a header file with the given filename for this
945 * function. The header will define a function with the type
946 * signature given by the second argument, and a name given by the
947 * third. The name defaults to the same name as this halide
948 * function. You don't actually have to have defined this function
949 * yet to call this. You probably don't want to use this directly;
950 * call compile_to_static_library or compile_to_file instead. */
951 void compile_to_header(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name = "",
952 const Target &target = get_target_from_environment());
953
954 /** Statically compile this function to text assembly equivalent
955 * to the object file generated by compile_to_object. This is
956 * useful for checking what Halide is producing without having to
957 * disassemble anything, or if you need to feed the assembly into
958 * some custom toolchain to produce an object file (e.g. iOS) */
959 //@{
960 void compile_to_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
961 const Target &target = get_target_from_environment());
962 void compile_to_assembly(const std::string &filename, const std::vector<Argument> &,
963 const Target &target = get_target_from_environment());
964 // @}
965
966 /** Statically compile this function to C source code. This is
967 * useful for providing fallback code paths that will compile on
968 * many platforms. Vectorization will fail, and parallelization
969 * will produce serial code. */
970 void compile_to_c(const std::string &filename,
971 const std::vector<Argument> &,
972 const std::string &fn_name = "",
973 const Target &target = get_target_from_environment());
974
975 /** Write out an internal representation of lowered code. Useful
976 * for analyzing and debugging scheduling. Can emit html or plain
977 * text. */
978 void compile_to_lowered_stmt(const std::string &filename,
979 const std::vector<Argument> &args,
981 const Target &target = get_target_from_environment());
982
983 /** Write out the loop nests specified by the schedule for this
984 * Function. Helpful for understanding what a schedule is
985 * doing. */
987
988 /** Compile to object file and header pair, with the given
989 * arguments. The name defaults to the same name as this halide
990 * function.
991 */
992 void compile_to_file(const std::string &filename_prefix, const std::vector<Argument> &args,
993 const std::string &fn_name = "",
994 const Target &target = get_target_from_environment());
995
996 /** Compile to static-library file and header pair, with the given
997 * arguments. The name defaults to the same name as this halide
998 * function.
999 */
1000 void compile_to_static_library(const std::string &filename_prefix, const std::vector<Argument> &args,
1001 const std::string &fn_name = "",
1002 const Target &target = get_target_from_environment());
1003
1004 /** Compile to static-library file and header pair once for each target;
1005 * each resulting function will be considered (in order) via halide_can_use_target_features()
1006 * at runtime, with the first appropriate match being selected for subsequent use.
1007 * This is typically useful for specializations that may vary unpredictably by machine
1008 * (e.g., SSE4.1/AVX/AVX2 on x86 desktop machines).
1009 * All targets must have identical arch-os-bits.
1010 */
1011 void compile_to_multitarget_static_library(const std::string &filename_prefix,
1012 const std::vector<Argument> &args,
1013 const std::vector<Target> &targets);
1014
1015 /** Like compile_to_multitarget_static_library(), except that the object files
1016 * are all output as object files (rather than bundled into a static library).
1017 *
1018 * `suffixes` is an optional list of strings to use for as the suffix for each object
1019 * file. If nonempty, it must be the same length as `targets`. (If empty, Target::to_string()
1020 * will be used for each suffix.)
1021 *
1022 * Note that if `targets.size()` > 1, the wrapper code (to select the subtarget)
1023 * will be generated with the filename `${filename_prefix}_wrapper.o`
1024 *
1025 * Note that if `targets.size()` > 1 and `no_runtime` is not specified, the runtime
1026 * will be generated with the filename `${filename_prefix}_runtime.o`
1027 */
1028 void compile_to_multitarget_object_files(const std::string &filename_prefix,
1029 const std::vector<Argument> &args,
1030 const std::vector<Target> &targets,
1031 const std::vector<std::string> &suffixes);
1032
1033 /** Store an internal representation of lowered code as a self
1034 * contained Module suitable for further compilation. */
1035 Module compile_to_module(const std::vector<Argument> &args, const std::string &fn_name = "",
1036 const Target &target = get_target_from_environment());
1037
1038 /** Compile and generate multiple target files with single call.
1039 * Deduces target files based on filenames specified in
1040 * output_files map.
1041 */
1042 void compile_to(const std::map<OutputFileType, std::string> &output_files,
1043 const std::vector<Argument> &args,
1044 const std::string &fn_name,
1045 const Target &target = get_target_from_environment());
1046
1047 /** Eagerly jit compile the function to machine code. This
1048 * normally happens on the first call to realize. If you're
1049 * running your halide pipeline inside time-sensitive code and
1050 * wish to avoid including the time taken to compile a pipeline,
1051 * then you can call this ahead of time. Default is to use the Target
1052 * returned from Halide::get_jit_target_from_environment()
1053 */
1055
1056 /** Deprecated variants of the above that use a void pointer
1057 * instead of a JITUserContext pointer. */
1058 // @{
1059 HALIDE_ATTRIBUTE_DEPRECATED("Custom handlers should by set by modifying the struct returned by jit_handlers()")
1060 void set_error_handler(void (*handler)(void *, const char *));
1061 HALIDE_ATTRIBUTE_DEPRECATED("Custom handlers should by set by modifying the struct returned by jit_handlers()")
1062 void set_custom_allocator(void *(*malloc)(void *, size_t),
1063 void (*free)(void *, void *));
1064 HALIDE_ATTRIBUTE_DEPRECATED("Custom handlers should by set by modifying the struct returned by jit_handlers()")
1066 int (*custom_do_task)(void *, int (*)(void *, int, uint8_t *),
1067 int, uint8_t *));
1068 HALIDE_ATTRIBUTE_DEPRECATED("Custom handlers should by set by modifying the struct returned by jit_handlers()")
1070 int (*custom_do_par_for)(void *, int (*)(void *, int, uint8_t *), int,
1071 int, uint8_t *));
1072 HALIDE_ATTRIBUTE_DEPRECATED("Custom handlers should by set by modifying the struct returned by jit_handlers()")
1073 void set_custom_trace(int (*trace_fn)(void *, const halide_trace_event_t *));
1074
1075 HALIDE_ATTRIBUTE_DEPRECATED("Custom handlers should by set by modifying the struct returned by jit_handlers()")
1076 void set_custom_print(void (*handler)(void *, const char *));
1077 // @}
1078
1079 /** Get a struct containing the currently set custom functions
1080 * used by JIT. This can be mutated. Changes will take effect the
1081 * next time this Func is realized. */
1083
1084 /** Add a custom pass to be used during lowering. It is run after
1085 * all other lowering passes. Can be used to verify properties of
1086 * the lowered Stmt, instrument it with extra code, or otherwise
1087 * modify it. The Func takes ownership of the pass, and will call
1088 * delete on it when the Func goes out of scope. So don't pass a
1089 * stack object, or share pass instances between multiple
1090 * Funcs. */
1091 template<typename T>
1093 // Template instantiate a custom deleter for this type, then
1094 // wrap in a lambda. The custom deleter lives in user code, so
1095 // that deletion is on the same heap as construction (I hate Windows).
1096 add_custom_lowering_pass(pass, [pass]() { delete_lowering_pass<T>(pass); });
1097 }
1098
1099 /** Add a custom pass to be used during lowering, with the
1100 * function that will be called to delete it also passed in. Set
1101 * it to nullptr if you wish to retain ownership of the object. */
1102 void add_custom_lowering_pass(Internal::IRMutator *pass, std::function<void()> deleter);
1103
1104 /** Remove all previously-set custom lowering passes */
1106
1107 /** Get the custom lowering passes. */
1108 const std::vector<CustomLoweringPass> &custom_lowering_passes();
1109
1110 /** When this function is compiled, include code that dumps its
1111 * values to a file after it is realized, for the purpose of
1112 * debugging.
1113 *
1114 * If filename ends in ".tif" or ".tiff" (case insensitive) the file
1115 * is in TIFF format and can be read by standard tools. Oherwise, the
1116 * file format is as follows:
1117 *
1118 * All data is in the byte-order of the target platform. First, a
1119 * 20 byte-header containing four 32-bit ints, giving the extents
1120 * of the first four dimensions. Dimensions beyond four are
1121 * folded into the fourth. Then, a fifth 32-bit int giving the
1122 * data type of the function. The typecodes are given by: float =
1123 * 0, double = 1, uint8_t = 2, int8_t = 3, uint16_t = 4, int16_t =
1124 * 5, uint32_t = 6, int32_t = 7, uint64_t = 8, int64_t = 9. The
1125 * data follows the header, as a densely packed array of the given
1126 * size and the given type. If given the extension .tmp, this file
1127 * format can be natively read by the program ImageStack. */
1128 void debug_to_file(const std::string &filename);
1129
1130 /** The name of this function, either given during construction,
1131 * or automatically generated. */
1132 const std::string &name() const;
1133
1134 /** Get the pure arguments. */
1135 std::vector<Var> args() const;
1136
1137 /** The right-hand-side value of the pure definition of this
1138 * function. Causes an error if there's no pure definition, or if
1139 * the function is defined to return multiple values. */
1140 Expr value() const;
1141
1142 /** The values returned by this function. An error if the function
1143 * has not been been defined. Returns a Tuple with one element for
1144 * functions defined to return a single value. */
1145 Tuple values() const;
1146
1147 /** Does this function have at least a pure definition. */
1148 bool defined() const;
1149
1150 /** Get the left-hand-side of the update definition. An empty
1151 * vector if there's no update definition. If there are
1152 * multiple update definitions for this function, use the
1153 * argument to select which one you want. */
1154 const std::vector<Expr> &update_args(int idx = 0) const;
1155
1156 /** Get the right-hand-side of an update definition. An error if
1157 * there's no update definition. If there are multiple
1158 * update definitions for this function, use the argument to
1159 * select which one you want. */
1160 Expr update_value(int idx = 0) const;
1161
1162 /** Get the right-hand-side of an update definition for
1163 * functions that returns multiple values. An error if there's no
1164 * update definition. Returns a Tuple with one element for
1165 * functions that return a single value. */
1166 Tuple update_values(int idx = 0) const;
1167
1168 /** Get the RVars of the reduction domain for an update definition, if there is
1169 * one. */
1170 std::vector<RVar> rvars(int idx = 0) const;
1171
1172 /** Does this function have at least one update definition? */
1174
1175 /** How many update definitions does this function have? */
1177
1178 /** Is this function an external stage? That is, was it defined
1179 * using define_extern? */
1180 bool is_extern() const;
1181
1182 /** Add an extern definition for this Func. This lets you define a
1183 * Func that represents an external pipeline stage. You can, for
1184 * example, use it to wrap a call to an extern library such as
1185 * fftw. */
1186 // @{
1187 void define_extern(const std::string &function_name,
1188 const std::vector<ExternFuncArgument> &params, Type t,
1189 int dimensionality,
1191 DeviceAPI device_api = DeviceAPI::Host) {
1192 define_extern(function_name, params, t,
1193 Internal::make_argument_list(dimensionality), mangling,
1194 device_api);
1195 }
1196
1197 void define_extern(const std::string &function_name,
1198 const std::vector<ExternFuncArgument> &params,
1199 const std::vector<Type> &types, int dimensionality,
1200 NameMangling mangling) {
1201 define_extern(function_name, params, types,
1202 Internal::make_argument_list(dimensionality), mangling);
1203 }
1204
1205 void define_extern(const std::string &function_name,
1206 const std::vector<ExternFuncArgument> &params,
1207 const std::vector<Type> &types, int dimensionality,
1209 DeviceAPI device_api = DeviceAPI::Host) {
1210 define_extern(function_name, params, types,
1211 Internal::make_argument_list(dimensionality), mangling,
1212 device_api);
1213 }
1214
1215 void define_extern(const std::string &function_name,
1216 const std::vector<ExternFuncArgument> &params, Type t,
1217 const std::vector<Var> &arguments,
1219 DeviceAPI device_api = DeviceAPI::Host) {
1220 define_extern(function_name, params, std::vector<Type>{t}, arguments,
1221 mangling, device_api);
1222 }
1223
1224 void define_extern(const std::string &function_name,
1225 const std::vector<ExternFuncArgument> &params,
1226 const std::vector<Type> &types,
1227 const std::vector<Var> &arguments,
1229 DeviceAPI device_api = DeviceAPI::Host);
1230 // @}
1231
1232 /** Get the types of the outputs of this Func. */
1233 const std::vector<Type> &output_types() const;
1234
1235 /** Get the number of outputs of this Func. Corresponds to the
1236 * size of the Tuple this Func was defined to return. */
1237 int outputs() const;
1238
1239 /** Get the name of the extern function called for an extern
1240 * definition. */
1241 const std::string &extern_function_name() const;
1242
1243 /** The dimensionality (number of arguments) of this
1244 * function. Zero if the function is not yet defined. */
1245 int dimensions() const;
1246
1247 /** Construct either the left-hand-side of a definition, or a call
1248 * to a functions that happens to only contain vars as
1249 * arguments. If the function has already been defined, and fewer
1250 * arguments are given than the function has dimensions, then
1251 * enough implicit vars are added to the end of the argument list
1252 * to make up the difference (see \ref Var::implicit) */
1253 // @{
1254 FuncRef operator()(std::vector<Var>) const;
1255
1256 template<typename... Args>
1257 HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Var, Args...>::value, FuncRef>::type
1258 operator()(Args &&...args) const {
1259 std::vector<Var> collected_args{std::forward<Args>(args)...};
1260 return this->operator()(collected_args);
1261 }
1262 // @}
1263
1264 /** Either calls to the function, or the left-hand-side of
1265 * an update definition (see \ref RDom). If the function has
1266 * already been defined, and fewer arguments are given than the
1267 * function has dimensions, then enough implicit vars are added to
1268 * the end of the argument list to make up the difference. (see
1269 * \ref Var::implicit)*/
1270 // @{
1271 FuncRef operator()(std::vector<Expr>) const;
1272
1273 template<typename... Args>
1274 HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Expr, Args...>::value, FuncRef>::type
1275 operator()(const Expr &x, Args &&...args) const {
1276 std::vector<Expr> collected_args{x, std::forward<Args>(args)...};
1277 return (*this)(collected_args);
1278 }
1279 // @}
1280
1281 /** Creates and returns a new identity Func that wraps this Func. During
1282 * compilation, Halide replaces all calls to this Func done by 'f'
1283 * with calls to the wrapper. If this Func is already wrapped for
1284 * use in 'f', will return the existing wrapper.
1285 *
1286 * For example, g.in(f) would rewrite a pipeline like this:
1287 \code
1288 g(x, y) = ...
1289 f(x, y) = ... g(x, y) ...
1290 \endcode
1291 * into a pipeline like this:
1292 \code
1293 g(x, y) = ...
1294 g_wrap(x, y) = g(x, y)
1295 f(x, y) = ... g_wrap(x, y)
1296 \endcode
1297 *
1298 * This has a variety of uses. You can use it to schedule this
1299 * Func differently in the different places it is used:
1300 \code
1301 g(x, y) = ...
1302 f1(x, y) = ... g(x, y) ...
1303 f2(x, y) = ... g(x, y) ...
1304 g.in(f1).compute_at(f1, y).vectorize(x, 8);
1305 g.in(f2).compute_at(f2, x).unroll(x);
1306 \endcode
1307 *
1308 * You can also use it to stage loads from this Func via some
1309 * intermediate buffer (perhaps on the stack as in
1310 * test/performance/block_transpose.cpp, or in shared GPU memory
1311 * as in test/performance/wrap.cpp). In this we compute the
1312 * wrapper at tiles of the consuming Funcs like so:
1313 \code
1314 g.compute_root()...
1315 g.in(f).compute_at(f, tiles)...
1316 \endcode
1317 *
1318 * Func::in() can also be used to compute pieces of a Func into a
1319 * smaller scratch buffer (perhaps on the GPU) and then copy them
1320 * into a larger output buffer one tile at a time. See
1321 * apps/interpolate/interpolate.cpp for an example of this. In
1322 * this case we compute the Func at tiles of its own wrapper:
1323 \code
1324 f.in(g).compute_root().gpu_tile(...)...
1325 f.compute_at(f.in(g), tiles)...
1326 \endcode
1327 *
1328 * A similar use of Func::in() wrapping Funcs with multiple update
1329 * stages in a pure wrapper. The following code:
1330 \code
1331 f(x, y) = x + y;
1332 f(x, y) += 5;
1333 g(x, y) = f(x, y);
1334 f.compute_root();
1335 \endcode
1336 *
1337 * Is equivalent to:
1338 \code
1339 for y:
1340 for x:
1341 f(x, y) = x + y;
1342 for y:
1343 for x:
1344 f(x, y) += 5
1345 for y:
1346 for x:
1347 g(x, y) = f(x, y)
1348 \endcode
1349 * using Func::in(), we can write:
1350 \code
1351 f(x, y) = x + y;
1352 f(x, y) += 5;
1353 g(x, y) = f(x, y);
1354 f.in(g).compute_root();
1355 \endcode
1356 * which instead produces:
1357 \code
1358 for y:
1359 for x:
1360 f(x, y) = x + y;
1361 f(x, y) += 5
1362 f_wrap(x, y) = f(x, y)
1363 for y:
1364 for x:
1365 g(x, y) = f_wrap(x, y)
1366 \endcode
1367 */
1368 Func in(const Func &f);
1369
1370 /** Create and return an identity wrapper shared by all the Funcs in
1371 * 'fs'. If any of the Funcs in 'fs' already have a custom wrapper,
1372 * this will throw an error. */
1373 Func in(const std::vector<Func> &fs);
1374
1375 /** Create and return a global identity wrapper, which wraps all calls to
1376 * this Func by any other Func. If a global wrapper already exists,
1377 * returns it. The global identity wrapper is only used by callers for
1378 * which no custom wrapper has been specified.
1379 */
1381
1382 /** Similar to \ref Func::in; however, instead of replacing the call to
1383 * this Func with an identity Func that refers to it, this replaces the
1384 * call with a clone of this Func.
1385 *
1386 * For example, f.clone_in(g) would rewrite a pipeline like this:
1387 \code
1388 f(x, y) = x + y;
1389 g(x, y) = f(x, y) + 2;
1390 h(x, y) = f(x, y) - 3;
1391 \endcode
1392 * into a pipeline like this:
1393 \code
1394 f(x, y) = x + y;
1395 f_clone(x, y) = x + y;
1396 g(x, y) = f_clone(x, y) + 2;
1397 h(x, y) = f(x, y) - 3;
1398 \endcode
1399 *
1400 */
1401 //@{
1402 Func clone_in(const Func &f);
1403 Func clone_in(const std::vector<Func> &fs);
1404 //@}
1405
1406 /** Declare that this function should be implemented by a call to
1407 * halide_buffer_copy with the given target device API. Asserts
1408 * that the Func has a pure definition which is a simple call to a
1409 * single input, and no update definitions. The wrapper Funcs
1410 * returned by in() are suitable candidates. Consumes all pure
1411 * variables, and rewrites the Func to have an extern definition
1412 * that calls halide_buffer_copy. */
1414
1415 /** Declare that this function should be implemented by a call to
1416 * halide_buffer_copy with a NULL target device API. Equivalent to
1417 * copy_to_device(DeviceAPI::Host). Asserts that the Func has a
1418 * pure definition which is a simple call to a single input, and
1419 * no update definitions. The wrapper Funcs returned by in() are
1420 * suitable candidates. Consumes all pure variables, and rewrites
1421 * the Func to have an extern definition that calls
1422 * halide_buffer_copy.
1423 *
1424 * Note that if the source Func is already valid in host memory,
1425 * this compiles to code that does the minimum number of calls to
1426 * memcpy.
1427 */
1429
1430 /** Split a dimension into inner and outer subdimensions with the
1431 * given names, where the inner dimension iterates from 0 to
1432 * factor-1. The inner and outer subdimensions can then be dealt
1433 * with using the other scheduling calls. It's ok to reuse the old
1434 * variable name as either the inner or outer variable. The final
1435 * argument specifies how the tail should be handled if the split
1436 * factor does not provably divide the extent. */
1437 Func &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1438
1439 /** Join two dimensions into a single fused dimenion. The fused
1440 * dimension covers the product of the extents of the inner and
1441 * outer dimensions given. */
1442 Func &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
1443
1444 /** Mark a dimension to be traversed serially. This is the default. */
1445 Func &serial(const VarOrRVar &var);
1446
1447 /** Mark a dimension to be traversed in parallel */
1449
1450 /** Split a dimension by the given task_size, and the parallelize the
1451 * outer dimension. This creates parallel tasks that have size
1452 * task_size. After this call, var refers to the outer dimension of
1453 * the split. The inner dimension has a new anonymous name. If you
1454 * wish to mutate it, or schedule with respect to it, do the split
1455 * manually. */
1456 Func &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
1457
1458 /** Mark a dimension to be computed all-at-once as a single
1459 * vector. The dimension should have constant extent -
1460 * e.g. because it is the inner dimension following a split by a
1461 * constant factor. For most uses of vectorize you want the two
1462 * argument form. The variable to be vectorized should be the
1463 * innermost one. */
1465
1466 /** Mark a dimension to be completely unrolled. The dimension
1467 * should have constant extent - e.g. because it is the inner
1468 * dimension following a split by a constant factor. For most uses
1469 * of unroll you want the two-argument form. */
1470 Func &unroll(const VarOrRVar &var);
1471
1472 /** Split a dimension by the given factor, then vectorize the
1473 * inner dimension. This is how you vectorize a loop of unknown
1474 * size. The variable to be vectorized should be the innermost
1475 * one. After this call, var refers to the outer dimension of the
1476 * split. 'factor' must be an integer. */
1477 Func &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1478
1479 /** Split a dimension by the given factor, then unroll the inner
1480 * dimension. This is how you unroll a loop of unknown size by
1481 * some constant factor. After this call, var refers to the outer
1482 * dimension of the split. 'factor' must be an integer. */
1483 Func &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1484
1485 /** Statically declare that the range over which a function should
1486 * be evaluated is given by the second and third arguments. This
1487 * can let Halide perform some optimizations. E.g. if you know
1488 * there are going to be 4 color channels, you can completely
1489 * vectorize the color channel dimension without the overhead of
1490 * splitting it up. If bounds inference decides that it requires
1491 * more of this function than the bounds you have stated, a
1492 * runtime error will occur when you try to run your pipeline. */
1493 Func &bound(const Var &var, Expr min, Expr extent);
1494
1495 /** Statically declare the range over which the function will be
1496 * evaluated in the general case. This provides a basis for the auto
1497 * scheduler to make trade-offs and scheduling decisions. The auto
1498 * generated schedules might break when the sizes of the dimensions are
1499 * very different from the estimates specified. These estimates are used
1500 * only by the auto scheduler if the function is a pipeline output. */
1501 Func &set_estimate(const Var &var, const Expr &min, const Expr &extent);
1502
1503 /** Set (min, extent) estimates for all dimensions in the Func
1504 * at once; this is equivalent to calling `set_estimate(args()[n], min, extent)`
1505 * repeatedly, but slightly terser. The size of the estimates vector
1506 * must match the dimensionality of the Func. */
1507 Func &set_estimates(const Region &estimates);
1508
1509 /** Expand the region computed so that the min coordinates is
1510 * congruent to 'remainder' modulo 'modulus', and the extent is a
1511 * multiple of 'modulus'. For example, f.align_bounds(x, 2) forces
1512 * the min and extent realized to be even, and calling
1513 * f.align_bounds(x, 2, 1) forces the min to be odd and the extent
1514 * to be even. The region computed always contains the region that
1515 * would have been computed without this directive, so no
1516 * assertions are injected.
1517 */
1518 Func &align_bounds(const Var &var, Expr modulus, Expr remainder = 0);
1519
1520 /** Expand the region computed so that the extent is a
1521 * multiple of 'modulus'. For example, f.align_extent(x, 2) forces
1522 * the extent realized to be even. The region computed always contains the
1523 * region that would have been computed without this directive, so no
1524 * assertions are injected. (This is essentially equivalent to align_bounds(),
1525 * but always leaving the min untouched.)
1526 */
1527 Func &align_extent(const Var &var, Expr modulus);
1528
1529 /** Bound the extent of a Func's realization, but not its
1530 * min. This means the dimension can be unrolled or vectorized
1531 * even when its min is not fixed (for example because it is
1532 * compute_at tiles of another Func). This can also be useful for
1533 * forcing a function's allocation to be a fixed size, which often
1534 * means it can go on the stack. */
1535 Func &bound_extent(const Var &var, Expr extent);
1536
1537 /** Split two dimensions at once by the given factors, and then
1538 * reorder the resulting dimensions to be xi, yi, xo, yo from
1539 * innermost outwards. This gives a tiled traversal. */
1540 Func &tile(const VarOrRVar &x, const VarOrRVar &y,
1541 const VarOrRVar &xo, const VarOrRVar &yo,
1542 const VarOrRVar &xi, const VarOrRVar &yi,
1543 const Expr &xfactor, const Expr &yfactor,
1545
1546 /** A shorter form of tile, which reuses the old variable names as
1547 * the new outer dimensions */
1548 Func &tile(const VarOrRVar &x, const VarOrRVar &y,
1549 const VarOrRVar &xi, const VarOrRVar &yi,
1550 const Expr &xfactor, const Expr &yfactor,
1552
1553 /** A more general form of tile, which defines tiles of any dimensionality. */
1554 Func &tile(const std::vector<VarOrRVar> &previous,
1555 const std::vector<VarOrRVar> &outers,
1556 const std::vector<VarOrRVar> &inners,
1557 const std::vector<Expr> &factors,
1558 const std::vector<TailStrategy> &tails);
1559
1560 /** The generalized tile, with a single tail strategy to apply to all vars. */
1561 Func &tile(const std::vector<VarOrRVar> &previous,
1562 const std::vector<VarOrRVar> &outers,
1563 const std::vector<VarOrRVar> &inners,
1564 const std::vector<Expr> &factors,
1566
1567 /** Generalized tiling, reusing the previous names as the outer names. */
1568 Func &tile(const std::vector<VarOrRVar> &previous,
1569 const std::vector<VarOrRVar> &inners,
1570 const std::vector<Expr> &factors,
1572
1573 /** Reorder variables to have the given nesting order, from
1574 * innermost out */
1575 Func &reorder(const std::vector<VarOrRVar> &vars);
1576
1577 template<typename... Args>
1578 HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Func &>::type
1579 reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
1580 std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
1581 return reorder(collected_args);
1582 }
1583
1584 /** Rename a dimension. Equivalent to split with a inner size of one. */
1585 Func &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
1586
1587 /** Specify that race conditions are permitted for this Func,
1588 * which enables parallelizing over RVars even when Halide cannot
1589 * prove that it is safe to do so. Use this with great caution,
1590 * and only if you can prove to yourself that this is safe, as it
1591 * may result in a non-deterministic routine that returns
1592 * different values at different times or on different machines. */
1594
1595 /** Issue atomic updates for this Func. This allows parallelization
1596 * on associative RVars. The function throws a compile error when
1597 * Halide fails to prove associativity. Use override_associativity_test
1598 * to disable the associativity test if you believe the function is
1599 * associative or the order of reduction variable execution does not
1600 * matter.
1601 * Halide compiles this into hardware atomic operations whenever possible,
1602 * and falls back to a mutex lock per storage element if it is impossible
1603 * to atomically update.
1604 * There are three possible outcomes of the compiled code:
1605 * atomic add, compare-and-swap loop, and mutex lock.
1606 * For example:
1607 *
1608 * hist(x) = 0;
1609 * hist(im(r)) += 1;
1610 * hist.compute_root();
1611 * hist.update().atomic().parallel();
1612 *
1613 * will be compiled to atomic add operations.
1614 *
1615 * hist(x) = 0;
1616 * hist(im(r)) = min(hist(im(r)) + 1, 100);
1617 * hist.compute_root();
1618 * hist.update().atomic().parallel();
1619 *
1620 * will be compiled to compare-and-swap loops.
1621 *
1622 * arg_max() = {0, im(0)};
1623 * Expr old_index = arg_max()[0];
1624 * Expr old_max = arg_max()[1];
1625 * Expr new_index = select(old_max < im(r), r, old_index);
1626 * Expr new_max = max(im(r), old_max);
1627 * arg_max() = {new_index, new_max};
1628 * arg_max.compute_root();
1629 * arg_max.update().atomic().parallel();
1630 *
1631 * will be compiled to updates guarded by a mutex lock,
1632 * since it is impossible to atomically update two different locations.
1633 *
1634 * Currently the atomic operation is supported by x86, CUDA, and OpenCL backends.
1635 * Compiling to other backends results in a compile error.
1636 * If an operation is compiled into a mutex lock, and is vectorized or is
1637 * compiled to CUDA or OpenCL, it also results in a compile error,
1638 * since per-element mutex lock on vectorized operation leads to a
1639 * deadlock.
1640 * Vectorization of predicated RVars (through rdom.where()) on CPU
1641 * is also unsupported yet (see https://github.com/halide/Halide/issues/4298).
1642 * 8-bit and 16-bit atomics on GPU are also not supported. */
1643 Func &atomic(bool override_associativity_test = false);
1644
1645 /** Specialize a Func. This creates a special-case version of the
1646 * Func where the given condition is true. The most effective
1647 * conditions are those of the form param == value, and boolean
1648 * Params. Consider a simple example:
1649 \code
1650 f(x) = x + select(cond, 0, 1);
1651 f.compute_root();
1652 \endcode
1653 * This is equivalent to:
1654 \code
1655 for (int x = 0; x < width; x++) {
1656 f[x] = x + (cond ? 0 : 1);
1657 }
1658 \endcode
1659 * Adding the scheduling directive:
1660 \code
1661 f.specialize(cond)
1662 \endcode
1663 * makes it equivalent to:
1664 \code
1665 if (cond) {
1666 for (int x = 0; x < width; x++) {
1667 f[x] = x;
1668 }
1669 } else {
1670 for (int x = 0; x < width; x++) {
1671 f[x] = x + 1;
1672 }
1673 }
1674 \endcode
1675 * Note that the inner loops have been simplified. In the first
1676 * path Halide knows that cond is true, and in the second path
1677 * Halide knows that it is false.
1678 *
1679 * The specialized version gets its own schedule, which inherits
1680 * every directive made about the parent Func's schedule so far
1681 * except for its specializations. This method returns a handle to
1682 * the new schedule. If you wish to retrieve the specialized
1683 * sub-schedule again later, you can call this method with the
1684 * same condition. Consider the following example of scheduling
1685 * the specialized version:
1686 *
1687 \code
1688 f(x) = x;
1689 f.compute_root();
1690 f.specialize(width > 1).unroll(x, 2);
1691 \endcode
1692 * Assuming for simplicity that width is even, this is equivalent to:
1693 \code
1694 if (width > 1) {
1695 for (int x = 0; x < width/2; x++) {
1696 f[2*x] = 2*x;
1697 f[2*x + 1] = 2*x + 1;
1698 }
1699 } else {
1700 for (int x = 0; x < width/2; x++) {
1701 f[x] = x;
1702 }
1703 }
1704 \endcode
1705 * For this case, it may be better to schedule the un-specialized
1706 * case instead:
1707 \code
1708 f(x) = x;
1709 f.compute_root();
1710 f.specialize(width == 1); // Creates a copy of the schedule so far.
1711 f.unroll(x, 2); // Only applies to the unspecialized case.
1712 \endcode
1713 * This is equivalent to:
1714 \code
1715 if (width == 1) {
1716 f[0] = 0;
1717 } else {
1718 for (int x = 0; x < width/2; x++) {
1719 f[2*x] = 2*x;
1720 f[2*x + 1] = 2*x + 1;
1721 }
1722 }
1723 \endcode
1724 * This can be a good way to write a pipeline that splits,
1725 * vectorizes, or tiles, but can still handle small inputs.
1726 *
1727 * If a Func has several specializations, the first matching one
1728 * will be used, so the order in which you define specializations
1729 * is significant. For example:
1730 *
1731 \code
1732 f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1733 f.specialize(cond1);
1734 f.specialize(cond2);
1735 \endcode
1736 * is equivalent to:
1737 \code
1738 if (cond1) {
1739 for (int x = 0; x < width; x++) {
1740 f[x] = x + a - (cond2 ? c : d);
1741 }
1742 } else if (cond2) {
1743 for (int x = 0; x < width; x++) {
1744 f[x] = x + b - c;
1745 }
1746 } else {
1747 for (int x = 0; x < width; x++) {
1748 f[x] = x + b - d;
1749 }
1750 }
1751 \endcode
1752 *
1753 * Specializations may in turn be specialized, which creates a
1754 * nested if statement in the generated code.
1755 *
1756 \code
1757 f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1758 f.specialize(cond1).specialize(cond2);
1759 \endcode
1760 * This is equivalent to:
1761 \code
1762 if (cond1) {
1763 if (cond2) {
1764 for (int x = 0; x < width; x++) {
1765 f[x] = x + a - c;
1766 }
1767 } else {
1768 for (int x = 0; x < width; x++) {
1769 f[x] = x + a - d;
1770 }
1771 }
1772 } else {
1773 for (int x = 0; x < width; x++) {
1774 f[x] = x + b - (cond2 ? c : d);
1775 }
1776 }
1777 \endcode
1778 * To create a 4-way if statement that simplifies away all of the
1779 * ternary operators above, you could say:
1780 \code
1781 f.specialize(cond1).specialize(cond2);
1782 f.specialize(cond2);
1783 \endcode
1784 * or
1785 \code
1786 f.specialize(cond1 && cond2);
1787 f.specialize(cond1);
1788 f.specialize(cond2);
1789 \endcode
1790 *
1791 * Any prior Func which is compute_at some variable of this Func
1792 * gets separately included in all paths of the generated if
1793 * statement. The Var in the compute_at call to must exist in all
1794 * paths, but it may have been generated via a different path of
1795 * splits, fuses, and renames. This can be used somewhat
1796 * creatively. Consider the following code:
1797 \code
1798 g(x, y) = 8*x;
1799 f(x, y) = g(x, y) + 1;
1800 f.compute_root().specialize(cond);
1801 Var g_loop;
1802 f.specialize(cond).rename(y, g_loop);
1803 f.rename(x, g_loop);
1804 g.compute_at(f, g_loop);
1805 \endcode
1806 * When cond is true, this is equivalent to g.compute_at(f,y).
1807 * When it is false, this is equivalent to g.compute_at(f,x).
1808 */
1809 Stage specialize(const Expr &condition);
1810
1811 /** Add a specialization to a Func that always terminates execution
1812 * with a call to halide_error(). By itself, this is of limited use,
1813 * but can be useful to terminate chains of specialize() calls where
1814 * no "default" case is expected (thus avoiding unnecessary code generation).
1815 *
1816 * For instance, say we want to optimize a pipeline to process images
1817 * in planar and interleaved format; we might typically do something like:
1818 \code
1819 ImageParam im(UInt(8), 3);
1820 Func f = do_something_with(im);
1821 f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1822 f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1823 \endcode
1824 * This code will vectorize along rows for the planar case, and across pixel
1825 * components for the interleaved case... but there is an implicit "else"
1826 * for the unhandled cases, which generates unoptimized code. If we never
1827 * anticipate passing any other sort of images to this, we code streamline
1828 * our code by adding specialize_fail():
1829 \code
1830 ImageParam im(UInt(8), 3);
1831 Func f = do_something(im);
1832 f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1833 f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1834 f.specialize_fail("Unhandled image format");
1835 \endcode
1836 * Conceptually, this produces codes like:
1837 \code
1838 if (im.dim(0).stride() == 1) {
1839 do_something_planar();
1840 } else if (im.dim(2).stride() == 1) {
1841 do_something_interleaved();
1842 } else {
1843 halide_error("Unhandled image format");
1844 }
1845 \endcode
1846 *
1847 * Note that calling specialize_fail() terminates the specialization chain
1848 * for a given Func; you cannot create new specializations for the Func
1849 * afterwards (though you can retrieve handles to previous specializations).
1850 */
1851 void specialize_fail(const std::string &message);
1852
1853 /** Tell Halide that the following dimensions correspond to GPU
1854 * thread indices. This is useful if you compute a producer
1855 * function within the block indices of a consumer function, and
1856 * want to control how that function's dimensions map to GPU
1857 * threads. If the selected target is not an appropriate GPU, this
1858 * just marks those dimensions as parallel. */
1859 // @{
1861 Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1862 Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1863 // @}
1864
1865 /** The given dimension corresponds to the lanes in a GPU
1866 * warp. GPU warp lanes are distinguished from GPU threads by the
1867 * fact that all warp lanes run together in lockstep, which
1868 * permits lightweight communication of data from one lane to
1869 * another. */
1871
1872 /** Tell Halide to run this stage using a single gpu thread and
1873 * block. This is not an efficient use of your GPU, but it can be
1874 * useful to avoid copy-back for intermediate update stages that
1875 * touch a very small part of your Func. */
1877
1878 /** Tell Halide that the following dimensions correspond to GPU
1879 * block indices. This is useful for scheduling stages that will
1880 * run serially within each GPU block. If the selected target is
1881 * not ptx, this just marks those dimensions as parallel. */
1882 // @{
1884 Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1885 Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1886 // @}
1887
1888 /** Tell Halide that the following dimensions correspond to GPU
1889 * block indices and thread indices. If the selected target is not
1890 * ptx, these just mark the given dimensions as parallel. The
1891 * dimensions are consumed by this call, so do all other
1892 * unrolling, reordering, etc first. */
1893 // @{
1894 Func &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1895 Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
1896 const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1897 Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
1898 const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1899 // @}
1900
1901 /** Short-hand for tiling a domain and mapping the tile indices
1902 * to GPU block indices and the coordinates within each tile to
1903 * GPU thread indices. Consumes the variables given, so do all
1904 * other scheduling first. */
1905 // @{
1906 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
1908 DeviceAPI device_api = DeviceAPI::Default_GPU);
1909
1910 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
1912 DeviceAPI device_api = DeviceAPI::Default_GPU);
1913 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
1914 const VarOrRVar &bx, const VarOrRVar &by,
1915 const VarOrRVar &tx, const VarOrRVar &ty,
1916 const Expr &x_size, const Expr &y_size,
1918 DeviceAPI device_api = DeviceAPI::Default_GPU);
1919
1920 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
1921 const VarOrRVar &tx, const VarOrRVar &ty,
1922 const Expr &x_size, const Expr &y_size,
1924 DeviceAPI device_api = DeviceAPI::Default_GPU);
1925
1926 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
1927 const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
1928 const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
1929 const Expr &x_size, const Expr &y_size, const Expr &z_size,
1931 DeviceAPI device_api = DeviceAPI::Default_GPU);
1932 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
1933 const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
1934 const Expr &x_size, const Expr &y_size, const Expr &z_size,
1936 DeviceAPI device_api = DeviceAPI::Default_GPU);
1937 // @}
1938
1939 /** Schedule for execution on Hexagon. When a loop is marked with
1940 * Hexagon, that loop is executed on a Hexagon DSP. */
1942
1943 /** Prefetch data written to or read from a Func or an ImageParam by a
1944 * subsequent loop iteration, at an optionally specified iteration offset.
1945 * 'var' specifies at which loop level the prefetch calls should be inserted.
1946 * The final argument specifies how prefetch of region outside bounds
1947 * should be handled.
1948 *
1949 * For example, consider this pipeline:
1950 \code
1951 Func f, g;
1952 Var x, y;
1953 f(x, y) = x + y;
1954 g(x, y) = 2 * f(x, y);
1955 \endcode
1956 *
1957 * The following schedule:
1958 \code
1959 f.compute_root();
1960 g.prefetch(f, x, 2, PrefetchBoundStrategy::NonFaulting);
1961 \endcode
1962 *
1963 * will inject prefetch call at the innermost loop of 'g' and generate
1964 * the following loop nest:
1965 * for y = ...
1966 * for x = ...
1967 * f(x, y) = x + y
1968 * for y = ..
1969 * for x = ...
1970 * prefetch(&f[x + 2, y], 1, 16);
1971 * g(x, y) = 2 * f(x, y)
1972 */
1973 // @{
1974 HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
1975 Func &prefetch(const Func &f, const VarOrRVar &var, int offset = 1,
1976 PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
1977 return prefetch(f, var, var, offset, strategy);
1978 }
1979 HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
1980 Func &prefetch(const Internal::Parameter &param, const VarOrRVar &var, int offset = 1,
1981 PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
1982 return prefetch(param, var, var, offset, strategy);
1983 }
1984 template<typename T>
1985 HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
1986 Func &prefetch(const T &image, VarOrRVar var, int offset = 1,
1987 PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
1988 return prefetch<T>(image, var, var, offset, strategy);
1989 }
1990 // @}
1991
1992 /** prefetch() is a more fine-grained version of prefetch(), which allows
1993 * specification of different vars for the location of the prefetch() instruction
1994 * vs. the location that is being prefetched:
1995 *
1996 * - the first var specified, 'at', indicates the loop in which the prefetch will be placed
1997 * - the second var specified, 'from', determines the var used to find the bounds to prefetch
1998 * (in conjunction with 'offset')
1999 *
2000 * If 'at' and 'from' are distinct vars, then 'from' must be at a nesting level outside 'at.'
2001 * Note that the value for 'offset' applies only to 'from', not 'at'.
2002 *
2003 * For example, consider this pipeline:
2004 \code
2005 Func f, g;
2006 Var x, y, z;
2007 f(x, y) = x + y;
2008 g(x, y) = 2 * f(x, y);
2009 h(x, y) = 3 * f(x, y);
2010 \endcode
2011 *
2012 * The following schedule:
2013 \code
2014 f.compute_root();
2015 g.prefetch(f, x, x, 2, PrefetchBoundStrategy::NonFaulting);
2016 h.prefetch(f, x, y, 2, PrefetchBoundStrategy::NonFaulting);
2017 \endcode
2018 *
2019 * will inject prefetch call at the innermost loop of 'g' and 'h' and generate
2020 * the following loop nest:
2021 \code
2022 for y = ...
2023 for x = ...
2024 f(x, y) = x + y
2025 for y = ..
2026 for x = ...
2027 prefetch(&f[x + 2, y], 1, 16);
2028 g(x, y) = 2 * f(x, y)
2029 for y = ..
2030 for x = ...
2031 prefetch(&f[x, y + 2], 1, 16);
2032 h(x, y) = 3 * f(x, y)
2033 \endcode
2034 *
2035 * Note that the 'from' nesting level need not be adjacent to 'at':
2036 \code
2037 Func f, g;
2038 Var x, y, z, w;
2039 f(x, y, z, w) = x + y + z + w;
2040 g(x, y, z, w) = 2 * f(x, y, z, w);
2041 \endcode
2042 *
2043 * The following schedule:
2044 \code
2045 f.compute_root();
2046 g.prefetch(f, y, w, 2, PrefetchBoundStrategy::NonFaulting);
2047 \endcode
2048 *
2049 * will produce code that prefetches a tile of data:
2050 \code
2051 for w = ...
2052 for z = ...
2053 for y = ...
2054 for x = ...
2055 f(x, y, z, w) = x + y + z + w
2056 for w = ...
2057 for z = ...
2058 for y = ...
2059 for x0 = ...
2060 prefetch(&f[x0, y, z, w + 2], 1, 16);
2061 for x = ...
2062 g(x, y, z, w) = 2 * f(x, y, z, w)
2063 \endcode
2064 *
2065 * Note that calling prefetch() with the same var for both 'at' and 'from'
2066 * is equivalent to calling prefetch() with that var.
2067 */
2068 // @{
2069 Func &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2071 Func &prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2073 template<typename T>
2074 Func &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2076 return prefetch(image.parameter(), at, from, std::move(offset), strategy);
2077 }
2078 // @}
2079
2080 /** Specify how the storage for the function is laid out. These
2081 * calls let you specify the nesting order of the dimensions. For
2082 * example, foo.reorder_storage(y, x) tells Halide to use
2083 * column-major storage for any realizations of foo, without
2084 * changing how you refer to foo in the code. You may want to do
2085 * this if you intend to vectorize across y. When representing
2086 * color images, foo.reorder_storage(c, x, y) specifies packed
2087 * storage (red, green, and blue values adjacent in memory), and
2088 * foo.reorder_storage(x, y, c) specifies planar storage (entire
2089 * red, green, and blue images one after the other in memory).
2090 *
2091 * If you leave out some dimensions, those remain in the same
2092 * positions in the nesting order while the specified variables
2093 * are reordered around them. */
2094 // @{
2095 Func &reorder_storage(const std::vector<Var> &dims);
2096
2097 Func &reorder_storage(const Var &x, const Var &y);
2098 template<typename... Args>
2099 HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Var, Args...>::value, Func &>::type
2100 reorder_storage(const Var &x, const Var &y, Args &&...args) {
2101 std::vector<Var> collected_args{x, y, std::forward<Args>(args)...};
2102 return reorder_storage(collected_args);
2103 }
2104 // @}
2105
2106 /** Pad the storage extent of a particular dimension of
2107 * realizations of this function up to be a multiple of the
2108 * specified alignment. This guarantees that the strides for the
2109 * dimensions stored outside of dim will be multiples of the
2110 * specified alignment, where the strides and alignment are
2111 * measured in numbers of elements.
2112 *
2113 * For example, to guarantee that a function foo(x, y, c)
2114 * representing an image has scanlines starting on offsets
2115 * aligned to multiples of 16, use foo.align_storage(x, 16). */
2116 Func &align_storage(const Var &dim, const Expr &alignment);
2117
2118 /** Store realizations of this function in a circular buffer of a
2119 * given extent. This is more efficient when the extent of the
2120 * circular buffer is a power of 2. If the fold factor is too
2121 * small, or the dimension is not accessed monotonically, the
2122 * pipeline will generate an error at runtime.
2123 *
2124 * The fold_forward option indicates that the new values of the
2125 * producer are accessed by the consumer in a monotonically
2126 * increasing order. Folding storage of producers is also
2127 * supported if the new values are accessed in a monotonically
2128 * decreasing order by setting fold_forward to false.
2129 *
2130 * For example, consider the pipeline:
2131 \code
2132 Func f, g;
2133 Var x, y;
2134 g(x, y) = x*y;
2135 f(x, y) = g(x, y) + g(x, y+1);
2136 \endcode
2137 *
2138 * If we schedule f like so:
2139 *
2140 \code
2141 g.compute_at(f, y).store_root().fold_storage(y, 2);
2142 \endcode
2143 *
2144 * Then g will be computed at each row of f and stored in a buffer
2145 * with an extent in y of 2, alternately storing each computed row
2146 * of g in row y=0 or y=1.
2147 */
2148 Func &fold_storage(const Var &dim, const Expr &extent, bool fold_forward = true);
2149
2150 /** Compute this function as needed for each unique value of the
2151 * given var for the given calling function f.
2152 *
2153 * For example, consider the simple pipeline:
2154 \code
2155 Func f, g;
2156 Var x, y;
2157 g(x, y) = x*y;
2158 f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2159 \endcode
2160 *
2161 * If we schedule f like so:
2162 *
2163 \code
2164 g.compute_at(f, x);
2165 \endcode
2166 *
2167 * Then the C code equivalent to this pipeline will look like this
2168 *
2169 \code
2170
2171 int f[height][width];
2172 for (int y = 0; y < height; y++) {
2173 for (int x = 0; x < width; x++) {
2174 int g[2][2];
2175 g[0][0] = x*y;
2176 g[0][1] = (x+1)*y;
2177 g[1][0] = x*(y+1);
2178 g[1][1] = (x+1)*(y+1);
2179 f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
2180 }
2181 }
2182
2183 \endcode
2184 *
2185 * The allocation and computation of g is within f's loop over x,
2186 * and enough of g is computed to satisfy all that f will need for
2187 * that iteration. This has excellent locality - values of g are
2188 * used as soon as they are computed, but it does redundant
2189 * work. Each value of g ends up getting computed four times. If
2190 * we instead schedule f like so:
2191 *
2192 \code
2193 g.compute_at(f, y);
2194 \endcode
2195 *
2196 * The equivalent C code is:
2197 *
2198 \code
2199 int f[height][width];
2200 for (int y = 0; y < height; y++) {
2201 int g[2][width+1];
2202 for (int x = 0; x < width; x++) {
2203 g[0][x] = x*y;
2204 g[1][x] = x*(y+1);
2205 }
2206 for (int x = 0; x < width; x++) {
2207 f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2208 }
2209 }
2210 \endcode
2211 *
2212 * The allocation and computation of g is within f's loop over y,
2213 * and enough of g is computed to satisfy all that f will need for
2214 * that iteration. This does less redundant work (each point in g
2215 * ends up being evaluated twice), but the locality is not quite
2216 * as good, and we have to allocate more temporary memory to store
2217 * g.
2218 */
2219 Func &compute_at(const Func &f, const Var &var);
2220
2221 /** Schedule a function to be computed within the iteration over
2222 * some dimension of an update domain. Produces equivalent code
2223 * to the version of compute_at that takes a Var. */
2224 Func &compute_at(const Func &f, const RVar &var);
2225
2226 /** Schedule a function to be computed within the iteration over
2227 * a given LoopLevel. */
2229
2230 /** Schedule the iteration over the initial definition of this function
2231 * to be fused with another stage 's' from outermost loop to a
2232 * given LoopLevel. */
2233 // @{
2234 Func &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
2236 Func &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
2238
2239 /** Compute all of this function once ahead of time. Reusing
2240 * the example in \ref Func::compute_at :
2241 *
2242 \code
2243 Func f, g;
2244 Var x, y;
2245 g(x, y) = x*y;
2246 f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2247
2248 g.compute_root();
2249 \endcode
2250 *
2251 * is equivalent to
2252 *
2253 \code
2254 int f[height][width];
2255 int g[height+1][width+1];
2256 for (int y = 0; y < height+1; y++) {
2257 for (int x = 0; x < width+1; x++) {
2258 g[y][x] = x*y;
2259 }
2260 }
2261 for (int y = 0; y < height; y++) {
2262 for (int x = 0; x < width; x++) {
2263 f[y][x] = g[y][x] + g[y+1][x] + g[y][x+1] + g[y+1][x+1];
2264 }
2265 }
2266 \endcode
2267 *
2268 * g is computed once ahead of time, and enough is computed to
2269 * satisfy all uses of it. This does no redundant work (each point
2270 * in g is evaluated once), but has poor locality (values of g are
2271 * probably not still in cache when they are used by f), and
2272 * allocates lots of temporary memory to store g.
2273 */
2275
2276 /** Use the halide_memoization_cache_... interface to store a
2277 * computed version of this function across invocations of the
2278 * Func.
2279 *
2280 * If an eviction_key is provided, it must be constructed with
2281 * Expr of integer or handle type. The key Expr will be promoted
2282 * to a uint64_t and can be used with halide_memoization_cache_evict
2283 * to remove memoized entries using this eviction key from the
2284 * cache. Memoized computations that do not provide an eviction
2285 * key will never be evicted by this mechanism.
2286 */
2287 Func &memoize(const EvictionKey &eviction_key = EvictionKey());
2288
2289 /** Produce this Func asynchronously in a separate
2290 * thread. Consumers will be run by the task system when the
2291 * production is complete. If this Func's store level is different
2292 * to its compute level, consumers will be run concurrently,
2293 * blocking as necessary to prevent reading ahead of what the
2294 * producer has computed. If storage is folded, then the producer
2295 * will additionally not be permitted to run too far ahead of the
2296 * consumer, to avoid clobbering data that has not yet been
2297 * used.
2298 *
2299 * Take special care when combining this with custom thread pool
2300 * implementations, as avoiding deadlock with producer-consumer
2301 * parallelism requires a much more sophisticated parallel runtime
2302 * than with data parallelism alone. It is strongly recommended
2303 * you just use Halide's default thread pool, which guarantees no
2304 * deadlock and a bound on the number of threads launched.
2305 */
2307
2308 /** Bound the extent of a Func's storage, but not extent of its
2309 * compute. This can be useful for forcing a function's allocation
2310 * to be a fixed size, which often means it can go on the stack.
2311 * If bounds inference decides that it requires more storage for
2312 * this function than the allocation size you have stated, a runtime
2313 * error will occur when you try to run the pipeline. */
2314 Func &bound_storage(const Var &dim, const Expr &bound);
2315
2316 /** Allocate storage for this function within f's loop over
2317 * var. Scheduling storage is optional, and can be used to
2318 * separate the loop level at which storage occurs from the loop
2319 * level at which computation occurs to trade off between locality
2320 * and redundant work. This can open the door for two types of
2321 * optimization.
2322 *
2323 * Consider again the pipeline from \ref Func::compute_at :
2324 \code
2325 Func f, g;
2326 Var x, y;
2327 g(x, y) = x*y;
2328 f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
2329 \endcode
2330 *
2331 * If we schedule it like so:
2332 *
2333 \code
2334 g.compute_at(f, x).store_at(f, y);
2335 \endcode
2336 *
2337 * Then the computation of g takes place within the loop over x,
2338 * but the storage takes place within the loop over y:
2339 *
2340 \code
2341 int f[height][width];
2342 for (int y = 0; y < height; y++) {
2343 int g[2][width+1];
2344 for (int x = 0; x < width; x++) {
2345 g[0][x] = x*y;
2346 g[0][x+1] = (x+1)*y;
2347 g[1][x] = x*(y+1);
2348 g[1][x+1] = (x+1)*(y+1);
2349 f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2350 }
2351 }
2352 \endcode
2353 *
2354 * Provided the for loop over x is serial, halide then
2355 * automatically performs the following sliding window
2356 * optimization:
2357 *
2358 \code
2359 int f[height][width];
2360 for (int y = 0; y < height; y++) {
2361 int g[2][width+1];
2362 for (int x = 0; x < width; x++) {
2363 if (x == 0) {
2364 g[0][x] = x*y;
2365 g[1][x] = x*(y+1);
2366 }
2367 g[0][x+1] = (x+1)*y;
2368 g[1][x+1] = (x+1)*(y+1);
2369 f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2370 }
2371 }
2372 \endcode
2373 *
2374 * Two of the assignments to g only need to be done when x is
2375 * zero. The rest of the time, those sites have already been
2376 * filled in by a previous iteration. This version has the
2377 * locality of compute_at(f, x), but allocates more memory and
2378 * does much less redundant work.
2379 *
2380 * Halide then further optimizes this pipeline like so:
2381 *
2382 \code
2383 int f[height][width];
2384 for (int y = 0; y < height; y++) {
2385 int g[2][2];
2386 for (int x = 0; x < width; x++) {
2387 if (x == 0) {
2388 g[0][0] = x*y;
2389 g[1][0] = x*(y+1);
2390 }
2391 g[0][(x+1)%2] = (x+1)*y;
2392 g[1][(x+1)%2] = (x+1)*(y+1);
2393 f[y][x] = g[0][x%2] + g[1][x%2] + g[0][(x+1)%2] + g[1][(x+1)%2];
2394 }
2395 }
2396 \endcode
2397 *
2398 * Halide has detected that it's possible to use a circular buffer
2399 * to represent g, and has reduced all accesses to g modulo 2 in
2400 * the x dimension. This optimization only triggers if the for
2401 * loop over x is serial, and if halide can statically determine
2402 * some power of two large enough to cover the range needed. For
2403 * powers of two, the modulo operator compiles to more efficient
2404 * bit-masking. This optimization reduces memory usage, and also
2405 * improves locality by reusing recently-accessed memory instead
2406 * of pulling new memory into cache.
2407 *
2408 */
2409 Func &store_at(const Func &f, const Var &var);
2410
2411 /** Equivalent to the version of store_at that takes a Var, but
2412 * schedules storage within the loop over a dimension of a
2413 * reduction domain */
2414 Func &store_at(const Func &f, const RVar &var);
2415
2416 /** Equivalent to the version of store_at that takes a Var, but
2417 * schedules storage at a given LoopLevel. */
2419
2420 /** Equivalent to \ref Func::store_at, but schedules storage
2421 * outside the outermost loop. */
2423
2424 /** Aggressively inline all uses of this function. This is the
2425 * default schedule, so you're unlikely to need to call this. For
2426 * a Func with an update definition, that means it gets computed
2427 * as close to the innermost loop as possible.
2428 *
2429 * Consider once more the pipeline from \ref Func::compute_at :
2430 *
2431 \code
2432 Func f, g;
2433 Var x, y;
2434 g(x, y) = x*y;
2435 f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
2436 \endcode
2437 *
2438 * Leaving g as inline, this compiles to code equivalent to the following C:
2439 *
2440 \code
2441 int f[height][width];
2442 for (int y = 0; y < height; y++) {
2443 for (int x = 0; x < width; x++) {
2444 f[y][x] = x*y + x*(y+1) + (x+1)*y + (x+1)*(y+1);
2445 }
2446 }
2447 \endcode
2448 */
2450
2451 /** Get a handle on an update step for the purposes of scheduling
2452 * it. */
2453 Stage update(int idx = 0);
2454
2455 /** Set the type of memory this Func should be stored in. Controls
2456 * whether allocations go on the stack or the heap on the CPU, and
2457 * in global vs shared vs local on the GPU. See the documentation
2458 * on MemoryType for more detail. */
2459 Func &store_in(MemoryType memory_type);
2460
2461 /** Trace all loads from this Func by emitting calls to
2462 * halide_trace. If the Func is inlined, this has no
2463 * effect. */
2465
2466 /** Trace all stores to the buffer backing this Func by emitting
2467 * calls to halide_trace. If the Func is inlined, this call
2468 * has no effect. */
2470
2471 /** Trace all realizations of this Func by emitting calls to
2472 * halide_trace. */
2474
2475 /** Add a string of arbitrary text that will be passed thru to trace
2476 * inspection code if the Func is realized in trace mode. (Funcs that are
2477 * inlined won't have their tags emitted.) Ignored entirely if
2478 * tracing is not enabled for the Func (or globally).
2479 */
2480 Func &add_trace_tag(const std::string &trace_tag);
2481
2482 /** Get a handle on the internal halide function that this Func
2483 * represents. Useful if you want to do introspection on Halide
2484 * functions */
2486 return func;
2487 }
2488
2489 /** You can cast a Func to its pure stage for the purposes of
2490 * scheduling it. */
2491 operator Stage() const;
2492
2493 /** Get a handle on the output buffer for this Func. Only relevant
2494 * if this is the output Func in a pipeline. Useful for making
2495 * static promises about strides, mins, and extents. */
2496 // @{
2498 std::vector<OutputImageParam> output_buffers() const;
2499 // @}
2500
2501 /** Use a Func as an argument to an external stage. */
2502 operator ExternFuncArgument() const;
2503
2504 /** Infer the arguments to the Func, sorted into a canonical order:
2505 * all buffers (sorted alphabetically by name), followed by all non-buffers
2506 * (sorted alphabetically by name).
2507 This lets you write things like:
2508 \code
2509 func.compile_to_assembly("/dev/stdout", func.infer_arguments());
2510 \endcode
2511 */
2512 std::vector<Argument> infer_arguments() const;
2513
2514 /** Get the source location of the pure definition of this
2515 * Func. See Stage::source_location() */
2516 std::string source_location() const;
2517
2518 /** Return the current StageSchedule associated with this initial
2519 * Stage of this Func. For introspection only: to modify schedule,
2520 * use the Func interface. */
2522 return Stage(*this).get_schedule();
2523 }
2524};
2525
2526namespace Internal {
2527
2528template<typename Last>
2529inline void check_types(const Tuple &t, int idx) {
2530 using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
2531 user_assert(t[idx].type() == type_of<T>())
2532 << "Can't evaluate expression "
2533 << t[idx] << " of type " << t[idx].type()
2534 << " as a scalar of type " << type_of<T>() << "\n";
2535}
2536
2537template<typename First, typename Second, typename... Rest>
2538inline void check_types(const Tuple &t, int idx) {
2539 check_types<First>(t, idx);
2540 check_types<Second, Rest...>(t, idx + 1);
2541}
2542
2543template<typename Last>
2544inline void assign_results(Realization &r, int idx, Last last) {
2545 using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
2546 *last = Buffer<T>(r[idx])();
2547}
2548
2549template<typename First, typename Second, typename... Rest>
2550inline void assign_results(Realization &r, int idx, First first, Second second, Rest &&...rest) {
2551 assign_results<First>(r, idx, first);
2552 assign_results<Second, Rest...>(r, idx + 1, second, rest...);
2553}
2554
2555} // namespace Internal
2556
2557/** JIT-Compile and run enough code to evaluate a Halide
2558 * expression. This can be thought of as a scalar version of
2559 * \ref Func::realize */
2560template<typename T>
2562 user_assert(e.type() == type_of<T>())
2563 << "Can't evaluate expression "
2564 << e << " of type " << e.type()
2565 << " as a scalar of type " << type_of<T>() << "\n";
2566 Func f;
2567 f() = e;
2568 Buffer<T, 0> im = f.realize(ctx);
2569 return im();
2570}
2571
2572/** evaluate with a default user context */
2573template<typename T>
2575 return evaluate<T>(nullptr, e);
2576}
2577
2578/** JIT-compile and run enough code to evaluate a Halide Tuple. */
2579template<typename First, typename... Rest>
2580HALIDE_NO_USER_CODE_INLINE void evaluate(JITUserContext *ctx, Tuple t, First first, Rest &&...rest) {
2581 Internal::check_types<First, Rest...>(t, 0);
2582
2583 Func f;
2584 f() = t;
2585 Realization r = f.realize(ctx);
2586 Internal::assign_results(r, 0, first, rest...);
2587}
2588
2589/** JIT-compile and run enough code to evaluate a Halide Tuple. */
2590template<typename First, typename... Rest>
2591HALIDE_NO_USER_CODE_INLINE void evaluate(Tuple t, First first, Rest &&...rest) {
2592 evaluate<First, Rest...>(nullptr, std::move(t), std::forward<First>(first), std::forward<Rest...>(rest...));
2593}
2594
2595namespace Internal {
2596
2597inline void schedule_scalar(Func f) {
2599 if (t.has_gpu_feature()) {
2601 }
2602 if (t.has_feature(Target::HVX)) {
2603 f.hexagon();
2604 }
2605}
2606
2607} // namespace Internal
2608
2609/** JIT-Compile and run enough code to evaluate a Halide
2610 * expression. This can be thought of as a scalar version of
2611 * \ref Func::realize. Can use GPU if jit target from environment
2612 * specifies one.
2613 */
2614template<typename T>
2616 user_assert(e.type() == type_of<T>())
2617 << "Can't evaluate expression "
2618 << e << " of type " << e.type()
2619 << " as a scalar of type " << type_of<T>() << "\n";
2620 Func f;
2621 f() = e;
2623 Buffer<T, 0> im = f.realize();
2624 return im();
2625}
2626
2627/** JIT-compile and run enough code to evaluate a Halide Tuple. Can
2628 * use GPU if jit target from environment specifies one. */
2629// @{
2630template<typename First, typename... Rest>
2631HALIDE_NO_USER_CODE_INLINE void evaluate_may_gpu(Tuple t, First first, Rest &&...rest) {
2632 Internal::check_types<First, Rest...>(t, 0);
2633
2634 Func f;
2635 f() = t;
2637 Realization r = f.realize();
2638 Internal::assign_results(r, 0, first, rest...);
2639}
2640// @}
2641
2642} // namespace Halide
2643
2644#endif
Defines a type used for expressing the type signature of a generated halide pipeline.
#define internal_assert(c)
Definition: Errors.h:19
#define user_assert(c)
Definition: Errors.h:15
Base classes for Halide expressions (Halide::Expr) and statements (Halide::Internal::Stmt)
Defines the struct representing lifetime and dependencies of a JIT compiled halide pipeline.
Defines Module, an IR container that fully describes a Halide program.
Classes for declaring scalar parameters to halide pipelines.
Defines the front-end class representing an entire Halide imaging pipeline.
Defines the front-end syntax for reduction domains and reduction variables.
Defines the structure that describes a Halide target.
Defines Tuple - the front-end handle on small arrays of expressions.
#define HALIDE_NO_USER_CODE_INLINE
Definition: Util.h:45
Defines the Var - the front-end variable.
A Halide::Buffer is a named shared reference to a Halide::Runtime::Buffer.
Definition: Buffer.h:120
Helper class for identifying purpose of an Expr passed to memoize.
Definition: Func.h:688
EvictionKey(const Expr &expr=Expr())
Definition: Func.h:694
A halide function.
Definition: Func.h:703
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices and thread indices.
Func & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices.
Func & bound_extent(const Var &var, Expr extent)
Bound the extent of a Func's realization, but not its min.
void print_loop_nest()
Write out the loop nests specified by the schedule for this Function.
Func & unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given factor, then unroll the inner dimension.
bool is_extern() const
Is this function an external stage? That is, was it defined using define_extern?
FuncRef operator()(std::vector< Expr >) const
Either calls to the function, or the left-hand-side of an update definition (see RDom).
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func(const std::string &name)
Declare a new undefined function with the given name.
void compile_to_multitarget_object_files(const std::string &filename_prefix, const std::vector< Argument > &args, const std::vector< Target > &targets, const std::vector< std::string > &suffixes)
Like compile_to_multitarget_static_library(), except that the object files are all output as object f...
Func & memoize(const EvictionKey &eviction_key=EvictionKey())
Use the halide_memoization_cache_... interface to store a computed version of this function across in...
Func & trace_stores()
Trace all stores to the buffer backing this Func by emitting calls to halide_trace.
Func & trace_loads()
Trace all loads from this Func by emitting calls to halide_trace.
void specialize_fail(const std::string &message)
Add a specialization to a Func that always terminates execution with a call to halide_error().
Func & vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given factor, then vectorize the inner dimension.
Func & compute_at(const Func &f, const RVar &var)
Schedule a function to be computed within the iteration over some dimension of an update domain.
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
The generalized tile, with a single tail strategy to apply to all vars.
void compile_to_assembly(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to text assembly equivalent to the object file generated by compile_...
Internal::Function function() const
Get a handle on the internal halide function that this Func represents.
Definition: Func.h:2485
bool has_update_definition() const
Does this function have at least one update definition?
void compile_jit(const Target &target=get_jit_target_from_environment())
Eagerly jit compile the function to machine code.
Func & compute_with(LoopLevel loop_level, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy > > &align)
Func()
Declare a new undefined function with an automatically-generated unique name.
Func & store_in(MemoryType memory_type)
Set the type of memory this Func should be stored in.
void compile_to_bitcode(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
void realize(Pipeline::RealizationArg outputs, const Target &target=Target(), const ParamMap &param_map=ParamMap::empty_map())
Evaluate this function into an existing allocated buffer or buffers.
Func & async()
Produce this Func asynchronously in a separate thread.
void set_custom_trace(int(*trace_fn)(void *, const halide_trace_event_t *))
Func & reorder(const std::vector< VarOrRVar > &vars)
Reorder variables to have the given nesting order, from innermost out.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Expr, Args... >::value, FuncRef >::type operator()(const Expr &x, Args &&...args) const
Definition: Func.h:1275
Func & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & set_estimate(const Var &var, const Expr &min, const Expr &extent)
Statically declare the range over which the function will be evaluated in the general case.
Func & gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
The given dimension corresponds to the lanes in a GPU warp.
void compile_to_lowered_stmt(const std::string &filename, const std::vector< Argument > &args, StmtOutputFormat fmt=Text, const Target &target=get_target_from_environment())
Write out an internal representation of lowered code.
void compile_to_c(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name="", const Target &target=get_target_from_environment())
Statically compile this function to C source code.
Stage update(int idx=0)
Get a handle on an update step for the purposes of scheduling it.
Func & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
bool defined() const
Does this function have at least a pure definition.
Func & align_storage(const Var &dim, const Expr &alignment)
Pad the storage extent of a particular dimension of realizations of this function up to be a multiple...
Func copy_to_host()
Declare that this function should be implemented by a call to halide_buffer_copy with a NULL target d...
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & serial(const VarOrRVar &var)
Mark a dimension to be traversed serially.
void compile_to_header(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name="", const Target &target=get_target_from_environment())
Emit a header file with the given filename for this function.
Func & align_bounds(const Var &var, Expr modulus, Expr remainder=0)
Expand the region computed so that the min coordinates is congruent to 'remainder' modulo 'modulus',...
Func & reorder_storage(const Var &x, const Var &y)
Func & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension into inner and outer subdimensions with the given names, where the inner dimension ...
const std::vector< Type > & output_types() const
Get the types of the outputs of this Func.
Func(const Expr &e)
Declare a new function with an automatically-generated unique name, and define it to return the given...
Func & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Split two dimensions at once by the given factors, and then reorder the resulting dimensions to be xi...
int dimensions() const
The dimensionality (number of arguments) of this function.
void realize(JITUserContext *context, Pipeline::RealizationArg outputs, const Target &target=Target(), const ParamMap &param_map=ParamMap::empty_map())
Same as above, but takes a custom user-provided context to be passed to runtime functions.
void set_custom_do_par_for(int(*custom_do_par_for)(void *, int(*)(void *, int, uint8_t *), int, int, uint8_t *))
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
std::string source_location() const
Get the source location of the pure definition of this Func.
void infer_input_bounds(const std::vector< int32_t > &sizes, const Target &target=get_jit_target_from_environment(), const ParamMap &param_map=ParamMap::empty_map())
For a given size of output, or a given output buffer, determine the bounds required of all unbound Im...
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, const std::vector< TailStrategy > &tails)
A more general form of tile, which defines tiles of any dimensionality.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
const std::vector< Expr > & update_args(int idx=0) const
Get the left-hand-side of the update definition.
int outputs() const
Get the number of outputs of this Func.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Var, Args... >::value, Func & >::type reorder_storage(const Var &x, const Var &y, Args &&...args)
Definition: Func.h:2100
Func & compute_root()
Compute all of this function once ahead of time.
Func & compute_with(LoopLevel loop_level, LoopAlignStrategy align=LoopAlignStrategy::Auto)
void set_custom_allocator(void *(*malloc)(void *, size_t), void(*free)(void *, void *))
Func & trace_realizations()
Trace all realizations of this Func by emitting calls to halide_trace.
JITHandlers & jit_handlers()
Get a struct containing the currently set custom functions used by JIT.
std::vector< Var > args() const
Get the pure arguments.
Tuple update_values(int idx=0) const
Get the right-hand-side of an update definition for functions that returns multiple values.
Func & allow_race_conditions()
Specify that race conditions are permitted for this Func, which enables parallelizing over RVars even...
void compile_to_bitcode(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to llvm bitcode, with the given filename (which should probably end ...
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Func & >::type reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args)
Definition: Func.h:1579
int num_update_definitions() const
How many update definitions does this function have?
Func & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
A shorter form of tile, which reuses the old variable names as the new outer dimensions.
Stage specialize(const Expr &condition)
Specialize a Func.
void set_custom_do_task(int(*custom_do_task)(void *, int(*)(void *, int, uint8_t *), int, uint8_t *))
Func & compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align=LoopAlignStrategy::Auto)
Realization realize(std::vector< int32_t > sizes={}, const Target &target=Target(), const ParamMap &param_map=ParamMap::empty_map())
Evaluate this function over some rectangular domain and return the resulting buffer or buffers.
Func & store_at(LoopLevel loop_level)
Equivalent to the version of store_at that takes a Var, but schedules storage at a given LoopLevel.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
HALIDE_NO_USER_CODE_INLINE Func(Buffer< T, Dims > &im)
Construct a new Func to wrap a Buffer.
Definition: Func.h:749
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, const std::vector< Var > &arguments, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Func & compute_with(const Stage &s, const VarOrRVar &var, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy > > &align)
Schedule the iteration over the initial definition of this function to be fused with another stage 's...
Expr value() const
The right-hand-side value of the pure definition of this function.
Func & align_extent(const Var &var, Expr modulus)
Expand the region computed so that the extent is a multiple of 'modulus'.
void set_error_handler(void(*handler)(void *, const char *))
Deprecated variants of the above that use a void pointer instead of a JITUserContext pointer.
Func clone_in(const std::vector< Func > &fs)
Module compile_to_module(const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Store an internal representation of lowered code as a self contained Module suitable for further comp...
Func & atomic(bool override_associativity_test=false)
Issue atomic updates for this Func.
void infer_input_bounds(JITUserContext *context, const std::vector< int32_t > &sizes, const Target &target=get_jit_target_from_environment(), const ParamMap &param_map=ParamMap::empty_map())
Versions of infer_input_bounds that take a custom user context to pass to runtime functions.
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Definition: Func.h:1205
Func & unroll(const VarOrRVar &var)
Mark a dimension to be completely unrolled.
void set_custom_print(void(*handler)(void *, const char *))
Func & set_estimates(const Region &estimates)
Set (min, extent) estimates for all dimensions in the Func at once; this is equivalent to calling set...
Func in()
Create and return a global identity wrapper, which wraps all calls to this Func by any other Func.
OutputImageParam output_buffer() const
Get a handle on the output buffer for this Func.
Expr update_value(int idx=0) const
Get the right-hand-side of an update definition.
void compile_to(const std::map< OutputFileType, std::string > &output_files, const std::vector< Argument > &args, const std::string &fn_name, const Target &target=get_target_from_environment())
Compile and generate multiple target files with single call.
std::vector< Argument > infer_arguments() const
Infer the arguments to the Func, sorted into a canonical order: all buffers (sorted alphabetically by...
void compile_to_llvm_assembly(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
Func & store_at(const Func &f, const Var &var)
Allocate storage for this function within f's loop over var.
void add_custom_lowering_pass(T *pass)
Add a custom pass to be used during lowering.
Definition: Func.h:1092
Func in(const std::vector< Func > &fs)
Create and return an identity wrapper shared by all the Funcs in 'fs'.
Func & fold_storage(const Var &dim, const Expr &extent, bool fold_forward=true)
Store realizations of this function in a circular buffer of a given extent.
void infer_input_bounds(JITUserContext *context, Pipeline::RealizationArg outputs, const Target &target=get_jit_target_from_environment(), const ParamMap &param_map=ParamMap::empty_map())
Func & compute_at(LoopLevel loop_level)
Schedule a function to be computed within the iteration over a given LoopLevel.
Func & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU thread indices.
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Add an extern definition for this Func.
Definition: Func.h:1187
void compile_to_file(const std::string &filename_prefix, const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Compile to object file and header pair, with the given arguments.
void add_custom_lowering_pass(Internal::IRMutator *pass, std::function< void()> deleter)
Add a custom pass to be used during lowering, with the function that will be called to delete it also...
Func & add_trace_tag(const std::string &trace_tag)
Add a string of arbitrary text that will be passed thru to trace inspection code if the Func is reali...
Func & store_at(const Func &f, const RVar &var)
Equivalent to the version of store_at that takes a Var, but schedules storage within the loop over a ...
void clear_custom_lowering_passes()
Remove all previously-set custom lowering passes.
void compile_to_llvm_assembly(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to llvm assembly, with the given filename (which should probably end...
const std::string & name() const
The name of this function, either given during construction, or automatically generated.
void compile_to_multitarget_static_library(const std::string &filename_prefix, const std::vector< Argument > &args, const std::vector< Target > &targets)
Compile to static-library file and header pair once for each target; each resulting function will be ...
Func & prefetch(const Func &f, const VarOrRVar &var, int offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Prefetch data written to or read from a Func or an ImageParam by a subsequent loop iteration,...
Definition: Func.h:1975
Func & hexagon(const VarOrRVar &x=Var::outermost())
Schedule for execution on Hexagon.
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
Generalized tiling, reusing the previous names as the outer names.
Func & store_root()
Equivalent to Func::store_at, but schedules storage outside the outermost loop.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
prefetch() is a more fine-grained version of prefetch(), which allows specification of different vars...
Realization realize(JITUserContext *context, std::vector< int32_t > sizes={}, const Target &target=Target(), const ParamMap &param_map=ParamMap::empty_map())
Same as above, but takes a custom user-provided context to be passed to runtime functions.
void compile_to_assembly(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
std::vector< RVar > rvars(int idx=0) const
Get the RVars of the reduction domain for an update definition, if there is one.
Func clone_in(const Func &f)
Similar to Func::in; however, instead of replacing the call to this Func with an identity Func that r...
const std::vector< CustomLoweringPass > & custom_lowering_passes()
Get the custom lowering passes.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Var, Args... >::value, FuncRef >::type operator()(Args &&...args) const
Definition: Func.h:1258
Func & compute_inline()
Aggressively inline all uses of this function.
Func(Internal::Function f)
Construct a new Func to wrap an existing, already-define Function object.
void compile_to_object(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to an object file, with the given filename (which should probably en...
Func & bound_storage(const Var &dim, const Expr &bound)
Bound the extent of a Func's storage, but not extent of its compute.
Func & rename(const VarOrRVar &old_name, const VarOrRVar &new_name)
Rename a dimension.
Tuple values() const
The values returned by this function.
const std::string & extern_function_name() const
Get the name of the extern function called for an extern definition.
Func copy_to_device(DeviceAPI d=DeviceAPI::Default_GPU)
Declare that this function should be implemented by a call to halide_buffer_copy with the given targe...
Func & parallel(const VarOrRVar &var)
Mark a dimension to be traversed in parallel.
Func & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
void compile_to_object(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
Func & reorder_storage(const std::vector< Var > &dims)
Specify how the storage for the function is laid out.
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, const std::vector< Var > &arguments, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Definition: Func.h:1215
Func & prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Func & vectorize(const VarOrRVar &var)
Mark a dimension to be computed all-at-once as a single vector.
void debug_to_file(const std::string &filename)
When this function is compiled, include code that dumps its values to a file after it is realized,...
Func & parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given task_size, and the parallelize the outer dimension.
Func & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Join two dimensions into a single fused dimenion.
Func in(const Func &f)
Creates and returns a new identity Func that wraps this Func.
Func & bound(const Var &var, Expr min, Expr extent)
Statically declare that the range over which a function should be evaluated is given by the second an...
std::vector< OutputImageParam > output_buffers() const
Func & prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition: Func.h:2074
void compile_to_static_library(const std::string &filename_prefix, const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Compile to static-library file and header pair, with the given arguments.
Func & compute_at(const Func &f, const Var &var)
Compute this function as needed for each unique value of the given var for the given calling function...
void infer_input_bounds(Pipeline::RealizationArg outputs, const Target &target=get_jit_target_from_environment(), const ParamMap &param_map=ParamMap::empty_map())
FuncRef operator()(std::vector< Var >) const
Construct either the left-hand-side of a definition, or a call to a functions that happens to only co...
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this initial Stage of this Func.
Definition: Func.h:2521
Func & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide to run this stage using a single gpu thread and block.
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, int dimensionality, NameMangling mangling)
Definition: Func.h:1197
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Short-hand for tiling a domain and mapping the tile indices to GPU block indices and the coordinates ...
A fragment of front-end syntax of the form f(x, y, z), where x, y, z are Vars or Exprs.
Definition: Func.h:494
Stage operator*=(const FuncRef &)
FuncTupleElementRef operator[](int) const
When a FuncRef refers to a function that provides multiple outputs, you can access each output as an ...
Stage operator-=(const FuncRef &)
size_t size() const
How many outputs does the function this refers to produce.
Internal::Function function() const
What function is this calling?
Definition: Func.h:591
Stage operator+=(Expr)
Define a stage that adds the given expression to this Func.
Stage operator-=(Expr)
Define a stage that adds the negative of the given expression to this Func.
Stage operator*=(Expr)
Define a stage that multiplies this Func by the given expression.
Stage operator-=(const Tuple &)
Stage operator/=(Expr)
Define a stage that divides this Func by the given expression.
Stage operator+=(const FuncRef &)
Stage operator=(const Expr &)
Use this as the left-hand-side of a definition or an update definition (see RDom).
Stage operator=(const FuncRef &)
FuncRef(Internal::Function, const std::vector< Var > &, int placeholder_pos=-1, int count=0)
Stage operator+=(const Tuple &)
FuncRef(const Internal::Function &, const std::vector< Expr > &, int placeholder_pos=-1, int count=0)
Stage operator/=(const FuncRef &)
Stage operator*=(const Tuple &)
Stage operator/=(const Tuple &)
Stage operator=(const Tuple &)
Use this as the left-hand-side of a definition or an update definition for a Func with multiple outpu...
A fragment of front-end syntax of the form f(x, y, z)[index], where x, y, z are Vars or Exprs.
Definition: Func.h:613
int index() const
Return index to the function outputs.
Definition: Func.h:677
Stage operator+=(const Expr &e)
Define a stage that adds the given expression to Tuple component 'idx' of this Func.
Stage operator*=(const Expr &e)
Define a stage that multiplies Tuple component 'idx' of this Func by the given expression.
Stage operator/=(const Expr &e)
Define a stage that divides Tuple component 'idx' of this Func by the given expression.
Stage operator=(const Expr &e)
Use this as the left-hand-side of an update definition of Tuple component 'idx' of a Func (see RDom).
Stage operator=(const FuncRef &e)
Internal::Function function() const
What function is this calling?
Definition: Func.h:672
Stage operator-=(const Expr &e)
Define a stage that adds the negative of the given expression to Tuple component 'idx' of this Func.
FuncTupleElementRef(const FuncRef &ref, const std::vector< Expr > &args, int idx)
An Image parameter to a halide pipeline.
Definition: ImageParam.h:23
A Function definition which can either represent a init or an update definition.
Definition: Definition.h:38
const std::vector< Expr > & args() const
Get the default (no-specialization) arguments (left-hand-side) of the definition.
const StageSchedule & schedule() const
Get the default (no-specialization) stage-specific schedule associated with this definition.
bool defined() const
Definition objects are nullable.
const std::vector< StorageDim > & storage_dims() const
The list and order of dimensions used to store this function.
A reference-counted handle to Halide's internal representation of a function.
Definition: Function.h:38
FuncSchedule & schedule()
Get a handle to the function-specific schedule for the purpose of modifying it.
const std::vector< std::string > & args() const
Get the pure arguments.
A base class for passes over the IR which modify it (e.g.
Definition: IRMutator.h:26
A reference-counted handle to a parameter to a halide pipeline.
Definition: Parameter.h:28
A schedule for a single stage of a Halide pipeline.
Definition: Schedule.h:646
A reference to a site in a Halide statement at the top of the body of a particular for loop.
Definition: Schedule.h:176
A halide module.
Definition: Module.h:172
A handle on the output buffer of a pipeline.
static const ParamMap & empty_map()
A const ref to an empty ParamMap.
Definition: ParamMap.h:104
A class representing a Halide pipeline.
Definition: Pipeline.h:99
A multi-dimensional domain over which to iterate.
Definition: RDom.h:193
A reduction variable represents a single dimension of a reduction domain (RDom).
Definition: RDom.h:29
const std::string & name() const
The name of this reduction variable.
A Realization is a vector of references to existing Buffer objects.
Definition: Realization.h:19
A single definition of a Func.
Definition: Func.h:70
Stage & prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
std::string name() const
Return the name of this stage, e.g.
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Stage & >::type reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args)
Definition: Func.h:378
Stage & compute_with(const Stage &s, const VarOrRVar &var, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy > > &align)
Func rfactor(const RVar &r, const Var &v)
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & vectorize(const VarOrRVar &var)
Stage & gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & unroll(const VarOrRVar &var)
Stage & compute_with(LoopLevel loop_level, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy > > &align)
Schedule the iteration over this stage to be fused with another stage 's' from outermost loop to a gi...
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & prefetch(const Func &f, const VarOrRVar &var, int offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition: Func.h:445
Func rfactor(std::vector< std::pair< RVar, Var > > preserved)
Calling rfactor() on an associative update definition a Func will split the update into an intermedia...
Stage & allow_race_conditions()
Stage & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Stage & parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail=TailStrategy::Auto)
Stage & rename(const VarOrRVar &old_name, const VarOrRVar &new_name)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
Stage & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Stage & prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Stage specialize(const Expr &condition)
Stage & prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition: Func.h:465
Stage & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & reorder(const std::vector< VarOrRVar > &vars)
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage(Internal::Function f, Internal::Definition d, size_t stage_index)
Definition: Func.h:94
Stage & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, const std::vector< TailStrategy > &tails)
Stage & compute_with(LoopLevel loop_level, LoopAlignStrategy align=LoopAlignStrategy::Auto)
Stage & parallel(const VarOrRVar &var)
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this Stage.
Definition: Func.h:107
Stage & serial(const VarOrRVar &var)
Stage & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Stage & vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align=LoopAlignStrategy::Auto)
void specialize_fail(const std::string &message)
Stage & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Stage & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & hexagon(const VarOrRVar &x=Var::outermost())
Stage & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Scheduling calls that control how the domain of this stage is traversed.
Stage & atomic(bool override_associativity_test=false)
std::string source_location() const
Attempt to get the source file and line where this stage was defined by parsing the process's own deb...
Stage & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
std::string dump_argument_list() const
Return a string describing the current var list taking into account all the splits,...
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
void unscheduled()
Assert that this stage has intentionally been given no schedule, and suppress the warning about unsch...
Create a small array of Exprs for defining and calling functions with multiple outputs.
Definition: Tuple.h:18
A Halide variable, to be used when defining functions.
Definition: Var.h:19
const std::string & name() const
Get the name of a Var.
static Var outermost()
A Var that represents the location outside the outermost loop.
Definition: Var.h:163
void schedule_scalar(Func f)
Definition: Func.h:2597
std::vector< Var > make_argument_list(int dimensionality)
Make a list of unique arguments for definitions with unnamed arguments.
void assign_results(Realization &r, int idx, Last last)
Definition: Func.h:2544
void check_types(const Tuple &t, int idx)
Definition: Func.h:2529
ForType
An enum describing a type of loop traversal.
Definition: Expr.h:399
WEAK halide_do_task_t custom_do_task
WEAK halide_do_par_for_t custom_do_par_for
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
class HALIDE_ATTRIBUTE_DEPRECATED("Use OutputFileType instead of Output") Output
Definition: Module.h:46
PrefetchBoundStrategy
Different ways to handle accesses outside the original extents in a prefetch.
@ GuardWithIf
Guard the prefetch with if-guards that ignores the prefetch if any of the prefetched region ever goes...
HALIDE_NO_USER_CODE_INLINE T evaluate_may_gpu(const Expr &e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition: Func.h:2615
TailStrategy
Different ways to handle a tail case in a split when the factor does not provably divide the extent.
Definition: Schedule.h:32
@ Auto
For pure definitions use ShiftInwards.
LoopAlignStrategy
Different ways to handle the case when the start/end of the loops of stages computed with (fused) are...
Definition: Schedule.h:110
@ Auto
By default, LoopAlignStrategy is set to NoAlign.
Expr min(const FuncRef &a, const FuncRef &b)
Explicit overloads of min and max for FuncRef.
Definition: Func.h:600
NameMangling
An enum to specify calling convention for extern stages.
Definition: Function.h:24
@ Default
Match whatever is specified in the Target.
Target get_jit_target_from_environment()
Return the target that Halide will use for jit-compilation.
DeviceAPI
An enum describing a type of device API.
Definition: DeviceAPI.h:15
@ Host
Used to denote for loops that run on the same device as the containing code.
Target get_target_from_environment()
Return the target that Halide will use.
StmtOutputFormat
Used to determine if the output printed to file should be as a normal string or as an HTML file which...
Definition: Pipeline.h:63
@ Text
Definition: Pipeline.h:64
Stage ScheduleHandle
Definition: Func.h:485
std::vector< Range > Region
A multi-dimensional box.
Definition: Expr.h:343
Expr max(const FuncRef &a, const FuncRef &b)
Definition: Func.h:603
MemoryType
An enum describing different address spaces to be used with Func::store_in.
Definition: Expr.h:346
HALIDE_NO_USER_CODE_INLINE T evaluate(JITUserContext *ctx, const Expr &e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition: Func.h:2561
void * malloc(size_t)
unsigned __INT8_TYPE__ uint8_t
void free(void *)
A fragment of Halide syntax.
Definition: Expr.h:256
HALIDE_ALWAYS_INLINE Type type() const
Get the type of this expression node.
Definition: Expr.h:320
An argument to an extern-defined Func.
A set of custom overrides of runtime functions.
Definition: JITModule.h:33
A context to be passed to Pipeline::realize.
Definition: JITModule.h:134
A struct representing a target machine and os to generate code for.
Definition: Target.h:19
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled? I.e.
bool has_feature(Feature f) const
Types in the halide type system.
Definition: Type.h:266
A class that can represent Vars or RVars.
Definition: Func.h:30
bool is_rvar
Definition: Func.h:58
VarOrRVar(const Var &v)
Definition: Func.h:34
VarOrRVar(const RVar &r)
Definition: Func.h:37
VarOrRVar(const std::string &n, bool r)
Definition: Func.h:31
VarOrRVar(const ImplicitVar< N > &u)
Definition: Func.h:44
const std::string & name() const
Definition: Func.h:48
VarOrRVar(const RDom &r)
Definition: Func.h:40