Halide 14.0.0
Halide compiler and libraries
Featurization.h
Go to the documentation of this file.
1#ifndef FEATURIZATION_H
2#define FEATURIZATION_H
3
4#include <cstdint>
5#include <cstring>
6#include <iostream>
7
8#include "ASLog.h"
9
10namespace Halide {
11namespace Internal {
12
13// The algorithm-specific features. For legacy reasons these are
14// called PipelineFeatures in the code.
16 static constexpr size_t num_features() {
17 return sizeof(PipelineFeatures) / sizeof(int);
18 }
19
20 static constexpr uint32_t version() {
21 return 3;
22 }
23
24 // Access them by index.
25 int &operator[](int idx) {
26 return ((int *)(this))[idx];
27 }
28
29 int operator[](int idx) const {
30 return ((const int *)(this))[idx];
31 }
32
33 enum class OpType {
34 Const,
35 Cast,
37 Param,
38 Add,
39 Sub,
40 Mod,
41 Mul,
42 Div,
43 Min,
44 Max,
45 EQ,
46 NE,
47 LT,
48 LE,
49 And,
50 Or,
51 Not,
52 Select,
53 ImageCall, // Loads to an input buffer
54 FuncCall, // Calls to another pipeline stage
55 SelfCall, // Recursive calls from a Func to itself
56 ExternCall, // Math intrinsics, typically
57 Let,
58 NumOpTypes
59 };
60
61 enum class ScalarType {
62 Bool,
63 UInt8, // or Int8
64 UInt16, // or Int16
65 UInt32, // or Int32
66 UInt64, // or Int64
67 Float,
68 Double,
69 NumScalarTypes
70 };
71
72 // Not fed into the network, but helps avoid printing huge numbers of zeros while debugging things
74
76
77 enum class AccessType {
78 LoadFunc,
79 LoadSelf,
80 LoadImage,
81 Store,
82 NumAccessTypes
83 };
84
85 // Finer granularity call/store node properties. These are a
86 // function of the matrix of derivatives of each arg to a
87 // call w.r.t the loop variables of the Stage. Each row of
88 // the matrix corresponds to one of the call arguments. In
89 // each case we illustrate such a call, assuming that the
90 // variables of this Func are x, y, z, and that the
91 // dimension vectorized over is the first (x).
92
93 // Square identity matrix. f(x - 2, y + 8, z + param)
95 // Square permutation matrix. f(y + 1, z - 3, x)
97 // Each row sums to 1. Each column sums to 1 or 0. f(y, x)
99 // Each row sums to 1 or 0. Each column sums to 1. f(z, y, x, 4)
101
102 template<typename OS>
103 void dump(OS &os) const {
104 for (int i = 0; i < (int)ScalarType::NumScalarTypes; i++) {
105 const char *type_names[] = {"Bool", "UInt8", "UInt16", "UInt32", "UInt64", "Float", "Double"};
106 // Skip printing for types not used
107 if (!types_in_use[i]) {
108 continue;
109 }
110
111 os << " Featurization for type " << type_names[i] << "\n"
112 << " Op histogram:\n"
113 << " Constant: " << op_histogram[(int)OpType::Const][i] << "\n"
114 << " Cast: " << op_histogram[(int)OpType::Cast][i] << "\n"
115 << " Variable: " << op_histogram[(int)OpType::Variable][i] << "\n"
116 << " Param: " << op_histogram[(int)OpType::Param][i] << "\n"
117 << " Add: " << op_histogram[(int)OpType::Add][i] << "\n"
118 << " Sub: " << op_histogram[(int)OpType::Sub][i] << "\n"
119 << " Mod: " << op_histogram[(int)OpType::Mod][i] << "\n"
120 << " Mul: " << op_histogram[(int)OpType::Mul][i] << "\n"
121 << " Div: " << op_histogram[(int)OpType::Div][i] << "\n"
122 << " Min: " << op_histogram[(int)OpType::Min][i] << "\n"
123 << " Max: " << op_histogram[(int)OpType::Max][i] << "\n"
124 << " EQ: " << op_histogram[(int)OpType::EQ][i] << "\n"
125 << " NE: " << op_histogram[(int)OpType::NE][i] << "\n"
126 << " LT: " << op_histogram[(int)OpType::LT][i] << "\n"
127 << " LE: " << op_histogram[(int)OpType::LE][i] << "\n"
128 << " And: " << op_histogram[(int)OpType::And][i] << "\n"
129 << " Or: " << op_histogram[(int)OpType::Or][i] << "\n"
130 << " Not: " << op_histogram[(int)OpType::Not][i] << "\n"
131 << " Select: " << op_histogram[(int)OpType::Select][i] << "\n"
132 << " ImageCall: " << op_histogram[(int)OpType::ImageCall][i] << "\n"
133 << " FuncCall: " << op_histogram[(int)OpType::FuncCall][i] << "\n"
134 << " SelfCall: " << op_histogram[(int)OpType::SelfCall][i] << "\n"
135 << " ExternCall: " << op_histogram[(int)OpType::ExternCall][i] << "\n"
136 << " Let: " << op_histogram[(int)OpType::Let][i] << "\n"
137 << " Memory access patterns. Columns are calls to other Funcs, self-calls, input image access, and stores\n"
138 << " Pointwise: "
139 << pointwise_accesses[0][i] << " "
140 << pointwise_accesses[1][i] << " "
141 << pointwise_accesses[2][i] << " "
142 << pointwise_accesses[3][i] << "\n"
143 << " Transpose: "
144 << transpose_accesses[0][i] << " "
145 << transpose_accesses[1][i] << " "
146 << transpose_accesses[2][i] << " "
147 << transpose_accesses[3][i] << "\n"
148 << " Broadcast: "
149 << broadcast_accesses[0][i] << " "
150 << broadcast_accesses[1][i] << " "
151 << broadcast_accesses[2][i] << " "
152 << broadcast_accesses[3][i] << "\n"
153 << " Slice: "
154 << slice_accesses[0][i] << " "
155 << slice_accesses[1][i] << " "
156 << slice_accesses[2][i] << " "
157 << slice_accesses[3][i] << "\n";
158 }
159 }
160 void dump() const {
161 auto os = aslog(0);
162 dump(os);
163 }
164};
165
166// The schedule-dependent portion of the featurization of a stage
168 static constexpr size_t num_features() {
169 return sizeof(ScheduleFeatures) / sizeof(double);
170 }
171
172 static constexpr uint32_t version() {
173 return 3;
174 }
175
176 double &operator[](int idx) {
177 return ((double *)(this))[idx];
178 }
179
180 double operator[](int idx) const {
181 return ((const double *)(this))[idx];
182 }
183
184 // The number of times storage for this stage is allocated. The
185 // product of outer loops at store_at site
187
188 // The number of times a tile of the stage is computed. The
189 // product of outer loops at compute_at site. Always at least as
190 // large as num_realizations.
191 double num_productions = 0;
192
193 // Number of times the innermost loop happens per allocation.
195
196 // Number of times the innermost stmt happens per tile computed.
198
199 // The total trip count of the innermost loop over the entire program.
200 // == num_realizations * points_computed_per_realization
201 // ~= num_productions * points_computed_per_production
202 // Only approximately equal because of the simplifications made
203 // regarding the modeling of sliding window
205
206 // The minimum number of points that are actually required to be
207 // computed to produce a correct output. Not actually a function
208 // of the schedule, but a useful reference point to see if a
209 // schedule has gone off the rails.
211
212 // Trip count of innermost loop nest.
214
215 // Trip count of just the pure loops in the innermost loop
216 // (i.e. excludes loops representing reductions).
218
219 // If this is to be unrolled, what is the product of the unrolling
220 // factors.
222
223 // The number of parallel jobs launched in the production of this
224 // stage. Always 1 unless the Func is compute_root, because we
225 // place all parallelism at the outermost level.
227
228 // The number of times this Func could be realized in parallel. 1
229 // when the Func is compute_root. Product of the containing
230 // parallel loops for other stages.
232
233 // Size of the region computed at the store_at site, measured in
234 // bytes. Does not take storage-folding optimizations into account.
236
237 // Size of the region computed per tile (at the compute_at site),
238 // measured in bytes. This includes the effect of storage-folding,
239 // so it's a better number to look at to estimate memory usage.
241
242 // If the stage were hypothetically scheduled at root, how much
243 // memory would it consumed. Doesn't vary w.r.t. the schedule, but
244 // a useful reference.
245 double bytes_at_root = 0;
246
247 // Same as the above, but only measuring the extent along the
248 // innermost dimension, so that we can reason about spatial
249 // locality, cache lines, prefetchers, etc.
253
254 // For inlined Funcs, how many calls are made to this Func total.
255 double inlined_calls = 0;
256
257 // Number of unique bytes and unique continguous segments of
258 // memory loaded from all inputs over a single trip of the loop
259 // containing the allocation site.
262
263 // The sum of the sizes of the allocations accessed at this
264 // site. Gives a hint as to the likely locality of it.
266
267 // The sum of the sizes of the temporary allocations while
268 // computing one tile of this Func. Probably a good thing if it
269 // fits in cache.
270 double working_set = 0;
271
272 // The vectorization factor (#simd lanes) to be used to compute
273 // this stage. Wasted work if it's smaller than the stage's native
274 // vector size.
275 double vector_size = 0;
276
277 // The native vector size for the narrowest type used. Does not
278 // vary with the schedule, but a useful reference point.
280
281 // Number of SIMD vectors computed
282 double num_vectors = 0;
283
284 // Number of scalars computed (e.g. from tails of loops)
285 double num_scalars = 0;
286
287 // The number of loads done per vector or scalar computed. Vector
288 // gathers count as a batch of scalar loads. These get amortized
289 // across unrolled blocks if some loads can be reused across the
290 // unrolled dimension.
294
295 // The memory footprint written over one per parallel task. The
296 // union of the regions if the stage is computed at finer
297 // granularity that one parallel task of some consumer.
298 double bytes_at_task = 0;
300
301 // The memory footprint accessed while computing a single vector.
304
305 // The memory footprint accessed per parallel task. Only counts
306 // loads from things computed outside of that parallel task (to
307 // measure the amount of traffic coming from another core).
310
311 // The sum of the sizes of all live allocations at various sites.
316
317 template<typename OS>
318 void dump(OS &os) const {
319 os << " num_realizations: " << num_realizations << "\n"
320 << " num_productions: " << num_productions << "\n"
321 << " points_computed_per_realization: " << points_computed_per_realization << "\n"
322 << " points_computed_per_production: " << points_computed_per_production << "\n"
323 << " points_computed_total: " << points_computed_total << "\n"
324 << " points_computed_minimum: " << points_computed_minimum << "\n"
325 << " innermost_loop_extent: " << innermost_loop_extent << "\n"
326 << " innermost_pure_loop_extent: " << innermost_pure_loop_extent << "\n"
327 << " unrolled_loop_extent: " << unrolled_loop_extent << "\n"
328 << " inner_parallelism: " << inner_parallelism << "\n"
329 << " outer_parallelism: " << outer_parallelism << "\n"
330 << " bytes_at_realization: " << bytes_at_realization << "\n"
331 << " bytes_at_production: " << bytes_at_production << "\n"
332 << " bytes_at_root: " << bytes_at_root << "\n"
333 << " innermost_bytes_at_realization: " << innermost_bytes_at_realization << "\n"
334 << " innermost_bytes_at_production: " << innermost_bytes_at_production << "\n"
335 << " innermost_bytes_at_root: " << innermost_bytes_at_root << "\n"
336 << " inlined_calls: " << inlined_calls << "\n"
337 << " unique_bytes_read_per_realization: " << unique_bytes_read_per_realization << "\n"
338 << " unique_lines_read_per_realization: " << unique_lines_read_per_realization << "\n"
339 << " allocation_bytes_read_per_realization: " << allocation_bytes_read_per_realization << "\n"
340 << " working_set: " << working_set << "\n"
341 << " vector_size: " << vector_size << "\n"
342 << " native_vector_size: " << native_vector_size << "\n"
343 << " num_vectors: " << num_vectors << "\n"
344 << " num_scalars: " << num_scalars << "\n"
345 << " scalar_loads_per_vector: " << scalar_loads_per_vector << "\n"
346 << " vector_loads_per_vector: " << vector_loads_per_vector << "\n"
347 << " scalar_loads_per_scalar: " << scalar_loads_per_scalar << "\n"
348 << " bytes_at_task: " << bytes_at_task << "\n"
349 << " innermost_bytes_at_task: " << innermost_bytes_at_task << "\n"
350 << " unique_bytes_read_per_vector: " << unique_bytes_read_per_vector << "\n"
351 << " unique_lines_read_per_vector: " << unique_lines_read_per_vector << "\n"
352 << " unique_bytes_read_per_task: " << unique_bytes_read_per_task << "\n"
353 << " unique_lines_read_per_task: " << unique_lines_read_per_task << "\n"
354 << " working_set_at_task: " << working_set_at_task << "\n"
355 << " working_set_at_production: " << working_set_at_production << "\n"
356 << " working_set_at_realization: " << working_set_at_realization << "\n"
357 << " working_set_at_root: " << working_set_at_root << "\n";
358 }
359 void dump() const {
360 auto os = aslog(0);
361 dump(os);
362 }
363
364 bool equal(const ScheduleFeatures &other) const {
365 const size_t n_features = ScheduleFeatures::num_features();
366 for (size_t i = 0; i < n_features; i++) {
367 if ((*this)[i] != other[i]) {
368 return false;
369 }
370 }
371 return true;
372 }
373};
374
375/*
376Some feature values cannot be cached, and need to be recomputed.
377These intermediates allow for faster recomputation of such features.
378*/
384};
385
386} // namespace Internal
387} // namespace Halide
388
389#endif
A scalar parameter to a halide pipeline.
Definition: Param.h:22
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Type Float(int bits, int lanes=1)
Construct a floating-point type.
Definition: Type.h:526
@ Internal
Not visible externally, similar to 'static' linkage in C.
Type Bool(int lanes=1)
Construct a boolean type.
Definition: Type.h:536
unsigned __INT32_TYPE__ uint32_t
The sum of two expressions.
Definition: IR.h:38
Logical and - are both expressions true.
Definition: IR.h:157
The actual IR nodes begin here.
Definition: IR.h:29
The ratio of two expressions.
Definition: IR.h:65
Is the first expression equal to the second.
Definition: IR.h:103
Is the first expression less than or equal to the second.
Definition: IR.h:130
Is the first expression less than the second.
Definition: IR.h:121
A let expression, like you might find in a functional language.
Definition: IR.h:253
The greater of two values.
Definition: IR.h:94
The lesser of two values.
Definition: IR.h:85
The remainder of a / b.
Definition: IR.h:76
The product of two expressions.
Definition: IR.h:56
Is the first expression not equal to the second.
Definition: IR.h:112
Logical not - true if the expression false.
Definition: IR.h:175
Logical or - is at least one of the expression true.
Definition: IR.h:166
int broadcast_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:98
int op_histogram[(int) OpType::NumOpTypes][(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:75
int pointwise_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:94
static constexpr uint32_t version()
Definition: Featurization.h:20
static constexpr size_t num_features()
Definition: Featurization.h:16
int types_in_use[(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:73
int transpose_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
Definition: Featurization.h:96
int slice_accesses[(int) AccessType::NumAccessTypes][(int) ScalarType::NumScalarTypes]
bool equal(const ScheduleFeatures &other) const
double operator[](int idx) const
static constexpr uint32_t version()
static constexpr size_t num_features()
A ternary operator.
Definition: IR.h:186
Store a 'value' to the buffer called 'name' at a given 'index' if 'predicate' is true.
Definition: IR.h:315
The difference of two expressions.
Definition: IR.h:47
A named variable.
Definition: IR.h:700