1 | //===- TargetTransformInfo.h ------------------------------------*- C++ -*-===// |
---|---|
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// This pass exposes codegen information to IR-level passes. Every |
10 | /// transformation that uses codegen information is broken into three parts: |
11 | /// 1. The IR-level analysis pass. |
12 | /// 2. The IR-level transformation interface which provides the needed |
13 | /// information. |
14 | /// 3. Codegen-level implementation which uses target-specific hooks. |
15 | /// |
16 | /// This file defines #2, which is the interface that IR-level transformations |
17 | /// use for querying the codegen. |
18 | /// |
19 | //===----------------------------------------------------------------------===// |
20 | |
21 | #ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFO_H |
22 | #define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H |
23 | |
24 | #include "llvm/ADT/APInt.h" |
25 | #include "llvm/ADT/ArrayRef.h" |
26 | #include "llvm/Analysis/IVDescriptors.h" |
27 | #include "llvm/IR/FMF.h" |
28 | #include "llvm/IR/InstrTypes.h" |
29 | #include "llvm/IR/PassManager.h" |
30 | #include "llvm/Pass.h" |
31 | #include "llvm/Support/AtomicOrdering.h" |
32 | #include "llvm/Support/BranchProbability.h" |
33 | #include "llvm/Support/Compiler.h" |
34 | #include "llvm/Support/InstructionCost.h" |
35 | #include <functional> |
36 | #include <optional> |
37 | #include <utility> |
38 | |
39 | namespace llvm { |
40 | |
41 | namespace Intrinsic { |
42 | typedef unsigned ID; |
43 | } |
44 | |
45 | class AllocaInst; |
46 | class AssumptionCache; |
47 | class BlockFrequencyInfo; |
48 | class DominatorTree; |
49 | class BranchInst; |
50 | class Function; |
51 | class GlobalValue; |
52 | class InstCombiner; |
53 | class OptimizationRemarkEmitter; |
54 | class InterleavedAccessInfo; |
55 | class IntrinsicInst; |
56 | class LoadInst; |
57 | class Loop; |
58 | class LoopInfo; |
59 | class LoopVectorizationLegality; |
60 | class ProfileSummaryInfo; |
61 | class RecurrenceDescriptor; |
62 | class SCEV; |
63 | class ScalarEvolution; |
64 | class SmallBitVector; |
65 | class StoreInst; |
66 | class SwitchInst; |
67 | class TargetLibraryInfo; |
68 | class Type; |
69 | class VPIntrinsic; |
70 | struct KnownBits; |
71 | |
72 | /// Information about a load/store intrinsic defined by the target. |
73 | struct MemIntrinsicInfo { |
74 | /// This is the pointer that the intrinsic is loading from or storing to. |
75 | /// If this is non-null, then analysis/optimization passes can assume that |
76 | /// this intrinsic is functionally equivalent to a load/store from this |
77 | /// pointer. |
78 | Value *PtrVal = nullptr; |
79 | |
80 | // Ordering for atomic operations. |
81 | AtomicOrdering Ordering = AtomicOrdering::NotAtomic; |
82 | |
83 | // Same Id is set by the target for corresponding load/store intrinsics. |
84 | unsigned short MatchingId = 0; |
85 | |
86 | bool ReadMem = false; |
87 | bool WriteMem = false; |
88 | bool IsVolatile = false; |
89 | |
90 | bool isUnordered() const { |
91 | return (Ordering == AtomicOrdering::NotAtomic || |
92 | Ordering == AtomicOrdering::Unordered) && |
93 | !IsVolatile; |
94 | } |
95 | }; |
96 | |
97 | /// Attributes of a target dependent hardware loop. |
98 | struct HardwareLoopInfo { |
99 | HardwareLoopInfo() = delete; |
100 | LLVM_ABI HardwareLoopInfo(Loop *L); |
101 | Loop *L = nullptr; |
102 | BasicBlock *ExitBlock = nullptr; |
103 | BranchInst *ExitBranch = nullptr; |
104 | const SCEV *ExitCount = nullptr; |
105 | IntegerType *CountType = nullptr; |
106 | Value *LoopDecrement = nullptr; // Decrement the loop counter by this |
107 | // value in every iteration. |
108 | bool IsNestingLegal = false; // Can a hardware loop be a parent to |
109 | // another hardware loop? |
110 | bool CounterInReg = false; // Should loop counter be updated in |
111 | // the loop via a phi? |
112 | bool PerformEntryTest = false; // Generate the intrinsic which also performs |
113 | // icmp ne zero on the loop counter value and |
114 | // produces an i1 to guard the loop entry. |
115 | LLVM_ABI bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, |
116 | DominatorTree &DT, |
117 | bool ForceNestedLoop = false, |
118 | bool ForceHardwareLoopPHI = false); |
119 | LLVM_ABI bool canAnalyze(LoopInfo &LI); |
120 | }; |
121 | |
122 | class IntrinsicCostAttributes { |
123 | const IntrinsicInst *II = nullptr; |
124 | Type *RetTy = nullptr; |
125 | Intrinsic::ID IID; |
126 | SmallVector<Type *, 4> ParamTys; |
127 | SmallVector<const Value *, 4> Arguments; |
128 | FastMathFlags FMF; |
129 | // If ScalarizationCost is UINT_MAX, the cost of scalarizing the |
130 | // arguments and the return value will be computed based on types. |
131 | InstructionCost ScalarizationCost = InstructionCost::getInvalid(); |
132 | TargetLibraryInfo const *LibInfo = nullptr; |
133 | |
134 | public: |
135 | LLVM_ABI IntrinsicCostAttributes( |
136 | Intrinsic::ID Id, const CallBase &CI, |
137 | InstructionCost ScalarCost = InstructionCost::getInvalid(), |
138 | bool TypeBasedOnly = false, TargetLibraryInfo const *LibInfo = nullptr); |
139 | |
140 | LLVM_ABI IntrinsicCostAttributes( |
141 | Intrinsic::ID Id, Type *RTy, ArrayRef<Type *> Tys, |
142 | FastMathFlags Flags = FastMathFlags(), const IntrinsicInst *I = nullptr, |
143 | InstructionCost ScalarCost = InstructionCost::getInvalid()); |
144 | |
145 | LLVM_ABI IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy, |
146 | ArrayRef<const Value *> Args); |
147 | |
148 | LLVM_ABI IntrinsicCostAttributes( |
149 | Intrinsic::ID Id, Type *RTy, ArrayRef<const Value *> Args, |
150 | ArrayRef<Type *> Tys, FastMathFlags Flags = FastMathFlags(), |
151 | const IntrinsicInst *I = nullptr, |
152 | InstructionCost ScalarCost = InstructionCost::getInvalid(), |
153 | TargetLibraryInfo const *LibInfo = nullptr); |
154 | |
155 | Intrinsic::ID getID() const { return IID; } |
156 | const IntrinsicInst *getInst() const { return II; } |
157 | Type *getReturnType() const { return RetTy; } |
158 | FastMathFlags getFlags() const { return FMF; } |
159 | InstructionCost getScalarizationCost() const { return ScalarizationCost; } |
160 | const SmallVectorImpl<const Value *> &getArgs() const { return Arguments; } |
161 | const SmallVectorImpl<Type *> &getArgTypes() const { return ParamTys; } |
162 | const TargetLibraryInfo *getLibInfo() const { return LibInfo; } |
163 | |
164 | bool isTypeBasedOnly() const { |
165 | return Arguments.empty(); |
166 | } |
167 | |
168 | bool skipScalarizationCost() const { return ScalarizationCost.isValid(); } |
169 | }; |
170 | |
171 | enum class TailFoldingStyle { |
172 | /// Don't use tail folding |
173 | None, |
174 | /// Use predicate only to mask operations on data in the loop. |
175 | /// When the VL is not known to be a power-of-2, this method requires a |
176 | /// runtime overflow check for the i + VL in the loop because it compares the |
177 | /// scalar induction variable against the tripcount rounded up by VL which may |
178 | /// overflow. When the VL is a power-of-2, both the increment and uprounded |
179 | /// tripcount will overflow to 0, which does not require a runtime check |
180 | /// since the loop is exited when the loop induction variable equals the |
181 | /// uprounded trip-count, which are both 0. |
182 | Data, |
183 | /// Same as Data, but avoids using the get.active.lane.mask intrinsic to |
184 | /// calculate the mask and instead implements this with a |
185 | /// splat/stepvector/cmp. |
186 | /// FIXME: Can this kind be removed now that SelectionDAGBuilder expands the |
187 | /// active.lane.mask intrinsic when it is not natively supported? |
188 | DataWithoutLaneMask, |
189 | /// Use predicate to control both data and control flow. |
190 | /// This method always requires a runtime overflow check for the i + VL |
191 | /// increment inside the loop, because it uses the result direclty in the |
192 | /// active.lane.mask to calculate the mask for the next iteration. If the |
193 | /// increment overflows, the mask is no longer correct. |
194 | DataAndControlFlow, |
195 | /// Use predicate to control both data and control flow, but modify |
196 | /// the trip count so that a runtime overflow check can be avoided |
197 | /// and such that the scalar epilogue loop can always be removed. |
198 | DataAndControlFlowWithoutRuntimeCheck, |
199 | /// Use predicated EVL instructions for tail-folding. |
200 | /// Indicates that VP intrinsics should be used. |
201 | DataWithEVL, |
202 | }; |
203 | |
204 | struct TailFoldingInfo { |
205 | TargetLibraryInfo *TLI; |
206 | LoopVectorizationLegality *LVL; |
207 | InterleavedAccessInfo *IAI; |
208 | TailFoldingInfo(TargetLibraryInfo *TLI, LoopVectorizationLegality *LVL, |
209 | InterleavedAccessInfo *IAI) |
210 | : TLI(TLI), LVL(LVL), IAI(IAI) {} |
211 | }; |
212 | |
213 | class TargetTransformInfo; |
214 | typedef TargetTransformInfo TTI; |
215 | class TargetTransformInfoImplBase; |
216 | |
217 | /// This pass provides access to the codegen interfaces that are needed |
218 | /// for IR-level transformations. |
219 | class TargetTransformInfo { |
220 | public: |
221 | enum PartialReductionExtendKind { PR_None, PR_SignExtend, PR_ZeroExtend }; |
222 | |
223 | /// Get the kind of extension that an instruction represents. |
224 | LLVM_ABI static PartialReductionExtendKind |
225 | getPartialReductionExtendKind(Instruction *I); |
226 | |
227 | /// Construct a TTI object using a type implementing the \c Concept |
228 | /// API below. |
229 | /// |
230 | /// This is used by targets to construct a TTI wrapping their target-specific |
231 | /// implementation that encodes appropriate costs for their target. |
232 | LLVM_ABI explicit TargetTransformInfo( |
233 | std::unique_ptr<const TargetTransformInfoImplBase> Impl); |
234 | |
235 | /// Construct a baseline TTI object using a minimal implementation of |
236 | /// the \c Concept API below. |
237 | /// |
238 | /// The TTI implementation will reflect the information in the DataLayout |
239 | /// provided if non-null. |
240 | LLVM_ABI explicit TargetTransformInfo(const DataLayout &DL); |
241 | |
242 | // Provide move semantics. |
243 | LLVM_ABI TargetTransformInfo(TargetTransformInfo &&Arg); |
244 | LLVM_ABI TargetTransformInfo &operator=(TargetTransformInfo &&RHS); |
245 | |
246 | // We need to define the destructor out-of-line to define our sub-classes |
247 | // out-of-line. |
248 | LLVM_ABI ~TargetTransformInfo(); |
249 | |
250 | /// Handle the invalidation of this information. |
251 | /// |
252 | /// When used as a result of \c TargetIRAnalysis this method will be called |
253 | /// when the function this was computed for changes. When it returns false, |
254 | /// the information is preserved across those changes. |
255 | bool invalidate(Function &, const PreservedAnalyses &, |
256 | FunctionAnalysisManager::Invalidator &) { |
257 | // FIXME: We should probably in some way ensure that the subtarget |
258 | // information for a function hasn't changed. |
259 | return false; |
260 | } |
261 | |
262 | /// \name Generic Target Information |
263 | /// @{ |
264 | |
265 | /// The kind of cost model. |
266 | /// |
267 | /// There are several different cost models that can be customized by the |
268 | /// target. The normalization of each cost model may be target specific. |
269 | /// e.g. TCK_SizeAndLatency should be comparable to target thresholds such as |
270 | /// those derived from MCSchedModel::LoopMicroOpBufferSize etc. |
271 | enum TargetCostKind { |
272 | TCK_RecipThroughput, ///< Reciprocal throughput. |
273 | TCK_Latency, ///< The latency of instruction. |
274 | TCK_CodeSize, ///< Instruction code size. |
275 | TCK_SizeAndLatency ///< The weighted sum of size and latency. |
276 | }; |
277 | |
278 | /// Underlying constants for 'cost' values in this interface. |
279 | /// |
280 | /// Many APIs in this interface return a cost. This enum defines the |
281 | /// fundamental values that should be used to interpret (and produce) those |
282 | /// costs. The costs are returned as an int rather than a member of this |
283 | /// enumeration because it is expected that the cost of one IR instruction |
284 | /// may have a multiplicative factor to it or otherwise won't fit directly |
285 | /// into the enum. Moreover, it is common to sum or average costs which works |
286 | /// better as simple integral values. Thus this enum only provides constants. |
287 | /// Also note that the returned costs are signed integers to make it natural |
288 | /// to add, subtract, and test with zero (a common boundary condition). It is |
289 | /// not expected that 2^32 is a realistic cost to be modeling at any point. |
290 | /// |
291 | /// Note that these costs should usually reflect the intersection of code-size |
292 | /// cost and execution cost. A free instruction is typically one that folds |
293 | /// into another instruction. For example, reg-to-reg moves can often be |
294 | /// skipped by renaming the registers in the CPU, but they still are encoded |
295 | /// and thus wouldn't be considered 'free' here. |
296 | enum TargetCostConstants { |
297 | TCC_Free = 0, ///< Expected to fold away in lowering. |
298 | TCC_Basic = 1, ///< The cost of a typical 'add' instruction. |
299 | TCC_Expensive = 4 ///< The cost of a 'div' instruction on x86. |
300 | }; |
301 | |
302 | /// Estimate the cost of a GEP operation when lowered. |
303 | /// |
304 | /// \p PointeeType is the source element type of the GEP. |
305 | /// \p Ptr is the base pointer operand. |
306 | /// \p Operands is the list of indices following the base pointer. |
307 | /// |
308 | /// \p AccessType is a hint as to what type of memory might be accessed by |
309 | /// users of the GEP. getGEPCost will use it to determine if the GEP can be |
310 | /// folded into the addressing mode of a load/store. If AccessType is null, |
311 | /// then the resulting target type based off of PointeeType will be used as an |
312 | /// approximation. |
313 | LLVM_ABI InstructionCost |
314 | getGEPCost(Type *PointeeType, const Value *Ptr, |
315 | ArrayRef<const Value *> Operands, Type *AccessType = nullptr, |
316 | TargetCostKind CostKind = TCK_SizeAndLatency) const; |
317 | |
318 | /// Describe known properties for a set of pointers. |
319 | struct PointersChainInfo { |
320 | /// All the GEPs in a set have same base address. |
321 | unsigned IsSameBaseAddress : 1; |
322 | /// These properties only valid if SameBaseAddress is set. |
323 | /// True if all pointers are separated by a unit stride. |
324 | unsigned IsUnitStride : 1; |
325 | /// True if distance between any two neigbouring pointers is a known value. |
326 | unsigned IsKnownStride : 1; |
327 | unsigned Reserved : 29; |
328 | |
329 | bool isSameBase() const { return IsSameBaseAddress; } |
330 | bool isUnitStride() const { return IsSameBaseAddress && IsUnitStride; } |
331 | bool isKnownStride() const { return IsSameBaseAddress && IsKnownStride; } |
332 | |
333 | static PointersChainInfo getUnitStride() { |
334 | return {/*IsSameBaseAddress=*/.IsSameBaseAddress: 1, /*IsUnitStride=*/.IsUnitStride: 1, |
335 | /*IsKnownStride=*/.IsKnownStride: 1, .Reserved: 0}; |
336 | } |
337 | static PointersChainInfo getKnownStride() { |
338 | return {/*IsSameBaseAddress=*/.IsSameBaseAddress: 1, /*IsUnitStride=*/.IsUnitStride: 0, |
339 | /*IsKnownStride=*/.IsKnownStride: 1, .Reserved: 0}; |
340 | } |
341 | static PointersChainInfo getUnknownStride() { |
342 | return {/*IsSameBaseAddress=*/.IsSameBaseAddress: 1, /*IsUnitStride=*/.IsUnitStride: 0, |
343 | /*IsKnownStride=*/.IsKnownStride: 0, .Reserved: 0}; |
344 | } |
345 | }; |
346 | static_assert(sizeof(PointersChainInfo) == 4, "Was size increase justified?"); |
347 | |
348 | /// Estimate the cost of a chain of pointers (typically pointer operands of a |
349 | /// chain of loads or stores within same block) operations set when lowered. |
350 | /// \p AccessTy is the type of the loads/stores that will ultimately use the |
351 | /// \p Ptrs. |
352 | LLVM_ABI InstructionCost getPointersChainCost( |
353 | ArrayRef<const Value *> Ptrs, const Value *Base, |
354 | const PointersChainInfo &Info, Type *AccessTy, |
355 | TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; |
356 | |
357 | /// \returns A value by which our inlining threshold should be multiplied. |
358 | /// This is primarily used to bump up the inlining threshold wholesale on |
359 | /// targets where calls are unusually expensive. |
360 | /// |
361 | /// TODO: This is a rather blunt instrument. Perhaps altering the costs of |
362 | /// individual classes of instructions would be better. |
363 | LLVM_ABI unsigned getInliningThresholdMultiplier() const; |
364 | |
365 | LLVM_ABI unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const; |
366 | LLVM_ABI unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const; |
367 | |
368 | /// \returns The bonus of inlining the last call to a static function. |
369 | LLVM_ABI int getInliningLastCallToStaticBonus() const; |
370 | |
371 | /// \returns A value to be added to the inlining threshold. |
372 | LLVM_ABI unsigned adjustInliningThreshold(const CallBase *CB) const; |
373 | |
374 | /// \returns The cost of having an Alloca in the caller if not inlined, to be |
375 | /// added to the threshold |
376 | LLVM_ABI unsigned getCallerAllocaCost(const CallBase *CB, |
377 | const AllocaInst *AI) const; |
378 | |
379 | /// \returns Vector bonus in percent. |
380 | /// |
381 | /// Vector bonuses: We want to more aggressively inline vector-dense kernels |
382 | /// and apply this bonus based on the percentage of vector instructions. A |
383 | /// bonus is applied if the vector instructions exceed 50% and half that |
384 | /// amount is applied if it exceeds 10%. Note that these bonuses are some what |
385 | /// arbitrary and evolved over time by accident as much as because they are |
386 | /// principled bonuses. |
387 | /// FIXME: It would be nice to base the bonus values on something more |
388 | /// scientific. A target may has no bonus on vector instructions. |
389 | LLVM_ABI int getInlinerVectorBonusPercent() const; |
390 | |
391 | /// \return the expected cost of a memcpy, which could e.g. depend on the |
392 | /// source/destination type and alignment and the number of bytes copied. |
393 | LLVM_ABI InstructionCost getMemcpyCost(const Instruction *I) const; |
394 | |
395 | /// Returns the maximum memset / memcpy size in bytes that still makes it |
396 | /// profitable to inline the call. |
397 | LLVM_ABI uint64_t getMaxMemIntrinsicInlineSizeThreshold() const; |
398 | |
399 | /// \return The estimated number of case clusters when lowering \p 'SI'. |
400 | /// \p JTSize Set a jump table size only when \p SI is suitable for a jump |
401 | /// table. |
402 | LLVM_ABI unsigned |
403 | getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize, |
404 | ProfileSummaryInfo *PSI, |
405 | BlockFrequencyInfo *BFI) const; |
406 | |
407 | /// Estimate the cost of a given IR user when lowered. |
408 | /// |
409 | /// This can estimate the cost of either a ConstantExpr or Instruction when |
410 | /// lowered. |
411 | /// |
412 | /// \p Operands is a list of operands which can be a result of transformations |
413 | /// of the current operands. The number of the operands on the list must equal |
414 | /// to the number of the current operands the IR user has. Their order on the |
415 | /// list must be the same as the order of the current operands the IR user |
416 | /// has. |
417 | /// |
418 | /// The returned cost is defined in terms of \c TargetCostConstants, see its |
419 | /// comments for a detailed explanation of the cost values. |
420 | LLVM_ABI InstructionCost getInstructionCost(const User *U, |
421 | ArrayRef<const Value *> Operands, |
422 | TargetCostKind CostKind) const; |
423 | |
424 | /// This is a helper function which calls the three-argument |
425 | /// getInstructionCost with \p Operands which are the current operands U has. |
426 | InstructionCost getInstructionCost(const User *U, |
427 | TargetCostKind CostKind) const { |
428 | SmallVector<const Value *, 4> Operands(U->operand_values()); |
429 | return getInstructionCost(U, Operands, CostKind); |
430 | } |
431 | |
432 | /// If a branch or a select condition is skewed in one direction by more than |
433 | /// this factor, it is very likely to be predicted correctly. |
434 | LLVM_ABI BranchProbability getPredictableBranchThreshold() const; |
435 | |
436 | /// Returns estimated penalty of a branch misprediction in latency. Indicates |
437 | /// how aggressive the target wants for eliminating unpredictable branches. A |
438 | /// zero return value means extra optimization applied to them should be |
439 | /// minimal. |
440 | LLVM_ABI InstructionCost getBranchMispredictPenalty() const; |
441 | |
442 | /// Return true if branch divergence exists. |
443 | /// |
444 | /// Branch divergence has a significantly negative impact on GPU performance |
445 | /// when threads in the same wavefront take different paths due to conditional |
446 | /// branches. |
447 | /// |
448 | /// If \p F is passed, provides a context function. If \p F is known to only |
449 | /// execute in a single threaded environment, the target may choose to skip |
450 | /// uniformity analysis and assume all values are uniform. |
451 | LLVM_ABI bool hasBranchDivergence(const Function *F = nullptr) const; |
452 | |
453 | /// Returns whether V is a source of divergence. |
454 | /// |
455 | /// This function provides the target-dependent information for |
456 | /// the target-independent UniformityAnalysis. |
457 | LLVM_ABI bool isSourceOfDivergence(const Value *V) const; |
458 | |
459 | // Returns true for the target specific |
460 | // set of operations which produce uniform result |
461 | // even taking non-uniform arguments |
462 | LLVM_ABI bool isAlwaysUniform(const Value *V) const; |
463 | |
464 | /// Query the target whether the specified address space cast from FromAS to |
465 | /// ToAS is valid. |
466 | LLVM_ABI bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const; |
467 | |
468 | /// Return false if a \p AS0 address cannot possibly alias a \p AS1 address. |
469 | LLVM_ABI bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const; |
470 | |
471 | /// Returns the address space ID for a target's 'flat' address space. Note |
472 | /// this is not necessarily the same as addrspace(0), which LLVM sometimes |
473 | /// refers to as the generic address space. The flat address space is a |
474 | /// generic address space that can be used access multiple segments of memory |
475 | /// with different address spaces. Access of a memory location through a |
476 | /// pointer with this address space is expected to be legal but slower |
477 | /// compared to the same memory location accessed through a pointer with a |
478 | /// different address space. |
479 | // |
480 | /// This is for targets with different pointer representations which can |
481 | /// be converted with the addrspacecast instruction. If a pointer is converted |
482 | /// to this address space, optimizations should attempt to replace the access |
483 | /// with the source address space. |
484 | /// |
485 | /// \returns ~0u if the target does not have such a flat address space to |
486 | /// optimize away. |
487 | LLVM_ABI unsigned getFlatAddressSpace() const; |
488 | |
489 | /// Return any intrinsic address operand indexes which may be rewritten if |
490 | /// they use a flat address space pointer. |
491 | /// |
492 | /// \returns true if the intrinsic was handled. |
493 | LLVM_ABI bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, |
494 | Intrinsic::ID IID) const; |
495 | |
496 | LLVM_ABI bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const; |
497 | |
498 | /// Return true if globals in this address space can have initializers other |
499 | /// than `undef`. |
500 | LLVM_ABI bool |
501 | canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const; |
502 | |
503 | LLVM_ABI unsigned getAssumedAddrSpace(const Value *V) const; |
504 | |
505 | LLVM_ABI bool isSingleThreaded() const; |
506 | |
507 | LLVM_ABI std::pair<const Value *, unsigned> |
508 | getPredicatedAddrSpace(const Value *V) const; |
509 | |
510 | /// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p |
511 | /// NewV, which has a different address space. This should happen for every |
512 | /// operand index that collectFlatAddressOperands returned for the intrinsic. |
513 | /// \returns nullptr if the intrinsic was not handled. Otherwise, returns the |
514 | /// new value (which may be the original \p II with modified operands). |
515 | LLVM_ABI Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, |
516 | Value *OldV, |
517 | Value *NewV) const; |
518 | |
519 | /// Test whether calls to a function lower to actual program function |
520 | /// calls. |
521 | /// |
522 | /// The idea is to test whether the program is likely to require a 'call' |
523 | /// instruction or equivalent in order to call the given function. |
524 | /// |
525 | /// FIXME: It's not clear that this is a good or useful query API. Client's |
526 | /// should probably move to simpler cost metrics using the above. |
527 | /// Alternatively, we could split the cost interface into distinct code-size |
528 | /// and execution-speed costs. This would allow modelling the core of this |
529 | /// query more accurately as a call is a single small instruction, but |
530 | /// incurs significant execution cost. |
531 | LLVM_ABI bool isLoweredToCall(const Function *F) const; |
532 | |
533 | struct LSRCost { |
534 | /// TODO: Some of these could be merged. Also, a lexical ordering |
535 | /// isn't always optimal. |
536 | unsigned Insns; |
537 | unsigned NumRegs; |
538 | unsigned AddRecCost; |
539 | unsigned NumIVMuls; |
540 | unsigned NumBaseAdds; |
541 | unsigned ImmCost; |
542 | unsigned SetupCost; |
543 | unsigned ScaleCost; |
544 | }; |
545 | |
546 | /// Parameters that control the generic loop unrolling transformation. |
547 | struct UnrollingPreferences { |
548 | /// The cost threshold for the unrolled loop. Should be relative to the |
549 | /// getInstructionCost values returned by this API, and the expectation is |
550 | /// that the unrolled loop's instructions when run through that interface |
551 | /// should not exceed this cost. However, this is only an estimate. Also, |
552 | /// specific loops may be unrolled even with a cost above this threshold if |
553 | /// deemed profitable. Set this to UINT_MAX to disable the loop body cost |
554 | /// restriction. |
555 | unsigned Threshold; |
556 | /// If complete unrolling will reduce the cost of the loop, we will boost |
557 | /// the Threshold by a certain percent to allow more aggressive complete |
558 | /// unrolling. This value provides the maximum boost percentage that we |
559 | /// can apply to Threshold (The value should be no less than 100). |
560 | /// BoostedThreshold = Threshold * min(RolledCost / UnrolledCost, |
561 | /// MaxPercentThresholdBoost / 100) |
562 | /// E.g. if complete unrolling reduces the loop execution time by 50% |
563 | /// then we boost the threshold by the factor of 2x. If unrolling is not |
564 | /// expected to reduce the running time, then we do not increase the |
565 | /// threshold. |
566 | unsigned MaxPercentThresholdBoost; |
567 | /// The cost threshold for the unrolled loop when optimizing for size (set |
568 | /// to UINT_MAX to disable). |
569 | unsigned OptSizeThreshold; |
570 | /// The cost threshold for the unrolled loop, like Threshold, but used |
571 | /// for partial/runtime unrolling (set to UINT_MAX to disable). |
572 | unsigned PartialThreshold; |
573 | /// The cost threshold for the unrolled loop when optimizing for size, like |
574 | /// OptSizeThreshold, but used for partial/runtime unrolling (set to |
575 | /// UINT_MAX to disable). |
576 | unsigned PartialOptSizeThreshold; |
577 | /// A forced unrolling factor (the number of concatenated bodies of the |
578 | /// original loop in the unrolled loop body). When set to 0, the unrolling |
579 | /// transformation will select an unrolling factor based on the current cost |
580 | /// threshold and other factors. |
581 | unsigned Count; |
582 | /// Default unroll count for loops with run-time trip count. |
583 | unsigned DefaultUnrollRuntimeCount; |
584 | // Set the maximum unrolling factor. The unrolling factor may be selected |
585 | // using the appropriate cost threshold, but may not exceed this number |
586 | // (set to UINT_MAX to disable). This does not apply in cases where the |
587 | // loop is being fully unrolled. |
588 | unsigned MaxCount; |
589 | /// Set the maximum upper bound of trip count. Allowing the MaxUpperBound |
590 | /// to be overrided by a target gives more flexiblity on certain cases. |
591 | /// By default, MaxUpperBound uses UnrollMaxUpperBound which value is 8. |
592 | unsigned MaxUpperBound; |
593 | /// Set the maximum unrolling factor for full unrolling. Like MaxCount, but |
594 | /// applies even if full unrolling is selected. This allows a target to fall |
595 | /// back to Partial unrolling if full unrolling is above FullUnrollMaxCount. |
596 | unsigned FullUnrollMaxCount; |
597 | // Represents number of instructions optimized when "back edge" |
598 | // becomes "fall through" in unrolled loop. |
599 | // For now we count a conditional branch on a backedge and a comparison |
600 | // feeding it. |
601 | unsigned BEInsns; |
602 | /// Allow partial unrolling (unrolling of loops to expand the size of the |
603 | /// loop body, not only to eliminate small constant-trip-count loops). |
604 | bool Partial; |
605 | /// Allow runtime unrolling (unrolling of loops to expand the size of the |
606 | /// loop body even when the number of loop iterations is not known at |
607 | /// compile time). |
608 | bool Runtime; |
609 | /// Allow generation of a loop remainder (extra iterations after unroll). |
610 | bool AllowRemainder; |
611 | /// Allow emitting expensive instructions (such as divisions) when computing |
612 | /// the trip count of a loop for runtime unrolling. |
613 | bool AllowExpensiveTripCount; |
614 | /// Apply loop unroll on any kind of loop |
615 | /// (mainly to loops that fail runtime unrolling). |
616 | bool Force; |
617 | /// Allow using trip count upper bound to unroll loops. |
618 | bool UpperBound; |
619 | /// Allow unrolling of all the iterations of the runtime loop remainder. |
620 | bool UnrollRemainder; |
621 | /// Allow unroll and jam. Used to enable unroll and jam for the target. |
622 | bool UnrollAndJam; |
623 | /// Threshold for unroll and jam, for inner loop size. The 'Threshold' |
624 | /// value above is used during unroll and jam for the outer loop size. |
625 | /// This value is used in the same manner to limit the size of the inner |
626 | /// loop. |
627 | unsigned UnrollAndJamInnerLoopThreshold; |
628 | /// Don't allow loop unrolling to simulate more than this number of |
629 | /// iterations when checking full unroll profitability |
630 | unsigned MaxIterationsCountToAnalyze; |
631 | /// Don't disable runtime unroll for the loops which were vectorized. |
632 | bool UnrollVectorizedLoop = false; |
633 | /// Don't allow runtime unrolling if expanding the trip count takes more |
634 | /// than SCEVExpansionBudget. |
635 | unsigned SCEVExpansionBudget; |
636 | /// Allow runtime unrolling multi-exit loops. Should only be set if the |
637 | /// target determined that multi-exit unrolling is profitable for the loop. |
638 | /// Fall back to the generic logic to determine whether multi-exit unrolling |
639 | /// is profitable if set to false. |
640 | bool RuntimeUnrollMultiExit; |
641 | }; |
642 | |
643 | /// Get target-customized preferences for the generic loop unrolling |
644 | /// transformation. The caller will initialize UP with the current |
645 | /// target-independent defaults. |
646 | LLVM_ABI void getUnrollingPreferences(Loop *L, ScalarEvolution &, |
647 | UnrollingPreferences &UP, |
648 | OptimizationRemarkEmitter *ORE) const; |
649 | |
650 | /// Query the target whether it would be profitable to convert the given loop |
651 | /// into a hardware loop. |
652 | LLVM_ABI bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, |
653 | AssumptionCache &AC, |
654 | TargetLibraryInfo *LibInfo, |
655 | HardwareLoopInfo &HWLoopInfo) const; |
656 | |
657 | // Query the target for which minimum vectorization factor epilogue |
658 | // vectorization should be considered. |
659 | LLVM_ABI unsigned getEpilogueVectorizationMinVF() const; |
660 | |
661 | /// Query the target whether it would be prefered to create a predicated |
662 | /// vector loop, which can avoid the need to emit a scalar epilogue loop. |
663 | LLVM_ABI bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const; |
664 | |
665 | /// Query the target what the preferred style of tail folding is. |
666 | /// \param IVUpdateMayOverflow Tells whether it is known if the IV update |
667 | /// may (or will never) overflow for the suggested VF/UF in the given loop. |
668 | /// Targets can use this information to select a more optimal tail folding |
669 | /// style. The value conservatively defaults to true, such that no assumptions |
670 | /// are made on overflow. |
671 | LLVM_ABI TailFoldingStyle |
672 | getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) const; |
673 | |
674 | // Parameters that control the loop peeling transformation |
675 | struct PeelingPreferences { |
676 | /// A forced peeling factor (the number of bodied of the original loop |
677 | /// that should be peeled off before the loop body). When set to 0, the |
678 | /// a peeling factor based on profile information and other factors. |
679 | unsigned PeelCount; |
680 | /// Allow peeling off loop iterations. |
681 | bool AllowPeeling; |
682 | /// Allow peeling off loop iterations for loop nests. |
683 | bool AllowLoopNestsPeeling; |
684 | /// Allow peeling basing on profile. Uses to enable peeling off all |
685 | /// iterations basing on provided profile. |
686 | /// If the value is true the peeling cost model can decide to peel only |
687 | /// some iterations and in this case it will set this to false. |
688 | bool PeelProfiledIterations; |
689 | |
690 | /// Peel off the last PeelCount loop iterations. |
691 | bool PeelLast; |
692 | }; |
693 | |
694 | /// Get target-customized preferences for the generic loop peeling |
695 | /// transformation. The caller will initialize \p PP with the current |
696 | /// target-independent defaults with information from \p L and \p SE. |
697 | LLVM_ABI void getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
698 | PeelingPreferences &PP) const; |
699 | |
700 | /// Targets can implement their own combinations for target-specific |
701 | /// intrinsics. This function will be called from the InstCombine pass every |
702 | /// time a target-specific intrinsic is encountered. |
703 | /// |
704 | /// \returns std::nullopt to not do anything target specific or a value that |
705 | /// will be returned from the InstCombiner. It is possible to return null and |
706 | /// stop further processing of the intrinsic by returning nullptr. |
707 | LLVM_ABI std::optional<Instruction *> |
708 | instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const; |
709 | /// Can be used to implement target-specific instruction combining. |
710 | /// \see instCombineIntrinsic |
711 | LLVM_ABI std::optional<Value *> |
712 | simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, |
713 | APInt DemandedMask, KnownBits &Known, |
714 | bool &KnownBitsComputed) const; |
715 | /// Can be used to implement target-specific instruction combining. |
716 | /// \see instCombineIntrinsic |
717 | LLVM_ABI std::optional<Value *> simplifyDemandedVectorEltsIntrinsic( |
718 | InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, |
719 | APInt &UndefElts2, APInt &UndefElts3, |
720 | std::function<void(Instruction *, unsigned, APInt, APInt &)> |
721 | SimplifyAndSetOp) const; |
722 | /// @} |
723 | |
724 | /// \name Scalar Target Information |
725 | /// @{ |
726 | |
727 | /// Flags indicating the kind of support for population count. |
728 | /// |
729 | /// Compared to the SW implementation, HW support is supposed to |
730 | /// significantly boost the performance when the population is dense, and it |
731 | /// may or may not degrade performance if the population is sparse. A HW |
732 | /// support is considered as "Fast" if it can outperform, or is on a par |
733 | /// with, SW implementation when the population is sparse; otherwise, it is |
734 | /// considered as "Slow". |
735 | enum PopcntSupportKind { PSK_Software, PSK_SlowHardware, PSK_FastHardware }; |
736 | |
737 | /// Return true if the specified immediate is legal add immediate, that |
738 | /// is the target has add instructions which can add a register with the |
739 | /// immediate without having to materialize the immediate into a register. |
740 | LLVM_ABI bool isLegalAddImmediate(int64_t Imm) const; |
741 | |
742 | /// Return true if adding the specified scalable immediate is legal, that is |
743 | /// the target has add instructions which can add a register with the |
744 | /// immediate (multiplied by vscale) without having to materialize the |
745 | /// immediate into a register. |
746 | LLVM_ABI bool isLegalAddScalableImmediate(int64_t Imm) const; |
747 | |
748 | /// Return true if the specified immediate is legal icmp immediate, |
749 | /// that is the target has icmp instructions which can compare a register |
750 | /// against the immediate without having to materialize the immediate into a |
751 | /// register. |
752 | LLVM_ABI bool isLegalICmpImmediate(int64_t Imm) const; |
753 | |
754 | /// Return true if the addressing mode represented by AM is legal for |
755 | /// this target, for a load/store of the specified type. |
756 | /// The type may be VoidTy, in which case only return true if the addressing |
757 | /// mode is legal for a load/store of any legal type. |
758 | /// If target returns true in LSRWithInstrQueries(), I may be valid. |
759 | /// \param ScalableOffset represents a quantity of bytes multiplied by vscale, |
760 | /// an invariant value known only at runtime. Most targets should not accept |
761 | /// a scalable offset. |
762 | /// |
763 | /// TODO: Handle pre/postinc as well. |
764 | LLVM_ABI bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, |
765 | int64_t BaseOffset, bool HasBaseReg, |
766 | int64_t Scale, unsigned AddrSpace = 0, |
767 | Instruction *I = nullptr, |
768 | int64_t ScalableOffset = 0) const; |
769 | |
770 | /// Return true if LSR cost of C1 is lower than C2. |
771 | LLVM_ABI bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, |
772 | const TargetTransformInfo::LSRCost &C2) const; |
773 | |
774 | /// Return true if LSR major cost is number of registers. Targets which |
775 | /// implement their own isLSRCostLess and unset number of registers as major |
776 | /// cost should return false, otherwise return true. |
777 | LLVM_ABI bool isNumRegsMajorCostOfLSR() const; |
778 | |
779 | /// Return true if LSR should drop a found solution if it's calculated to be |
780 | /// less profitable than the baseline. |
781 | LLVM_ABI bool shouldDropLSRSolutionIfLessProfitable() const; |
782 | |
783 | /// \returns true if LSR should not optimize a chain that includes \p I. |
784 | LLVM_ABI bool isProfitableLSRChainElement(Instruction *I) const; |
785 | |
786 | /// Return true if the target can fuse a compare and branch. |
787 | /// Loop-strength-reduction (LSR) uses that knowledge to adjust its cost |
788 | /// calculation for the instructions in a loop. |
789 | LLVM_ABI bool canMacroFuseCmp() const; |
790 | |
791 | /// Return true if the target can save a compare for loop count, for example |
792 | /// hardware loop saves a compare. |
793 | LLVM_ABI bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, |
794 | LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, |
795 | TargetLibraryInfo *LibInfo) const; |
796 | |
797 | enum AddressingModeKind { |
798 | AMK_PreIndexed, |
799 | AMK_PostIndexed, |
800 | AMK_None |
801 | }; |
802 | |
803 | /// Return the preferred addressing mode LSR should make efforts to generate. |
804 | LLVM_ABI AddressingModeKind |
805 | getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const; |
806 | |
807 | /// Return true if the target supports masked store. |
808 | LLVM_ABI bool isLegalMaskedStore(Type *DataType, Align Alignment, |
809 | unsigned AddressSpace) const; |
810 | /// Return true if the target supports masked load. |
811 | LLVM_ABI bool isLegalMaskedLoad(Type *DataType, Align Alignment, |
812 | unsigned AddressSpace) const; |
813 | |
814 | /// Return true if the target supports nontemporal store. |
815 | LLVM_ABI bool isLegalNTStore(Type *DataType, Align Alignment) const; |
816 | /// Return true if the target supports nontemporal load. |
817 | LLVM_ABI bool isLegalNTLoad(Type *DataType, Align Alignment) const; |
818 | |
819 | /// \Returns true if the target supports broadcasting a load to a vector of |
820 | /// type <NumElements x ElementTy>. |
821 | LLVM_ABI bool isLegalBroadcastLoad(Type *ElementTy, |
822 | ElementCount NumElements) const; |
823 | |
824 | /// Return true if the target supports masked scatter. |
825 | LLVM_ABI bool isLegalMaskedScatter(Type *DataType, Align Alignment) const; |
826 | /// Return true if the target supports masked gather. |
827 | LLVM_ABI bool isLegalMaskedGather(Type *DataType, Align Alignment) const; |
828 | /// Return true if the target forces scalarizing of llvm.masked.gather |
829 | /// intrinsics. |
830 | LLVM_ABI bool forceScalarizeMaskedGather(VectorType *Type, |
831 | Align Alignment) const; |
832 | /// Return true if the target forces scalarizing of llvm.masked.scatter |
833 | /// intrinsics. |
834 | LLVM_ABI bool forceScalarizeMaskedScatter(VectorType *Type, |
835 | Align Alignment) const; |
836 | |
837 | /// Return true if the target supports masked compress store. |
838 | LLVM_ABI bool isLegalMaskedCompressStore(Type *DataType, |
839 | Align Alignment) const; |
840 | /// Return true if the target supports masked expand load. |
841 | LLVM_ABI bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const; |
842 | |
843 | /// Return true if the target supports strided load. |
844 | LLVM_ABI bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const; |
845 | |
846 | /// Return true is the target supports interleaved access for the given vector |
847 | /// type \p VTy, interleave factor \p Factor, alignment \p Alignment and |
848 | /// address space \p AddrSpace. |
849 | LLVM_ABI bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, |
850 | Align Alignment, |
851 | unsigned AddrSpace) const; |
852 | |
853 | // Return true if the target supports masked vector histograms. |
854 | LLVM_ABI bool isLegalMaskedVectorHistogram(Type *AddrType, |
855 | Type *DataType) const; |
856 | |
857 | /// Return true if this is an alternating opcode pattern that can be lowered |
858 | /// to a single instruction on the target. In X86 this is for the addsub |
859 | /// instruction which corrsponds to a Shuffle + Fadd + FSub pattern in IR. |
860 | /// This function expectes two opcodes: \p Opcode1 and \p Opcode2 being |
861 | /// selected by \p OpcodeMask. The mask contains one bit per lane and is a `0` |
862 | /// when \p Opcode0 is selected and `1` when Opcode1 is selected. |
863 | /// \p VecTy is the vector type of the instruction to be generated. |
864 | LLVM_ABI bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, |
865 | unsigned Opcode1, |
866 | const SmallBitVector &OpcodeMask) const; |
867 | |
868 | /// Return true if we should be enabling ordered reductions for the target. |
869 | LLVM_ABI bool enableOrderedReductions() const; |
870 | |
871 | /// Return true if the target has a unified operation to calculate division |
872 | /// and remainder. If so, the additional implicit multiplication and |
873 | /// subtraction required to calculate a remainder from division are free. This |
874 | /// can enable more aggressive transformations for division and remainder than |
875 | /// would typically be allowed using throughput or size cost models. |
876 | LLVM_ABI bool hasDivRemOp(Type *DataType, bool IsSigned) const; |
877 | |
878 | /// Return true if the given instruction (assumed to be a memory access |
879 | /// instruction) has a volatile variant. If that's the case then we can avoid |
880 | /// addrspacecast to generic AS for volatile loads/stores. Default |
881 | /// implementation returns false, which prevents address space inference for |
882 | /// volatile loads/stores. |
883 | LLVM_ABI bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const; |
884 | |
885 | /// Return true if target doesn't mind addresses in vectors. |
886 | LLVM_ABI bool prefersVectorizedAddressing() const; |
887 | |
888 | /// Return the cost of the scaling factor used in the addressing |
889 | /// mode represented by AM for this target, for a load/store |
890 | /// of the specified type. |
891 | /// If the AM is supported, the return value must be >= 0. |
892 | /// If the AM is not supported, it returns a negative value. |
893 | /// TODO: Handle pre/postinc as well. |
894 | LLVM_ABI InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, |
895 | StackOffset BaseOffset, |
896 | bool HasBaseReg, int64_t Scale, |
897 | unsigned AddrSpace = 0) const; |
898 | |
899 | /// Return true if the loop strength reduce pass should make |
900 | /// Instruction* based TTI queries to isLegalAddressingMode(). This is |
901 | /// needed on SystemZ, where e.g. a memcpy can only have a 12 bit unsigned |
902 | /// immediate offset and no index register. |
903 | LLVM_ABI bool LSRWithInstrQueries() const; |
904 | |
905 | /// Return true if it's free to truncate a value of type Ty1 to type |
906 | /// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16 |
907 | /// by referencing its sub-register AX. |
908 | LLVM_ABI bool isTruncateFree(Type *Ty1, Type *Ty2) const; |
909 | |
910 | /// Return true if it is profitable to hoist instruction in the |
911 | /// then/else to before if. |
912 | LLVM_ABI bool isProfitableToHoist(Instruction *I) const; |
913 | |
914 | LLVM_ABI bool useAA() const; |
915 | |
916 | /// Return true if this type is legal. |
917 | LLVM_ABI bool isTypeLegal(Type *Ty) const; |
918 | |
919 | /// Returns the estimated number of registers required to represent \p Ty. |
920 | LLVM_ABI unsigned getRegUsageForType(Type *Ty) const; |
921 | |
922 | /// Return true if switches should be turned into lookup tables for the |
923 | /// target. |
924 | LLVM_ABI bool shouldBuildLookupTables() const; |
925 | |
926 | /// Return true if switches should be turned into lookup tables |
927 | /// containing this constant value for the target. |
928 | LLVM_ABI bool shouldBuildLookupTablesForConstant(Constant *C) const; |
929 | |
930 | /// Return true if lookup tables should be turned into relative lookup tables. |
931 | LLVM_ABI bool shouldBuildRelLookupTables() const; |
932 | |
933 | /// Return true if the input function which is cold at all call sites, |
934 | /// should use coldcc calling convention. |
935 | LLVM_ABI bool useColdCCForColdCall(Function &F) const; |
936 | |
937 | LLVM_ABI bool isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID) const; |
938 | |
939 | /// Identifies if the vector form of the intrinsic has a scalar operand. |
940 | LLVM_ABI bool isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, |
941 | unsigned ScalarOpdIdx) const; |
942 | |
943 | /// Identifies if the vector form of the intrinsic is overloaded on the type |
944 | /// of the operand at index \p OpdIdx, or on the return type if \p OpdIdx is |
945 | /// -1. |
946 | LLVM_ABI bool isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, |
947 | int OpdIdx) const; |
948 | |
949 | /// Identifies if the vector form of the intrinsic that returns a struct is |
950 | /// overloaded at the struct element index \p RetIdx. |
951 | LLVM_ABI bool |
952 | isTargetIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, |
953 | int RetIdx) const; |
954 | |
955 | /// Estimate the overhead of scalarizing an instruction. Insert and Extract |
956 | /// are set if the demanded result elements need to be inserted and/or |
957 | /// extracted from vectors. The involved values may be passed in VL if |
958 | /// Insert is true. |
959 | LLVM_ABI InstructionCost getScalarizationOverhead( |
960 | VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, |
961 | TTI::TargetCostKind CostKind, bool ForPoisonSrc = true, |
962 | ArrayRef<Value *> VL = {}) const; |
963 | |
964 | /// Estimate the overhead of scalarizing an instructions unique |
965 | /// non-constant operands. The (potentially vector) types to use for each of |
966 | /// argument are passes via Tys. |
967 | LLVM_ABI InstructionCost getOperandsScalarizationOverhead( |
968 | ArrayRef<const Value *> Args, ArrayRef<Type *> Tys, |
969 | TTI::TargetCostKind CostKind) const; |
970 | |
971 | /// If target has efficient vector element load/store instructions, it can |
972 | /// return true here so that insertion/extraction costs are not added to |
973 | /// the scalarization cost of a load/store. |
974 | LLVM_ABI bool supportsEfficientVectorElementLoadStore() const; |
975 | |
976 | /// If the target supports tail calls. |
977 | LLVM_ABI bool supportsTailCalls() const; |
978 | |
979 | /// If target supports tail call on \p CB |
980 | LLVM_ABI bool supportsTailCallFor(const CallBase *CB) const; |
981 | |
982 | /// Don't restrict interleaved unrolling to small loops. |
983 | LLVM_ABI bool enableAggressiveInterleaving(bool LoopHasReductions) const; |
984 | |
985 | /// Returns options for expansion of memcmp. IsZeroCmp is |
986 | // true if this is the expansion of memcmp(p1, p2, s) == 0. |
987 | struct MemCmpExpansionOptions { |
988 | // Return true if memcmp expansion is enabled. |
989 | operator bool() const { return MaxNumLoads > 0; } |
990 | |
991 | // Maximum number of load operations. |
992 | unsigned MaxNumLoads = 0; |
993 | |
994 | // The list of available load sizes (in bytes), sorted in decreasing order. |
995 | SmallVector<unsigned, 8> LoadSizes; |
996 | |
997 | // For memcmp expansion when the memcmp result is only compared equal or |
998 | // not-equal to 0, allow up to this number of load pairs per block. As an |
999 | // example, this may allow 'memcmp(a, b, 3) == 0' in a single block: |
1000 | // a0 = load2bytes &a[0] |
1001 | // b0 = load2bytes &b[0] |
1002 | // a2 = load1byte &a[2] |
1003 | // b2 = load1byte &b[2] |
1004 | // r = cmp eq (a0 ^ b0 | a2 ^ b2), 0 |
1005 | unsigned NumLoadsPerBlock = 1; |
1006 | |
1007 | // Set to true to allow overlapping loads. For example, 7-byte compares can |
1008 | // be done with two 4-byte compares instead of 4+2+1-byte compares. This |
1009 | // requires all loads in LoadSizes to be doable in an unaligned way. |
1010 | bool AllowOverlappingLoads = false; |
1011 | |
1012 | // Sometimes, the amount of data that needs to be compared is smaller than |
1013 | // the standard register size, but it cannot be loaded with just one load |
1014 | // instruction. For example, if the size of the memory comparison is 6 |
1015 | // bytes, we can handle it more efficiently by loading all 6 bytes in a |
1016 | // single block and generating an 8-byte number, instead of generating two |
1017 | // separate blocks with conditional jumps for 4 and 2 byte loads. This |
1018 | // approach simplifies the process and produces the comparison result as |
1019 | // normal. This array lists the allowed sizes of memcmp tails that can be |
1020 | // merged into one block |
1021 | SmallVector<unsigned, 4> AllowedTailExpansions; |
1022 | }; |
1023 | LLVM_ABI MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, |
1024 | bool IsZeroCmp) const; |
1025 | |
1026 | /// Should the Select Optimization pass be enabled and ran. |
1027 | LLVM_ABI bool enableSelectOptimize() const; |
1028 | |
1029 | /// Should the Select Optimization pass treat the given instruction like a |
1030 | /// select, potentially converting it to a conditional branch. This can |
1031 | /// include select-like instructions like or(zext(c), x) that can be converted |
1032 | /// to selects. |
1033 | LLVM_ABI bool shouldTreatInstructionLikeSelect(const Instruction *I) const; |
1034 | |
1035 | /// Enable matching of interleaved access groups. |
1036 | LLVM_ABI bool enableInterleavedAccessVectorization() const; |
1037 | |
1038 | /// Enable matching of interleaved access groups that contain predicated |
1039 | /// accesses or gaps and therefore vectorized using masked |
1040 | /// vector loads/stores. |
1041 | LLVM_ABI bool enableMaskedInterleavedAccessVectorization() const; |
1042 | |
1043 | /// Indicate that it is potentially unsafe to automatically vectorize |
1044 | /// floating-point operations because the semantics of vector and scalar |
1045 | /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math |
1046 | /// does not support IEEE-754 denormal numbers, while depending on the |
1047 | /// platform, scalar floating-point math does. |
1048 | /// This applies to floating-point math operations and calls, not memory |
1049 | /// operations, shuffles, or casts. |
1050 | LLVM_ABI bool isFPVectorizationPotentiallyUnsafe() const; |
1051 | |
1052 | /// Determine if the target supports unaligned memory accesses. |
1053 | LLVM_ABI bool allowsMisalignedMemoryAccesses(LLVMContext &Context, |
1054 | unsigned BitWidth, |
1055 | unsigned AddressSpace = 0, |
1056 | Align Alignment = Align(1), |
1057 | unsigned *Fast = nullptr) const; |
1058 | |
1059 | /// Return hardware support for population count. |
1060 | LLVM_ABI PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const; |
1061 | |
1062 | /// Return true if the hardware has a fast square-root instruction. |
1063 | LLVM_ABI bool haveFastSqrt(Type *Ty) const; |
1064 | |
1065 | /// Return true if the cost of the instruction is too high to speculatively |
1066 | /// execute and should be kept behind a branch. |
1067 | /// This normally just wraps around a getInstructionCost() call, but some |
1068 | /// targets might report a low TCK_SizeAndLatency value that is incompatible |
1069 | /// with the fixed TCC_Expensive value. |
1070 | /// NOTE: This assumes the instruction passes isSafeToSpeculativelyExecute(). |
1071 | LLVM_ABI bool isExpensiveToSpeculativelyExecute(const Instruction *I) const; |
1072 | |
1073 | /// Return true if it is faster to check if a floating-point value is NaN |
1074 | /// (or not-NaN) versus a comparison against a constant FP zero value. |
1075 | /// Targets should override this if materializing a 0.0 for comparison is |
1076 | /// generally as cheap as checking for ordered/unordered. |
1077 | LLVM_ABI bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const; |
1078 | |
1079 | /// Return the expected cost of supporting the floating point operation |
1080 | /// of the specified type. |
1081 | LLVM_ABI InstructionCost getFPOpCost(Type *Ty) const; |
1082 | |
1083 | /// Return the expected cost of materializing for the given integer |
1084 | /// immediate of the specified type. |
1085 | LLVM_ABI InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, |
1086 | TargetCostKind CostKind) const; |
1087 | |
1088 | /// Return the expected cost of materialization for the given integer |
1089 | /// immediate of the specified type for a given instruction. The cost can be |
1090 | /// zero if the immediate can be folded into the specified instruction. |
1091 | LLVM_ABI InstructionCost getIntImmCostInst(unsigned Opc, unsigned Idx, |
1092 | const APInt &Imm, Type *Ty, |
1093 | TargetCostKind CostKind, |
1094 | Instruction *Inst = nullptr) const; |
1095 | LLVM_ABI InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, |
1096 | const APInt &Imm, Type *Ty, |
1097 | TargetCostKind CostKind) const; |
1098 | |
1099 | /// Return the expected cost for the given integer when optimising |
1100 | /// for size. This is different than the other integer immediate cost |
1101 | /// functions in that it is subtarget agnostic. This is useful when you e.g. |
1102 | /// target one ISA such as Aarch32 but smaller encodings could be possible |
1103 | /// with another such as Thumb. This return value is used as a penalty when |
1104 | /// the total costs for a constant is calculated (the bigger the cost, the |
1105 | /// more beneficial constant hoisting is). |
1106 | LLVM_ABI InstructionCost getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, |
1107 | const APInt &Imm, |
1108 | Type *Ty) const; |
1109 | |
1110 | /// It can be advantageous to detach complex constants from their uses to make |
1111 | /// their generation cheaper. This hook allows targets to report when such |
1112 | /// transformations might negatively effect the code generation of the |
1113 | /// underlying operation. The motivating example is divides whereby hoisting |
1114 | /// constants prevents the code generator's ability to transform them into |
1115 | /// combinations of simpler operations. |
1116 | LLVM_ABI bool preferToKeepConstantsAttached(const Instruction &Inst, |
1117 | const Function &Fn) const; |
1118 | |
1119 | /// @} |
1120 | |
1121 | /// \name Vector Target Information |
1122 | /// @{ |
1123 | |
1124 | /// The various kinds of shuffle patterns for vector queries. |
1125 | enum ShuffleKind { |
1126 | SK_Broadcast, ///< Broadcast element 0 to all other elements. |
1127 | SK_Reverse, ///< Reverse the order of the vector. |
1128 | SK_Select, ///< Selects elements from the corresponding lane of |
1129 | ///< either source operand. This is equivalent to a |
1130 | ///< vector select with a constant condition operand. |
1131 | SK_Transpose, ///< Transpose two vectors. |
1132 | SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset. |
1133 | SK_ExtractSubvector, ///< ExtractSubvector Index indicates start offset. |
1134 | SK_PermuteTwoSrc, ///< Merge elements from two source vectors into one |
1135 | ///< with any shuffle mask. |
1136 | SK_PermuteSingleSrc, ///< Shuffle elements of single source vector with any |
1137 | ///< shuffle mask. |
1138 | SK_Splice ///< Concatenates elements from the first input vector |
1139 | ///< with elements of the second input vector. Returning |
1140 | ///< a vector of the same type as the input vectors. |
1141 | ///< Index indicates start offset in first input vector. |
1142 | }; |
1143 | |
1144 | /// Additional information about an operand's possible values. |
1145 | enum OperandValueKind { |
1146 | OK_AnyValue, // Operand can have any value. |
1147 | OK_UniformValue, // Operand is uniform (splat of a value). |
1148 | OK_UniformConstantValue, // Operand is uniform constant. |
1149 | OK_NonUniformConstantValue // Operand is a non uniform constant value. |
1150 | }; |
1151 | |
1152 | /// Additional properties of an operand's values. |
1153 | enum OperandValueProperties { |
1154 | OP_None = 0, |
1155 | OP_PowerOf2 = 1, |
1156 | OP_NegatedPowerOf2 = 2, |
1157 | }; |
1158 | |
1159 | // Describe the values an operand can take. We're in the process |
1160 | // of migrating uses of OperandValueKind and OperandValueProperties |
1161 | // to use this class, and then will change the internal representation. |
1162 | struct OperandValueInfo { |
1163 | OperandValueKind Kind = OK_AnyValue; |
1164 | OperandValueProperties Properties = OP_None; |
1165 | |
1166 | bool isConstant() const { |
1167 | return Kind == OK_UniformConstantValue || Kind == OK_NonUniformConstantValue; |
1168 | } |
1169 | bool isUniform() const { |
1170 | return Kind == OK_UniformConstantValue || Kind == OK_UniformValue; |
1171 | } |
1172 | bool isPowerOf2() const { |
1173 | return Properties == OP_PowerOf2; |
1174 | } |
1175 | bool isNegatedPowerOf2() const { |
1176 | return Properties == OP_NegatedPowerOf2; |
1177 | } |
1178 | |
1179 | OperandValueInfo getNoProps() const { |
1180 | return {.Kind: Kind, .Properties: OP_None}; |
1181 | } |
1182 | }; |
1183 | |
1184 | /// \return the number of registers in the target-provided register class. |
1185 | LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const; |
1186 | |
1187 | /// \return true if the target supports load/store that enables fault |
1188 | /// suppression of memory operands when the source condition is false. |
1189 | LLVM_ABI bool hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const; |
1190 | |
1191 | /// \return the target-provided register class ID for the provided type, |
1192 | /// accounting for type promotion and other type-legalization techniques that |
1193 | /// the target might apply. However, it specifically does not account for the |
1194 | /// scalarization or splitting of vector types. Should a vector type require |
1195 | /// scalarization or splitting into multiple underlying vector registers, that |
1196 | /// type should be mapped to a register class containing no registers. |
1197 | /// Specifically, this is designed to provide a simple, high-level view of the |
1198 | /// register allocation later performed by the backend. These register classes |
1199 | /// don't necessarily map onto the register classes used by the backend. |
1200 | /// FIXME: It's not currently possible to determine how many registers |
1201 | /// are used by the provided type. |
1202 | LLVM_ABI unsigned getRegisterClassForType(bool Vector, |
1203 | Type *Ty = nullptr) const; |
1204 | |
1205 | /// \return the target-provided register class name |
1206 | LLVM_ABI const char *getRegisterClassName(unsigned ClassID) const; |
1207 | |
1208 | enum RegisterKind { RGK_Scalar, RGK_FixedWidthVector, RGK_ScalableVector }; |
1209 | |
1210 | /// \return The width of the largest scalar or vector register type. |
1211 | LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const; |
1212 | |
1213 | /// \return The width of the smallest vector register type. |
1214 | LLVM_ABI unsigned getMinVectorRegisterBitWidth() const; |
1215 | |
1216 | /// \return The maximum value of vscale if the target specifies an |
1217 | /// architectural maximum vector length, and std::nullopt otherwise. |
1218 | LLVM_ABI std::optional<unsigned> getMaxVScale() const; |
1219 | |
1220 | /// \return the value of vscale to tune the cost model for. |
1221 | LLVM_ABI std::optional<unsigned> getVScaleForTuning() const; |
1222 | |
1223 | /// \return true if vscale is known to be a power of 2 |
1224 | LLVM_ABI bool isVScaleKnownToBeAPowerOfTwo() const; |
1225 | |
1226 | /// \return True if the vectorization factor should be chosen to |
1227 | /// make the vector of the smallest element type match the size of a |
1228 | /// vector register. For wider element types, this could result in |
1229 | /// creating vectors that span multiple vector registers. |
1230 | /// If false, the vectorization factor will be chosen based on the |
1231 | /// size of the widest element type. |
1232 | /// \p K Register Kind for vectorization. |
1233 | LLVM_ABI bool |
1234 | shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const; |
1235 | |
1236 | /// \return The minimum vectorization factor for types of given element |
1237 | /// bit width, or 0 if there is no minimum VF. The returned value only |
1238 | /// applies when shouldMaximizeVectorBandwidth returns true. |
1239 | /// If IsScalable is true, the returned ElementCount must be a scalable VF. |
1240 | LLVM_ABI ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const; |
1241 | |
1242 | /// \return The maximum vectorization factor for types of given element |
1243 | /// bit width and opcode, or 0 if there is no maximum VF. |
1244 | /// Currently only used by the SLP vectorizer. |
1245 | LLVM_ABI unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const; |
1246 | |
1247 | /// \return The minimum vectorization factor for the store instruction. Given |
1248 | /// the initial estimation of the minimum vector factor and store value type, |
1249 | /// it tries to find possible lowest VF, which still might be profitable for |
1250 | /// the vectorization. |
1251 | /// \param VF Initial estimation of the minimum vector factor. |
1252 | /// \param ScalarMemTy Scalar memory type of the store operation. |
1253 | /// \param ScalarValTy Scalar type of the stored value. |
1254 | /// Currently only used by the SLP vectorizer. |
1255 | LLVM_ABI unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, |
1256 | Type *ScalarValTy) const; |
1257 | |
1258 | /// \return True if it should be considered for address type promotion. |
1259 | /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is |
1260 | /// profitable without finding other extensions fed by the same input. |
1261 | LLVM_ABI bool shouldConsiderAddressTypePromotion( |
1262 | const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const; |
1263 | |
1264 | /// \return The size of a cache line in bytes. |
1265 | LLVM_ABI unsigned getCacheLineSize() const; |
1266 | |
1267 | /// The possible cache levels |
1268 | enum class CacheLevel { |
1269 | L1D, // The L1 data cache |
1270 | L2D, // The L2 data cache |
1271 | |
1272 | // We currently do not model L3 caches, as their sizes differ widely between |
1273 | // microarchitectures. Also, we currently do not have a use for L3 cache |
1274 | // size modeling yet. |
1275 | }; |
1276 | |
1277 | /// \return The size of the cache level in bytes, if available. |
1278 | LLVM_ABI std::optional<unsigned> getCacheSize(CacheLevel Level) const; |
1279 | |
1280 | /// \return The associativity of the cache level, if available. |
1281 | LLVM_ABI std::optional<unsigned> |
1282 | getCacheAssociativity(CacheLevel Level) const; |
1283 | |
1284 | /// \return The minimum architectural page size for the target. |
1285 | LLVM_ABI std::optional<unsigned> getMinPageSize() const; |
1286 | |
1287 | /// \return How much before a load we should place the prefetch |
1288 | /// instruction. This is currently measured in number of |
1289 | /// instructions. |
1290 | LLVM_ABI unsigned getPrefetchDistance() const; |
1291 | |
1292 | /// Some HW prefetchers can handle accesses up to a certain constant stride. |
1293 | /// Sometimes prefetching is beneficial even below the HW prefetcher limit, |
1294 | /// and the arguments provided are meant to serve as a basis for deciding this |
1295 | /// for a particular loop. |
1296 | /// |
1297 | /// \param NumMemAccesses Number of memory accesses in the loop. |
1298 | /// \param NumStridedMemAccesses Number of the memory accesses that |
1299 | /// ScalarEvolution could find a known stride |
1300 | /// for. |
1301 | /// \param NumPrefetches Number of software prefetches that will be |
1302 | /// emitted as determined by the addresses |
1303 | /// involved and the cache line size. |
1304 | /// \param HasCall True if the loop contains a call. |
1305 | /// |
1306 | /// \return This is the minimum stride in bytes where it makes sense to start |
1307 | /// adding SW prefetches. The default is 1, i.e. prefetch with any |
1308 | /// stride. |
1309 | LLVM_ABI unsigned getMinPrefetchStride(unsigned NumMemAccesses, |
1310 | unsigned NumStridedMemAccesses, |
1311 | unsigned NumPrefetches, |
1312 | bool HasCall) const; |
1313 | |
1314 | /// \return The maximum number of iterations to prefetch ahead. If |
1315 | /// the required number of iterations is more than this number, no |
1316 | /// prefetching is performed. |
1317 | LLVM_ABI unsigned getMaxPrefetchIterationsAhead() const; |
1318 | |
1319 | /// \return True if prefetching should also be done for writes. |
1320 | LLVM_ABI bool enableWritePrefetching() const; |
1321 | |
1322 | /// \return if target want to issue a prefetch in address space \p AS. |
1323 | LLVM_ABI bool shouldPrefetchAddressSpace(unsigned AS) const; |
1324 | |
1325 | /// \return The cost of a partial reduction, which is a reduction from a |
1326 | /// vector to another vector with fewer elements of larger size. They are |
1327 | /// represented by the llvm.experimental.partial.reduce.add intrinsic, which |
1328 | /// takes an accumulator and a binary operation operand that itself is fed by |
1329 | /// two extends. An example of an operation that uses a partial reduction is a |
1330 | /// dot product, which reduces two vectors to another of 4 times fewer and 4 |
1331 | /// times larger elements. |
1332 | LLVM_ABI InstructionCost getPartialReductionCost( |
1333 | unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, |
1334 | ElementCount VF, PartialReductionExtendKind OpAExtend, |
1335 | PartialReductionExtendKind OpBExtend, |
1336 | std::optional<unsigned> BinOp = std::nullopt) const; |
1337 | |
1338 | /// \return The maximum interleave factor that any transform should try to |
1339 | /// perform for this target. This number depends on the level of parallelism |
1340 | /// and the number of execution units in the CPU. |
1341 | LLVM_ABI unsigned getMaxInterleaveFactor(ElementCount VF) const; |
1342 | |
1343 | /// Collect properties of V used in cost analysis, e.g. OP_PowerOf2. |
1344 | LLVM_ABI static OperandValueInfo getOperandInfo(const Value *V); |
1345 | |
1346 | /// This is an approximation of reciprocal throughput of a math/logic op. |
1347 | /// A higher cost indicates less expected throughput. |
1348 | /// From Agner Fog's guides, reciprocal throughput is "the average number of |
1349 | /// clock cycles per instruction when the instructions are not part of a |
1350 | /// limiting dependency chain." |
1351 | /// Therefore, costs should be scaled to account for multiple execution units |
1352 | /// on the target that can process this type of instruction. For example, if |
1353 | /// there are 5 scalar integer units and 2 vector integer units that can |
1354 | /// calculate an 'add' in a single cycle, this model should indicate that the |
1355 | /// cost of the vector add instruction is 2.5 times the cost of the scalar |
1356 | /// add instruction. |
1357 | /// \p Args is an optional argument which holds the instruction operands |
1358 | /// values so the TTI can analyze those values searching for special |
1359 | /// cases or optimizations based on those values. |
1360 | /// \p CxtI is the optional original context instruction, if one exists, to |
1361 | /// provide even more information. |
1362 | /// \p TLibInfo is used to search for platform specific vector library |
1363 | /// functions for instructions that might be converted to calls (e.g. frem). |
1364 | LLVM_ABI InstructionCost getArithmeticInstrCost( |
1365 | unsigned Opcode, Type *Ty, |
1366 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, |
1367 | TTI::OperandValueInfo Opd1Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
1368 | TTI::OperandValueInfo Opd2Info = {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
1369 | ArrayRef<const Value *> Args = {}, const Instruction *CxtI = nullptr, |
1370 | const TargetLibraryInfo *TLibInfo = nullptr) const; |
1371 | |
1372 | /// Returns the cost estimation for alternating opcode pattern that can be |
1373 | /// lowered to a single instruction on the target. In X86 this is for the |
1374 | /// addsub instruction which corrsponds to a Shuffle + Fadd + FSub pattern in |
1375 | /// IR. This function expects two opcodes: \p Opcode1 and \p Opcode2 being |
1376 | /// selected by \p OpcodeMask. The mask contains one bit per lane and is a `0` |
1377 | /// when \p Opcode0 is selected and `1` when Opcode1 is selected. |
1378 | /// \p VecTy is the vector type of the instruction to be generated. |
1379 | LLVM_ABI InstructionCost getAltInstrCost( |
1380 | VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, |
1381 | const SmallBitVector &OpcodeMask, |
1382 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; |
1383 | |
1384 | /// \return The cost of a shuffle instruction of kind Kind and of type Tp. |
1385 | /// The exact mask may be passed as Mask, or else the array will be empty. |
1386 | /// The index and subtype parameters are used by the subvector insertion and |
1387 | /// extraction shuffle kinds to show the insert/extract point and the type of |
1388 | /// the subvector being inserted/extracted. The operands of the shuffle can be |
1389 | /// passed through \p Args, which helps improve the cost estimation in some |
1390 | /// cases, like in broadcast loads. |
1391 | /// NOTE: For subvector extractions Tp represents the source type. |
1392 | LLVM_ABI InstructionCost getShuffleCost( |
1393 | ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask = {}, |
1394 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, int Index = 0, |
1395 | VectorType *SubTp = nullptr, ArrayRef<const Value *> Args = {}, |
1396 | const Instruction *CxtI = nullptr) const; |
1397 | |
1398 | /// Represents a hint about the context in which a cast is used. |
1399 | /// |
1400 | /// For zext/sext, the context of the cast is the operand, which must be a |
1401 | /// load of some kind. For trunc, the context is of the cast is the single |
1402 | /// user of the instruction, which must be a store of some kind. |
1403 | /// |
1404 | /// This enum allows the vectorizer to give getCastInstrCost an idea of the |
1405 | /// type of cast it's dealing with, as not every cast is equal. For instance, |
1406 | /// the zext of a load may be free, but the zext of an interleaving load can |
1407 | //// be (very) expensive! |
1408 | /// |
1409 | /// See \c getCastContextHint to compute a CastContextHint from a cast |
1410 | /// Instruction*. Callers can use it if they don't need to override the |
1411 | /// context and just want it to be calculated from the instruction. |
1412 | /// |
1413 | /// FIXME: This handles the types of load/store that the vectorizer can |
1414 | /// produce, which are the cases where the context instruction is most |
1415 | /// likely to be incorrect. There are other situations where that can happen |
1416 | /// too, which might be handled here but in the long run a more general |
1417 | /// solution of costing multiple instructions at the same times may be better. |
1418 | enum class CastContextHint : uint8_t { |
1419 | None, ///< The cast is not used with a load/store of any kind. |
1420 | Normal, ///< The cast is used with a normal load/store. |
1421 | Masked, ///< The cast is used with a masked load/store. |
1422 | GatherScatter, ///< The cast is used with a gather/scatter. |
1423 | Interleave, ///< The cast is used with an interleaved load/store. |
1424 | Reversed, ///< The cast is used with a reversed load/store. |
1425 | }; |
1426 | |
1427 | /// Calculates a CastContextHint from \p I. |
1428 | /// This should be used by callers of getCastInstrCost if they wish to |
1429 | /// determine the context from some instruction. |
1430 | /// \returns the CastContextHint for ZExt/SExt/Trunc, None if \p I is nullptr, |
1431 | /// or if it's another type of cast. |
1432 | LLVM_ABI static CastContextHint getCastContextHint(const Instruction *I); |
1433 | |
1434 | /// \return The expected cost of cast instructions, such as bitcast, trunc, |
1435 | /// zext, etc. If there is an existing instruction that holds Opcode, it |
1436 | /// may be passed in the 'I' parameter. |
1437 | LLVM_ABI InstructionCost getCastInstrCost( |
1438 | unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, |
1439 | TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency, |
1440 | const Instruction *I = nullptr) const; |
1441 | |
1442 | /// \return The expected cost of a sign- or zero-extended vector extract. Use |
1443 | /// Index = -1 to indicate that there is no information about the index value. |
1444 | LLVM_ABI InstructionCost |
1445 | getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, |
1446 | unsigned Index, TTI::TargetCostKind CostKind) const; |
1447 | |
1448 | /// \return The expected cost of control-flow related instructions such as |
1449 | /// Phi, Ret, Br, Switch. |
1450 | LLVM_ABI InstructionCost getCFInstrCost( |
1451 | unsigned Opcode, TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency, |
1452 | const Instruction *I = nullptr) const; |
1453 | |
1454 | /// \returns The expected cost of compare and select instructions. If there |
1455 | /// is an existing instruction that holds Opcode, it may be passed in the |
1456 | /// 'I' parameter. The \p VecPred parameter can be used to indicate the select |
1457 | /// is using a compare with the specified predicate as condition. When vector |
1458 | /// types are passed, \p VecPred must be used for all lanes. For a |
1459 | /// comparison, the two operands are the natural values. For a select, the |
1460 | /// two operands are the *value* operands, not the condition operand. |
1461 | LLVM_ABI InstructionCost getCmpSelInstrCost( |
1462 | unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, |
1463 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, |
1464 | OperandValueInfo Op1Info = {.Kind: OK_AnyValue, .Properties: OP_None}, |
1465 | OperandValueInfo Op2Info = {.Kind: OK_AnyValue, .Properties: OP_None}, |
1466 | const Instruction *I = nullptr) const; |
1467 | |
1468 | /// \return The expected cost of vector Insert and Extract. |
1469 | /// Use -1 to indicate that there is no information on the index value. |
1470 | /// This is used when the instruction is not available; a typical use |
1471 | /// case is to provision the cost of vectorization/scalarization in |
1472 | /// vectorizer passes. |
1473 | LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, |
1474 | TTI::TargetCostKind CostKind, |
1475 | unsigned Index = -1, |
1476 | const Value *Op0 = nullptr, |
1477 | const Value *Op1 = nullptr) const; |
1478 | |
1479 | /// \return The expected cost of vector Insert and Extract. |
1480 | /// Use -1 to indicate that there is no information on the index value. |
1481 | /// This is used when the instruction is not available; a typical use |
1482 | /// case is to provision the cost of vectorization/scalarization in |
1483 | /// vectorizer passes. |
1484 | /// \param ScalarUserAndIdx encodes the information about extracts from a |
1485 | /// vector with 'Scalar' being the value being extracted,'User' being the user |
1486 | /// of the extract(nullptr if user is not known before vectorization) and |
1487 | /// 'Idx' being the extract lane. |
1488 | LLVM_ABI InstructionCost getVectorInstrCost( |
1489 | unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, |
1490 | Value *Scalar, |
1491 | ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const; |
1492 | |
1493 | /// \return The expected cost of vector Insert and Extract. |
1494 | /// This is used when instruction is available, and implementation |
1495 | /// asserts 'I' is not nullptr. |
1496 | /// |
1497 | /// A typical suitable use case is cost estimation when vector instruction |
1498 | /// exists (e.g., from basic blocks during transformation). |
1499 | LLVM_ABI InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, |
1500 | TTI::TargetCostKind CostKind, |
1501 | unsigned Index = -1) const; |
1502 | |
1503 | /// \return The expected cost of aggregate inserts and extracts. This is |
1504 | /// used when the instruction is not available; a typical use case is to |
1505 | /// provision the cost of vectorization/scalarization in vectorizer passes. |
1506 | LLVM_ABI InstructionCost getInsertExtractValueCost( |
1507 | unsigned Opcode, TTI::TargetCostKind CostKind) const; |
1508 | |
1509 | /// \return The cost of replication shuffle of \p VF elements typed \p EltTy |
1510 | /// \p ReplicationFactor times. |
1511 | /// |
1512 | /// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is: |
1513 | /// <0,0,0,1,1,1,2,2,2,3,3,3> |
1514 | LLVM_ABI InstructionCost getReplicationShuffleCost( |
1515 | Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, |
1516 | TTI::TargetCostKind CostKind) const; |
1517 | |
1518 | /// \return The cost of Load and Store instructions. |
1519 | LLVM_ABI InstructionCost getMemoryOpCost( |
1520 | unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, |
1521 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, |
1522 | OperandValueInfo OpdInfo = {.Kind: OK_AnyValue, .Properties: OP_None}, |
1523 | const Instruction *I = nullptr) const; |
1524 | |
1525 | /// \return The cost of VP Load and Store instructions. |
1526 | LLVM_ABI InstructionCost getVPMemoryOpCost( |
1527 | unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, |
1528 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, |
1529 | const Instruction *I = nullptr) const; |
1530 | |
1531 | /// \return The cost of masked Load and Store instructions. |
1532 | LLVM_ABI InstructionCost getMaskedMemoryOpCost( |
1533 | unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, |
1534 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; |
1535 | |
1536 | /// \return The cost of Gather or Scatter operation |
1537 | /// \p Opcode - is a type of memory access Load or Store |
1538 | /// \p DataTy - a vector type of the data to be loaded or stored |
1539 | /// \p Ptr - pointer [or vector of pointers] - address[es] in memory |
1540 | /// \p VariableMask - true when the memory access is predicated with a mask |
1541 | /// that is not a compile-time constant |
1542 | /// \p Alignment - alignment of single element |
1543 | /// \p I - the optional original context instruction, if one exists, e.g. the |
1544 | /// load/store to transform or the call to the gather/scatter intrinsic |
1545 | LLVM_ABI InstructionCost getGatherScatterOpCost( |
1546 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, |
1547 | Align Alignment, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, |
1548 | const Instruction *I = nullptr) const; |
1549 | |
1550 | /// \return The cost of Expand Load or Compress Store operation |
1551 | /// \p Opcode - is a type of memory access Load or Store |
1552 | /// \p Src - a vector type of the data to be loaded or stored |
1553 | /// \p VariableMask - true when the memory access is predicated with a mask |
1554 | /// that is not a compile-time constant |
1555 | /// \p Alignment - alignment of single element |
1556 | /// \p I - the optional original context instruction, if one exists, e.g. the |
1557 | /// load/store to transform or the call to the gather/scatter intrinsic |
1558 | LLVM_ABI InstructionCost getExpandCompressMemoryOpCost( |
1559 | unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment, |
1560 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, |
1561 | const Instruction *I = nullptr) const; |
1562 | |
1563 | /// \return The cost of strided memory operations. |
1564 | /// \p Opcode - is a type of memory access Load or Store |
1565 | /// \p DataTy - a vector type of the data to be loaded or stored |
1566 | /// \p Ptr - pointer [or vector of pointers] - address[es] in memory |
1567 | /// \p VariableMask - true when the memory access is predicated with a mask |
1568 | /// that is not a compile-time constant |
1569 | /// \p Alignment - alignment of single element |
1570 | /// \p I - the optional original context instruction, if one exists, e.g. the |
1571 | /// load/store to transform or the call to the gather/scatter intrinsic |
1572 | LLVM_ABI InstructionCost getStridedMemoryOpCost( |
1573 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, |
1574 | Align Alignment, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, |
1575 | const Instruction *I = nullptr) const; |
1576 | |
1577 | /// \return The cost of the interleaved memory operation. |
1578 | /// \p Opcode is the memory operation code |
1579 | /// \p VecTy is the vector type of the interleaved access. |
1580 | /// \p Factor is the interleave factor |
1581 | /// \p Indices is the indices for interleaved load members (as interleaved |
1582 | /// load allows gaps) |
1583 | /// \p Alignment is the alignment of the memory operation |
1584 | /// \p AddressSpace is address space of the pointer. |
1585 | /// \p UseMaskForCond indicates if the memory access is predicated. |
1586 | /// \p UseMaskForGaps indicates if gaps should be masked. |
1587 | LLVM_ABI InstructionCost getInterleavedMemoryOpCost( |
1588 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, |
1589 | Align Alignment, unsigned AddressSpace, |
1590 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, |
1591 | bool UseMaskForCond = false, bool UseMaskForGaps = false) const; |
1592 | |
1593 | /// A helper function to determine the type of reduction algorithm used |
1594 | /// for a given \p Opcode and set of FastMathFlags \p FMF. |
1595 | static bool requiresOrderedReduction(std::optional<FastMathFlags> FMF) { |
1596 | return FMF && !(*FMF).allowReassoc(); |
1597 | } |
1598 | |
1599 | /// Calculate the cost of vector reduction intrinsics. |
1600 | /// |
1601 | /// This is the cost of reducing the vector value of type \p Ty to a scalar |
1602 | /// value using the operation denoted by \p Opcode. The FastMathFlags |
1603 | /// parameter \p FMF indicates what type of reduction we are performing: |
1604 | /// 1. Tree-wise. This is the typical 'fast' reduction performed that |
1605 | /// involves successively splitting a vector into half and doing the |
1606 | /// operation on the pair of halves until you have a scalar value. For |
1607 | /// example: |
1608 | /// (v0, v1, v2, v3) |
1609 | /// ((v0+v2), (v1+v3), undef, undef) |
1610 | /// ((v0+v2+v1+v3), undef, undef, undef) |
1611 | /// This is the default behaviour for integer operations, whereas for |
1612 | /// floating point we only do this if \p FMF indicates that |
1613 | /// reassociation is allowed. |
1614 | /// 2. Ordered. For a vector with N elements this involves performing N |
1615 | /// operations in lane order, starting with an initial scalar value, i.e. |
1616 | /// result = InitVal + v0 |
1617 | /// result = result + v1 |
1618 | /// result = result + v2 |
1619 | /// result = result + v3 |
1620 | /// This is only the case for FP operations and when reassociation is not |
1621 | /// allowed. |
1622 | /// |
1623 | LLVM_ABI InstructionCost getArithmeticReductionCost( |
1624 | unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF, |
1625 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; |
1626 | |
1627 | LLVM_ABI InstructionCost getMinMaxReductionCost( |
1628 | Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF = FastMathFlags(), |
1629 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; |
1630 | |
1631 | /// Calculate the cost of an extended reduction pattern, similar to |
1632 | /// getArithmeticReductionCost of an Add reduction with multiply and optional |
1633 | /// extensions. This is the cost of as: |
1634 | /// ResTy vecreduce.add(mul (A, B)). |
1635 | /// ResTy vecreduce.add(mul(ext(Ty A), ext(Ty B)). |
1636 | LLVM_ABI InstructionCost getMulAccReductionCost( |
1637 | bool IsUnsigned, Type *ResTy, VectorType *Ty, |
1638 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; |
1639 | |
1640 | /// Calculate the cost of an extended reduction pattern, similar to |
1641 | /// getArithmeticReductionCost of a reduction with an extension. |
1642 | /// This is the cost of as: |
1643 | /// ResTy vecreduce.opcode(ext(Ty A)). |
1644 | LLVM_ABI InstructionCost getExtendedReductionCost( |
1645 | unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, |
1646 | std::optional<FastMathFlags> FMF, |
1647 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; |
1648 | |
1649 | /// \returns The cost of Intrinsic instructions. Analyses the real arguments. |
1650 | /// Three cases are handled: 1. scalar instruction 2. vector instruction |
1651 | /// 3. scalar instruction which is to be vectorized. |
1652 | LLVM_ABI InstructionCost getIntrinsicInstrCost( |
1653 | const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const; |
1654 | |
1655 | /// \returns The cost of Call instructions. |
1656 | LLVM_ABI InstructionCost getCallInstrCost( |
1657 | Function *F, Type *RetTy, ArrayRef<Type *> Tys, |
1658 | TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) const; |
1659 | |
1660 | /// \returns The number of pieces into which the provided type must be |
1661 | /// split during legalization. Zero is returned when the answer is unknown. |
1662 | LLVM_ABI unsigned getNumberOfParts(Type *Tp) const; |
1663 | |
1664 | /// \returns The cost of the address computation. For most targets this can be |
1665 | /// merged into the instruction indexing mode. Some targets might want to |
1666 | /// distinguish between address computation for memory operations on vector |
1667 | /// types and scalar types. Such targets should override this function. |
1668 | /// The 'SE' parameter holds pointer for the scalar evolution object which |
1669 | /// is used in order to get the Ptr step value in case of constant stride. |
1670 | /// The 'Ptr' parameter holds SCEV of the access pointer. |
1671 | LLVM_ABI InstructionCost getAddressComputationCost( |
1672 | Type *Ty, ScalarEvolution *SE = nullptr, const SCEV *Ptr = nullptr) const; |
1673 | |
1674 | /// \returns The cost, if any, of keeping values of the given types alive |
1675 | /// over a callsite. |
1676 | /// |
1677 | /// Some types may require the use of register classes that do not have |
1678 | /// any callee-saved registers, so would require a spill and fill. |
1679 | LLVM_ABI InstructionCost |
1680 | getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const; |
1681 | |
1682 | /// \returns True if the intrinsic is a supported memory intrinsic. Info |
1683 | /// will contain additional information - whether the intrinsic may write |
1684 | /// or read to memory, volatility and the pointer. Info is undefined |
1685 | /// if false is returned. |
1686 | LLVM_ABI bool getTgtMemIntrinsic(IntrinsicInst *Inst, |
1687 | MemIntrinsicInfo &Info) const; |
1688 | |
1689 | /// \returns The maximum element size, in bytes, for an element |
1690 | /// unordered-atomic memory intrinsic. |
1691 | LLVM_ABI unsigned getAtomicMemIntrinsicMaxElementSize() const; |
1692 | |
1693 | /// \returns A value which is the result of the given memory intrinsic. New |
1694 | /// instructions may be created to extract the result from the given intrinsic |
1695 | /// memory operation. Returns nullptr if the target cannot create a result |
1696 | /// from the given intrinsic. |
1697 | LLVM_ABI Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, |
1698 | Type *ExpectedType) const; |
1699 | |
1700 | /// \returns The type to use in a loop expansion of a memcpy call. |
1701 | LLVM_ABI Type *getMemcpyLoopLoweringType( |
1702 | LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, |
1703 | unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, |
1704 | std::optional<uint32_t> AtomicElementSize = std::nullopt) const; |
1705 | |
1706 | /// \param[out] OpsOut The operand types to copy RemainingBytes of memory. |
1707 | /// \param RemainingBytes The number of bytes to copy. |
1708 | /// |
1709 | /// Calculates the operand types to use when copying \p RemainingBytes of |
1710 | /// memory, where source and destination alignments are \p SrcAlign and |
1711 | /// \p DestAlign respectively. |
1712 | LLVM_ABI void getMemcpyLoopResidualLoweringType( |
1713 | SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context, |
1714 | unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, |
1715 | Align SrcAlign, Align DestAlign, |
1716 | std::optional<uint32_t> AtomicCpySize = std::nullopt) const; |
1717 | |
1718 | /// \returns True if the two functions have compatible attributes for inlining |
1719 | /// purposes. |
1720 | LLVM_ABI bool areInlineCompatible(const Function *Caller, |
1721 | const Function *Callee) const; |
1722 | |
1723 | /// Returns a penalty for invoking call \p Call in \p F. |
1724 | /// For example, if a function F calls a function G, which in turn calls |
1725 | /// function H, then getInlineCallPenalty(F, H()) would return the |
1726 | /// penalty of calling H from F, e.g. after inlining G into F. |
1727 | /// \p DefaultCallPenalty is passed to give a default penalty that |
1728 | /// the target can amend or override. |
1729 | LLVM_ABI unsigned getInlineCallPenalty(const Function *F, |
1730 | const CallBase &Call, |
1731 | unsigned DefaultCallPenalty) const; |
1732 | |
1733 | /// \returns True if the caller and callee agree on how \p Types will be |
1734 | /// passed to or returned from the callee. |
1735 | /// to the callee. |
1736 | /// \param Types List of types to check. |
1737 | LLVM_ABI bool areTypesABICompatible(const Function *Caller, |
1738 | const Function *Callee, |
1739 | const ArrayRef<Type *> &Types) const; |
1740 | |
1741 | /// The type of load/store indexing. |
1742 | enum MemIndexedMode { |
1743 | MIM_Unindexed, ///< No indexing. |
1744 | MIM_PreInc, ///< Pre-incrementing. |
1745 | MIM_PreDec, ///< Pre-decrementing. |
1746 | MIM_PostInc, ///< Post-incrementing. |
1747 | MIM_PostDec ///< Post-decrementing. |
1748 | }; |
1749 | |
1750 | /// \returns True if the specified indexed load for the given type is legal. |
1751 | LLVM_ABI bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const; |
1752 | |
1753 | /// \returns True if the specified indexed store for the given type is legal. |
1754 | LLVM_ABI bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const; |
1755 | |
1756 | /// \returns The bitwidth of the largest vector type that should be used to |
1757 | /// load/store in the given address space. |
1758 | LLVM_ABI unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; |
1759 | |
1760 | /// \returns True if the load instruction is legal to vectorize. |
1761 | LLVM_ABI bool isLegalToVectorizeLoad(LoadInst *LI) const; |
1762 | |
1763 | /// \returns True if the store instruction is legal to vectorize. |
1764 | LLVM_ABI bool isLegalToVectorizeStore(StoreInst *SI) const; |
1765 | |
1766 | /// \returns True if it is legal to vectorize the given load chain. |
1767 | LLVM_ABI bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, |
1768 | Align Alignment, |
1769 | unsigned AddrSpace) const; |
1770 | |
1771 | /// \returns True if it is legal to vectorize the given store chain. |
1772 | LLVM_ABI bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, |
1773 | Align Alignment, |
1774 | unsigned AddrSpace) const; |
1775 | |
1776 | /// \returns True if it is legal to vectorize the given reduction kind. |
1777 | LLVM_ABI bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, |
1778 | ElementCount VF) const; |
1779 | |
1780 | /// \returns True if the given type is supported for scalable vectors |
1781 | LLVM_ABI bool isElementTypeLegalForScalableVector(Type *Ty) const; |
1782 | |
1783 | /// \returns The new vector factor value if the target doesn't support \p |
1784 | /// SizeInBytes loads or has a better vector factor. |
1785 | LLVM_ABI unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, |
1786 | unsigned ChainSizeInBytes, |
1787 | VectorType *VecTy) const; |
1788 | |
1789 | /// \returns The new vector factor value if the target doesn't support \p |
1790 | /// SizeInBytes stores or has a better vector factor. |
1791 | LLVM_ABI unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, |
1792 | unsigned ChainSizeInBytes, |
1793 | VectorType *VecTy) const; |
1794 | |
1795 | /// \returns True if the targets prefers fixed width vectorization if the |
1796 | /// loop vectorizer's cost-model assigns an equal cost to the fixed and |
1797 | /// scalable version of the vectorized loop. |
1798 | LLVM_ABI bool preferFixedOverScalableIfEqualCost() const; |
1799 | |
1800 | /// \returns True if target prefers SLP vectorizer with altermate opcode |
1801 | /// vectorization, false - otherwise. |
1802 | LLVM_ABI bool preferAlternateOpcodeVectorization() const; |
1803 | |
1804 | /// \returns True if the target prefers reductions of \p Kind to be performed |
1805 | /// in the loop. |
1806 | LLVM_ABI bool preferInLoopReduction(RecurKind Kind, Type *Ty) const; |
1807 | |
1808 | /// \returns True if the target prefers reductions select kept in the loop |
1809 | /// when tail folding. i.e. |
1810 | /// loop: |
1811 | /// p = phi (0, s) |
1812 | /// a = add (p, x) |
1813 | /// s = select (mask, a, p) |
1814 | /// vecreduce.add(s) |
1815 | /// |
1816 | /// As opposed to the normal scheme of p = phi (0, a) which allows the select |
1817 | /// to be pulled out of the loop. If the select(.., add, ..) can be predicated |
1818 | /// by the target, this can lead to cleaner code generation. |
1819 | LLVM_ABI bool preferPredicatedReductionSelect() const; |
1820 | |
1821 | /// Return true if the loop vectorizer should consider vectorizing an |
1822 | /// otherwise scalar epilogue loop. |
1823 | LLVM_ABI bool preferEpilogueVectorization() const; |
1824 | |
1825 | /// \returns True if the target wants to expand the given reduction intrinsic |
1826 | /// into a shuffle sequence. |
1827 | LLVM_ABI bool shouldExpandReduction(const IntrinsicInst *II) const; |
1828 | |
1829 | enum struct ReductionShuffle { SplitHalf, Pairwise }; |
1830 | |
1831 | /// \returns The shuffle sequence pattern used to expand the given reduction |
1832 | /// intrinsic. |
1833 | LLVM_ABI ReductionShuffle |
1834 | getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const; |
1835 | |
1836 | /// \returns the size cost of rematerializing a GlobalValue address relative |
1837 | /// to a stack reload. |
1838 | LLVM_ABI unsigned getGISelRematGlobalCost() const; |
1839 | |
1840 | /// \returns the lower bound of a trip count to decide on vectorization |
1841 | /// while tail-folding. |
1842 | LLVM_ABI unsigned getMinTripCountTailFoldingThreshold() const; |
1843 | |
1844 | /// \returns True if the target supports scalable vectors. |
1845 | LLVM_ABI bool supportsScalableVectors() const; |
1846 | |
1847 | /// \return true when scalable vectorization is preferred. |
1848 | LLVM_ABI bool enableScalableVectorization() const; |
1849 | |
1850 | /// \name Vector Predication Information |
1851 | /// @{ |
1852 | /// Whether the target supports the %evl parameter of VP intrinsic efficiently |
1853 | /// in hardware, for the given opcode and type/alignment. (see LLVM Language |
1854 | /// Reference - "Vector Predication Intrinsics"). |
1855 | /// Use of %evl is discouraged when that is not the case. |
1856 | LLVM_ABI bool hasActiveVectorLength(unsigned Opcode, Type *DataType, |
1857 | Align Alignment) const; |
1858 | |
1859 | /// Return true if sinking I's operands to the same basic block as I is |
1860 | /// profitable, e.g. because the operands can be folded into a target |
1861 | /// instruction during instruction selection. After calling the function |
1862 | /// \p Ops contains the Uses to sink ordered by dominance (dominating users |
1863 | /// come first). |
1864 | LLVM_ABI bool isProfitableToSinkOperands(Instruction *I, |
1865 | SmallVectorImpl<Use *> &Ops) const; |
1866 | |
1867 | /// Return true if it's significantly cheaper to shift a vector by a uniform |
1868 | /// scalar than by an amount which will vary across each lane. On x86 before |
1869 | /// AVX2 for example, there is a "psllw" instruction for the former case, but |
1870 | /// no simple instruction for a general "a << b" operation on vectors. |
1871 | /// This should also apply to lowering for vector funnel shifts (rotates). |
1872 | LLVM_ABI bool isVectorShiftByScalarCheap(Type *Ty) const; |
1873 | |
1874 | struct VPLegalization { |
1875 | enum VPTransform { |
1876 | // keep the predicating parameter |
1877 | Legal = 0, |
1878 | // where legal, discard the predicate parameter |
1879 | Discard = 1, |
1880 | // transform into something else that is also predicating |
1881 | Convert = 2 |
1882 | }; |
1883 | |
1884 | // How to transform the EVL parameter. |
1885 | // Legal: keep the EVL parameter as it is. |
1886 | // Discard: Ignore the EVL parameter where it is safe to do so. |
1887 | // Convert: Fold the EVL into the mask parameter. |
1888 | VPTransform EVLParamStrategy; |
1889 | |
1890 | // How to transform the operator. |
1891 | // Legal: The target supports this operator. |
1892 | // Convert: Convert this to a non-VP operation. |
1893 | // The 'Discard' strategy is invalid. |
1894 | VPTransform OpStrategy; |
1895 | |
1896 | bool shouldDoNothing() const { |
1897 | return (EVLParamStrategy == Legal) && (OpStrategy == Legal); |
1898 | } |
1899 | VPLegalization(VPTransform EVLParamStrategy, VPTransform OpStrategy) |
1900 | : EVLParamStrategy(EVLParamStrategy), OpStrategy(OpStrategy) {} |
1901 | }; |
1902 | |
1903 | /// \returns How the target needs this vector-predicated operation to be |
1904 | /// transformed. |
1905 | LLVM_ABI VPLegalization |
1906 | getVPLegalizationStrategy(const VPIntrinsic &PI) const; |
1907 | /// @} |
1908 | |
1909 | /// \returns Whether a 32-bit branch instruction is available in Arm or Thumb |
1910 | /// state. |
1911 | /// |
1912 | /// Used by the LowerTypeTests pass, which constructs an IR inline assembler |
1913 | /// node containing a jump table in a format suitable for the target, so it |
1914 | /// needs to know what format of jump table it can legally use. |
1915 | /// |
1916 | /// For non-Arm targets, this function isn't used. It defaults to returning |
1917 | /// false, but it shouldn't matter what it returns anyway. |
1918 | LLVM_ABI bool hasArmWideBranch(bool Thumb) const; |
1919 | |
1920 | /// Returns a bitmask constructed from the target-features or fmv-features |
1921 | /// metadata of a function. |
1922 | LLVM_ABI uint64_t getFeatureMask(const Function &F) const; |
1923 | |
1924 | /// Returns true if this is an instance of a function with multiple versions. |
1925 | LLVM_ABI bool isMultiversionedFunction(const Function &F) const; |
1926 | |
1927 | /// \return The maximum number of function arguments the target supports. |
1928 | LLVM_ABI unsigned getMaxNumArgs() const; |
1929 | |
1930 | /// \return For an array of given Size, return alignment boundary to |
1931 | /// pad to. Default is no padding. |
1932 | LLVM_ABI unsigned getNumBytesToPadGlobalArray(unsigned Size, |
1933 | Type *ArrayType) const; |
1934 | |
1935 | /// @} |
1936 | |
1937 | /// Collect kernel launch bounds for \p F into \p LB. |
1938 | LLVM_ABI void collectKernelLaunchBounds( |
1939 | const Function &F, |
1940 | SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const; |
1941 | |
1942 | private: |
1943 | std::unique_ptr<const TargetTransformInfoImplBase> TTIImpl; |
1944 | }; |
1945 | |
1946 | /// Analysis pass providing the \c TargetTransformInfo. |
1947 | /// |
1948 | /// The core idea of the TargetIRAnalysis is to expose an interface through |
1949 | /// which LLVM targets can analyze and provide information about the middle |
1950 | /// end's target-independent IR. This supports use cases such as target-aware |
1951 | /// cost modeling of IR constructs. |
1952 | /// |
1953 | /// This is a function analysis because much of the cost modeling for targets |
1954 | /// is done in a subtarget specific way and LLVM supports compiling different |
1955 | /// functions targeting different subtargets in order to support runtime |
1956 | /// dispatch according to the observed subtarget. |
1957 | class TargetIRAnalysis : public AnalysisInfoMixin<TargetIRAnalysis> { |
1958 | public: |
1959 | typedef TargetTransformInfo Result; |
1960 | |
1961 | /// Default construct a target IR analysis. |
1962 | /// |
1963 | /// This will use the module's datalayout to construct a baseline |
1964 | /// conservative TTI result. |
1965 | LLVM_ABI TargetIRAnalysis(); |
1966 | |
1967 | /// Construct an IR analysis pass around a target-provide callback. |
1968 | /// |
1969 | /// The callback will be called with a particular function for which the TTI |
1970 | /// is needed and must return a TTI object for that function. |
1971 | LLVM_ABI |
1972 | TargetIRAnalysis(std::function<Result(const Function &)> TTICallback); |
1973 | |
1974 | // Value semantics. We spell out the constructors for MSVC. |
1975 | TargetIRAnalysis(const TargetIRAnalysis &Arg) |
1976 | : TTICallback(Arg.TTICallback) {} |
1977 | TargetIRAnalysis(TargetIRAnalysis &&Arg) |
1978 | : TTICallback(std::move(Arg.TTICallback)) {} |
1979 | TargetIRAnalysis &operator=(const TargetIRAnalysis &RHS) { |
1980 | TTICallback = RHS.TTICallback; |
1981 | return *this; |
1982 | } |
1983 | TargetIRAnalysis &operator=(TargetIRAnalysis &&RHS) { |
1984 | TTICallback = std::move(RHS.TTICallback); |
1985 | return *this; |
1986 | } |
1987 | |
1988 | LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &); |
1989 | |
1990 | private: |
1991 | friend AnalysisInfoMixin<TargetIRAnalysis>; |
1992 | LLVM_ABI static AnalysisKey Key; |
1993 | |
1994 | /// The callback used to produce a result. |
1995 | /// |
1996 | /// We use a completely opaque callback so that targets can provide whatever |
1997 | /// mechanism they desire for constructing the TTI for a given function. |
1998 | /// |
1999 | /// FIXME: Should we really use std::function? It's relatively inefficient. |
2000 | /// It might be possible to arrange for even stateful callbacks to outlive |
2001 | /// the analysis and thus use a function_ref which would be lighter weight. |
2002 | /// This may also be less error prone as the callback is likely to reference |
2003 | /// the external TargetMachine, and that reference needs to never dangle. |
2004 | std::function<Result(const Function &)> TTICallback; |
2005 | |
2006 | /// Helper function used as the callback in the default constructor. |
2007 | static Result getDefaultTTI(const Function &F); |
2008 | }; |
2009 | |
2010 | /// Wrapper pass for TargetTransformInfo. |
2011 | /// |
2012 | /// This pass can be constructed from a TTI object which it stores internally |
2013 | /// and is queried by passes. |
2014 | class LLVM_ABI TargetTransformInfoWrapperPass : public ImmutablePass { |
2015 | TargetIRAnalysis TIRA; |
2016 | std::optional<TargetTransformInfo> TTI; |
2017 | |
2018 | virtual void anchor(); |
2019 | |
2020 | public: |
2021 | static char ID; |
2022 | |
2023 | /// We must provide a default constructor for the pass but it should |
2024 | /// never be used. |
2025 | /// |
2026 | /// Use the constructor below or call one of the creation routines. |
2027 | TargetTransformInfoWrapperPass(); |
2028 | |
2029 | explicit TargetTransformInfoWrapperPass(TargetIRAnalysis TIRA); |
2030 | |
2031 | TargetTransformInfo &getTTI(const Function &F); |
2032 | }; |
2033 | |
2034 | /// Create an analysis pass wrapper around a TTI object. |
2035 | /// |
2036 | /// This analysis pass just holds the TTI instance and makes it available to |
2037 | /// clients. |
2038 | LLVM_ABI ImmutablePass * |
2039 | createTargetTransformInfoWrapperPass(TargetIRAnalysis TIRA); |
2040 | |
2041 | } // namespace llvm |
2042 | |
2043 | #endif |
2044 |
Definitions
- MemIntrinsicInfo
- isUnordered
- HardwareLoopInfo
- HardwareLoopInfo
- IntrinsicCostAttributes
- getID
- getInst
- getReturnType
- getFlags
- getScalarizationCost
- getArgs
- getArgTypes
- getLibInfo
- isTypeBasedOnly
- skipScalarizationCost
- TailFoldingStyle
- TailFoldingInfo
- TailFoldingInfo
- TargetTransformInfo
- PartialReductionExtendKind
- invalidate
- TargetCostKind
- TargetCostConstants
- PointersChainInfo
- isSameBase
- isUnitStride
- isKnownStride
- getUnitStride
- getKnownStride
- getUnknownStride
- getInstructionCost
- LSRCost
- UnrollingPreferences
- PeelingPreferences
- PopcntSupportKind
- AddressingModeKind
- MemCmpExpansionOptions
- operator bool
- ShuffleKind
- OperandValueKind
- OperandValueProperties
- OperandValueInfo
- isConstant
- isUniform
- isPowerOf2
- isNegatedPowerOf2
- getNoProps
- RegisterKind
- CacheLevel
- CastContextHint
- requiresOrderedReduction
- MemIndexedMode
- ReductionShuffle
- VPLegalization
- VPTransform
- shouldDoNothing
- VPLegalization
- TargetIRAnalysis
- TargetIRAnalysis
- TargetIRAnalysis
- operator=
- operator=
Improve your Profiling and Debugging skills
Find out more