1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/PostOrderIterator.h"
26#include "llvm/ADT/STLExtras.h"
27#include "llvm/ADT/SetVector.h"
28#include "llvm/ADT/TypeSwitch.h"
29#include "llvm/Analysis/IVDescriptors.h"
30#include "llvm/Analysis/InstSimplifyFolder.h"
31#include "llvm/Analysis/LoopInfo.h"
32#include "llvm/Analysis/VectorUtils.h"
33#include "llvm/IR/Intrinsics.h"
34#include "llvm/IR/PatternMatch.h"
35#include "llvm/Support/Casting.h"
36#include "llvm/Support/TypeSize.h"
37
38using namespace llvm;
39
40bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
41 VPlanPtr &Plan,
42 function_ref<const InductionDescriptor *(PHINode *)>
43 GetIntOrFpInductionDescriptor,
44 ScalarEvolution &SE, const TargetLibraryInfo &TLI) {
45
46 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
47 Plan->getVectorLoopRegion());
48 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
49 // Skip blocks outside region
50 if (!VPBB->getParent())
51 break;
52 VPRecipeBase *Term = VPBB->getTerminator();
53 auto EndIter = Term ? Term->getIterator() : VPBB->end();
54 // Introduce each ingredient into VPlan.
55 for (VPRecipeBase &Ingredient :
56 make_early_inc_range(Range: make_range(x: VPBB->begin(), y: EndIter))) {
57
58 VPValue *VPV = Ingredient.getVPSingleValue();
59 if (!VPV->getUnderlyingValue())
60 continue;
61
62 Instruction *Inst = cast<Instruction>(Val: VPV->getUnderlyingValue());
63
64 VPRecipeBase *NewRecipe = nullptr;
65 if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &Ingredient)) {
66 auto *Phi = cast<PHINode>(Val: VPPhi->getUnderlyingValue());
67 const auto *II = GetIntOrFpInductionDescriptor(Phi);
68 if (!II)
69 continue;
70
71 VPValue *Start = Plan->getOrAddLiveIn(V: II->getStartValue());
72 VPValue *Step =
73 vputils::getOrCreateVPValueForSCEVExpr(Plan&: *Plan, Expr: II->getStep(), SE);
74 NewRecipe = new VPWidenIntOrFpInductionRecipe(
75 Phi, Start, Step, &Plan->getVF(), *II, Ingredient.getDebugLoc());
76 } else {
77 assert(isa<VPInstruction>(&Ingredient) &&
78 "only VPInstructions expected here");
79 assert(!isa<PHINode>(Inst) && "phis should be handled above");
80 // Create VPWidenMemoryRecipe for loads and stores.
81 if (LoadInst *Load = dyn_cast<LoadInst>(Val: Inst)) {
82 NewRecipe = new VPWidenLoadRecipe(
83 *Load, Ingredient.getOperand(N: 0), nullptr /*Mask*/,
84 false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
85 Ingredient.getDebugLoc());
86 } else if (StoreInst *Store = dyn_cast<StoreInst>(Val: Inst)) {
87 NewRecipe = new VPWidenStoreRecipe(
88 *Store, Ingredient.getOperand(N: 1), Ingredient.getOperand(N: 0),
89 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
90 VPIRMetadata(*Store), Ingredient.getDebugLoc());
91 } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) {
92 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
93 } else if (CallInst *CI = dyn_cast<CallInst>(Val: Inst)) {
94 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, TLI: &TLI);
95 if (VectorID == Intrinsic::not_intrinsic)
96 return false;
97 NewRecipe = new VPWidenIntrinsicRecipe(
98 *CI, getVectorIntrinsicIDForCall(CI, TLI: &TLI),
99 {Ingredient.op_begin(), Ingredient.op_end() - 1}, CI->getType(),
100 CI->getDebugLoc());
101 } else if (SelectInst *SI = dyn_cast<SelectInst>(Val: Inst)) {
102 NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
103 } else if (auto *CI = dyn_cast<CastInst>(Val: Inst)) {
104 NewRecipe = new VPWidenCastRecipe(
105 CI->getOpcode(), Ingredient.getOperand(N: 0), CI->getType(), *CI);
106 } else {
107 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
108 }
109 }
110
111 NewRecipe->insertBefore(InsertPos: &Ingredient);
112 if (NewRecipe->getNumDefinedValues() == 1)
113 VPV->replaceAllUsesWith(New: NewRecipe->getVPSingleValue());
114 else
115 assert(NewRecipe->getNumDefinedValues() == 0 &&
116 "Only recpies with zero or one defined values expected");
117 Ingredient.eraseFromParent();
118 }
119 }
120 return true;
121}
122
123static bool sinkScalarOperands(VPlan &Plan) {
124 auto Iter = vp_depth_first_deep(G: Plan.getEntry());
125 bool Changed = false;
126 // First, collect the operands of all recipes in replicate blocks as seeds for
127 // sinking.
128 SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
129 for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Range: Iter)) {
130 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
131 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
132 continue;
133 VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Val: EntryVPBB->getSuccessors()[0]);
134 if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
135 continue;
136 for (auto &Recipe : *VPBB) {
137 for (VPValue *Op : Recipe.operands())
138 if (auto *Def =
139 dyn_cast_or_null<VPSingleDefRecipe>(Val: Op->getDefiningRecipe()))
140 WorkList.insert(X: std::make_pair(x&: VPBB, y&: Def));
141 }
142 }
143
144 bool ScalarVFOnly = Plan.hasScalarVFOnly();
145 // Try to sink each replicate or scalar IV steps recipe in the worklist.
146 for (unsigned I = 0; I != WorkList.size(); ++I) {
147 VPBasicBlock *SinkTo;
148 VPSingleDefRecipe *SinkCandidate;
149 std::tie(args&: SinkTo, args&: SinkCandidate) = WorkList[I];
150 if (SinkCandidate->getParent() == SinkTo ||
151 SinkCandidate->mayHaveSideEffects() ||
152 SinkCandidate->mayReadOrWriteMemory())
153 continue;
154 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: SinkCandidate)) {
155 if (!ScalarVFOnly && RepR->isSingleScalar())
156 continue;
157 } else if (!isa<VPScalarIVStepsRecipe>(Val: SinkCandidate))
158 continue;
159
160 bool NeedsDuplicating = false;
161 // All recipe users of the sink candidate must be in the same block SinkTo
162 // or all users outside of SinkTo must be uniform-after-vectorization (
163 // i.e., only first lane is used) . In the latter case, we need to duplicate
164 // SinkCandidate.
165 auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
166 SinkCandidate](VPUser *U) {
167 auto *UI = cast<VPRecipeBase>(Val: U);
168 if (UI->getParent() == SinkTo)
169 return true;
170 NeedsDuplicating = UI->onlyFirstLaneUsed(Op: SinkCandidate);
171 // We only know how to duplicate VPReplicateRecipes and
172 // VPScalarIVStepsRecipes for now.
173 return NeedsDuplicating &&
174 isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Val: SinkCandidate);
175 };
176 if (!all_of(Range: SinkCandidate->users(), P: CanSinkWithUser))
177 continue;
178
179 if (NeedsDuplicating) {
180 if (ScalarVFOnly)
181 continue;
182 VPSingleDefRecipe *Clone;
183 if (auto *SinkCandidateRepR =
184 dyn_cast<VPReplicateRecipe>(Val: SinkCandidate)) {
185 // TODO: Handle converting to uniform recipes as separate transform,
186 // then cloning should be sufficient here.
187 Instruction *I = SinkCandidate->getUnderlyingInstr();
188 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
189 nullptr /*Mask*/, *SinkCandidateRepR);
190 // TODO: add ".cloned" suffix to name of Clone's VPValue.
191 } else {
192 Clone = SinkCandidate->clone();
193 }
194
195 Clone->insertBefore(InsertPos: SinkCandidate);
196 SinkCandidate->replaceUsesWithIf(New: Clone, ShouldReplace: [SinkTo](VPUser &U, unsigned) {
197 return cast<VPRecipeBase>(Val: &U)->getParent() != SinkTo;
198 });
199 }
200 SinkCandidate->moveBefore(BB&: *SinkTo, I: SinkTo->getFirstNonPhi());
201 for (VPValue *Op : SinkCandidate->operands())
202 if (auto *Def =
203 dyn_cast_or_null<VPSingleDefRecipe>(Val: Op->getDefiningRecipe()))
204 WorkList.insert(X: std::make_pair(x&: SinkTo, y&: Def));
205 Changed = true;
206 }
207 return Changed;
208}
209
210/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
211/// the mask.
212VPValue *getPredicatedMask(VPRegionBlock *R) {
213 auto *EntryBB = dyn_cast<VPBasicBlock>(Val: R->getEntry());
214 if (!EntryBB || EntryBB->size() != 1 ||
215 !isa<VPBranchOnMaskRecipe>(Val: EntryBB->begin()))
216 return nullptr;
217
218 return cast<VPBranchOnMaskRecipe>(Val: &*EntryBB->begin())->getOperand(N: 0);
219}
220
221/// If \p R is a triangle region, return the 'then' block of the triangle.
222static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) {
223 auto *EntryBB = cast<VPBasicBlock>(Val: R->getEntry());
224 if (EntryBB->getNumSuccessors() != 2)
225 return nullptr;
226
227 auto *Succ0 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[0]);
228 auto *Succ1 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[1]);
229 if (!Succ0 || !Succ1)
230 return nullptr;
231
232 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
233 return nullptr;
234 if (Succ0->getSingleSuccessor() == Succ1)
235 return Succ0;
236 if (Succ1->getSingleSuccessor() == Succ0)
237 return Succ1;
238 return nullptr;
239}
240
241// Merge replicate regions in their successor region, if a replicate region
242// is connected to a successor replicate region with the same predicate by a
243// single, empty VPBasicBlock.
244static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
245 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
246
247 // Collect replicate regions followed by an empty block, followed by another
248 // replicate region with matching masks to process front. This is to avoid
249 // iterator invalidation issues while merging regions.
250 SmallVector<VPRegionBlock *, 8> WorkList;
251 for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>(
252 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
253 if (!Region1->isReplicator())
254 continue;
255 auto *MiddleBasicBlock =
256 dyn_cast_or_null<VPBasicBlock>(Val: Region1->getSingleSuccessor());
257 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
258 continue;
259
260 auto *Region2 =
261 dyn_cast_or_null<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor());
262 if (!Region2 || !Region2->isReplicator())
263 continue;
264
265 VPValue *Mask1 = getPredicatedMask(R: Region1);
266 VPValue *Mask2 = getPredicatedMask(R: Region2);
267 if (!Mask1 || Mask1 != Mask2)
268 continue;
269
270 assert(Mask1 && Mask2 && "both region must have conditions");
271 WorkList.push_back(Elt: Region1);
272 }
273
274 // Move recipes from Region1 to its successor region, if both are triangles.
275 for (VPRegionBlock *Region1 : WorkList) {
276 if (TransformedRegions.contains(Ptr: Region1))
277 continue;
278 auto *MiddleBasicBlock = cast<VPBasicBlock>(Val: Region1->getSingleSuccessor());
279 auto *Region2 = cast<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor());
280
281 VPBasicBlock *Then1 = getPredicatedThenBlock(R: Region1);
282 VPBasicBlock *Then2 = getPredicatedThenBlock(R: Region2);
283 if (!Then1 || !Then2)
284 continue;
285
286 // Note: No fusion-preventing memory dependencies are expected in either
287 // region. Such dependencies should be rejected during earlier dependence
288 // checks, which guarantee accesses can be re-ordered for vectorization.
289 //
290 // Move recipes to the successor region.
291 for (VPRecipeBase &ToMove : make_early_inc_range(Range: reverse(C&: *Then1)))
292 ToMove.moveBefore(BB&: *Then2, I: Then2->getFirstNonPhi());
293
294 auto *Merge1 = cast<VPBasicBlock>(Val: Then1->getSingleSuccessor());
295 auto *Merge2 = cast<VPBasicBlock>(Val: Then2->getSingleSuccessor());
296
297 // Move VPPredInstPHIRecipes from the merge block to the successor region's
298 // merge block. Update all users inside the successor region to use the
299 // original values.
300 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(Range: reverse(C&: *Merge1))) {
301 VPValue *PredInst1 =
302 cast<VPPredInstPHIRecipe>(Val: &Phi1ToMove)->getOperand(N: 0);
303 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
304 Phi1ToMoveV->replaceUsesWithIf(New: PredInst1, ShouldReplace: [Then2](VPUser &U, unsigned) {
305 return cast<VPRecipeBase>(Val: &U)->getParent() == Then2;
306 });
307
308 // Remove phi recipes that are unused after merging the regions.
309 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
310 Phi1ToMove.eraseFromParent();
311 continue;
312 }
313 Phi1ToMove.moveBefore(BB&: *Merge2, I: Merge2->begin());
314 }
315
316 // Remove the dead recipes in Region1's entry block.
317 for (VPRecipeBase &R :
318 make_early_inc_range(Range: reverse(C&: *Region1->getEntryBasicBlock())))
319 R.eraseFromParent();
320
321 // Finally, remove the first region.
322 for (VPBlockBase *Pred : make_early_inc_range(Range&: Region1->getPredecessors())) {
323 VPBlockUtils::disconnectBlocks(From: Pred, To: Region1);
324 VPBlockUtils::connectBlocks(From: Pred, To: MiddleBasicBlock);
325 }
326 VPBlockUtils::disconnectBlocks(From: Region1, To: MiddleBasicBlock);
327 TransformedRegions.insert(Ptr: Region1);
328 }
329
330 return !TransformedRegions.empty();
331}
332
333static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
334 VPlan &Plan) {
335 Instruction *Instr = PredRecipe->getUnderlyingInstr();
336 // Build the triangular if-then region.
337 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
338 assert(Instr->getParent() && "Predicated instruction not in any basic block");
339 auto *BlockInMask = PredRecipe->getMask();
340 auto *MaskDef = BlockInMask->getDefiningRecipe();
341 auto *BOMRecipe = new VPBranchOnMaskRecipe(
342 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc());
343 auto *Entry =
344 Plan.createVPBasicBlock(Name: Twine(RegionName) + ".entry", Recipe: BOMRecipe);
345
346 // Replace predicated replicate recipe with a replicate recipe without a
347 // mask but in the replicate region.
348 auto *RecipeWithoutMask = new VPReplicateRecipe(
349 PredRecipe->getUnderlyingInstr(),
350 make_range(x: PredRecipe->op_begin(), y: std::prev(x: PredRecipe->op_end())),
351 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);
352 auto *Pred =
353 Plan.createVPBasicBlock(Name: Twine(RegionName) + ".if", Recipe: RecipeWithoutMask);
354
355 VPPredInstPHIRecipe *PHIRecipe = nullptr;
356 if (PredRecipe->getNumUsers() != 0) {
357 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
358 RecipeWithoutMask->getDebugLoc());
359 PredRecipe->replaceAllUsesWith(New: PHIRecipe);
360 PHIRecipe->setOperand(I: 0, New: RecipeWithoutMask);
361 }
362 PredRecipe->eraseFromParent();
363 auto *Exiting =
364 Plan.createVPBasicBlock(Name: Twine(RegionName) + ".continue", Recipe: PHIRecipe);
365 VPRegionBlock *Region =
366 Plan.createVPRegionBlock(Entry, Exiting, Name: RegionName, IsReplicator: true);
367
368 // Note: first set Entry as region entry and then connect successors starting
369 // from it in order, to propagate the "parent" of each VPBasicBlock.
370 VPBlockUtils::insertTwoBlocksAfter(IfTrue: Pred, IfFalse: Exiting, BlockPtr: Entry);
371 VPBlockUtils::connectBlocks(From: Pred, To: Exiting);
372
373 return Region;
374}
375
376static void addReplicateRegions(VPlan &Plan) {
377 SmallVector<VPReplicateRecipe *> WorkList;
378 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
379 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
380 for (VPRecipeBase &R : *VPBB)
381 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
382 if (RepR->isPredicated())
383 WorkList.push_back(Elt: RepR);
384 }
385 }
386
387 unsigned BBNum = 0;
388 for (VPReplicateRecipe *RepR : WorkList) {
389 VPBasicBlock *CurrentBlock = RepR->getParent();
390 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(SplitAt: RepR->getIterator());
391
392 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
393 SplitBlock->setName(
394 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
395 // Record predicated instructions for above packing optimizations.
396 VPRegionBlock *Region = createReplicateRegion(PredRecipe: RepR, Plan);
397 Region->setParent(CurrentBlock->getParent());
398 VPBlockUtils::insertOnEdge(From: CurrentBlock, To: SplitBlock, BlockPtr: Region);
399
400 VPRegionBlock *ParentRegion = Region->getParent();
401 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
402 ParentRegion->setExiting(SplitBlock);
403 }
404}
405
406/// Remove redundant VPBasicBlocks by merging them into their predecessor if
407/// the predecessor has a single successor.
408static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
409 SmallVector<VPBasicBlock *> WorkList;
410 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
411 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
412 // Don't fold the blocks in the skeleton of the Plan into their single
413 // predecessors for now.
414 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
415 if (!VPBB->getParent())
416 continue;
417 auto *PredVPBB =
418 dyn_cast_or_null<VPBasicBlock>(Val: VPBB->getSinglePredecessor());
419 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
420 isa<VPIRBasicBlock>(Val: PredVPBB))
421 continue;
422 WorkList.push_back(Elt: VPBB);
423 }
424
425 for (VPBasicBlock *VPBB : WorkList) {
426 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(Val: VPBB->getSinglePredecessor());
427 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB))
428 R.moveBefore(BB&: *PredVPBB, I: PredVPBB->end());
429 VPBlockUtils::disconnectBlocks(From: PredVPBB, To: VPBB);
430 auto *ParentRegion = VPBB->getParent();
431 if (ParentRegion && ParentRegion->getExiting() == VPBB)
432 ParentRegion->setExiting(PredVPBB);
433 for (auto *Succ : to_vector(Range: VPBB->successors())) {
434 VPBlockUtils::disconnectBlocks(From: VPBB, To: Succ);
435 VPBlockUtils::connectBlocks(From: PredVPBB, To: Succ);
436 }
437 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
438 }
439 return !WorkList.empty();
440}
441
442void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) {
443 // Convert masked VPReplicateRecipes to if-then region blocks.
444 addReplicateRegions(Plan);
445
446 bool ShouldSimplify = true;
447 while (ShouldSimplify) {
448 ShouldSimplify = sinkScalarOperands(Plan);
449 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
450 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
451 }
452}
453
454/// Remove redundant casts of inductions.
455///
456/// Such redundant casts are casts of induction variables that can be ignored,
457/// because we already proved that the casted phi is equal to the uncasted phi
458/// in the vectorized loop. There is no need to vectorize the cast - the same
459/// value can be used for both the phi and casts in the vector loop.
460static void removeRedundantInductionCasts(VPlan &Plan) {
461 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
462 auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
463 if (!IV || IV->getTruncInst())
464 continue;
465
466 // A sequence of IR Casts has potentially been recorded for IV, which
467 // *must be bypassed* when the IV is vectorized, because the vectorized IV
468 // will produce the desired casted value. This sequence forms a def-use
469 // chain and is provided in reverse order, ending with the cast that uses
470 // the IV phi. Search for the recipe of the last cast in the chain and
471 // replace it with the original IV. Note that only the final cast is
472 // expected to have users outside the cast-chain and the dead casts left
473 // over will be cleaned up later.
474 auto &Casts = IV->getInductionDescriptor().getCastInsts();
475 VPValue *FindMyCast = IV;
476 for (Instruction *IRCast : reverse(C: Casts)) {
477 VPSingleDefRecipe *FoundUserCast = nullptr;
478 for (auto *U : FindMyCast->users()) {
479 auto *UserCast = dyn_cast<VPSingleDefRecipe>(Val: U);
480 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
481 FoundUserCast = UserCast;
482 break;
483 }
484 }
485 FindMyCast = FoundUserCast;
486 }
487 FindMyCast->replaceAllUsesWith(New: IV);
488 }
489}
490
491/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
492/// recipe, if it exists.
493static void removeRedundantCanonicalIVs(VPlan &Plan) {
494 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
495 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
496 for (VPUser *U : CanonicalIV->users()) {
497 WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(Val: U);
498 if (WidenNewIV)
499 break;
500 }
501
502 if (!WidenNewIV)
503 return;
504
505 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
506 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
507 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
508
509 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
510 continue;
511
512 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
513 // everything WidenNewIV's users need. That is, WidenOriginalIV will
514 // generate a vector phi or all users of WidenNewIV demand the first lane
515 // only.
516 if (any_of(Range: WidenOriginalIV->users(),
517 P: [WidenOriginalIV](VPUser *U) {
518 return !U->usesScalars(Op: WidenOriginalIV);
519 }) ||
520 vputils::onlyFirstLaneUsed(Def: WidenNewIV)) {
521 WidenNewIV->replaceAllUsesWith(New: WidenOriginalIV);
522 WidenNewIV->eraseFromParent();
523 return;
524 }
525 }
526}
527
528/// Returns true if \p R is dead and can be removed.
529static bool isDeadRecipe(VPRecipeBase &R) {
530 using namespace llvm::PatternMatch;
531 // Do remove conditional assume instructions as their conditions may be
532 // flattened.
533 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
534 bool IsConditionalAssume =
535 RepR && RepR->isPredicated() &&
536 match(RepR->getUnderlyingInstr(), m_Intrinsic<Intrinsic::assume>());
537 if (IsConditionalAssume)
538 return true;
539
540 if (R.mayHaveSideEffects())
541 return false;
542
543 // Recipe is dead if no user keeps the recipe alive.
544 return all_of(Range: R.definedValues(),
545 P: [](VPValue *V) { return V->getNumUsers() == 0; });
546}
547
548void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
549 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
550 Plan.getEntry());
551
552 for (VPBasicBlock *VPBB : reverse(C: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT))) {
553 // The recipes in the block are processed in reverse order, to catch chains
554 // of dead recipes.
555 for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
556 if (isDeadRecipe(R))
557 R.eraseFromParent();
558 }
559 }
560}
561
562static VPScalarIVStepsRecipe *
563createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
564 Instruction::BinaryOps InductionOpcode,
565 FPMathOperator *FPBinOp, Instruction *TruncI,
566 VPValue *StartV, VPValue *Step, DebugLoc DL,
567 VPBuilder &Builder) {
568 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
569 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
570 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
571 Kind, FPBinOp, Start: StartV, Current: CanonicalIV, Step, Name: "offset.idx");
572
573 // Truncate base induction if needed.
574 Type *CanonicalIVType = CanonicalIV->getScalarType();
575 VPTypeAnalysis TypeInfo(CanonicalIVType);
576 Type *ResultTy = TypeInfo.inferScalarType(V: BaseIV);
577 if (TruncI) {
578 Type *TruncTy = TruncI->getType();
579 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
580 "Not truncating.");
581 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
582 BaseIV = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: BaseIV, ResultTy: TruncTy, DL);
583 ResultTy = TruncTy;
584 }
585
586 // Truncate step if needed.
587 Type *StepTy = TypeInfo.inferScalarType(V: Step);
588 if (ResultTy != StepTy) {
589 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
590 "Not truncating.");
591 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
592 auto *VecPreheader =
593 cast<VPBasicBlock>(Val: HeaderVPBB->getSingleHierarchicalPredecessor());
594 VPBuilder::InsertPointGuard Guard(Builder);
595 Builder.setInsertPoint(VecPreheader);
596 Step = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Step, ResultTy, DL);
597 }
598 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, IV: BaseIV, Step,
599 VF: &Plan.getVF(), DL);
600}
601
602static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {
603 SetVector<VPUser *> Users(llvm::from_range, V->users());
604 for (unsigned I = 0; I != Users.size(); ++I) {
605 VPRecipeBase *Cur = cast<VPRecipeBase>(Val: Users[I]);
606 if (isa<VPHeaderPHIRecipe>(Val: Cur))
607 continue;
608 for (VPValue *V : Cur->definedValues())
609 Users.insert_range(R: V->users());
610 }
611 return Users.takeVector();
612}
613
614/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
615/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
616/// VPWidenPointerInductionRecipe will generate vectors only. If some users
617/// require vectors while other require scalars, the scalar uses need to extract
618/// the scalars from the generated vectors (Note that this is different to how
619/// int/fp inductions are handled). Legalize extract-from-ends using uniform
620/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
621/// the correct end value is available. Also optimize
622/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
623/// providing them scalar steps built on the canonical scalar IV and update the
624/// original IV's users. This is an optional optimization to reduce the needs of
625/// vector extracts.
626static void legalizeAndOptimizeInductions(VPlan &Plan) {
627 using namespace llvm::VPlanPatternMatch;
628 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
629 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
630 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
631 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
632 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(Val: &Phi);
633 if (!PhiR)
634 continue;
635
636 // Try to narrow wide and replicating recipes to uniform recipes, based on
637 // VPlan analysis.
638 // TODO: Apply to all recipes in the future, to replace legacy uniformity
639 // analysis.
640 auto Users = collectUsersRecursively(V: PhiR);
641 for (VPUser *U : reverse(C&: Users)) {
642 auto *Def = dyn_cast<VPSingleDefRecipe>(Val: U);
643 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U);
644 // Skip recipes that shouldn't be narrowed.
645 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Val: Def) ||
646 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
647 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
648 continue;
649
650 // Skip recipes that may have other lanes than their first used.
651 if (!vputils::isSingleScalar(VPV: Def) && !vputils::onlyFirstLaneUsed(Def))
652 continue;
653
654 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
655 Def->operands(), /*IsUniform*/ true);
656 Clone->insertAfter(InsertPos: Def);
657 Def->replaceAllUsesWith(New: Clone);
658 }
659
660 // Replace wide pointer inductions which have only their scalars used by
661 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
662 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(Val: &Phi)) {
663 if (!PtrIV->onlyScalarsGenerated(IsScalable: Plan.hasScalableVF()))
664 continue;
665
666 const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
667 VPValue *StartV =
668 Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: ID.getStep()->getType(), V: 0));
669 VPValue *StepV = PtrIV->getOperand(N: 1);
670 VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
671 Plan, Kind: InductionDescriptor::IK_IntInduction, InductionOpcode: Instruction::Add, FPBinOp: nullptr,
672 TruncI: nullptr, StartV, Step: StepV, DL: PtrIV->getDebugLoc(), Builder);
673
674 VPValue *PtrAdd = Builder.createPtrAdd(Ptr: PtrIV->getStartValue(), Offset: Steps,
675 DL: PtrIV->getDebugLoc(), Name: "next.gep");
676
677 PtrIV->replaceAllUsesWith(New: PtrAdd);
678 continue;
679 }
680
681 // Replace widened induction with scalar steps for users that only use
682 // scalars.
683 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
684 if (HasOnlyVectorVFs && none_of(Range: WideIV->users(), P: [WideIV](VPUser *U) {
685 return U->usesScalars(Op: WideIV);
686 }))
687 continue;
688
689 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
690 VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
691 Plan, Kind: ID.getKind(), InductionOpcode: ID.getInductionOpcode(),
692 FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
693 TruncI: WideIV->getTruncInst(), StartV: WideIV->getStartValue(), Step: WideIV->getStepValue(),
694 DL: WideIV->getDebugLoc(), Builder);
695
696 // Update scalar users of IV to use Step instead.
697 if (!HasOnlyVectorVFs)
698 WideIV->replaceAllUsesWith(New: Steps);
699 else
700 WideIV->replaceUsesWithIf(New: Steps, ShouldReplace: [WideIV](VPUser &U, unsigned) {
701 return U.usesScalars(Op: WideIV);
702 });
703 }
704}
705
706/// Check if \p VPV is an untruncated wide induction, either before or after the
707/// increment. If so return the header IV (before the increment), otherwise
708/// return null.
709static VPWidenInductionRecipe *getOptimizableIVOf(VPValue *VPV) {
710 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Val: VPV);
711 if (WideIV) {
712 // VPV itself is a wide induction, separately compute the end value for exit
713 // users if it is not a truncated IV.
714 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
715 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
716 }
717
718 // Check if VPV is an optimizable induction increment.
719 VPRecipeBase *Def = VPV->getDefiningRecipe();
720 if (!Def || Def->getNumOperands() != 2)
721 return nullptr;
722 WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Def->getOperand(N: 0));
723 if (!WideIV)
724 WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Def->getOperand(N: 1));
725 if (!WideIV)
726 return nullptr;
727
728 auto IsWideIVInc = [&]() {
729 using namespace VPlanPatternMatch;
730 auto &ID = WideIV->getInductionDescriptor();
731
732 // Check if VPV increments the induction by the induction step.
733 VPValue *IVStep = WideIV->getStepValue();
734 switch (ID.getInductionOpcode()) {
735 case Instruction::Add:
736 return match(V: VPV, P: m_c_Binary<Instruction::Add>(Op0: m_Specific(VPV: WideIV),
737 Op1: m_Specific(VPV: IVStep)));
738 case Instruction::FAdd:
739 return match(V: VPV, P: m_c_Binary<Instruction::FAdd>(Op0: m_Specific(VPV: WideIV),
740 Op1: m_Specific(VPV: IVStep)));
741 case Instruction::FSub:
742 return match(V: VPV, P: m_Binary<Instruction::FSub>(Op0: m_Specific(VPV: WideIV),
743 Op1: m_Specific(VPV: IVStep)));
744 case Instruction::Sub: {
745 // IVStep will be the negated step of the subtraction. Check if Step == -1
746 // * IVStep.
747 VPValue *Step;
748 if (!match(V: VPV,
749 P: m_Binary<Instruction::Sub>(Op0: m_VPValue(), Op1: m_VPValue(V&: Step))) ||
750 !Step->isLiveIn() || !IVStep->isLiveIn())
751 return false;
752 auto *StepCI = dyn_cast<ConstantInt>(Val: Step->getLiveInIRValue());
753 auto *IVStepCI = dyn_cast<ConstantInt>(Val: IVStep->getLiveInIRValue());
754 return StepCI && IVStepCI &&
755 StepCI->getValue() == (-1 * IVStepCI->getValue());
756 }
757 default:
758 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
759 match(V: VPV, P: m_GetElementPtr(Op0: m_Specific(VPV: WideIV),
760 Op1: m_Specific(VPV: WideIV->getStepValue())));
761 }
762 llvm_unreachable("should have been covered by switch above");
763 };
764 return IsWideIVInc() ? WideIV : nullptr;
765}
766
767/// Attempts to optimize the induction variable exit values for users in the
768/// early exit block.
769static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
770 VPTypeAnalysis &TypeInfo,
771 VPBlockBase *PredVPBB,
772 VPValue *Op) {
773 using namespace VPlanPatternMatch;
774
775 VPValue *Incoming, *Mask;
776 if (!match(V: Op, P: m_VPInstruction<Instruction::ExtractElement>(
777 Op0: m_VPValue(V&: Incoming),
778 Op1: m_VPInstruction<VPInstruction::FirstActiveLane>(
779 Op0: m_VPValue(V&: Mask)))))
780 return nullptr;
781
782 auto *WideIV = getOptimizableIVOf(VPV: Incoming);
783 if (!WideIV)
784 return nullptr;
785
786 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
787 if (WideIntOrFp && WideIntOrFp->getTruncInst())
788 return nullptr;
789
790 // Calculate the final index.
791 VPValue *EndValue = Plan.getCanonicalIV();
792 auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
793 VPBuilder B(cast<VPBasicBlock>(Val: PredVPBB));
794
795 DebugLoc DL = cast<VPInstruction>(Val: Op)->getDebugLoc();
796 VPValue *FirstActiveLane =
797 B.createNaryOp(Opcode: VPInstruction::FirstActiveLane, Operands: Mask, DL);
798 Type *FirstActiveLaneType = TypeInfo.inferScalarType(V: FirstActiveLane);
799 if (CanonicalIVType != FirstActiveLaneType) {
800 Instruction::CastOps CastOp =
801 CanonicalIVType->getScalarSizeInBits() <
802 FirstActiveLaneType->getScalarSizeInBits()
803 ? Instruction::Trunc
804 : Instruction::ZExt;
805 FirstActiveLane =
806 B.createScalarCast(Opcode: CastOp, Op: FirstActiveLane, ResultTy: CanonicalIVType, DL);
807 }
808 EndValue = B.createNaryOp(Opcode: Instruction::Add, Operands: {EndValue, FirstActiveLane}, DL);
809
810 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
811 // changed it means the exit is using the incremented value, so we need to
812 // add the step.
813 if (Incoming != WideIV) {
814 VPValue *One = Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: CanonicalIVType, V: 1));
815 EndValue = B.createNaryOp(Opcode: Instruction::Add, Operands: {EndValue, One}, DL);
816 }
817
818 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
819 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
820 VPValue *Start = WideIV->getStartValue();
821 VPValue *Step = WideIV->getStepValue();
822 EndValue = B.createDerivedIV(
823 Kind: ID.getKind(), FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
824 Start, Current: EndValue, Step);
825 }
826
827 return EndValue;
828}
829
830/// Attempts to optimize the induction variable exit values for users in the
831/// exit block coming from the latch in the original scalar loop.
832static VPValue *
833optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo,
834 VPBlockBase *PredVPBB, VPValue *Op,
835 DenseMap<VPValue *, VPValue *> &EndValues) {
836 using namespace VPlanPatternMatch;
837
838 VPValue *Incoming;
839 if (!match(V: Op, P: m_VPInstruction<VPInstruction::ExtractLastElement>(
840 Op0: m_VPValue(V&: Incoming))))
841 return nullptr;
842
843 auto *WideIV = getOptimizableIVOf(VPV: Incoming);
844 if (!WideIV)
845 return nullptr;
846
847 VPValue *EndValue = EndValues.lookup(Val: WideIV);
848 assert(EndValue && "end value must have been pre-computed");
849
850 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
851 // changed it means the exit is using the incremented value, so we don't
852 // need to subtract the step.
853 if (Incoming != WideIV)
854 return EndValue;
855
856 // Otherwise, subtract the step from the EndValue.
857 VPBuilder B(cast<VPBasicBlock>(Val: PredVPBB)->getTerminator());
858 VPValue *Step = WideIV->getStepValue();
859 Type *ScalarTy = TypeInfo.inferScalarType(V: WideIV);
860 if (ScalarTy->isIntegerTy())
861 return B.createNaryOp(Opcode: Instruction::Sub, Operands: {EndValue, Step}, Inst: {}, Name: "ind.escape");
862 if (ScalarTy->isPointerTy()) {
863 auto *Zero = Plan.getOrAddLiveIn(
864 V: ConstantInt::get(Ty: Step->getLiveInIRValue()->getType(), V: 0));
865 return B.createPtrAdd(Ptr: EndValue,
866 Offset: B.createNaryOp(Opcode: Instruction::Sub, Operands: {Zero, Step}), DL: {},
867 Name: "ind.escape");
868 }
869 if (ScalarTy->isFloatingPointTy()) {
870 const auto &ID = WideIV->getInductionDescriptor();
871 return B.createNaryOp(
872 Opcode: ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
873 ? Instruction::FSub
874 : Instruction::FAdd,
875 Operands: {EndValue, Step}, Flags: {ID.getInductionBinOp()->getFastMathFlags()});
876 }
877 llvm_unreachable("all possible induction types must be handled");
878 return nullptr;
879}
880
881void VPlanTransforms::optimizeInductionExitUsers(
882 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues) {
883 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
884 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
885 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
886 for (VPRecipeBase &R : ExitVPBB->phis()) {
887 auto *ExitIRI = cast<VPIRPhi>(Val: &R);
888
889 for (auto [Idx, PredVPBB] : enumerate(First&: ExitVPBB->getPredecessors())) {
890 VPValue *Escape = nullptr;
891 if (PredVPBB == MiddleVPBB)
892 Escape = optimizeLatchExitInductionUser(
893 Plan, TypeInfo, PredVPBB, Op: ExitIRI->getOperand(N: Idx), EndValues);
894 else
895 Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
896 Op: ExitIRI->getOperand(N: Idx));
897 if (Escape)
898 ExitIRI->setOperand(I: Idx, New: Escape);
899 }
900 }
901 }
902}
903
904/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
905/// them with already existing recipes expanding the same SCEV expression.
906static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
907 DenseMap<const SCEV *, VPValue *> SCEV2VPV;
908
909 for (VPRecipeBase &R :
910 make_early_inc_range(Range&: *Plan.getEntry()->getEntryBasicBlock())) {
911 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
912 if (!ExpR)
913 continue;
914
915 auto I = SCEV2VPV.insert(KV: {ExpR->getSCEV(), ExpR});
916 if (I.second)
917 continue;
918 ExpR->replaceAllUsesWith(New: I.first->second);
919 ExpR->eraseFromParent();
920 }
921}
922
923static void recursivelyDeleteDeadRecipes(VPValue *V) {
924 SmallVector<VPValue *> WorkList;
925 SmallPtrSet<VPValue *, 8> Seen;
926 WorkList.push_back(Elt: V);
927
928 while (!WorkList.empty()) {
929 VPValue *Cur = WorkList.pop_back_val();
930 if (!Seen.insert(Ptr: Cur).second)
931 continue;
932 VPRecipeBase *R = Cur->getDefiningRecipe();
933 if (!R)
934 continue;
935 if (!isDeadRecipe(R&: *R))
936 continue;
937 WorkList.append(in_start: R->op_begin(), in_end: R->op_end());
938 R->eraseFromParent();
939 }
940}
941
942/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
943/// non-nullptr Value for a handled \p Opcode if corresponding \p Operands are
944/// foldable live-ins.
945static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode,
946 ArrayRef<VPValue *> Operands,
947 const DataLayout &DL, VPTypeAnalysis &TypeInfo) {
948 SmallVector<Value *, 4> Ops;
949 for (VPValue *Op : Operands) {
950 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
951 return nullptr;
952 Ops.push_back(Elt: Op->getLiveInIRValue());
953 }
954
955 InstSimplifyFolder Folder(DL);
956 if (Instruction::isBinaryOp(Opcode))
957 return Folder.FoldBinOp(Opc: static_cast<Instruction::BinaryOps>(Opcode), LHS: Ops[0],
958 RHS: Ops[1]);
959 if (Instruction::isCast(Opcode))
960 return Folder.FoldCast(Op: static_cast<Instruction::CastOps>(Opcode), V: Ops[0],
961 DestTy: TypeInfo.inferScalarType(V: R.getVPSingleValue()));
962 switch (Opcode) {
963 case VPInstruction::LogicalAnd:
964 return Folder.FoldSelect(C: Ops[0], True: Ops[1],
965 False: ConstantInt::getNullValue(Ty: Ops[1]->getType()));
966 case VPInstruction::Not:
967 return Folder.FoldBinOp(Opc: Instruction::BinaryOps::Xor, LHS: Ops[0],
968 RHS: Constant::getAllOnesValue(Ty: Ops[0]->getType()));
969 case Instruction::Select:
970 return Folder.FoldSelect(C: Ops[0], True: Ops[1], False: Ops[2]);
971 case Instruction::ICmp:
972 case Instruction::FCmp:
973 return Folder.FoldCmp(P: cast<VPRecipeWithIRFlags>(Val: R).getPredicate(), LHS: Ops[0],
974 RHS: Ops[1]);
975 case Instruction::GetElementPtr: {
976 auto &RFlags = cast<VPRecipeWithIRFlags>(Val: R);
977 auto *GEP = cast<GetElementPtrInst>(Val: RFlags.getUnderlyingInstr());
978 return Folder.FoldGEP(Ty: GEP->getSourceElementType(), Ptr: Ops[0], IdxList: drop_begin(RangeOrContainer&: Ops),
979 NW: RFlags.getGEPNoWrapFlags());
980 }
981 case VPInstruction::PtrAdd:
982 return Folder.FoldGEP(Ty: IntegerType::getInt8Ty(C&: TypeInfo.getContext()), Ptr: Ops[0],
983 IdxList: Ops[1],
984 NW: cast<VPRecipeWithIRFlags>(Val: R).getGEPNoWrapFlags());
985 case Instruction::InsertElement:
986 return Folder.FoldInsertElement(Vec: Ops[0], NewElt: Ops[1], Idx: Ops[2]);
987 case Instruction::ExtractElement:
988 return Folder.FoldExtractElement(Vec: Ops[0], Idx: Ops[1]);
989 }
990 return nullptr;
991}
992
993/// Try to simplify recipe \p R.
994static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
995 using namespace llvm::VPlanPatternMatch;
996 VPlan *Plan = R.getParent()->getPlan();
997
998 auto *Def = dyn_cast<VPSingleDefRecipe>(Val: &R);
999 if (!Def)
1000 return;
1001
1002 // Simplification of live-in IR values for SingleDef recipes using
1003 // InstSimplifyFolder.
1004 if (TypeSwitch<VPRecipeBase *, bool>(&R)
1005 .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
1006 VPReplicateRecipe>(caseFn: [&](auto *I) {
1007 const DataLayout &DL =
1008 Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout();
1009 Value *V = tryToFoldLiveIns(*I, I->getOpcode(), I->operands(), DL,
1010 TypeInfo);
1011 if (V)
1012 I->replaceAllUsesWith(Plan->getOrAddLiveIn(V));
1013 return V;
1014 })
1015 .Default(defaultFn: [](auto *) { return false; }))
1016 return;
1017
1018 // Fold PredPHI LiveIn -> LiveIn.
1019 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Val: &R)) {
1020 VPValue *Op = PredPHI->getOperand(N: 0);
1021 if (Op->isLiveIn())
1022 PredPHI->replaceAllUsesWith(New: Op);
1023 }
1024
1025 VPValue *A;
1026 if (match(V: Def, P: m_Trunc(Op0: m_ZExtOrSExt(Op0: m_VPValue(V&: A))))) {
1027 Type *TruncTy = TypeInfo.inferScalarType(V: Def);
1028 Type *ATy = TypeInfo.inferScalarType(V: A);
1029 if (TruncTy == ATy) {
1030 Def->replaceAllUsesWith(New: A);
1031 } else {
1032 // Don't replace a scalarizing recipe with a widened cast.
1033 if (isa<VPReplicateRecipe>(Val: Def))
1034 return;
1035 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1036
1037 unsigned ExtOpcode = match(V: R.getOperand(N: 0), P: m_SExt(Op0: m_VPValue()))
1038 ? Instruction::SExt
1039 : Instruction::ZExt;
1040 auto *VPC =
1041 new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy);
1042 if (auto *UnderlyingExt = R.getOperand(N: 0)->getUnderlyingValue()) {
1043 // UnderlyingExt has distinct return type, used to retain legacy cost.
1044 VPC->setUnderlyingValue(UnderlyingExt);
1045 }
1046 VPC->insertBefore(InsertPos: &R);
1047 Def->replaceAllUsesWith(New: VPC);
1048 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1049 auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy);
1050 VPC->insertBefore(InsertPos: &R);
1051 Def->replaceAllUsesWith(New: VPC);
1052 }
1053 }
1054#ifndef NDEBUG
1055 // Verify that the cached type info is for both A and its users is still
1056 // accurate by comparing it to freshly computed types.
1057 VPTypeAnalysis TypeInfo2(Plan->getCanonicalIV()->getScalarType());
1058 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1059 for (VPUser *U : A->users()) {
1060 auto *R = cast<VPRecipeBase>(Val: U);
1061 for (VPValue *VPV : R->definedValues())
1062 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1063 }
1064#endif
1065 }
1066
1067 // Simplify (X && Y) || (X && !Y) -> X.
1068 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1069 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1070 // recipes to be visited during simplification.
1071 VPValue *X, *Y;
1072 if (match(V: Def,
1073 P: m_c_BinaryOr(Op0: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_VPValue(V&: Y)),
1074 Op1: m_LogicalAnd(Op0: m_Deferred(V: X), Op1: m_Not(Op0: m_Deferred(V: Y)))))) {
1075 Def->replaceAllUsesWith(New: X);
1076 Def->eraseFromParent();
1077 return;
1078 }
1079
1080 // OR x, 1 -> 1.
1081 if (match(V: Def, P: m_c_BinaryOr(Op0: m_VPValue(V&: X), Op1: m_AllOnes()))) {
1082 Def->replaceAllUsesWith(New: Def->getOperand(N: 0) == X ? Def->getOperand(N: 1)
1083 : Def->getOperand(N: 0));
1084 Def->eraseFromParent();
1085 return;
1086 }
1087
1088 if (match(V: Def, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: X), Op2: m_Deferred(V: X))))
1089 return Def->replaceAllUsesWith(New: X);
1090
1091 if (match(V: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_SpecificInt(V: 1))))
1092 return Def->replaceAllUsesWith(New: A);
1093
1094 if (match(V: Def, P: m_Not(Op0: m_VPValue(V&: A)))) {
1095 if (match(V: A, P: m_Not(Op0: m_VPValue(V&: A))))
1096 return Def->replaceAllUsesWith(New: A);
1097
1098 // Try to fold Not into compares by adjusting the predicate in-place.
1099 if (isa<VPWidenRecipe>(Val: A) && A->getNumUsers() == 1) {
1100 auto *WideCmp = cast<VPWidenRecipe>(Val: A);
1101 if (WideCmp->getOpcode() == Instruction::ICmp ||
1102 WideCmp->getOpcode() == Instruction::FCmp) {
1103 WideCmp->setPredicate(
1104 CmpInst::getInversePredicate(pred: WideCmp->getPredicate()));
1105 Def->replaceAllUsesWith(New: WideCmp);
1106 // If WideCmp doesn't have a debug location, use the one from the
1107 // negation, to preserve the location.
1108 if (!WideCmp->getDebugLoc() && R.getDebugLoc())
1109 WideCmp->setDebugLoc(R.getDebugLoc());
1110 }
1111 }
1112 }
1113
1114 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1115 if ((match(V: Def,
1116 P: m_DerivedIV(Op0: m_SpecificInt(V: 0), Op1: m_VPValue(V&: A), Op2: m_SpecificInt(V: 1))) ||
1117 match(V: Def,
1118 P: m_DerivedIV(Op0: m_SpecificInt(V: 0), Op1: m_SpecificInt(V: 0), Op2: m_VPValue()))) &&
1119 TypeInfo.inferScalarType(V: Def->getOperand(N: 1)) ==
1120 TypeInfo.inferScalarType(V: Def))
1121 return Def->replaceAllUsesWith(New: Def->getOperand(N: 1));
1122
1123 if (match(V: Def, P: m_VPInstruction<VPInstruction::WideIVStep>(
1124 Op0: m_VPValue(V&: X), Op1: m_SpecificInt(V: 1)))) {
1125 Type *WideStepTy = TypeInfo.inferScalarType(V: Def);
1126 if (TypeInfo.inferScalarType(V: X) != WideStepTy)
1127 X = VPBuilder(Def).createWidenCast(Opcode: Instruction::Trunc, Op: X, ResultTy: WideStepTy);
1128 Def->replaceAllUsesWith(New: X);
1129 return;
1130 }
1131
1132 // For i1 vp.merges produced by AnyOf reductions:
1133 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1134 if (match(Def, m_Intrinsic<Intrinsic::vp_merge>(m_True(), m_VPValue(A),
1135 m_VPValue(X), m_VPValue())) &&
1136 match(A, m_c_BinaryOr(m_Specific(X), m_VPValue(Y))) &&
1137 TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(1)) {
1138 Def->setOperand(I: 1, New: Def->getOperand(N: 0));
1139 Def->setOperand(I: 0, New: Y);
1140 return;
1141 }
1142
1143 // Some simplifications can only be applied after unrolling. Perform them
1144 // below.
1145 if (!Plan->isUnrolled())
1146 return;
1147
1148 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1149 // the first lane is demanded.
1150 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: Def)) {
1151 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Def: Steps)) {
1152 Steps->replaceAllUsesWith(New: Steps->getOperand(N: 0));
1153 return;
1154 }
1155 }
1156 // Simplify redundant ReductionStartVector recipes after unrolling.
1157 VPValue *StartV;
1158 if (match(V: Def, P: m_VPInstruction<VPInstruction::ReductionStartVector>(
1159 Op0: m_VPValue(V&: StartV), Op1: m_VPValue(), Op2: m_VPValue()))) {
1160 Def->replaceUsesWithIf(New: StartV, ShouldReplace: [](const VPUser &U, unsigned Idx) {
1161 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &U);
1162 return PhiR && PhiR->isInLoop();
1163 });
1164 return;
1165 }
1166}
1167
1168void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {
1169 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
1170 Plan.getEntry());
1171 VPTypeAnalysis TypeInfo(&CanonicalIVTy);
1172 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
1173 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
1174 simplifyRecipe(R, TypeInfo);
1175 }
1176 }
1177}
1178
1179static void narrowToSingleScalarRecipes(VPlan &Plan) {
1180 if (Plan.hasScalarVFOnly())
1181 return;
1182
1183 // Try to narrow wide and replicating recipes to single scalar recipes,
1184 // based on VPlan analysis. Only process blocks in the loop region for now,
1185 // without traversing into nested regions, as recipes in replicate regions
1186 // cannot be converted yet.
1187 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1188 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
1189 for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
1190 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
1191 if (!RepR && !isa<VPWidenRecipe>(Val: &R))
1192 continue;
1193 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1194 continue;
1195
1196 auto *RepOrWidenR = cast<VPSingleDefRecipe>(Val: &R);
1197 // Skip recipes that aren't single scalars or don't have only their
1198 // scalar results used. In the latter case, we would introduce extra
1199 // broadcasts.
1200 if (!vputils::isSingleScalar(VPV: RepOrWidenR) ||
1201 any_of(Range: RepOrWidenR->users(), P: [RepOrWidenR](VPUser *U) {
1202 return !U->usesScalars(Op: RepOrWidenR);
1203 }))
1204 continue;
1205
1206 auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
1207 RepOrWidenR->operands(),
1208 true /*IsSingleScalar*/);
1209 Clone->insertBefore(InsertPos: RepOrWidenR);
1210 RepOrWidenR->replaceAllUsesWith(New: Clone);
1211 }
1212 }
1213}
1214
1215/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1216/// to make sure the masks are simplified.
1217static void simplifyBlends(VPlan &Plan) {
1218 using namespace llvm::VPlanPatternMatch;
1219 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1220 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
1221 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
1222 auto *Blend = dyn_cast<VPBlendRecipe>(Val: &R);
1223 if (!Blend)
1224 continue;
1225
1226 // Try to remove redundant blend recipes.
1227 SmallPtrSet<VPValue *, 4> UniqueValues;
1228 if (Blend->isNormalized() || !match(V: Blend->getMask(Idx: 0), P: m_False()))
1229 UniqueValues.insert(Ptr: Blend->getIncomingValue(Idx: 0));
1230 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1231 if (!match(V: Blend->getMask(Idx: I), P: m_False()))
1232 UniqueValues.insert(Ptr: Blend->getIncomingValue(Idx: I));
1233
1234 if (UniqueValues.size() == 1) {
1235 Blend->replaceAllUsesWith(New: *UniqueValues.begin());
1236 Blend->eraseFromParent();
1237 continue;
1238 }
1239
1240 if (Blend->isNormalized())
1241 continue;
1242
1243 // Normalize the blend so its first incoming value is used as the initial
1244 // value with the others blended into it.
1245
1246 unsigned StartIndex = 0;
1247 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1248 // If a value's mask is used only by the blend then is can be deadcoded.
1249 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1250 // that's used by multiple blends where it can be removed from them all.
1251 VPValue *Mask = Blend->getMask(Idx: I);
1252 if (Mask->getNumUsers() == 1 && !match(V: Mask, P: m_False())) {
1253 StartIndex = I;
1254 break;
1255 }
1256 }
1257
1258 SmallVector<VPValue *, 4> OperandsWithMask;
1259 OperandsWithMask.push_back(Elt: Blend->getIncomingValue(Idx: StartIndex));
1260
1261 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1262 if (I == StartIndex)
1263 continue;
1264 OperandsWithMask.push_back(Elt: Blend->getIncomingValue(Idx: I));
1265 OperandsWithMask.push_back(Elt: Blend->getMask(Idx: I));
1266 }
1267
1268 auto *NewBlend = new VPBlendRecipe(
1269 cast<PHINode>(Val: Blend->getUnderlyingValue()), OperandsWithMask);
1270 NewBlend->insertBefore(InsertPos: &R);
1271
1272 VPValue *DeadMask = Blend->getMask(Idx: StartIndex);
1273 Blend->replaceAllUsesWith(New: NewBlend);
1274 Blend->eraseFromParent();
1275 recursivelyDeleteDeadRecipes(V: DeadMask);
1276
1277 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1278 VPValue *NewMask;
1279 if (NewBlend->getNumOperands() == 3 &&
1280 match(V: NewBlend->getMask(Idx: 1), P: m_Not(Op0: m_VPValue(V&: NewMask)))) {
1281 VPValue *Inc0 = NewBlend->getOperand(N: 0);
1282 VPValue *Inc1 = NewBlend->getOperand(N: 1);
1283 VPValue *OldMask = NewBlend->getOperand(N: 2);
1284 NewBlend->setOperand(I: 0, New: Inc1);
1285 NewBlend->setOperand(I: 1, New: Inc0);
1286 NewBlend->setOperand(I: 2, New: NewMask);
1287 if (OldMask->getNumUsers() == 0)
1288 cast<VPInstruction>(Val: OldMask)->eraseFromParent();
1289 }
1290 }
1291 }
1292}
1293
1294/// Optimize the width of vector induction variables in \p Plan based on a known
1295/// constant Trip Count, \p BestVF and \p BestUF.
1296static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
1297 ElementCount BestVF,
1298 unsigned BestUF) {
1299 // Only proceed if we have not completely removed the vector region.
1300 if (!Plan.getVectorLoopRegion())
1301 return false;
1302
1303 if (!Plan.getTripCount()->isLiveIn())
1304 return false;
1305 auto *TC = dyn_cast_if_present<ConstantInt>(
1306 Val: Plan.getTripCount()->getUnderlyingValue());
1307 if (!TC || !BestVF.isFixed())
1308 return false;
1309
1310 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1311 // and UF. Returns at least 8.
1312 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1313 APInt AlignedTC =
1314 Align * APIntOps::RoundingUDiv(A: TC, B: APInt(TC.getBitWidth(), Align),
1315 RM: APInt::Rounding::UP);
1316 APInt MaxVal = AlignedTC - 1;
1317 return std::max<unsigned>(a: PowerOf2Ceil(A: MaxVal.getActiveBits()), b: 8);
1318 };
1319 unsigned NewBitWidth =
1320 ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF);
1321
1322 LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext();
1323 auto *NewIVTy = IntegerType::get(C&: Ctx, NumBits: NewBitWidth);
1324
1325 bool MadeChange = false;
1326
1327 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1328 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1329 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
1330
1331 // Currently only handle canonical IVs as it is trivial to replace the start
1332 // and stop values, and we currently only perform the optimization when the
1333 // IV has a single use.
1334 if (!WideIV || !WideIV->isCanonical() ||
1335 WideIV->hasMoreThanOneUniqueUser() ||
1336 NewIVTy == WideIV->getScalarType())
1337 continue;
1338
1339 // Currently only handle cases where the single user is a header-mask
1340 // comparison with the backedge-taken-count.
1341 using namespace VPlanPatternMatch;
1342 if (!match(
1343 U: *WideIV->user_begin(),
1344 P: m_Binary<Instruction::ICmp>(
1345 Op0: m_Specific(VPV: WideIV),
1346 Op1: m_Broadcast(Op0: m_Specific(VPV: Plan.getOrCreateBackedgeTakenCount())))))
1347 continue;
1348
1349 // Update IV operands and comparison bound to use new narrower type.
1350 auto *NewStart = Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: NewIVTy, V: 0));
1351 WideIV->setStartValue(NewStart);
1352 auto *NewStep = Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: NewIVTy, V: 1));
1353 WideIV->setStepValue(NewStep);
1354 // TODO: Remove once VPWidenIntOrFpInductionRecipe is fully expanded.
1355 VPInstructionWithType *OldStepVector = WideIV->getStepVector();
1356 assert(OldStepVector->getNumUsers() == 1 &&
1357 "step vector should only be used by single "
1358 "VPWidenIntOrFpInductionRecipe");
1359 auto *NewStepVector =
1360 new VPInstructionWithType(VPInstruction::StepVector, {}, NewIVTy, {},
1361 OldStepVector->getDebugLoc());
1362 NewStepVector->insertAfter(InsertPos: OldStepVector->getDefiningRecipe());
1363 OldStepVector->replaceAllUsesWith(New: NewStepVector);
1364 OldStepVector->eraseFromParent();
1365
1366 auto *NewBTC = new VPWidenCastRecipe(
1367 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1368 Plan.getVectorPreheader()->appendRecipe(Recipe: NewBTC);
1369 auto *Cmp = cast<VPInstruction>(Val: *WideIV->user_begin());
1370 Cmp->setOperand(I: 1, New: NewBTC);
1371
1372 MadeChange = true;
1373 }
1374
1375 return MadeChange;
1376}
1377
1378/// Return true if \p Cond is known to be true for given \p BestVF and \p
1379/// BestUF.
1380static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
1381 ElementCount BestVF, unsigned BestUF,
1382 ScalarEvolution &SE) {
1383 using namespace llvm::VPlanPatternMatch;
1384 if (match(V: Cond, P: m_Binary<Instruction::Or>(Op0: m_VPValue(), Op1: m_VPValue())))
1385 return any_of(Range: Cond->getDefiningRecipe()->operands(), P: [&Plan, BestVF, BestUF,
1386 &SE](VPValue *C) {
1387 return isConditionTrueViaVFAndUF(Cond: C, Plan, BestVF, BestUF, SE);
1388 });
1389
1390 auto *CanIV = Plan.getCanonicalIV();
1391 if (!match(V: Cond, P: m_Binary<Instruction::ICmp>(
1392 Op0: m_Specific(VPV: CanIV->getBackedgeValue()),
1393 Op1: m_Specific(VPV: &Plan.getVectorTripCount()))) ||
1394 cast<VPRecipeWithIRFlags>(Val: Cond->getDefiningRecipe())->getPredicate() !=
1395 CmpInst::ICMP_EQ)
1396 return false;
1397
1398 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1399 // count is not conveniently available as SCEV so far, so we compare directly
1400 // against the original trip count. This is stricter than necessary, as we
1401 // will only return true if the trip count == vector trip count.
1402 // TODO: Use SCEV for vector trip count once available, to cover cases where
1403 // vector trip count == UF * VF, but original trip count != UF * VF.
1404 const SCEV *TripCount =
1405 vputils::getSCEVExprForVPValue(V: Plan.getTripCount(), SE);
1406 assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1407 "Trip count SCEV must be computable");
1408 ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF);
1409 const SCEV *C = SE.getElementCount(Ty: TripCount->getType(), EC: NumElements);
1410 return SE.isKnownPredicate(Pred: CmpInst::ICMP_EQ, LHS: TripCount, RHS: C);
1411}
1412
1413/// Try to simplify the branch condition of \p Plan. This may restrict the
1414/// resulting plan to \p BestVF and \p BestUF.
1415static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
1416 unsigned BestUF,
1417 PredicatedScalarEvolution &PSE) {
1418 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1419 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1420 auto *Term = &ExitingVPBB->back();
1421 VPValue *Cond;
1422 ScalarEvolution &SE = *PSE.getSE();
1423 using namespace llvm::VPlanPatternMatch;
1424 if (match(V: Term, P: m_BranchOnCount(Op0: m_VPValue(), Op1: m_VPValue())) ||
1425 match(V: Term, P: m_BranchOnCond(
1426 Op0: m_Not(Op0: m_ActiveLaneMask(Op0: m_VPValue(), Op1: m_VPValue()))))) {
1427 // Try to simplify the branch condition if TC <= VF * UF when the latch
1428 // terminator is BranchOnCount or BranchOnCond where the input is
1429 // Not(ActiveLaneMask).
1430 const SCEV *TripCount =
1431 vputils::getSCEVExprForVPValue(V: Plan.getTripCount(), SE);
1432 assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1433 "Trip count SCEV must be computable");
1434 ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF);
1435 const SCEV *C = SE.getElementCount(Ty: TripCount->getType(), EC: NumElements);
1436 if (TripCount->isZero() ||
1437 !SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: TripCount, RHS: C))
1438 return false;
1439 } else if (match(V: Term, P: m_BranchOnCond(Op0: m_VPValue(V&: Cond)))) {
1440 // For BranchOnCond, check if we can prove the condition to be true using VF
1441 // and UF.
1442 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1443 return false;
1444 } else {
1445 return false;
1446 }
1447
1448 // The vector loop region only executes once. If possible, completely remove
1449 // the region, otherwise replace the terminator controlling the latch with
1450 // (BranchOnCond true).
1451 auto *Header = cast<VPBasicBlock>(Val: VectorRegion->getEntry());
1452 auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
1453 if (all_of(
1454 Range: Header->phis(),
1455 P: IsaPred<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe>)) {
1456 for (VPRecipeBase &HeaderR : make_early_inc_range(Range: Header->phis())) {
1457 auto *HeaderPhiR = cast<VPHeaderPHIRecipe>(Val: &HeaderR);
1458 HeaderPhiR->replaceAllUsesWith(New: HeaderPhiR->getStartValue());
1459 HeaderPhiR->eraseFromParent();
1460 }
1461
1462 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1463 VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1464 VPBlockUtils::disconnectBlocks(From: Preheader, To: VectorRegion);
1465 VPBlockUtils::disconnectBlocks(From: VectorRegion, To: Exit);
1466
1467 for (VPBlockBase *B : vp_depth_first_shallow(G: VectorRegion->getEntry()))
1468 B->setParent(nullptr);
1469
1470 VPBlockUtils::connectBlocks(From: Preheader, To: Header);
1471 VPBlockUtils::connectBlocks(From: ExitingVPBB, To: Exit);
1472 VPlanTransforms::simplifyRecipes(Plan, CanonicalIVTy&: *CanIVTy);
1473 } else {
1474 // The vector region contains header phis for which we cannot remove the
1475 // loop region yet.
1476 LLVMContext &Ctx = SE.getContext();
1477 auto *BOC = new VPInstruction(
1478 VPInstruction::BranchOnCond,
1479 {Plan.getOrAddLiveIn(V: ConstantInt::getTrue(Context&: Ctx))}, Term->getDebugLoc());
1480 ExitingVPBB->appendRecipe(Recipe: BOC);
1481 }
1482
1483 Term->eraseFromParent();
1484
1485 return true;
1486}
1487
1488void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
1489 unsigned BestUF,
1490 PredicatedScalarEvolution &PSE) {
1491 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1492 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1493
1494 bool MadeChange =
1495 simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1496 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1497
1498 if (MadeChange) {
1499 Plan.setVF(BestVF);
1500 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1501 }
1502 // TODO: Further simplifications are possible
1503 // 1. Replace inductions with constants.
1504 // 2. Replace vector loop region with VPBasicBlock.
1505}
1506
1507/// Sink users of \p FOR after the recipe defining the previous value \p
1508/// Previous of the recurrence. \returns true if all users of \p FOR could be
1509/// re-arranged as needed or false if it is not possible.
1510static bool
1511sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
1512 VPRecipeBase *Previous,
1513 VPDominatorTree &VPDT) {
1514 // Collect recipes that need sinking.
1515 SmallVector<VPRecipeBase *> WorkList;
1516 SmallPtrSet<VPRecipeBase *, 8> Seen;
1517 Seen.insert(Ptr: Previous);
1518 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
1519 // The previous value must not depend on the users of the recurrence phi. In
1520 // that case, FOR is not a fixed order recurrence.
1521 if (SinkCandidate == Previous)
1522 return false;
1523
1524 if (isa<VPHeaderPHIRecipe>(Val: SinkCandidate) ||
1525 !Seen.insert(Ptr: SinkCandidate).second ||
1526 VPDT.properlyDominates(A: Previous, B: SinkCandidate))
1527 return true;
1528
1529 if (SinkCandidate->mayHaveSideEffects())
1530 return false;
1531
1532 WorkList.push_back(Elt: SinkCandidate);
1533 return true;
1534 };
1535
1536 // Recursively sink users of FOR after Previous.
1537 WorkList.push_back(Elt: FOR);
1538 for (unsigned I = 0; I != WorkList.size(); ++I) {
1539 VPRecipeBase *Current = WorkList[I];
1540 assert(Current->getNumDefinedValues() == 1 &&
1541 "only recipes with a single defined value expected");
1542
1543 for (VPUser *User : Current->getVPSingleValue()->users()) {
1544 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(Val: User)))
1545 return false;
1546 }
1547 }
1548
1549 // Keep recipes to sink ordered by dominance so earlier instructions are
1550 // processed first.
1551 sort(C&: WorkList, Comp: [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1552 return VPDT.properlyDominates(A, B);
1553 });
1554
1555 for (VPRecipeBase *SinkCandidate : WorkList) {
1556 if (SinkCandidate == FOR)
1557 continue;
1558
1559 SinkCandidate->moveAfter(MovePos: Previous);
1560 Previous = SinkCandidate;
1561 }
1562 return true;
1563}
1564
1565/// Try to hoist \p Previous and its operands before all users of \p FOR.
1566static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
1567 VPRecipeBase *Previous,
1568 VPDominatorTree &VPDT) {
1569 if (Previous->mayHaveSideEffects() || Previous->mayReadFromMemory())
1570 return false;
1571
1572 // Collect recipes that need hoisting.
1573 SmallVector<VPRecipeBase *> HoistCandidates;
1574 SmallPtrSet<VPRecipeBase *, 8> Visited;
1575 VPRecipeBase *HoistPoint = nullptr;
1576 // Find the closest hoist point by looking at all users of FOR and selecting
1577 // the recipe dominating all other users.
1578 for (VPUser *U : FOR->users()) {
1579 auto *R = cast<VPRecipeBase>(Val: U);
1580 if (!HoistPoint || VPDT.properlyDominates(A: R, B: HoistPoint))
1581 HoistPoint = R;
1582 }
1583 assert(all_of(FOR->users(),
1584 [&VPDT, HoistPoint](VPUser *U) {
1585 auto *R = cast<VPRecipeBase>(U);
1586 return HoistPoint == R ||
1587 VPDT.properlyDominates(HoistPoint, R);
1588 }) &&
1589 "HoistPoint must dominate all users of FOR");
1590
1591 auto NeedsHoisting = [HoistPoint, &VPDT,
1592 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
1593 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
1594 if (!HoistCandidate)
1595 return nullptr;
1596 VPRegionBlock *EnclosingLoopRegion =
1597 HoistCandidate->getParent()->getEnclosingLoopRegion();
1598 assert((!HoistCandidate->getParent()->getParent() ||
1599 HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) &&
1600 "CFG in VPlan should still be flat, without replicate regions");
1601 // Hoist candidate was already visited, no need to hoist.
1602 if (!Visited.insert(Ptr: HoistCandidate).second)
1603 return nullptr;
1604
1605 // Candidate is outside loop region or a header phi, dominates FOR users w/o
1606 // hoisting.
1607 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(Val: HoistCandidate))
1608 return nullptr;
1609
1610 // If we reached a recipe that dominates HoistPoint, we don't need to
1611 // hoist the recipe.
1612 if (VPDT.properlyDominates(A: HoistCandidate, B: HoistPoint))
1613 return nullptr;
1614 return HoistCandidate;
1615 };
1616 auto CanHoist = [&](VPRecipeBase *HoistCandidate) {
1617 // Avoid hoisting candidates with side-effects, as we do not yet analyze
1618 // associated dependencies.
1619 return !HoistCandidate->mayHaveSideEffects();
1620 };
1621
1622 if (!NeedsHoisting(Previous->getVPSingleValue()))
1623 return true;
1624
1625 // Recursively try to hoist Previous and its operands before all users of FOR.
1626 HoistCandidates.push_back(Elt: Previous);
1627
1628 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
1629 VPRecipeBase *Current = HoistCandidates[I];
1630 assert(Current->getNumDefinedValues() == 1 &&
1631 "only recipes with a single defined value expected");
1632 if (!CanHoist(Current))
1633 return false;
1634
1635 for (VPValue *Op : Current->operands()) {
1636 // If we reach FOR, it means the original Previous depends on some other
1637 // recurrence that in turn depends on FOR. If that is the case, we would
1638 // also need to hoist recipes involving the other FOR, which may break
1639 // dependencies.
1640 if (Op == FOR)
1641 return false;
1642
1643 if (auto *R = NeedsHoisting(Op))
1644 HoistCandidates.push_back(Elt: R);
1645 }
1646 }
1647
1648 // Order recipes to hoist by dominance so earlier instructions are processed
1649 // first.
1650 sort(C&: HoistCandidates, Comp: [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
1651 return VPDT.properlyDominates(A, B);
1652 });
1653
1654 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
1655 HoistCandidate->moveBefore(BB&: *HoistPoint->getParent(),
1656 I: HoistPoint->getIterator());
1657 }
1658
1659 return true;
1660}
1661
1662bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
1663 VPBuilder &LoopBuilder) {
1664 VPDominatorTree VPDT;
1665 VPDT.recalculate(Func&: Plan);
1666
1667 SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis;
1668 for (VPRecipeBase &R :
1669 Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis())
1670 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &R))
1671 RecurrencePhis.push_back(Elt: FOR);
1672
1673 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
1674 SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;
1675 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
1676 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
1677 // to terminate.
1678 while (auto *PrevPhi =
1679 dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Val: Previous)) {
1680 assert(PrevPhi->getParent() == FOR->getParent());
1681 assert(SeenPhis.insert(PrevPhi).second);
1682 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
1683 }
1684
1685 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
1686 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
1687 return false;
1688
1689 // Introduce a recipe to combine the incoming and previous values of a
1690 // fixed-order recurrence.
1691 VPBasicBlock *InsertBlock = Previous->getParent();
1692 if (isa<VPHeaderPHIRecipe>(Val: Previous))
1693 LoopBuilder.setInsertPoint(TheBB: InsertBlock, IP: InsertBlock->getFirstNonPhi());
1694 else
1695 LoopBuilder.setInsertPoint(TheBB: InsertBlock,
1696 IP: std::next(x: Previous->getIterator()));
1697
1698 auto *RecurSplice =
1699 LoopBuilder.createNaryOp(Opcode: VPInstruction::FirstOrderRecurrenceSplice,
1700 Operands: {FOR, FOR->getBackedgeValue()});
1701
1702 FOR->replaceAllUsesWith(New: RecurSplice);
1703 // Set the first operand of RecurSplice to FOR again, after replacing
1704 // all users.
1705 RecurSplice->setOperand(I: 0, New: FOR);
1706 }
1707 return true;
1708}
1709
1710void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
1711 for (VPRecipeBase &R :
1712 Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
1713 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
1714 if (!PhiR)
1715 continue;
1716 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
1717 RecurKind RK = RdxDesc.getRecurrenceKind();
1718 if (RK != RecurKind::Add && RK != RecurKind::Mul)
1719 continue;
1720
1721 for (VPUser *U : collectUsersRecursively(V: PhiR))
1722 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: U)) {
1723 RecWithFlags->dropPoisonGeneratingFlags();
1724 }
1725 }
1726}
1727
1728/// Move loop-invariant recipes out of the vector loop region in \p Plan.
1729static void licm(VPlan &Plan) {
1730 VPBasicBlock *Preheader = Plan.getVectorPreheader();
1731
1732 // Return true if we do not know how to (mechanically) hoist a given recipe
1733 // out of a loop region. Does not address legality concerns such as aliasing
1734 // or speculation safety.
1735 auto CannotHoistRecipe = [](VPRecipeBase &R) {
1736 // Allocas cannot be hoisted.
1737 auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
1738 return RepR && RepR->getOpcode() == Instruction::Alloca;
1739 };
1740
1741 // Hoist any loop invariant recipes from the vector loop region to the
1742 // preheader. Preform a shallow traversal of the vector loop region, to
1743 // exclude recipes in replicate regions.
1744 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1745 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1746 Range: vp_depth_first_shallow(G: LoopRegion->getEntry()))) {
1747 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
1748 if (CannotHoistRecipe(R))
1749 continue;
1750 // TODO: Relax checks in the future, e.g. we could also hoist reads, if
1751 // their memory location is not modified in the vector loop.
1752 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() ||
1753 any_of(Range: R.operands(), P: [](VPValue *Op) {
1754 return !Op->isDefinedOutsideLoopRegions();
1755 }))
1756 continue;
1757 R.moveBefore(BB&: *Preheader, I: Preheader->end());
1758 }
1759 }
1760}
1761
1762void VPlanTransforms::truncateToMinimalBitwidths(
1763 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
1764 // Keep track of created truncates, so they can be re-used. Note that we
1765 // cannot use RAUW after creating a new truncate, as this would could make
1766 // other uses have different types for their operands, making them invalidly
1767 // typed.
1768 DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs;
1769 Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
1770 VPTypeAnalysis TypeInfo(CanonicalIVType);
1771 VPBasicBlock *PH = Plan.getVectorPreheader();
1772 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1773 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) {
1774 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
1775 if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
1776 VPWidenSelectRecipe, VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(
1777 Val: &R))
1778 continue;
1779
1780 VPValue *ResultVPV = R.getVPSingleValue();
1781 auto *UI = cast_or_null<Instruction>(Val: ResultVPV->getUnderlyingValue());
1782 unsigned NewResSizeInBits = MinBWs.lookup(Key: UI);
1783 if (!NewResSizeInBits)
1784 continue;
1785
1786 // If the value wasn't vectorized, we must maintain the original scalar
1787 // type. Skip those here, after incrementing NumProcessedRecipes. Also
1788 // skip casts which do not need to be handled explicitly here, as
1789 // redundant casts will be removed during recipe simplification.
1790 if (isa<VPReplicateRecipe, VPWidenCastRecipe>(Val: &R))
1791 continue;
1792
1793 Type *OldResTy = TypeInfo.inferScalarType(V: ResultVPV);
1794 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
1795 assert(OldResTy->isIntegerTy() && "only integer types supported");
1796 (void)OldResSizeInBits;
1797
1798 LLVMContext &Ctx = CanonicalIVType->getContext();
1799 auto *NewResTy = IntegerType::get(C&: Ctx, NumBits: NewResSizeInBits);
1800
1801 // Any wrapping introduced by shrinking this operation shouldn't be
1802 // considered undefined behavior. So, we can't unconditionally copy
1803 // arithmetic wrapping flags to VPW.
1804 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(Val: &R))
1805 VPW->dropPoisonGeneratingFlags();
1806
1807 using namespace llvm::VPlanPatternMatch;
1808 if (OldResSizeInBits != NewResSizeInBits &&
1809 !match(V: &R, P: m_Binary<Instruction::ICmp>(Op0: m_VPValue(), Op1: m_VPValue()))) {
1810 // Extend result to original width.
1811 auto *Ext =
1812 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
1813 Ext->insertAfter(InsertPos: &R);
1814 ResultVPV->replaceAllUsesWith(New: Ext);
1815 Ext->setOperand(I: 0, New: ResultVPV);
1816 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
1817 } else {
1818 assert(
1819 match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) &&
1820 "Only ICmps should not need extending the result.");
1821 }
1822
1823 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
1824 if (isa<VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(Val: &R))
1825 continue;
1826
1827 // Shrink operands by introducing truncates as needed.
1828 unsigned StartIdx = isa<VPWidenSelectRecipe>(Val: &R) ? 1 : 0;
1829 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
1830 auto *Op = R.getOperand(N: Idx);
1831 unsigned OpSizeInBits =
1832 TypeInfo.inferScalarType(V: Op)->getScalarSizeInBits();
1833 if (OpSizeInBits == NewResSizeInBits)
1834 continue;
1835 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
1836 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Key: Op);
1837 VPWidenCastRecipe *NewOp =
1838 IterIsEmpty
1839 ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy)
1840 : ProcessedIter->second;
1841 R.setOperand(I: Idx, New: NewOp);
1842 if (!IterIsEmpty)
1843 continue;
1844 ProcessedIter->second = NewOp;
1845 if (!Op->isLiveIn()) {
1846 NewOp->insertBefore(InsertPos: &R);
1847 } else {
1848 PH->appendRecipe(Recipe: NewOp);
1849 }
1850 }
1851
1852 }
1853 }
1854}
1855
1856/// Remove BranchOnCond recipes with true or false conditions together with
1857/// removing dead edges to their successors.
1858static void removeBranchOnConst(VPlan &Plan) {
1859 using namespace llvm::VPlanPatternMatch;
1860 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1861 Range: vp_depth_first_shallow(G: Plan.getEntry()))) {
1862 VPValue *Cond;
1863 if (VPBB->getNumSuccessors() != 2 || VPBB == Plan.getEntry() ||
1864 !match(V: &VPBB->back(), P: m_BranchOnCond(Op0: m_VPValue(V&: Cond))))
1865 continue;
1866
1867 unsigned RemovedIdx;
1868 if (match(V: Cond, P: m_True()))
1869 RemovedIdx = 1;
1870 else if (match(V: Cond, P: m_False()))
1871 RemovedIdx = 0;
1872 else
1873 continue;
1874
1875 VPBasicBlock *RemovedSucc =
1876 cast<VPBasicBlock>(Val: VPBB->getSuccessors()[RemovedIdx]);
1877 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
1878 "There must be a single edge between VPBB and its successor");
1879 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
1880 // these recipes.
1881 for (VPRecipeBase &R : RemovedSucc->phis()) {
1882 auto *Phi = cast<VPPhiAccessors>(Val: &R);
1883 assert((!isa<VPIRPhi>(&R) || RemovedSucc->getNumPredecessors() == 1) &&
1884 "VPIRPhis must have a single predecessor");
1885 Phi->removeIncomingValueFor(IncomingBlock: VPBB);
1886 }
1887 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
1888 // automatically on VPlan destruction if it becomes unreachable.
1889 VPBlockUtils::disconnectBlocks(From: VPBB, To: RemovedSucc);
1890 VPBB->back().eraseFromParent();
1891 }
1892}
1893
1894void VPlanTransforms::optimize(VPlan &Plan) {
1895 runPass(Fn: removeRedundantCanonicalIVs, Plan);
1896 runPass(Fn: removeRedundantInductionCasts, Plan);
1897
1898 runPass(Fn: simplifyRecipes, Plan, Args&: *Plan.getCanonicalIV()->getScalarType());
1899 runPass(Fn: simplifyBlends, Plan);
1900 runPass(Fn: removeDeadRecipes, Plan);
1901 runPass(Fn: narrowToSingleScalarRecipes, Plan);
1902 runPass(Fn: legalizeAndOptimizeInductions, Plan);
1903 runPass(Fn: removeRedundantExpandSCEVRecipes, Plan);
1904 runPass(Fn: simplifyRecipes, Plan, Args&: *Plan.getCanonicalIV()->getScalarType());
1905 runPass(Fn: removeBranchOnConst, Plan);
1906 runPass(Fn: removeDeadRecipes, Plan);
1907
1908 runPass(Fn: createAndOptimizeReplicateRegions, Plan);
1909 runPass(Transform: mergeBlocksIntoPredecessors, Plan);
1910 runPass(Fn: licm, Plan);
1911}
1912
1913// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
1914// the loop terminator with a branch-on-cond recipe with the negated
1915// active-lane-mask as operand. Note that this turns the loop into an
1916// uncountable one. Only the existing terminator is replaced, all other existing
1917// recipes/users remain unchanged, except for poison-generating flags being
1918// dropped from the canonical IV increment. Return the created
1919// VPActiveLaneMaskPHIRecipe.
1920//
1921// The function uses the following definitions:
1922//
1923// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
1924// calculate-trip-count-minus-VF (original TC) : original TC
1925// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
1926// CanonicalIVPhi : CanonicalIVIncrement
1927// %StartV is the canonical induction start value.
1928//
1929// The function adds the following recipes:
1930//
1931// vector.ph:
1932// %TripCount = calculate-trip-count-minus-VF (original TC)
1933// [if DataWithControlFlowWithoutRuntimeCheck]
1934// %EntryInc = canonical-iv-increment-for-part %StartV
1935// %EntryALM = active-lane-mask %EntryInc, %TripCount
1936//
1937// vector.body:
1938// ...
1939// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
1940// ...
1941// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
1942// %ALM = active-lane-mask %InLoopInc, TripCount
1943// %Negated = Not %ALM
1944// branch-on-cond %Negated
1945//
1946static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
1947 VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
1948 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
1949 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
1950 auto *CanonicalIVPHI = Plan.getCanonicalIV();
1951 VPValue *StartV = CanonicalIVPHI->getStartValue();
1952
1953 auto *CanonicalIVIncrement =
1954 cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue());
1955 // TODO: Check if dropping the flags is needed if
1956 // !DataAndControlFlowWithoutRuntimeCheck.
1957 CanonicalIVIncrement->dropPoisonGeneratingFlags();
1958 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
1959 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
1960 // we have to take unrolling into account. Each part needs to start at
1961 // Part * VF
1962 auto *VecPreheader = Plan.getVectorPreheader();
1963 VPBuilder Builder(VecPreheader);
1964
1965 // Create the ActiveLaneMask instruction using the correct start values.
1966 VPValue *TC = Plan.getTripCount();
1967
1968 VPValue *TripCount, *IncrementValue;
1969 if (!DataAndControlFlowWithoutRuntimeCheck) {
1970 // When the loop is guarded by a runtime overflow check for the loop
1971 // induction variable increment by VF, we can increment the value before
1972 // the get.active.lane mask and use the unmodified tripcount.
1973 IncrementValue = CanonicalIVIncrement;
1974 TripCount = TC;
1975 } else {
1976 // When avoiding a runtime check, the active.lane.mask inside the loop
1977 // uses a modified trip count and the induction variable increment is
1978 // done after the active.lane.mask intrinsic is called.
1979 IncrementValue = CanonicalIVPHI;
1980 TripCount = Builder.createNaryOp(Opcode: VPInstruction::CalculateTripCountMinusVF,
1981 Operands: {TC}, DL);
1982 }
1983 auto *EntryIncrement = Builder.createOverflowingOp(
1984 Opcode: VPInstruction::CanonicalIVIncrementForPart, Operands: {StartV}, WrapFlags: {false, false}, DL,
1985 Name: "index.part.next");
1986
1987 // Create the active lane mask instruction in the VPlan preheader.
1988 auto *EntryALM =
1989 Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask, Operands: {EntryIncrement, TC},
1990 DL, Name: "active.lane.mask.entry");
1991
1992 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
1993 // preheader ActiveLaneMask instruction.
1994 auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
1995 LaneMaskPhi->insertAfter(InsertPos: CanonicalIVPHI);
1996
1997 // Create the active lane mask for the next iteration of the loop before the
1998 // original terminator.
1999 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2000 Builder.setInsertPoint(OriginalTerminator);
2001 auto *InLoopIncrement =
2002 Builder.createOverflowingOp(Opcode: VPInstruction::CanonicalIVIncrementForPart,
2003 Operands: {IncrementValue}, WrapFlags: {false, false}, DL);
2004 auto *ALM = Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
2005 Operands: {InLoopIncrement, TripCount}, DL,
2006 Name: "active.lane.mask.next");
2007 LaneMaskPhi->addOperand(Operand: ALM);
2008
2009 // Replace the original terminator with BranchOnCond. We have to invert the
2010 // mask here because a true condition means jumping to the exit block.
2011 auto *NotMask = Builder.createNot(Operand: ALM, DL);
2012 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {NotMask}, DL);
2013 OriginalTerminator->eraseFromParent();
2014 return LaneMaskPhi;
2015}
2016
2017/// Collect all VPValues representing a header mask through the (ICMP_ULE,
2018/// WideCanonicalIV, backedge-taken-count) pattern.
2019/// TODO: Introduce explicit recipe for header-mask instead of searching
2020/// for the header-mask pattern manually.
2021static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
2022 SmallVector<VPValue *> WideCanonicalIVs;
2023 auto *FoundWidenCanonicalIVUser =
2024 find_if(Range: Plan.getCanonicalIV()->users(),
2025 P: [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(Val: U); });
2026 assert(count_if(Plan.getCanonicalIV()->users(),
2027 [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }) <=
2028 1 &&
2029 "Must have at most one VPWideCanonicalIVRecipe");
2030 if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) {
2031 auto *WideCanonicalIV =
2032 cast<VPWidenCanonicalIVRecipe>(Val: *FoundWidenCanonicalIVUser);
2033 WideCanonicalIVs.push_back(Elt: WideCanonicalIV);
2034 }
2035
2036 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2037 // version of the canonical induction.
2038 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2039 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2040 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
2041 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2042 WideCanonicalIVs.push_back(Elt: WidenOriginalIV);
2043 }
2044
2045 // Walk users of wide canonical IVs and collect to all compares of the form
2046 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2047 SmallVector<VPValue *> HeaderMasks;
2048 for (auto *Wide : WideCanonicalIVs) {
2049 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2050 auto *HeaderMask = dyn_cast<VPInstruction>(Val: U);
2051 if (!HeaderMask || !vputils::isHeaderMask(V: HeaderMask, Plan))
2052 continue;
2053
2054 assert(HeaderMask->getOperand(0) == Wide &&
2055 "WidenCanonicalIV must be the first operand of the compare");
2056 HeaderMasks.push_back(Elt: HeaderMask);
2057 }
2058 }
2059 return HeaderMasks;
2060}
2061
2062void VPlanTransforms::addActiveLaneMask(
2063 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2064 bool DataAndControlFlowWithoutRuntimeCheck) {
2065 assert((!DataAndControlFlowWithoutRuntimeCheck ||
2066 UseActiveLaneMaskForControlFlow) &&
2067 "DataAndControlFlowWithoutRuntimeCheck implies "
2068 "UseActiveLaneMaskForControlFlow");
2069
2070 auto *FoundWidenCanonicalIVUser =
2071 find_if(Range: Plan.getCanonicalIV()->users(),
2072 P: [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(Val: U); });
2073 assert(FoundWidenCanonicalIVUser &&
2074 "Must have widened canonical IV when tail folding!");
2075 auto *WideCanonicalIV =
2076 cast<VPWidenCanonicalIVRecipe>(Val: *FoundWidenCanonicalIVUser);
2077 VPSingleDefRecipe *LaneMask;
2078 if (UseActiveLaneMaskForControlFlow) {
2079 LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
2080 Plan, DataAndControlFlowWithoutRuntimeCheck);
2081 } else {
2082 VPBuilder B = VPBuilder::getToInsertAfter(R: WideCanonicalIV);
2083 LaneMask = B.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
2084 Operands: {WideCanonicalIV, Plan.getTripCount()}, Inst: nullptr,
2085 Name: "active.lane.mask");
2086 }
2087
2088 // Walk users of WideCanonicalIV and replace all compares of the form
2089 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an
2090 // active-lane-mask.
2091 for (VPValue *HeaderMask : collectAllHeaderMasks(Plan))
2092 HeaderMask->replaceAllUsesWith(New: LaneMask);
2093}
2094
2095/// Try to convert \p CurRecipe to a corresponding EVL-based recipe. Returns
2096/// nullptr if no EVL-based recipe could be created.
2097/// \p HeaderMask Header Mask.
2098/// \p CurRecipe Recipe to be transform.
2099/// \p TypeInfo VPlan-based type analysis.
2100/// \p AllOneMask The vector mask parameter of vector-predication intrinsics.
2101/// \p EVL The explicit vector length parameter of vector-predication
2102/// intrinsics.
2103/// \p PrevEVL The explicit vector length of the previous iteration. Only
2104/// required if \p CurRecipe is a VPInstruction::FirstOrderRecurrenceSplice.
2105static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask,
2106 VPRecipeBase &CurRecipe,
2107 VPTypeAnalysis &TypeInfo,
2108 VPValue &AllOneMask, VPValue &EVL,
2109 VPValue *PrevEVL) {
2110 using namespace llvm::VPlanPatternMatch;
2111 auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
2112 assert(OrigMask && "Unmasked recipe when folding tail");
2113 // HeaderMask will be handled using EVL.
2114 VPValue *Mask;
2115 if (match(V: OrigMask, P: m_LogicalAnd(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: Mask))))
2116 return Mask;
2117 return HeaderMask == OrigMask ? nullptr : OrigMask;
2118 };
2119
2120 return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe)
2121 .Case<VPWidenLoadRecipe>(caseFn: [&](VPWidenLoadRecipe *L) {
2122 VPValue *NewMask = GetNewMask(L->getMask());
2123 return new VPWidenLoadEVLRecipe(*L, EVL, NewMask);
2124 })
2125 .Case<VPWidenStoreRecipe>(caseFn: [&](VPWidenStoreRecipe *S) {
2126 VPValue *NewMask = GetNewMask(S->getMask());
2127 return new VPWidenStoreEVLRecipe(*S, EVL, NewMask);
2128 })
2129 .Case<VPReductionRecipe>(caseFn: [&](VPReductionRecipe *Red) {
2130 VPValue *NewMask = GetNewMask(Red->getCondOp());
2131 return new VPReductionEVLRecipe(*Red, EVL, NewMask);
2132 })
2133 .Case<VPWidenSelectRecipe>(caseFn: [&](VPWidenSelectRecipe *Sel) {
2134 SmallVector<VPValue *> Ops(Sel->operands());
2135 Ops.push_back(Elt: &EVL);
2136 return new VPWidenIntrinsicRecipe(Intrinsic::vp_select, Ops,
2137 TypeInfo.inferScalarType(Sel),
2138 Sel->getDebugLoc());
2139 })
2140 .Case<VPInstruction>(caseFn: [&](VPInstruction *VPI) -> VPRecipeBase * {
2141 if (VPI->getOpcode() == VPInstruction::FirstOrderRecurrenceSplice) {
2142 assert(PrevEVL && "Fixed-order recurrences require previous EVL");
2143 VPValue *MinusOneVPV = VPI->getParent()->getPlan()->getOrAddLiveIn(
2144 V: ConstantInt::getSigned(Ty: Type::getInt32Ty(C&: TypeInfo.getContext()),
2145 V: -1));
2146 SmallVector<VPValue *> Ops(VPI->operands());
2147 Ops.append(IL: {MinusOneVPV, &AllOneMask, PrevEVL, &EVL});
2148 return new VPWidenIntrinsicRecipe(Intrinsic::experimental_vp_splice,
2149 Ops, TypeInfo.inferScalarType(V: VPI),
2150 VPI->getDebugLoc());
2151 }
2152
2153 VPValue *LHS, *RHS;
2154 // Transform select with a header mask condition
2155 // select(header_mask, LHS, RHS)
2156 // into vector predication merge.
2157 // vp.merge(all-true, LHS, RHS, EVL)
2158 if (!match(V: VPI, P: m_Select(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: LHS),
2159 Op2: m_VPValue(V&: RHS))))
2160 return nullptr;
2161 // Use all true as the condition because this transformation is
2162 // limited to selects whose condition is a header mask.
2163 return new VPWidenIntrinsicRecipe(
2164 Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
2165 TypeInfo.inferScalarType(V: LHS), VPI->getDebugLoc());
2166 })
2167 .Default(defaultFn: [&](VPRecipeBase *R) { return nullptr; });
2168}
2169
2170/// Replace recipes with their EVL variants.
2171static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
2172 Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
2173 VPTypeAnalysis TypeInfo(CanonicalIVType);
2174 LLVMContext &Ctx = CanonicalIVType->getContext();
2175 VPValue *AllOneMask = Plan.getOrAddLiveIn(V: ConstantInt::getTrue(Context&: Ctx));
2176 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2177 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2178
2179 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
2180 // contained.
2181 VPInstruction *PrevEVL = nullptr;
2182 bool ContainsFORs =
2183 any_of(Range: Header->phis(), P: IsaPred<VPFirstOrderRecurrencePHIRecipe>);
2184 if (ContainsFORs) {
2185 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2186 VPValue *MaxEVL = &Plan.getVF();
2187 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2188 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2189 if (unsigned VFSize =
2190 TypeInfo.inferScalarType(V: MaxEVL)->getScalarSizeInBits();
2191 VFSize != 32) {
2192 MaxEVL = Builder.createScalarCast(
2193 Opcode: VFSize > 32 ? Instruction::Trunc : Instruction::ZExt, Op: MaxEVL,
2194 ResultTy: Type::getInt32Ty(C&: Ctx), DL: DebugLoc());
2195 }
2196 Builder.setInsertPoint(TheBB: Header, IP: Header->getFirstNonPhi());
2197 PrevEVL = Builder.createScalarPhi(IncomingValues: {MaxEVL, &EVL}, DL: DebugLoc(), Name: "prev.evl");
2198 }
2199
2200 for (VPUser *U : to_vector(Range: Plan.getVF().users())) {
2201 if (auto *R = dyn_cast<VPVectorEndPointerRecipe>(Val: U))
2202 R->setOperand(I: 1, New: &EVL);
2203 }
2204
2205 SmallVector<VPRecipeBase *> ToErase;
2206
2207 for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
2208 for (VPUser *U : collectUsersRecursively(V: HeaderMask)) {
2209 auto *CurRecipe = cast<VPRecipeBase>(Val: U);
2210 VPRecipeBase *EVLRecipe = createEVLRecipe(
2211 HeaderMask, CurRecipe&: *CurRecipe, TypeInfo, AllOneMask&: *AllOneMask, EVL, PrevEVL);
2212 if (!EVLRecipe)
2213 continue;
2214
2215 [[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2216 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2217 "New recipe must define the same number of values as the "
2218 "original.");
2219 assert(
2220 NumDefVal <= 1 &&
2221 "Only supports recipes with a single definition or without users.");
2222 EVLRecipe->insertBefore(InsertPos: CurRecipe);
2223 if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(Val: EVLRecipe)) {
2224 VPValue *CurVPV = CurRecipe->getVPSingleValue();
2225 CurVPV->replaceAllUsesWith(New: EVLRecipe->getVPSingleValue());
2226 }
2227 // Defer erasing recipes till the end so that we don't invalidate the
2228 // VPTypeAnalysis cache.
2229 ToErase.push_back(Elt: CurRecipe);
2230 }
2231 }
2232
2233 for (VPRecipeBase *R : reverse(C&: ToErase)) {
2234 SmallVector<VPValue *> PossiblyDead(R->operands());
2235 R->eraseFromParent();
2236 for (VPValue *Op : PossiblyDead)
2237 recursivelyDeleteDeadRecipes(V: Op);
2238 }
2239}
2240
2241/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2242/// replaces all uses except the canonical IV increment of
2243/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2244/// is used only for loop iterations counting after this transformation.
2245///
2246/// The function uses the following definitions:
2247/// %StartV is the canonical induction start value.
2248///
2249/// The function adds the following recipes:
2250///
2251/// vector.ph:
2252/// ...
2253///
2254/// vector.body:
2255/// ...
2256/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2257/// [ %NextEVLIV, %vector.body ]
2258/// %AVL = sub original TC, %EVLPhi
2259/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2260/// ...
2261/// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
2262/// ...
2263///
2264/// If MaxSafeElements is provided, the function adds the following recipes:
2265/// vector.ph:
2266/// ...
2267///
2268/// vector.body:
2269/// ...
2270/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2271/// [ %NextEVLIV, %vector.body ]
2272/// %AVL = sub original TC, %EVLPhi
2273/// %cmp = cmp ult %AVL, MaxSafeElements
2274/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
2275/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
2276/// ...
2277/// %NextEVLIV = add IVSize (cast i32 %VPEVL to IVSize), %EVLPhi
2278/// ...
2279///
2280bool VPlanTransforms::tryAddExplicitVectorLength(
2281 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
2282 VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2283 // The transform updates all users of inductions to work based on EVL, instead
2284 // of the VF directly. At the moment, widened inductions cannot be updated, so
2285 // bail out if the plan contains any.
2286 bool ContainsWidenInductions = any_of(
2287 Range: Header->phis(),
2288 P: IsaPred<VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>);
2289 if (ContainsWidenInductions)
2290 return false;
2291
2292 auto *CanonicalIVPHI = Plan.getCanonicalIV();
2293 VPValue *StartV = CanonicalIVPHI->getStartValue();
2294
2295 // Create the ExplicitVectorLengthPhi recipe in the main loop.
2296 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
2297 EVLPhi->insertAfter(InsertPos: CanonicalIVPHI);
2298 VPBuilder Builder(Header, Header->getFirstNonPhi());
2299 // Compute original TC - IV as the AVL (application vector length).
2300 VPValue *AVL = Builder.createNaryOp(
2301 Opcode: Instruction::Sub, Operands: {Plan.getTripCount(), EVLPhi}, DL: DebugLoc(), Name: "avl");
2302 if (MaxSafeElements) {
2303 // Support for MaxSafeDist for correct loop emission.
2304 VPValue *AVLSafe = Plan.getOrAddLiveIn(
2305 V: ConstantInt::get(Ty: CanonicalIVPHI->getScalarType(), V: *MaxSafeElements));
2306 VPValue *Cmp = Builder.createICmp(Pred: ICmpInst::ICMP_ULT, A: AVL, B: AVLSafe);
2307 AVL = Builder.createSelect(Cond: Cmp, TrueVal: AVL, FalseVal: AVLSafe, DL: DebugLoc(), Name: "safe_avl");
2308 }
2309 auto *VPEVL = Builder.createNaryOp(Opcode: VPInstruction::ExplicitVectorLength, Operands: AVL,
2310 DL: DebugLoc());
2311
2312 auto *CanonicalIVIncrement =
2313 cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue());
2314 Builder.setInsertPoint(CanonicalIVIncrement);
2315 VPSingleDefRecipe *OpVPEVL = VPEVL;
2316 if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits();
2317 IVSize != 32) {
2318 OpVPEVL = Builder.createScalarCast(
2319 Opcode: IVSize < 32 ? Instruction::Trunc : Instruction::ZExt, Op: OpVPEVL,
2320 ResultTy: CanonicalIVPHI->getScalarType(), DL: CanonicalIVIncrement->getDebugLoc());
2321 }
2322 auto *NextEVLIV = Builder.createOverflowingOp(
2323 Opcode: Instruction::Add, Operands: {OpVPEVL, EVLPhi},
2324 WrapFlags: {CanonicalIVIncrement->hasNoUnsignedWrap(),
2325 CanonicalIVIncrement->hasNoSignedWrap()},
2326 DL: CanonicalIVIncrement->getDebugLoc(), Name: "index.evl.next");
2327 EVLPhi->addOperand(Operand: NextEVLIV);
2328
2329 transformRecipestoEVLRecipes(Plan, EVL&: *VPEVL);
2330
2331 // Replace all uses of VPCanonicalIVPHIRecipe by
2332 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
2333 CanonicalIVPHI->replaceAllUsesWith(New: EVLPhi);
2334 CanonicalIVIncrement->setOperand(I: 0, New: CanonicalIVPHI);
2335 // TODO: support unroll factor > 1.
2336 Plan.setUF(1);
2337 return true;
2338}
2339
2340void VPlanTransforms::dropPoisonGeneratingRecipes(
2341 VPlan &Plan,
2342 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
2343 // Collect recipes in the backward slice of `Root` that may generate a poison
2344 // value that is used after vectorization.
2345 SmallPtrSet<VPRecipeBase *, 16> Visited;
2346 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
2347 SmallVector<VPRecipeBase *, 16> Worklist;
2348 Worklist.push_back(Elt: Root);
2349
2350 // Traverse the backward slice of Root through its use-def chain.
2351 while (!Worklist.empty()) {
2352 VPRecipeBase *CurRec = Worklist.pop_back_val();
2353
2354 if (!Visited.insert(Ptr: CurRec).second)
2355 continue;
2356
2357 // Prune search if we find another recipe generating a widen memory
2358 // instruction. Widen memory instructions involved in address computation
2359 // will lead to gather/scatter instructions, which don't need to be
2360 // handled.
2361 if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe,
2362 VPHeaderPHIRecipe>(Val: CurRec))
2363 continue;
2364
2365 // This recipe contributes to the address computation of a widen
2366 // load/store. If the underlying instruction has poison-generating flags,
2367 // drop them directly.
2368 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: CurRec)) {
2369 VPValue *A, *B;
2370 using namespace llvm::VPlanPatternMatch;
2371 // Dropping disjoint from an OR may yield incorrect results, as some
2372 // analysis may have converted it to an Add implicitly (e.g. SCEV used
2373 // for dependence analysis). Instead, replace it with an equivalent Add.
2374 // This is possible as all users of the disjoint OR only access lanes
2375 // where the operands are disjoint or poison otherwise.
2376 if (match(V: RecWithFlags, P: m_BinaryOr(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B))) &&
2377 RecWithFlags->isDisjoint()) {
2378 VPBuilder Builder(RecWithFlags);
2379 VPInstruction *New = Builder.createOverflowingOp(
2380 Opcode: Instruction::Add, Operands: {A, B}, WrapFlags: {false, false},
2381 DL: RecWithFlags->getDebugLoc());
2382 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
2383 RecWithFlags->replaceAllUsesWith(New);
2384 RecWithFlags->eraseFromParent();
2385 CurRec = New;
2386 } else
2387 RecWithFlags->dropPoisonGeneratingFlags();
2388 } else {
2389 Instruction *Instr = dyn_cast_or_null<Instruction>(
2390 Val: CurRec->getVPSingleValue()->getUnderlyingValue());
2391 (void)Instr;
2392 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
2393 "found instruction with poison generating flags not covered by "
2394 "VPRecipeWithIRFlags");
2395 }
2396
2397 // Add new definitions to the worklist.
2398 for (VPValue *Operand : CurRec->operands())
2399 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
2400 Worklist.push_back(Elt: OpDef);
2401 }
2402 });
2403
2404 // Traverse all the recipes in the VPlan and collect the poison-generating
2405 // recipes in the backward slice starting at the address of a VPWidenRecipe or
2406 // VPInterleaveRecipe.
2407 auto Iter = vp_depth_first_deep(G: Plan.getEntry());
2408 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
2409 for (VPRecipeBase &Recipe : *VPBB) {
2410 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(Val: &Recipe)) {
2411 Instruction &UnderlyingInstr = WidenRec->getIngredient();
2412 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
2413 if (AddrDef && WidenRec->isConsecutive() &&
2414 BlockNeedsPredication(UnderlyingInstr.getParent()))
2415 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
2416 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(Val: &Recipe)) {
2417 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
2418 if (AddrDef) {
2419 // Check if any member of the interleave group needs predication.
2420 const InterleaveGroup<Instruction> *InterGroup =
2421 InterleaveRec->getInterleaveGroup();
2422 bool NeedPredication = false;
2423 for (int I = 0, NumMembers = InterGroup->getNumMembers();
2424 I < NumMembers; ++I) {
2425 Instruction *Member = InterGroup->getMember(Index: I);
2426 if (Member)
2427 NeedPredication |= BlockNeedsPredication(Member->getParent());
2428 }
2429
2430 if (NeedPredication)
2431 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
2432 }
2433 }
2434 }
2435 }
2436}
2437
2438void VPlanTransforms::createInterleaveGroups(
2439 VPlan &Plan,
2440 const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
2441 &InterleaveGroups,
2442 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
2443 if (InterleaveGroups.empty())
2444 return;
2445
2446 // Interleave memory: for each Interleave Group we marked earlier as relevant
2447 // for this VPlan, replace the Recipes widening its memory instructions with a
2448 // single VPInterleaveRecipe at its insertion point.
2449 VPDominatorTree VPDT;
2450 VPDT.recalculate(Func&: Plan);
2451 for (const auto *IG : InterleaveGroups) {
2452 SmallVector<VPValue *, 4> StoredValues;
2453 for (unsigned i = 0; i < IG->getFactor(); ++i)
2454 if (auto *SI = dyn_cast_or_null<StoreInst>(Val: IG->getMember(Index: i))) {
2455 auto *StoreR = cast<VPWidenStoreRecipe>(Val: RecipeBuilder.getRecipe(I: SI));
2456 StoredValues.push_back(Elt: StoreR->getStoredValue());
2457 }
2458
2459 bool NeedsMaskForGaps =
2460 IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed;
2461
2462 Instruction *IRInsertPos = IG->getInsertPos();
2463 auto *InsertPos =
2464 cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IRInsertPos));
2465
2466 // Get or create the start address for the interleave group.
2467 auto *Start =
2468 cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IG->getMember(Index: 0)));
2469 VPValue *Addr = Start->getAddr();
2470 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
2471 if (AddrDef && !VPDT.properlyDominates(A: AddrDef, B: InsertPos)) {
2472 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
2473 // InsertPos or sink loads above zero members to join it.
2474 bool InBounds = false;
2475 if (auto *Gep = dyn_cast<GetElementPtrInst>(
2476 Val: getLoadStorePointerOperand(V: IRInsertPos)->stripPointerCasts()))
2477 InBounds = Gep->isInBounds();
2478
2479 // We cannot re-use the address of member zero because it does not
2480 // dominate the insert position. Instead, use the address of the insert
2481 // position and create a PtrAdd adjusting it to the address of member
2482 // zero.
2483 assert(IG->getIndex(IRInsertPos) != 0 &&
2484 "index of insert position shouldn't be zero");
2485 auto &DL = IRInsertPos->getDataLayout();
2486 APInt Offset(32,
2487 DL.getTypeAllocSize(Ty: getLoadStoreType(I: IRInsertPos)) *
2488 IG->getIndex(Instr: IRInsertPos),
2489 /*IsSigned=*/true);
2490 VPValue *OffsetVPV = Plan.getOrAddLiveIn(
2491 V: ConstantInt::get(Context&: IRInsertPos->getParent()->getContext(), V: -Offset));
2492 VPBuilder B(InsertPos);
2493 Addr = InBounds ? B.createInBoundsPtrAdd(Ptr: InsertPos->getAddr(), Offset: OffsetVPV)
2494 : B.createPtrAdd(Ptr: InsertPos->getAddr(), Offset: OffsetVPV);
2495 }
2496 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
2497 InsertPos->getMask(), NeedsMaskForGaps, InsertPos->getDebugLoc());
2498 VPIG->insertBefore(InsertPos);
2499
2500 unsigned J = 0;
2501 for (unsigned i = 0; i < IG->getFactor(); ++i)
2502 if (Instruction *Member = IG->getMember(Index: i)) {
2503 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(I: Member);
2504 if (!Member->getType()->isVoidTy()) {
2505 VPValue *OriginalV = MemberR->getVPSingleValue();
2506 OriginalV->replaceAllUsesWith(New: VPIG->getVPValue(I: J));
2507 J++;
2508 }
2509 MemberR->eraseFromParent();
2510 }
2511 }
2512}
2513
2514void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
2515 // Replace loop regions with explicity CFG.
2516 SmallVector<VPRegionBlock *> LoopRegions;
2517 for (VPRegionBlock *R : VPBlockUtils::blocksOnly<VPRegionBlock>(
2518 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
2519 if (!R->isReplicator())
2520 LoopRegions.push_back(Elt: R);
2521 }
2522 for (VPRegionBlock *R : LoopRegions)
2523 R->dissolveToCFGLoop();
2524}
2525
2526// Expand VPExtendedReductionRecipe to VPWidenCastRecipe + VPReductionRecipe.
2527static void expandVPExtendedReduction(VPExtendedReductionRecipe *ExtRed) {
2528 VPWidenCastRecipe *Ext;
2529 // Only ZExt contains non-neg flags.
2530 if (ExtRed->isZExt())
2531 Ext = new VPWidenCastRecipe(ExtRed->getExtOpcode(), ExtRed->getVecOp(),
2532 ExtRed->getResultType(), *ExtRed,
2533 ExtRed->getDebugLoc());
2534 else
2535 Ext = new VPWidenCastRecipe(ExtRed->getExtOpcode(), ExtRed->getVecOp(),
2536 ExtRed->getResultType(), {},
2537 ExtRed->getDebugLoc());
2538
2539 auto *Red = new VPReductionRecipe(
2540 ExtRed->getRecurrenceKind(), FastMathFlags(), ExtRed->getChainOp(), Ext,
2541 ExtRed->getCondOp(), ExtRed->isOrdered(), ExtRed->getDebugLoc());
2542 Ext->insertBefore(InsertPos: ExtRed);
2543 Red->insertBefore(InsertPos: ExtRed);
2544 ExtRed->replaceAllUsesWith(New: Red);
2545 ExtRed->eraseFromParent();
2546}
2547
2548// Expand VPMulAccumulateReductionRecipe to VPWidenRecipe (mul) +
2549// VPReductionRecipe (reduce.add)
2550// + VPWidenCastRecipe (optional).
2551static void
2552expandVPMulAccumulateReduction(VPMulAccumulateReductionRecipe *MulAcc) {
2553 // Generate inner VPWidenCastRecipes if necessary.
2554 // Note that we will drop the extend after mul which transforms
2555 // reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)).
2556 VPValue *Op0, *Op1;
2557 if (MulAcc->isExtended()) {
2558 Type *RedTy = MulAcc->getResultType();
2559 if (MulAcc->isZExt())
2560 Op0 = new VPWidenCastRecipe(
2561 MulAcc->getExtOpcode(), MulAcc->getVecOp0(), RedTy,
2562 VPIRFlags::NonNegFlagsTy(MulAcc->isNonNeg()), MulAcc->getDebugLoc());
2563 else
2564 Op0 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp0(),
2565 RedTy, {}, MulAcc->getDebugLoc());
2566 Op0->getDefiningRecipe()->insertBefore(InsertPos: MulAcc);
2567 // Prevent reduce.add(mul(ext(A), ext(A))) generate duplicate
2568 // VPWidenCastRecipe.
2569 if (MulAcc->getVecOp0() == MulAcc->getVecOp1()) {
2570 Op1 = Op0;
2571 } else {
2572 if (MulAcc->isZExt())
2573 Op1 = new VPWidenCastRecipe(
2574 MulAcc->getExtOpcode(), MulAcc->getVecOp1(), RedTy,
2575 VPIRFlags::NonNegFlagsTy(MulAcc->isNonNeg()),
2576 MulAcc->getDebugLoc());
2577 else
2578 Op1 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp1(),
2579 RedTy, {}, MulAcc->getDebugLoc());
2580 Op1->getDefiningRecipe()->insertBefore(InsertPos: MulAcc);
2581 }
2582 } else {
2583 // No extends in this MulAccRecipe.
2584 Op0 = MulAcc->getVecOp0();
2585 Op1 = MulAcc->getVecOp1();
2586 }
2587
2588 std::array<VPValue *, 2> MulOps = {Op0, Op1};
2589 auto *Mul = new VPWidenRecipe(
2590 Instruction::Mul, ArrayRef(MulOps), MulAcc->hasNoUnsignedWrap(),
2591 MulAcc->hasNoSignedWrap(), MulAcc->getDebugLoc());
2592 Mul->insertBefore(InsertPos: MulAcc);
2593
2594 auto *Red = new VPReductionRecipe(
2595 MulAcc->getRecurrenceKind(), FastMathFlags(), MulAcc->getChainOp(), Mul,
2596 MulAcc->getCondOp(), MulAcc->isOrdered(), MulAcc->getDebugLoc());
2597 Red->insertBefore(InsertPos: MulAcc);
2598
2599 MulAcc->replaceAllUsesWith(New: Red);
2600 MulAcc->eraseFromParent();
2601}
2602
2603void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
2604 Type &CanonicalIVTy) {
2605 using namespace llvm::VPlanPatternMatch;
2606
2607 VPTypeAnalysis TypeInfo(&CanonicalIVTy);
2608 SmallVector<VPRecipeBase *> ToRemove;
2609 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2610 Range: vp_depth_first_deep(G: Plan.getEntry()))) {
2611 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
2612 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(Val: &R)) {
2613 auto *ScalarR = VPBuilder(PhiR).createScalarPhi(
2614 IncomingValues: {PhiR->getStartValue(), PhiR->getBackedgeValue()},
2615 DL: PhiR->getDebugLoc(), Name: "evl.based.iv");
2616 PhiR->replaceAllUsesWith(New: ScalarR);
2617 ToRemove.push_back(Elt: PhiR);
2618 continue;
2619 }
2620
2621 VPValue *VectorStep;
2622 VPValue *ScalarStep;
2623 if (!match(V: &R, P: m_VPInstruction<VPInstruction::WideIVStep>(
2624 Op0: m_VPValue(V&: VectorStep), Op1: m_VPValue(V&: ScalarStep))))
2625 continue;
2626
2627 // Expand WideIVStep.
2628 auto *VPI = cast<VPInstruction>(Val: &R);
2629 VPBuilder Builder(VPI);
2630 Type *IVTy = TypeInfo.inferScalarType(V: VPI);
2631 if (TypeInfo.inferScalarType(V: VectorStep) != IVTy) {
2632 Instruction::CastOps CastOp = IVTy->isFloatingPointTy()
2633 ? Instruction::UIToFP
2634 : Instruction::Trunc;
2635 VectorStep = Builder.createWidenCast(Opcode: CastOp, Op: VectorStep, ResultTy: IVTy);
2636 }
2637
2638 [[maybe_unused]] auto *ConstStep =
2639 ScalarStep->isLiveIn()
2640 ? dyn_cast<ConstantInt>(Val: ScalarStep->getLiveInIRValue())
2641 : nullptr;
2642 assert(!ConstStep || ConstStep->getValue() != 1);
2643 (void)ConstStep;
2644 if (TypeInfo.inferScalarType(V: ScalarStep) != IVTy) {
2645 ScalarStep =
2646 Builder.createWidenCast(Opcode: Instruction::Trunc, Op: ScalarStep, ResultTy: IVTy);
2647 }
2648
2649 VPIRFlags Flags;
2650 if (IVTy->isFloatingPointTy())
2651 Flags = {VPI->getFastMathFlags()};
2652
2653 unsigned MulOpc =
2654 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
2655 VPInstruction *Mul = Builder.createNaryOp(
2656 Opcode: MulOpc, Operands: {VectorStep, ScalarStep}, Flags, DL: R.getDebugLoc());
2657 VectorStep = Mul;
2658 VPI->replaceAllUsesWith(New: VectorStep);
2659 ToRemove.push_back(Elt: VPI);
2660 }
2661 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
2662 if (auto *ExtRed = dyn_cast<VPExtendedReductionRecipe>(Val: &R)) {
2663 expandVPExtendedReduction(ExtRed);
2664 continue;
2665 }
2666 if (auto *MulAcc = dyn_cast<VPMulAccumulateReductionRecipe>(Val: &R))
2667 expandVPMulAccumulateReduction(MulAcc);
2668 }
2669 }
2670
2671 for (VPRecipeBase *R : ToRemove)
2672 R->eraseFromParent();
2673}
2674
2675void VPlanTransforms::handleUncountableEarlyExit(
2676 VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan,
2677 VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VFRange &Range) {
2678 using namespace llvm::VPlanPatternMatch;
2679
2680 VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
2681 if (!EarlyExitVPBB->getSinglePredecessor() &&
2682 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
2683 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
2684 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
2685 "unsupported early exit VPBB");
2686 // Early exit operand should always be last phi operand. If EarlyExitVPBB
2687 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
2688 // of the phis.
2689 for (VPRecipeBase &R : EarlyExitVPBB->phis())
2690 cast<VPIRPhi>(Val: &R)->swapOperands();
2691 }
2692
2693 VPBuilder Builder(LatchVPBB->getTerminator());
2694 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
2695 assert(
2696 match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond(m_VPValue())) &&
2697 "Terminator must be be BranchOnCond");
2698 VPValue *CondOfEarlyExitingVPBB =
2699 EarlyExitingVPBB->getTerminator()->getOperand(N: 0);
2700 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
2701 ? CondOfEarlyExitingVPBB
2702 : Builder.createNot(Operand: CondOfEarlyExitingVPBB);
2703
2704 // Split the middle block and have it conditionally branch to the early exit
2705 // block if CondToEarlyExit.
2706 VPValue *IsEarlyExitTaken =
2707 Builder.createNaryOp(Opcode: VPInstruction::AnyOf, Operands: {CondToEarlyExit});
2708 VPBasicBlock *NewMiddle = Plan.createVPBasicBlock(Name: "middle.split");
2709 VPBasicBlock *VectorEarlyExitVPBB =
2710 Plan.createVPBasicBlock(Name: "vector.early.exit");
2711 VPBlockUtils::insertOnEdge(From: LatchVPBB, To: MiddleVPBB, BlockPtr: NewMiddle);
2712 VPBlockUtils::connectBlocks(From: NewMiddle, To: VectorEarlyExitVPBB);
2713 NewMiddle->swapSuccessors();
2714
2715 VPBlockUtils::connectBlocks(From: VectorEarlyExitVPBB, To: EarlyExitVPBB);
2716
2717 // Update the exit phis in the early exit block.
2718 VPBuilder MiddleBuilder(NewMiddle);
2719 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
2720 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
2721 auto *ExitIRI = cast<VPIRPhi>(Val: &R);
2722 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
2723 // a single predecessor and 1 if it has two.
2724 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
2725 if (ExitIRI->getNumOperands() != 1) {
2726 // The first of two operands corresponds to the latch exit, via MiddleVPBB
2727 // predecessor. Extract its last lane.
2728 ExitIRI->extractLastLaneOfFirstOperand(Builder&: MiddleBuilder);
2729 }
2730
2731 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(N: EarlyExitIdx);
2732 auto IsVector = [](ElementCount VF) { return VF.isVector(); };
2733 // When the VFs are vectors, need to add `extract` to get the incoming value
2734 // from early exit. When the range contains scalar VF, limit the range to
2735 // scalar VF to prevent mis-compilation for the range containing both scalar
2736 // and vector VFs.
2737 if (!IncomingFromEarlyExit->isLiveIn() &&
2738 LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: IsVector, Range)) {
2739 // Update the incoming value from the early exit.
2740 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
2741 Opcode: VPInstruction::FirstActiveLane, Operands: {CondToEarlyExit}, Inst: nullptr,
2742 Name: "first.active.lane");
2743 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
2744 Opcode: Instruction::ExtractElement, Operands: {IncomingFromEarlyExit, FirstActiveLane},
2745 Inst: nullptr, Name: "early.exit.value");
2746 ExitIRI->setOperand(I: EarlyExitIdx, New: IncomingFromEarlyExit);
2747 }
2748 }
2749 MiddleBuilder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {IsEarlyExitTaken});
2750
2751 // Replace the condition controlling the non-early exit from the vector loop
2752 // with one exiting if either the original condition of the vector latch is
2753 // true or the early exit has been taken.
2754 auto *LatchExitingBranch = cast<VPInstruction>(Val: LatchVPBB->getTerminator());
2755 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
2756 "Unexpected terminator");
2757 auto *IsLatchExitTaken =
2758 Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: LatchExitingBranch->getOperand(N: 0),
2759 B: LatchExitingBranch->getOperand(N: 1));
2760 auto *AnyExitTaken = Builder.createNaryOp(
2761 Opcode: Instruction::Or, Operands: {IsEarlyExitTaken, IsLatchExitTaken});
2762 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: AnyExitTaken);
2763 LatchExitingBranch->eraseFromParent();
2764}
2765
2766/// This function tries convert extended in-loop reductions to
2767/// VPExtendedReductionRecipe and clamp the \p Range if it is beneficial and
2768/// valid. The created recipe must be lowered to concrete
2769/// recipes before execution.
2770static VPExtendedReductionRecipe *
2771tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
2772 VFRange &Range) {
2773 using namespace VPlanPatternMatch;
2774
2775 Type *RedTy = Ctx.Types.inferScalarType(V: Red);
2776 VPValue *VecOp = Red->getVecOp();
2777
2778 // Clamp the range if using extended-reduction is profitable.
2779 auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt,
2780 Type *SrcTy) -> bool {
2781 return LoopVectorizationPlanner::getDecisionAndClampRange(
2782 Predicate: [&](ElementCount VF) {
2783 auto *SrcVecTy = cast<VectorType>(Val: toVectorTy(Scalar: SrcTy, EC: VF));
2784 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2785 InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost(
2786 Opcode, IsUnsigned: isZExt, ResTy: RedTy, Ty: SrcVecTy, FMF: Red->getFastMathFlags(),
2787 CostKind);
2788 InstructionCost ExtCost =
2789 cast<VPWidenCastRecipe>(Val: VecOp)->computeCost(VF, Ctx);
2790 InstructionCost RedCost = Red->computeCost(VF, Ctx);
2791 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
2792 },
2793 Range);
2794 };
2795
2796 VPValue *A;
2797 // Match reduce(ext)).
2798 if (match(V: VecOp, P: m_ZExtOrSExt(Op0: m_VPValue(V&: A))) &&
2799 IsExtendedRedValidAndClampRange(
2800 RecurrenceDescriptor::getOpcode(Kind: Red->getRecurrenceKind()),
2801 cast<VPWidenCastRecipe>(Val: VecOp)->getOpcode() ==
2802 Instruction::CastOps::ZExt,
2803 Ctx.Types.inferScalarType(V: A)))
2804 return new VPExtendedReductionRecipe(Red, cast<VPWidenCastRecipe>(Val: VecOp));
2805
2806 return nullptr;
2807}
2808
2809/// This function tries convert extended in-loop reductions to
2810/// VPMulAccumulateReductionRecipe and clamp the \p Range if it is beneficial
2811/// and valid. The created VPExtendedReductionRecipe must be lower to concrete
2812/// recipes before execution. Patterns of MulAccumulateReduction:
2813/// reduce.add(mul(...)),
2814/// reduce.add(mul(ext(A), ext(B))),
2815/// reduce.add(ext(mul(ext(A), ext(B)))).
2816static VPMulAccumulateReductionRecipe *
2817tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
2818 VPCostContext &Ctx, VFRange &Range) {
2819 using namespace VPlanPatternMatch;
2820
2821 unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: Red->getRecurrenceKind());
2822 if (Opcode != Instruction::Add)
2823 return nullptr;
2824
2825 Type *RedTy = Ctx.Types.inferScalarType(V: Red);
2826
2827 // Clamp the range if using multiply-accumulate-reduction is profitable.
2828 auto IsMulAccValidAndClampRange =
2829 [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0,
2830 VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool {
2831 return LoopVectorizationPlanner::getDecisionAndClampRange(
2832 Predicate: [&](ElementCount VF) {
2833 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2834 Type *SrcTy =
2835 Ext0 ? Ctx.Types.inferScalarType(V: Ext0->getOperand(N: 0)) : RedTy;
2836 auto *SrcVecTy = cast<VectorType>(Val: toVectorTy(Scalar: SrcTy, EC: VF));
2837 InstructionCost MulAccCost =
2838 Ctx.TTI.getMulAccReductionCost(IsUnsigned: isZExt, ResTy: RedTy, Ty: SrcVecTy, CostKind);
2839 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
2840 InstructionCost RedCost = Red->computeCost(VF, Ctx);
2841 InstructionCost ExtCost = 0;
2842 if (Ext0)
2843 ExtCost += Ext0->computeCost(VF, Ctx);
2844 if (Ext1)
2845 ExtCost += Ext1->computeCost(VF, Ctx);
2846 if (OuterExt)
2847 ExtCost += OuterExt->computeCost(VF, Ctx);
2848
2849 return MulAccCost.isValid() &&
2850 MulAccCost < ExtCost + MulCost + RedCost;
2851 },
2852 Range);
2853 };
2854
2855 VPValue *VecOp = Red->getVecOp();
2856 VPValue *A, *B;
2857 // Try to match reduce.add(mul(...)).
2858 if (match(V: VecOp, P: m_Mul(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B)))) {
2859 auto *RecipeA =
2860 dyn_cast_if_present<VPWidenCastRecipe>(Val: A->getDefiningRecipe());
2861 auto *RecipeB =
2862 dyn_cast_if_present<VPWidenCastRecipe>(Val: B->getDefiningRecipe());
2863 auto *Mul = cast<VPWidenRecipe>(Val: VecOp->getDefiningRecipe());
2864
2865 // Match reduce.add(mul(ext, ext)).
2866 if (RecipeA && RecipeB &&
2867 (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) &&
2868 match(V: RecipeA, P: m_ZExtOrSExt(Op0: m_VPValue())) &&
2869 match(V: RecipeB, P: m_ZExtOrSExt(Op0: m_VPValue())) &&
2870 IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
2871 Instruction::CastOps::ZExt,
2872 Mul, RecipeA, RecipeB, nullptr))
2873 return new VPMulAccumulateReductionRecipe(Red, Mul, RecipeA, RecipeB,
2874 RecipeA->getResultType());
2875 // Match reduce.add(mul).
2876 if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
2877 return new VPMulAccumulateReductionRecipe(Red, Mul, RedTy);
2878 }
2879 // Match reduce.add(ext(mul(ext(A), ext(B)))).
2880 // All extend recipes must have same opcode or A == B
2881 // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
2882 if (match(V: VecOp, P: m_ZExtOrSExt(Op0: m_Mul(Op0: m_ZExtOrSExt(Op0: m_VPValue()),
2883 Op1: m_ZExtOrSExt(Op0: m_VPValue()))))) {
2884 auto *Ext = cast<VPWidenCastRecipe>(Val: VecOp->getDefiningRecipe());
2885 auto *Mul = cast<VPWidenRecipe>(Val: Ext->getOperand(N: 0)->getDefiningRecipe());
2886 auto *Ext0 =
2887 cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: 0)->getDefiningRecipe());
2888 auto *Ext1 =
2889 cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: 1)->getDefiningRecipe());
2890 if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
2891 Ext0->getOpcode() == Ext1->getOpcode() &&
2892 IsMulAccValidAndClampRange(Ext0->getOpcode() ==
2893 Instruction::CastOps::ZExt,
2894 Mul, Ext0, Ext1, Ext))
2895 return new VPMulAccumulateReductionRecipe(Red, Mul, Ext0, Ext1,
2896 Ext->getResultType());
2897 }
2898 return nullptr;
2899}
2900
2901/// This function tries to create abstract recipes from the reduction recipe for
2902/// following optimizations and cost estimation.
2903static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
2904 VPCostContext &Ctx,
2905 VFRange &Range) {
2906 VPReductionRecipe *AbstractR = nullptr;
2907
2908 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
2909 AbstractR = MulAcc;
2910 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
2911 AbstractR = ExtRed;
2912 // Cannot create abstract inloop reduction recipes.
2913 if (!AbstractR)
2914 return;
2915
2916 AbstractR->insertBefore(InsertPos: Red);
2917 Red->replaceAllUsesWith(New: AbstractR);
2918}
2919
2920void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
2921 VFRange &Range) {
2922 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2923 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) {
2924 for (VPRecipeBase &R : *VPBB) {
2925 if (auto *Red = dyn_cast<VPReductionRecipe>(Val: &R))
2926 tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
2927 }
2928 }
2929}
2930
2931void VPlanTransforms::materializeStepVectors(VPlan &Plan) {
2932 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
2933 auto *IVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
2934 if (!IVR)
2935 continue;
2936
2937 Type *Ty = IVR->getPHINode()->getType();
2938 if (TruncInst *Trunc = IVR->getTruncInst())
2939 Ty = Trunc->getType();
2940 if (Ty->isFloatingPointTy())
2941 Ty = IntegerType::get(C&: Ty->getContext(), NumBits: Ty->getScalarSizeInBits());
2942
2943 VPBuilder Builder(Plan.getVectorPreheader());
2944 VPInstruction *StepVector = Builder.createNaryOp(
2945 Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: Ty, Flags: {}, DL: IVR->getDebugLoc());
2946 assert(IVR->getNumOperands() == 3 &&
2947 "can only add step vector before unrolling");
2948 IVR->addOperand(Operand: StepVector);
2949 }
2950}
2951
2952void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
2953 if (Plan.hasScalarVFOnly())
2954 return;
2955
2956#ifndef NDEBUG
2957 VPDominatorTree VPDT;
2958 VPDT.recalculate(Func&: Plan);
2959#endif
2960
2961 SmallVector<VPValue *> VPValues;
2962 if (Plan.getOrCreateBackedgeTakenCount()->getNumUsers() > 0)
2963 VPValues.push_back(Elt: Plan.getOrCreateBackedgeTakenCount());
2964 append_range(C&: VPValues, R: Plan.getLiveIns());
2965 for (VPRecipeBase &R : *Plan.getEntry())
2966 append_range(C&: VPValues, R: R.definedValues());
2967
2968 auto *VectorPreheader = Plan.getVectorPreheader();
2969 for (VPValue *VPV : VPValues) {
2970 if (all_of(Range: VPV->users(),
2971 P: [VPV](VPUser *U) { return U->usesScalars(Op: VPV); }) ||
2972 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
2973 isa<Constant>(Val: VPV->getLiveInIRValue())))
2974 continue;
2975
2976 // Add explicit broadcast at the insert point that dominates all users.
2977 VPBasicBlock *HoistBlock = VectorPreheader;
2978 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
2979 for (VPUser *User : VPV->users()) {
2980 if (User->usesScalars(Op: VPV))
2981 continue;
2982 if (cast<VPRecipeBase>(Val: User)->getParent() == VectorPreheader)
2983 HoistPoint = HoistBlock->begin();
2984 else
2985 assert(VPDT.dominates(VectorPreheader,
2986 cast<VPRecipeBase>(User)->getParent()) &&
2987 "All users must be in the vector preheader or dominated by it");
2988 }
2989
2990 VPBuilder Builder(cast<VPBasicBlock>(Val: HoistBlock), HoistPoint);
2991 auto *Broadcast = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: {VPV});
2992 VPV->replaceUsesWithIf(New: Broadcast,
2993 ShouldReplace: [VPV, Broadcast](VPUser &U, unsigned Idx) {
2994 return Broadcast != &U && !U.usesScalars(Op: VPV);
2995 });
2996 }
2997}
2998
2999/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
3000/// converted to a narrower recipe. \p V is used by a wide recipe \p WideMember
3001/// that feeds a store interleave group at index \p Idx, \p WideMember0 is the
3002/// recipe feeding the same interleave group at index 0. A VPWidenLoadRecipe can
3003/// be narrowed to an index-independent load if it feeds all wide ops at all
3004/// indices (checked by via the operands of the wide recipe at lane0, \p
3005/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
3006/// is defined at \p Idx of a load interleave group.
3007static bool canNarrowLoad(VPWidenRecipe *WideMember0, VPWidenRecipe *WideMember,
3008 VPValue *V, unsigned Idx) {
3009 auto *DefR = V->getDefiningRecipe();
3010 if (!DefR)
3011 return false;
3012 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Val: DefR))
3013 return !W->getMask() &&
3014 all_of(Range: zip(t: WideMember0->operands(), u: WideMember->operands()),
3015 P: [V](const auto P) {
3016 // V must be as at the same places in both WideMember0 and
3017 // WideMember.
3018 const auto &[WideMember0Op, WideMemberOp] = P;
3019 return (WideMember0Op == V) == (WideMemberOp == V);
3020 });
3021
3022 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Val: DefR))
3023 return IR->getInterleaveGroup()->getFactor() ==
3024 IR->getInterleaveGroup()->getNumMembers() &&
3025 IR->getVPValue(I: Idx) == V;
3026 return false;
3027}
3028
3029/// Returns true if \p IR is a full interleave group with factor and number of
3030/// members both equal to \p VF. The interleave group must also access the full
3031/// vector width \p VectorRegWidth.
3032static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
3033 unsigned VF, VPTypeAnalysis &TypeInfo,
3034 unsigned VectorRegWidth) {
3035 if (!InterleaveR)
3036 return false;
3037
3038 Type *GroupElementTy = nullptr;
3039 if (InterleaveR->getStoredValues().empty()) {
3040 GroupElementTy = TypeInfo.inferScalarType(V: InterleaveR->getVPValue(I: 0));
3041 if (!all_of(Range: InterleaveR->definedValues(),
3042 P: [&TypeInfo, GroupElementTy](VPValue *Op) {
3043 return TypeInfo.inferScalarType(V: Op) == GroupElementTy;
3044 }))
3045 return false;
3046 } else {
3047 GroupElementTy =
3048 TypeInfo.inferScalarType(V: InterleaveR->getStoredValues()[0]);
3049 if (!all_of(Range: InterleaveR->getStoredValues(),
3050 P: [&TypeInfo, GroupElementTy](VPValue *Op) {
3051 return TypeInfo.inferScalarType(V: Op) == GroupElementTy;
3052 }))
3053 return false;
3054 }
3055
3056 unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
3057 auto IG = InterleaveR->getInterleaveGroup();
3058 return IG->getFactor() == VF && IG->getNumMembers() == VF &&
3059 GroupSize == VectorRegWidth;
3060}
3061
3062void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
3063 unsigned VectorRegWidth) {
3064 using namespace llvm::VPlanPatternMatch;
3065 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
3066 if (VF.isScalable() || !VectorLoop)
3067 return;
3068
3069 VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
3070 Type *CanonicalIVType = CanonicalIV->getScalarType();
3071 VPTypeAnalysis TypeInfo(CanonicalIVType);
3072
3073 unsigned FixedVF = VF.getFixedValue();
3074 SmallVector<VPInterleaveRecipe *> StoreGroups;
3075 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
3076 if (isa<VPCanonicalIVPHIRecipe>(Val: &R) ||
3077 match(V: &R, P: m_BranchOnCount(Op0: m_VPValue(), Op1: m_VPValue())))
3078 continue;
3079
3080 // Bail out on recipes not supported at the moment:
3081 // * phi recipes other than the canonical induction
3082 // * recipes writing to memory except interleave groups
3083 // Only support plans with a canonical induction phi.
3084 if (R.isPhi())
3085 return;
3086
3087 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(Val: &R);
3088 if (R.mayWriteToMemory() && !InterleaveR)
3089 return;
3090
3091 // Do not narrow interleave groups if there are VectorPointer recipes and
3092 // the plan was unrolled. The recipe implicitly uses VF from
3093 // VPTransformState.
3094 // TODO: Remove restriction once the VF for the VectorPointer offset is
3095 // modeled explicitly as operand.
3096 if (isa<VPVectorPointerRecipe>(Val: &R) && Plan.getUF() > 1)
3097 return;
3098
3099 // All other ops are allowed, but we reject uses that cannot be converted
3100 // when checking all allowed consumers (store interleave groups) below.
3101 if (!InterleaveR)
3102 continue;
3103
3104 // Bail out on non-consecutive interleave groups.
3105 if (!isConsecutiveInterleaveGroup(InterleaveR, VF: FixedVF, TypeInfo,
3106 VectorRegWidth))
3107 return;
3108
3109 // Skip read interleave groups.
3110 if (InterleaveR->getStoredValues().empty())
3111 continue;
3112
3113 // For now, we only support full interleave groups storing load interleave
3114 // groups.
3115 if (all_of(Range: enumerate(First: InterleaveR->getStoredValues()), P: [](auto Op) {
3116 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
3117 if (!DefR)
3118 return false;
3119 auto *IR = dyn_cast<VPInterleaveRecipe>(Val: DefR);
3120 return IR &&
3121 IR->getInterleaveGroup()->getFactor() ==
3122 IR->getInterleaveGroup()->getNumMembers() &&
3123 IR->getVPValue(Op.index()) == Op.value();
3124 })) {
3125 StoreGroups.push_back(Elt: InterleaveR);
3126 continue;
3127 }
3128
3129 // Check if all values feeding InterleaveR are matching wide recipes, which
3130 // operands that can be narrowed.
3131 auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
3132 Val: InterleaveR->getStoredValues()[0]->getDefiningRecipe());
3133 if (!WideMember0)
3134 return;
3135 for (const auto &[I, V] : enumerate(First: InterleaveR->getStoredValues())) {
3136 auto *R = dyn_cast<VPWidenRecipe>(Val: V->getDefiningRecipe());
3137 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
3138 R->getNumOperands() > 2)
3139 return;
3140 if (any_of(Range: R->operands(), P: [WideMember0, Idx = I, R](VPValue *V) {
3141 return !canNarrowLoad(WideMember0, WideMember: R, V, Idx);
3142 }))
3143 return;
3144 }
3145 StoreGroups.push_back(Elt: InterleaveR);
3146 }
3147
3148 if (StoreGroups.empty())
3149 return;
3150
3151 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
3152 auto NarrowOp = [](VPRecipeBase *R) -> VPValue * {
3153 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(Val: R)) {
3154 // Narrow interleave group to wide load, as transformed VPlan will only
3155 // process one original iteration.
3156 auto *L = new VPWidenLoadRecipe(
3157 *cast<LoadInst>(Val: LoadGroup->getInterleaveGroup()->getInsertPos()),
3158 LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
3159 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
3160 L->insertBefore(InsertPos: LoadGroup);
3161 return L;
3162 }
3163
3164 auto *WideLoad = cast<VPWidenLoadRecipe>(Val: R);
3165
3166 // Narrow wide load to uniform scalar load, as transformed VPlan will only
3167 // process one original iteration.
3168 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(),
3169 WideLoad->operands(), /*IsUniform*/ true,
3170 /*Mask*/ nullptr, *WideLoad);
3171 N->insertBefore(InsertPos: WideLoad);
3172 return N;
3173 };
3174
3175 // Narrow operation tree rooted at store groups.
3176 for (auto *StoreGroup : StoreGroups) {
3177 VPValue *Res = nullptr;
3178 if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(
3179 Val: StoreGroup->getStoredValues()[0]->getDefiningRecipe())) {
3180 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
3181 WideMember0->setOperand(
3182 I: Idx, New: NarrowOp(WideMember0->getOperand(N: Idx)->getDefiningRecipe()));
3183 Res = WideMember0;
3184 } else {
3185 Res = NarrowOp(StoreGroup->getStoredValues()[0]->getDefiningRecipe());
3186 }
3187
3188 auto *S = new VPWidenStoreRecipe(
3189 *cast<StoreInst>(Val: StoreGroup->getInterleaveGroup()->getInsertPos()),
3190 StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
3191 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
3192 S->insertBefore(InsertPos: StoreGroup);
3193 StoreGroup->eraseFromParent();
3194 }
3195
3196 // Adjust induction to reflect that the transformed plan only processes one
3197 // original iteration.
3198 auto *CanIV = Plan.getCanonicalIV();
3199 auto *Inc = cast<VPInstruction>(Val: CanIV->getBackedgeValue());
3200 Inc->setOperand(I: 1, New: Plan.getOrAddLiveIn(V: ConstantInt::get(
3201 Ty: CanIV->getScalarType(), V: 1 * Plan.getUF())));
3202 Plan.getVF().replaceAllUsesWith(
3203 New: Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: CanIV->getScalarType(), V: 1)));
3204 removeDeadRecipes(Plan);
3205}
3206

Provided by KDAB

Privacy Policy
Update your C++ knowledge – Modern C++11/14/17 Training
Find out more

source code of llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp