1 | //===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// This file implements a set of utility VPlan to VPlan transformations. |
11 | /// |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "VPlanTransforms.h" |
15 | #include "VPRecipeBuilder.h" |
16 | #include "VPlanAnalysis.h" |
17 | #include "VPlanCFG.h" |
18 | #include "VPlanDominatorTree.h" |
19 | #include "VPlanPatternMatch.h" |
20 | #include "llvm/ADT/PostOrderIterator.h" |
21 | #include "llvm/ADT/STLExtras.h" |
22 | #include "llvm/ADT/SetVector.h" |
23 | #include "llvm/Analysis/IVDescriptors.h" |
24 | #include "llvm/Analysis/VectorUtils.h" |
25 | #include "llvm/IR/Intrinsics.h" |
26 | #include "llvm/IR/PatternMatch.h" |
27 | |
28 | using namespace llvm; |
29 | |
30 | void VPlanTransforms::VPInstructionsToVPRecipes( |
31 | VPlanPtr &Plan, |
32 | function_ref<const InductionDescriptor *(PHINode *)> |
33 | GetIntOrFpInductionDescriptor, |
34 | ScalarEvolution &SE, const TargetLibraryInfo &TLI) { |
35 | |
36 | ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( |
37 | Plan->getEntry()); |
38 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) { |
39 | VPRecipeBase *Term = VPBB->getTerminator(); |
40 | auto EndIter = Term ? Term->getIterator() : VPBB->end(); |
41 | // Introduce each ingredient into VPlan. |
42 | for (VPRecipeBase &Ingredient : |
43 | make_early_inc_range(Range: make_range(x: VPBB->begin(), y: EndIter))) { |
44 | |
45 | VPValue *VPV = Ingredient.getVPSingleValue(); |
46 | Instruction *Inst = cast<Instruction>(Val: VPV->getUnderlyingValue()); |
47 | |
48 | VPRecipeBase *NewRecipe = nullptr; |
49 | if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &Ingredient)) { |
50 | auto *Phi = cast<PHINode>(Val: VPPhi->getUnderlyingValue()); |
51 | const auto *II = GetIntOrFpInductionDescriptor(Phi); |
52 | if (!II) |
53 | continue; |
54 | |
55 | VPValue *Start = Plan->getOrAddLiveIn(V: II->getStartValue()); |
56 | VPValue *Step = |
57 | vputils::getOrCreateVPValueForSCEVExpr(Plan&: *Plan, Expr: II->getStep(), SE); |
58 | NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II); |
59 | } else { |
60 | assert(isa<VPInstruction>(&Ingredient) && |
61 | "only VPInstructions expected here" ); |
62 | assert(!isa<PHINode>(Inst) && "phis should be handled above" ); |
63 | // Create VPWidenMemoryRecipe for loads and stores. |
64 | if (LoadInst *Load = dyn_cast<LoadInst>(Val: Inst)) { |
65 | NewRecipe = new VPWidenLoadRecipe( |
66 | *Load, Ingredient.getOperand(N: 0), nullptr /*Mask*/, |
67 | false /*Consecutive*/, false /*Reverse*/, |
68 | Ingredient.getDebugLoc()); |
69 | } else if (StoreInst *Store = dyn_cast<StoreInst>(Val: Inst)) { |
70 | NewRecipe = new VPWidenStoreRecipe( |
71 | *Store, Ingredient.getOperand(N: 1), Ingredient.getOperand(N: 0), |
72 | nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, |
73 | Ingredient.getDebugLoc()); |
74 | } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) { |
75 | NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands()); |
76 | } else if (CallInst *CI = dyn_cast<CallInst>(Val: Inst)) { |
77 | NewRecipe = new VPWidenCallRecipe( |
78 | *CI, drop_end(RangeOrContainer: Ingredient.operands()), |
79 | getVectorIntrinsicIDForCall(CI, TLI: &TLI), CI->getDebugLoc()); |
80 | } else if (SelectInst *SI = dyn_cast<SelectInst>(Val: Inst)) { |
81 | NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands()); |
82 | } else if (auto *CI = dyn_cast<CastInst>(Val: Inst)) { |
83 | NewRecipe = new VPWidenCastRecipe( |
84 | CI->getOpcode(), Ingredient.getOperand(N: 0), CI->getType(), *CI); |
85 | } else { |
86 | NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands()); |
87 | } |
88 | } |
89 | |
90 | NewRecipe->insertBefore(InsertPos: &Ingredient); |
91 | if (NewRecipe->getNumDefinedValues() == 1) |
92 | VPV->replaceAllUsesWith(New: NewRecipe->getVPSingleValue()); |
93 | else |
94 | assert(NewRecipe->getNumDefinedValues() == 0 && |
95 | "Only recpies with zero or one defined values expected" ); |
96 | Ingredient.eraseFromParent(); |
97 | } |
98 | } |
99 | } |
100 | |
101 | static bool sinkScalarOperands(VPlan &Plan) { |
102 | auto Iter = vp_depth_first_deep(G: Plan.getEntry()); |
103 | bool Changed = false; |
104 | // First, collect the operands of all recipes in replicate blocks as seeds for |
105 | // sinking. |
106 | SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; |
107 | for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Range: Iter)) { |
108 | VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock(); |
109 | if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2) |
110 | continue; |
111 | VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Val: EntryVPBB->getSuccessors()[0]); |
112 | if (!VPBB || VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) |
113 | continue; |
114 | for (auto &Recipe : *VPBB) { |
115 | for (VPValue *Op : Recipe.operands()) |
116 | if (auto *Def = |
117 | dyn_cast_or_null<VPSingleDefRecipe>(Val: Op->getDefiningRecipe())) |
118 | WorkList.insert(X: std::make_pair(x&: VPBB, y&: Def)); |
119 | } |
120 | } |
121 | |
122 | bool ScalarVFOnly = Plan.hasScalarVFOnly(); |
123 | // Try to sink each replicate or scalar IV steps recipe in the worklist. |
124 | for (unsigned I = 0; I != WorkList.size(); ++I) { |
125 | VPBasicBlock *SinkTo; |
126 | VPSingleDefRecipe *SinkCandidate; |
127 | std::tie(args&: SinkTo, args&: SinkCandidate) = WorkList[I]; |
128 | if (SinkCandidate->getParent() == SinkTo || |
129 | SinkCandidate->mayHaveSideEffects() || |
130 | SinkCandidate->mayReadOrWriteMemory()) |
131 | continue; |
132 | if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: SinkCandidate)) { |
133 | if (!ScalarVFOnly && RepR->isUniform()) |
134 | continue; |
135 | } else if (!isa<VPScalarIVStepsRecipe>(Val: SinkCandidate)) |
136 | continue; |
137 | |
138 | bool NeedsDuplicating = false; |
139 | // All recipe users of the sink candidate must be in the same block SinkTo |
140 | // or all users outside of SinkTo must be uniform-after-vectorization ( |
141 | // i.e., only first lane is used) . In the latter case, we need to duplicate |
142 | // SinkCandidate. |
143 | auto CanSinkWithUser = [SinkTo, &NeedsDuplicating, |
144 | SinkCandidate](VPUser *U) { |
145 | auto *UI = dyn_cast<VPRecipeBase>(Val: U); |
146 | if (!UI) |
147 | return false; |
148 | if (UI->getParent() == SinkTo) |
149 | return true; |
150 | NeedsDuplicating = UI->onlyFirstLaneUsed(Op: SinkCandidate); |
151 | // We only know how to duplicate VPRecipeRecipes for now. |
152 | return NeedsDuplicating && isa<VPReplicateRecipe>(Val: SinkCandidate); |
153 | }; |
154 | if (!all_of(Range: SinkCandidate->users(), P: CanSinkWithUser)) |
155 | continue; |
156 | |
157 | if (NeedsDuplicating) { |
158 | if (ScalarVFOnly) |
159 | continue; |
160 | Instruction *I = SinkCandidate->getUnderlyingInstr(); |
161 | auto *Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true); |
162 | // TODO: add ".cloned" suffix to name of Clone's VPValue. |
163 | |
164 | Clone->insertBefore(InsertPos: SinkCandidate); |
165 | SinkCandidate->replaceUsesWithIf(New: Clone, ShouldReplace: [SinkTo](VPUser &U, unsigned) { |
166 | return cast<VPRecipeBase>(Val: &U)->getParent() != SinkTo; |
167 | }); |
168 | } |
169 | SinkCandidate->moveBefore(BB&: *SinkTo, I: SinkTo->getFirstNonPhi()); |
170 | for (VPValue *Op : SinkCandidate->operands()) |
171 | if (auto *Def = |
172 | dyn_cast_or_null<VPSingleDefRecipe>(Val: Op->getDefiningRecipe())) |
173 | WorkList.insert(X: std::make_pair(x&: SinkTo, y&: Def)); |
174 | Changed = true; |
175 | } |
176 | return Changed; |
177 | } |
178 | |
179 | /// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return |
180 | /// the mask. |
181 | VPValue *getPredicatedMask(VPRegionBlock *R) { |
182 | auto *EntryBB = dyn_cast<VPBasicBlock>(Val: R->getEntry()); |
183 | if (!EntryBB || EntryBB->size() != 1 || |
184 | !isa<VPBranchOnMaskRecipe>(Val: EntryBB->begin())) |
185 | return nullptr; |
186 | |
187 | return cast<VPBranchOnMaskRecipe>(Val: &*EntryBB->begin())->getOperand(N: 0); |
188 | } |
189 | |
190 | /// If \p R is a triangle region, return the 'then' block of the triangle. |
191 | static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) { |
192 | auto *EntryBB = cast<VPBasicBlock>(Val: R->getEntry()); |
193 | if (EntryBB->getNumSuccessors() != 2) |
194 | return nullptr; |
195 | |
196 | auto *Succ0 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[0]); |
197 | auto *Succ1 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[1]); |
198 | if (!Succ0 || !Succ1) |
199 | return nullptr; |
200 | |
201 | if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1) |
202 | return nullptr; |
203 | if (Succ0->getSingleSuccessor() == Succ1) |
204 | return Succ0; |
205 | if (Succ1->getSingleSuccessor() == Succ0) |
206 | return Succ1; |
207 | return nullptr; |
208 | } |
209 | |
210 | // Merge replicate regions in their successor region, if a replicate region |
211 | // is connected to a successor replicate region with the same predicate by a |
212 | // single, empty VPBasicBlock. |
213 | static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { |
214 | SetVector<VPRegionBlock *> DeletedRegions; |
215 | |
216 | // Collect replicate regions followed by an empty block, followed by another |
217 | // replicate region with matching masks to process front. This is to avoid |
218 | // iterator invalidation issues while merging regions. |
219 | SmallVector<VPRegionBlock *, 8> WorkList; |
220 | for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>( |
221 | Range: vp_depth_first_deep(G: Plan.getEntry()))) { |
222 | if (!Region1->isReplicator()) |
223 | continue; |
224 | auto *MiddleBasicBlock = |
225 | dyn_cast_or_null<VPBasicBlock>(Val: Region1->getSingleSuccessor()); |
226 | if (!MiddleBasicBlock || !MiddleBasicBlock->empty()) |
227 | continue; |
228 | |
229 | auto *Region2 = |
230 | dyn_cast_or_null<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor()); |
231 | if (!Region2 || !Region2->isReplicator()) |
232 | continue; |
233 | |
234 | VPValue *Mask1 = getPredicatedMask(R: Region1); |
235 | VPValue *Mask2 = getPredicatedMask(R: Region2); |
236 | if (!Mask1 || Mask1 != Mask2) |
237 | continue; |
238 | |
239 | assert(Mask1 && Mask2 && "both region must have conditions" ); |
240 | WorkList.push_back(Elt: Region1); |
241 | } |
242 | |
243 | // Move recipes from Region1 to its successor region, if both are triangles. |
244 | for (VPRegionBlock *Region1 : WorkList) { |
245 | if (DeletedRegions.contains(key: Region1)) |
246 | continue; |
247 | auto *MiddleBasicBlock = cast<VPBasicBlock>(Val: Region1->getSingleSuccessor()); |
248 | auto *Region2 = cast<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor()); |
249 | |
250 | VPBasicBlock *Then1 = getPredicatedThenBlock(R: Region1); |
251 | VPBasicBlock *Then2 = getPredicatedThenBlock(R: Region2); |
252 | if (!Then1 || !Then2) |
253 | continue; |
254 | |
255 | // Note: No fusion-preventing memory dependencies are expected in either |
256 | // region. Such dependencies should be rejected during earlier dependence |
257 | // checks, which guarantee accesses can be re-ordered for vectorization. |
258 | // |
259 | // Move recipes to the successor region. |
260 | for (VPRecipeBase &ToMove : make_early_inc_range(Range: reverse(C&: *Then1))) |
261 | ToMove.moveBefore(BB&: *Then2, I: Then2->getFirstNonPhi()); |
262 | |
263 | auto *Merge1 = cast<VPBasicBlock>(Val: Then1->getSingleSuccessor()); |
264 | auto *Merge2 = cast<VPBasicBlock>(Val: Then2->getSingleSuccessor()); |
265 | |
266 | // Move VPPredInstPHIRecipes from the merge block to the successor region's |
267 | // merge block. Update all users inside the successor region to use the |
268 | // original values. |
269 | for (VPRecipeBase &Phi1ToMove : make_early_inc_range(Range: reverse(C&: *Merge1))) { |
270 | VPValue *PredInst1 = |
271 | cast<VPPredInstPHIRecipe>(Val: &Phi1ToMove)->getOperand(N: 0); |
272 | VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue(); |
273 | Phi1ToMoveV->replaceUsesWithIf(New: PredInst1, ShouldReplace: [Then2](VPUser &U, unsigned) { |
274 | auto *UI = dyn_cast<VPRecipeBase>(Val: &U); |
275 | return UI && UI->getParent() == Then2; |
276 | }); |
277 | |
278 | Phi1ToMove.moveBefore(BB&: *Merge2, I: Merge2->begin()); |
279 | } |
280 | |
281 | // Finally, remove the first region. |
282 | for (VPBlockBase *Pred : make_early_inc_range(Range&: Region1->getPredecessors())) { |
283 | VPBlockUtils::disconnectBlocks(From: Pred, To: Region1); |
284 | VPBlockUtils::connectBlocks(From: Pred, To: MiddleBasicBlock); |
285 | } |
286 | VPBlockUtils::disconnectBlocks(From: Region1, To: MiddleBasicBlock); |
287 | DeletedRegions.insert(X: Region1); |
288 | } |
289 | |
290 | for (VPRegionBlock *ToDelete : DeletedRegions) |
291 | delete ToDelete; |
292 | return !DeletedRegions.empty(); |
293 | } |
294 | |
295 | static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, |
296 | VPlan &Plan) { |
297 | Instruction *Instr = PredRecipe->getUnderlyingInstr(); |
298 | // Build the triangular if-then region. |
299 | std::string RegionName = (Twine("pred." ) + Instr->getOpcodeName()).str(); |
300 | assert(Instr->getParent() && "Predicated instruction not in any basic block" ); |
301 | auto *BlockInMask = PredRecipe->getMask(); |
302 | auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); |
303 | auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry" , BOMRecipe); |
304 | |
305 | // Replace predicated replicate recipe with a replicate recipe without a |
306 | // mask but in the replicate region. |
307 | auto *RecipeWithoutMask = new VPReplicateRecipe( |
308 | PredRecipe->getUnderlyingInstr(), |
309 | make_range(x: PredRecipe->op_begin(), y: std::prev(x: PredRecipe->op_end())), |
310 | PredRecipe->isUniform()); |
311 | auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if" , RecipeWithoutMask); |
312 | |
313 | VPPredInstPHIRecipe *PHIRecipe = nullptr; |
314 | if (PredRecipe->getNumUsers() != 0) { |
315 | PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask); |
316 | PredRecipe->replaceAllUsesWith(New: PHIRecipe); |
317 | PHIRecipe->setOperand(I: 0, New: RecipeWithoutMask); |
318 | } |
319 | PredRecipe->eraseFromParent(); |
320 | auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue" , PHIRecipe); |
321 | VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); |
322 | |
323 | // Note: first set Entry as region entry and then connect successors starting |
324 | // from it in order, to propagate the "parent" of each VPBasicBlock. |
325 | VPBlockUtils::insertTwoBlocksAfter(IfTrue: Pred, IfFalse: Exiting, BlockPtr: Entry); |
326 | VPBlockUtils::connectBlocks(From: Pred, To: Exiting); |
327 | |
328 | return Region; |
329 | } |
330 | |
331 | static void addReplicateRegions(VPlan &Plan) { |
332 | SmallVector<VPReplicateRecipe *> WorkList; |
333 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( |
334 | Range: vp_depth_first_deep(G: Plan.getEntry()))) { |
335 | for (VPRecipeBase &R : *VPBB) |
336 | if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) { |
337 | if (RepR->isPredicated()) |
338 | WorkList.push_back(Elt: RepR); |
339 | } |
340 | } |
341 | |
342 | unsigned BBNum = 0; |
343 | for (VPReplicateRecipe *RepR : WorkList) { |
344 | VPBasicBlock *CurrentBlock = RepR->getParent(); |
345 | VPBasicBlock *SplitBlock = CurrentBlock->splitAt(SplitAt: RepR->getIterator()); |
346 | |
347 | BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent(); |
348 | SplitBlock->setName( |
349 | OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "" ); |
350 | // Record predicated instructions for above packing optimizations. |
351 | VPBlockBase *Region = createReplicateRegion(PredRecipe: RepR, Plan); |
352 | Region->setParent(CurrentBlock->getParent()); |
353 | VPBlockUtils::disconnectBlocks(From: CurrentBlock, To: SplitBlock); |
354 | VPBlockUtils::connectBlocks(From: CurrentBlock, To: Region); |
355 | VPBlockUtils::connectBlocks(From: Region, To: SplitBlock); |
356 | } |
357 | } |
358 | |
359 | /// Remove redundant VPBasicBlocks by merging them into their predecessor if |
360 | /// the predecessor has a single successor. |
361 | static bool mergeBlocksIntoPredecessors(VPlan &Plan) { |
362 | SmallVector<VPBasicBlock *> WorkList; |
363 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( |
364 | Range: vp_depth_first_deep(G: Plan.getEntry()))) { |
365 | auto *PredVPBB = |
366 | dyn_cast_or_null<VPBasicBlock>(Val: VPBB->getSinglePredecessor()); |
367 | if (PredVPBB && PredVPBB->getNumSuccessors() == 1) |
368 | WorkList.push_back(Elt: VPBB); |
369 | } |
370 | |
371 | for (VPBasicBlock *VPBB : WorkList) { |
372 | VPBasicBlock *PredVPBB = cast<VPBasicBlock>(Val: VPBB->getSinglePredecessor()); |
373 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) |
374 | R.moveBefore(BB&: *PredVPBB, I: PredVPBB->end()); |
375 | VPBlockUtils::disconnectBlocks(From: PredVPBB, To: VPBB); |
376 | auto *ParentRegion = cast_or_null<VPRegionBlock>(Val: VPBB->getParent()); |
377 | if (ParentRegion && ParentRegion->getExiting() == VPBB) |
378 | ParentRegion->setExiting(PredVPBB); |
379 | for (auto *Succ : to_vector(Range: VPBB->successors())) { |
380 | VPBlockUtils::disconnectBlocks(From: VPBB, To: Succ); |
381 | VPBlockUtils::connectBlocks(From: PredVPBB, To: Succ); |
382 | } |
383 | delete VPBB; |
384 | } |
385 | return !WorkList.empty(); |
386 | } |
387 | |
388 | void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) { |
389 | // Convert masked VPReplicateRecipes to if-then region blocks. |
390 | addReplicateRegions(Plan); |
391 | |
392 | bool ShouldSimplify = true; |
393 | while (ShouldSimplify) { |
394 | ShouldSimplify = sinkScalarOperands(Plan); |
395 | ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan); |
396 | ShouldSimplify |= mergeBlocksIntoPredecessors(Plan); |
397 | } |
398 | } |
399 | |
400 | /// Remove redundant casts of inductions. |
401 | /// |
402 | /// Such redundant casts are casts of induction variables that can be ignored, |
403 | /// because we already proved that the casted phi is equal to the uncasted phi |
404 | /// in the vectorized loop. There is no need to vectorize the cast - the same |
405 | /// value can be used for both the phi and casts in the vector loop. |
406 | static void removeRedundantInductionCasts(VPlan &Plan) { |
407 | for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { |
408 | auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi); |
409 | if (!IV || IV->getTruncInst()) |
410 | continue; |
411 | |
412 | // A sequence of IR Casts has potentially been recorded for IV, which |
413 | // *must be bypassed* when the IV is vectorized, because the vectorized IV |
414 | // will produce the desired casted value. This sequence forms a def-use |
415 | // chain and is provided in reverse order, ending with the cast that uses |
416 | // the IV phi. Search for the recipe of the last cast in the chain and |
417 | // replace it with the original IV. Note that only the final cast is |
418 | // expected to have users outside the cast-chain and the dead casts left |
419 | // over will be cleaned up later. |
420 | auto &Casts = IV->getInductionDescriptor().getCastInsts(); |
421 | VPValue *FindMyCast = IV; |
422 | for (Instruction *IRCast : reverse(C: Casts)) { |
423 | VPSingleDefRecipe *FoundUserCast = nullptr; |
424 | for (auto *U : FindMyCast->users()) { |
425 | auto *UserCast = dyn_cast<VPSingleDefRecipe>(Val: U); |
426 | if (UserCast && UserCast->getUnderlyingValue() == IRCast) { |
427 | FoundUserCast = UserCast; |
428 | break; |
429 | } |
430 | } |
431 | FindMyCast = FoundUserCast; |
432 | } |
433 | FindMyCast->replaceAllUsesWith(New: IV); |
434 | } |
435 | } |
436 | |
437 | /// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV |
438 | /// recipe, if it exists. |
439 | static void removeRedundantCanonicalIVs(VPlan &Plan) { |
440 | VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); |
441 | VPWidenCanonicalIVRecipe *WidenNewIV = nullptr; |
442 | for (VPUser *U : CanonicalIV->users()) { |
443 | WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(Val: U); |
444 | if (WidenNewIV) |
445 | break; |
446 | } |
447 | |
448 | if (!WidenNewIV) |
449 | return; |
450 | |
451 | VPBasicBlock * = Plan.getVectorLoopRegion()->getEntryBasicBlock(); |
452 | for (VPRecipeBase &Phi : HeaderVPBB->phis()) { |
453 | auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi); |
454 | |
455 | if (!WidenOriginalIV || !WidenOriginalIV->isCanonical() || |
456 | WidenOriginalIV->getScalarType() != WidenNewIV->getScalarType()) |
457 | continue; |
458 | |
459 | // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides |
460 | // everything WidenNewIV's users need. That is, WidenOriginalIV will |
461 | // generate a vector phi or all users of WidenNewIV demand the first lane |
462 | // only. |
463 | if (any_of(Range: WidenOriginalIV->users(), |
464 | P: [WidenOriginalIV](VPUser *U) { |
465 | return !U->usesScalars(Op: WidenOriginalIV); |
466 | }) || |
467 | vputils::onlyFirstLaneUsed(Def: WidenNewIV)) { |
468 | WidenNewIV->replaceAllUsesWith(New: WidenOriginalIV); |
469 | WidenNewIV->eraseFromParent(); |
470 | return; |
471 | } |
472 | } |
473 | } |
474 | |
475 | /// Returns true if \p R is dead and can be removed. |
476 | static bool isDeadRecipe(VPRecipeBase &R) { |
477 | using namespace llvm::PatternMatch; |
478 | // Do remove conditional assume instructions as their conditions may be |
479 | // flattened. |
480 | auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R); |
481 | bool IsConditionalAssume = |
482 | RepR && RepR->isPredicated() && |
483 | match(RepR->getUnderlyingInstr(), m_Intrinsic<Intrinsic::assume>()); |
484 | if (IsConditionalAssume) |
485 | return true; |
486 | |
487 | if (R.mayHaveSideEffects()) |
488 | return false; |
489 | |
490 | // Recipe is dead if no user keeps the recipe alive. |
491 | return all_of(Range: R.definedValues(), |
492 | P: [](VPValue *V) { return V->getNumUsers() == 0; }); |
493 | } |
494 | |
495 | static void removeDeadRecipes(VPlan &Plan) { |
496 | ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( |
497 | Plan.getEntry()); |
498 | |
499 | for (VPBasicBlock *VPBB : reverse(C: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT))) { |
500 | // The recipes in the block are processed in reverse order, to catch chains |
501 | // of dead recipes. |
502 | for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) { |
503 | if (isDeadRecipe(R)) |
504 | R.eraseFromParent(); |
505 | } |
506 | } |
507 | } |
508 | |
509 | static VPValue *createScalarIVSteps(VPlan &Plan, |
510 | InductionDescriptor::InductionKind Kind, |
511 | Instruction::BinaryOps InductionOpcode, |
512 | FPMathOperator *FPBinOp, |
513 | ScalarEvolution &SE, Instruction *TruncI, |
514 | VPValue *StartV, VPValue *Step, |
515 | VPBasicBlock::iterator IP) { |
516 | VPBasicBlock * = Plan.getVectorLoopRegion()->getEntryBasicBlock(); |
517 | VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); |
518 | VPSingleDefRecipe *BaseIV = CanonicalIV; |
519 | if (!CanonicalIV->isCanonical(Kind, Start: StartV, Step)) { |
520 | BaseIV = new VPDerivedIVRecipe(Kind, FPBinOp, StartV, CanonicalIV, Step); |
521 | HeaderVPBB->insert(Recipe: BaseIV, InsertPt: IP); |
522 | } |
523 | |
524 | // Truncate base induction if needed. |
525 | VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(), |
526 | SE.getContext()); |
527 | Type *ResultTy = TypeInfo.inferScalarType(V: BaseIV); |
528 | if (TruncI) { |
529 | Type *TruncTy = TruncI->getType(); |
530 | assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() && |
531 | "Not truncating." ); |
532 | assert(ResultTy->isIntegerTy() && "Truncation requires an integer type" ); |
533 | BaseIV = new VPScalarCastRecipe(Instruction::Trunc, BaseIV, TruncTy); |
534 | HeaderVPBB->insert(Recipe: BaseIV, InsertPt: IP); |
535 | ResultTy = TruncTy; |
536 | } |
537 | |
538 | // Truncate step if needed. |
539 | Type *StepTy = TypeInfo.inferScalarType(V: Step); |
540 | if (ResultTy != StepTy) { |
541 | assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() && |
542 | "Not truncating." ); |
543 | assert(StepTy->isIntegerTy() && "Truncation requires an integer type" ); |
544 | Step = new VPScalarCastRecipe(Instruction::Trunc, Step, ResultTy); |
545 | auto * = |
546 | cast<VPBasicBlock>(Val: HeaderVPBB->getSingleHierarchicalPredecessor()); |
547 | VecPreheader->appendRecipe(Recipe: Step->getDefiningRecipe()); |
548 | } |
549 | |
550 | VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe( |
551 | BaseIV, Step, InductionOpcode, |
552 | FPBinOp ? FPBinOp->getFastMathFlags() : FastMathFlags()); |
553 | HeaderVPBB->insert(Recipe: Steps, InsertPt: IP); |
554 | return Steps; |
555 | } |
556 | |
557 | /// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd |
558 | /// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as |
559 | /// VPWidenPointerInductionRecipe will generate vectors only. If some users |
560 | /// require vectors while other require scalars, the scalar uses need to extract |
561 | /// the scalars from the generated vectors (Note that this is different to how |
562 | /// int/fp inductions are handled). Also optimize VPWidenIntOrFpInductionRecipe, |
563 | /// if any of its users needs scalar values, by providing them scalar steps |
564 | /// built on the canonical scalar IV and update the original IV's users. This is |
565 | /// an optional optimization to reduce the needs of vector extracts. |
566 | static void legalizeAndOptimizeInductions(VPlan &Plan, ScalarEvolution &SE) { |
567 | SmallVector<VPRecipeBase *> ToRemove; |
568 | VPBasicBlock * = Plan.getVectorLoopRegion()->getEntryBasicBlock(); |
569 | bool HasOnlyVectorVFs = !Plan.hasVF(VF: ElementCount::getFixed(MinVal: 1)); |
570 | VPBasicBlock::iterator InsertPt = HeaderVPBB->getFirstNonPhi(); |
571 | for (VPRecipeBase &Phi : HeaderVPBB->phis()) { |
572 | // Replace wide pointer inductions which have only their scalars used by |
573 | // PtrAdd(IndStart, ScalarIVSteps (0, Step)). |
574 | if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(Val: &Phi)) { |
575 | if (!PtrIV->onlyScalarsGenerated(IsScalable: Plan.hasScalableVF())) |
576 | continue; |
577 | |
578 | const InductionDescriptor &ID = PtrIV->getInductionDescriptor(); |
579 | VPValue *StartV = |
580 | Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: ID.getStep()->getType(), V: 0)); |
581 | VPValue *StepV = PtrIV->getOperand(N: 1); |
582 | VPRecipeBase *Steps = |
583 | createScalarIVSteps(Plan, Kind: InductionDescriptor::IK_IntInduction, |
584 | InductionOpcode: Instruction::Add, FPBinOp: nullptr, SE, TruncI: nullptr, StartV, |
585 | Step: StepV, IP: InsertPt) |
586 | ->getDefiningRecipe(); |
587 | |
588 | auto *Recipe = |
589 | new VPInstruction(VPInstruction::PtrAdd, |
590 | {PtrIV->getStartValue(), Steps->getVPSingleValue()}, |
591 | PtrIV->getDebugLoc(), "next.gep" ); |
592 | |
593 | Recipe->insertAfter(InsertPos: Steps); |
594 | PtrIV->replaceAllUsesWith(New: Recipe); |
595 | continue; |
596 | } |
597 | |
598 | // Replace widened induction with scalar steps for users that only use |
599 | // scalars. |
600 | auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi); |
601 | if (!WideIV) |
602 | continue; |
603 | if (HasOnlyVectorVFs && none_of(Range: WideIV->users(), P: [WideIV](VPUser *U) { |
604 | return U->usesScalars(Op: WideIV); |
605 | })) |
606 | continue; |
607 | |
608 | const InductionDescriptor &ID = WideIV->getInductionDescriptor(); |
609 | VPValue *Steps = createScalarIVSteps( |
610 | Plan, Kind: ID.getKind(), InductionOpcode: ID.getInductionOpcode(), |
611 | FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()), SE, |
612 | TruncI: WideIV->getTruncInst(), StartV: WideIV->getStartValue(), Step: WideIV->getStepValue(), |
613 | IP: InsertPt); |
614 | |
615 | // Update scalar users of IV to use Step instead. |
616 | if (!HasOnlyVectorVFs) |
617 | WideIV->replaceAllUsesWith(New: Steps); |
618 | else |
619 | WideIV->replaceUsesWithIf(New: Steps, ShouldReplace: [WideIV](VPUser &U, unsigned) { |
620 | return U.usesScalars(Op: WideIV); |
621 | }); |
622 | } |
623 | } |
624 | |
625 | /// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing |
626 | /// them with already existing recipes expanding the same SCEV expression. |
627 | static void removeRedundantExpandSCEVRecipes(VPlan &Plan) { |
628 | DenseMap<const SCEV *, VPValue *> SCEV2VPV; |
629 | |
630 | for (VPRecipeBase &R : |
631 | make_early_inc_range(Range&: *Plan.getEntry()->getEntryBasicBlock())) { |
632 | auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(Val: &R); |
633 | if (!ExpR) |
634 | continue; |
635 | |
636 | auto I = SCEV2VPV.insert(KV: {ExpR->getSCEV(), ExpR}); |
637 | if (I.second) |
638 | continue; |
639 | ExpR->replaceAllUsesWith(New: I.first->second); |
640 | ExpR->eraseFromParent(); |
641 | } |
642 | } |
643 | |
644 | static void recursivelyDeleteDeadRecipes(VPValue *V) { |
645 | SmallVector<VPValue *> WorkList; |
646 | SmallPtrSet<VPValue *, 8> Seen; |
647 | WorkList.push_back(Elt: V); |
648 | |
649 | while (!WorkList.empty()) { |
650 | VPValue *Cur = WorkList.pop_back_val(); |
651 | if (!Seen.insert(Ptr: Cur).second) |
652 | continue; |
653 | VPRecipeBase *R = Cur->getDefiningRecipe(); |
654 | if (!R) |
655 | continue; |
656 | if (!isDeadRecipe(R&: *R)) |
657 | continue; |
658 | WorkList.append(in_start: R->op_begin(), in_end: R->op_end()); |
659 | R->eraseFromParent(); |
660 | } |
661 | } |
662 | |
663 | void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, |
664 | unsigned BestUF, |
665 | PredicatedScalarEvolution &PSE) { |
666 | assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan" ); |
667 | assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan" ); |
668 | VPBasicBlock *ExitingVPBB = |
669 | Plan.getVectorLoopRegion()->getExitingBasicBlock(); |
670 | auto *Term = &ExitingVPBB->back(); |
671 | // Try to simplify the branch condition if TC <= VF * UF when preparing to |
672 | // execute the plan for the main vector loop. We only do this if the |
673 | // terminator is: |
674 | // 1. BranchOnCount, or |
675 | // 2. BranchOnCond where the input is Not(ActiveLaneMask). |
676 | using namespace llvm::VPlanPatternMatch; |
677 | if (!match(V: Term, P: m_BranchOnCount(Op0: m_VPValue(), Op1: m_VPValue())) && |
678 | !match(V: Term, |
679 | P: m_BranchOnCond(Op0: m_Not(Op0: m_ActiveLaneMask(Op0: m_VPValue(), Op1: m_VPValue()))))) |
680 | return; |
681 | |
682 | Type *IdxTy = |
683 | Plan.getCanonicalIV()->getStartValue()->getLiveInIRValue()->getType(); |
684 | const SCEV *TripCount = createTripCountSCEV(IdxTy, PSE); |
685 | ScalarEvolution &SE = *PSE.getSE(); |
686 | ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF); |
687 | const SCEV *C = SE.getElementCount(Ty: TripCount->getType(), EC: NumElements); |
688 | if (TripCount->isZero() || |
689 | !SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: TripCount, RHS: C)) |
690 | return; |
691 | |
692 | LLVMContext &Ctx = SE.getContext(); |
693 | auto *BOC = |
694 | new VPInstruction(VPInstruction::BranchOnCond, |
695 | {Plan.getOrAddLiveIn(V: ConstantInt::getTrue(Context&: Ctx))}); |
696 | |
697 | SmallVector<VPValue *> PossiblyDead(Term->operands()); |
698 | Term->eraseFromParent(); |
699 | for (VPValue *Op : PossiblyDead) |
700 | recursivelyDeleteDeadRecipes(V: Op); |
701 | ExitingVPBB->appendRecipe(Recipe: BOC); |
702 | Plan.setVF(BestVF); |
703 | Plan.setUF(BestUF); |
704 | // TODO: Further simplifications are possible |
705 | // 1. Replace inductions with constants. |
706 | // 2. Replace vector loop region with VPBasicBlock. |
707 | } |
708 | |
709 | #ifndef NDEBUG |
710 | static VPRegionBlock *GetReplicateRegion(VPRecipeBase *R) { |
711 | auto *Region = dyn_cast_or_null<VPRegionBlock>(Val: R->getParent()->getParent()); |
712 | if (Region && Region->isReplicator()) { |
713 | assert(Region->getNumSuccessors() == 1 && |
714 | Region->getNumPredecessors() == 1 && "Expected SESE region!" ); |
715 | assert(R->getParent()->size() == 1 && |
716 | "A recipe in an original replicator region must be the only " |
717 | "recipe in its block" ); |
718 | return Region; |
719 | } |
720 | return nullptr; |
721 | } |
722 | #endif |
723 | |
724 | static bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B, |
725 | VPDominatorTree &VPDT) { |
726 | if (A == B) |
727 | return false; |
728 | |
729 | auto LocalComesBefore = [](const VPRecipeBase *A, const VPRecipeBase *B) { |
730 | for (auto &R : *A->getParent()) { |
731 | if (&R == A) |
732 | return true; |
733 | if (&R == B) |
734 | return false; |
735 | } |
736 | llvm_unreachable("recipe not found" ); |
737 | }; |
738 | const VPBlockBase *ParentA = A->getParent(); |
739 | const VPBlockBase *ParentB = B->getParent(); |
740 | if (ParentA == ParentB) |
741 | return LocalComesBefore(A, B); |
742 | |
743 | assert(!GetReplicateRegion(const_cast<VPRecipeBase *>(A)) && |
744 | "No replicate regions expected at this point" ); |
745 | assert(!GetReplicateRegion(const_cast<VPRecipeBase *>(B)) && |
746 | "No replicate regions expected at this point" ); |
747 | return VPDT.properlyDominates(A: ParentA, B: ParentB); |
748 | } |
749 | |
750 | /// Sink users of \p FOR after the recipe defining the previous value \p |
751 | /// Previous of the recurrence. \returns true if all users of \p FOR could be |
752 | /// re-arranged as needed or false if it is not possible. |
753 | static bool |
754 | sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, |
755 | VPRecipeBase *Previous, |
756 | VPDominatorTree &VPDT) { |
757 | // Collect recipes that need sinking. |
758 | SmallVector<VPRecipeBase *> WorkList; |
759 | SmallPtrSet<VPRecipeBase *, 8> Seen; |
760 | Seen.insert(Ptr: Previous); |
761 | auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) { |
762 | // The previous value must not depend on the users of the recurrence phi. In |
763 | // that case, FOR is not a fixed order recurrence. |
764 | if (SinkCandidate == Previous) |
765 | return false; |
766 | |
767 | if (isa<VPHeaderPHIRecipe>(Val: SinkCandidate) || |
768 | !Seen.insert(Ptr: SinkCandidate).second || |
769 | properlyDominates(A: Previous, B: SinkCandidate, VPDT)) |
770 | return true; |
771 | |
772 | if (SinkCandidate->mayHaveSideEffects()) |
773 | return false; |
774 | |
775 | WorkList.push_back(Elt: SinkCandidate); |
776 | return true; |
777 | }; |
778 | |
779 | // Recursively sink users of FOR after Previous. |
780 | WorkList.push_back(Elt: FOR); |
781 | for (unsigned I = 0; I != WorkList.size(); ++I) { |
782 | VPRecipeBase *Current = WorkList[I]; |
783 | assert(Current->getNumDefinedValues() == 1 && |
784 | "only recipes with a single defined value expected" ); |
785 | |
786 | for (VPUser *User : Current->getVPSingleValue()->users()) { |
787 | if (auto *R = dyn_cast<VPRecipeBase>(Val: User)) |
788 | if (!TryToPushSinkCandidate(R)) |
789 | return false; |
790 | } |
791 | } |
792 | |
793 | // Keep recipes to sink ordered by dominance so earlier instructions are |
794 | // processed first. |
795 | sort(C&: WorkList, Comp: [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) { |
796 | return properlyDominates(A, B, VPDT); |
797 | }); |
798 | |
799 | for (VPRecipeBase *SinkCandidate : WorkList) { |
800 | if (SinkCandidate == FOR) |
801 | continue; |
802 | |
803 | SinkCandidate->moveAfter(MovePos: Previous); |
804 | Previous = SinkCandidate; |
805 | } |
806 | return true; |
807 | } |
808 | |
809 | bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, |
810 | VPBuilder &Builder) { |
811 | VPDominatorTree VPDT; |
812 | VPDT.recalculate(Func&: Plan); |
813 | |
814 | SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis; |
815 | for (VPRecipeBase &R : |
816 | Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis()) |
817 | if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &R)) |
818 | RecurrencePhis.push_back(Elt: FOR); |
819 | |
820 | for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) { |
821 | SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis; |
822 | VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe(); |
823 | // Fixed-order recurrences do not contain cycles, so this loop is guaranteed |
824 | // to terminate. |
825 | while (auto *PrevPhi = |
826 | dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Val: Previous)) { |
827 | assert(PrevPhi->getParent() == FOR->getParent()); |
828 | assert(SeenPhis.insert(PrevPhi).second); |
829 | Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe(); |
830 | } |
831 | |
832 | if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT)) |
833 | return false; |
834 | |
835 | // Introduce a recipe to combine the incoming and previous values of a |
836 | // fixed-order recurrence. |
837 | VPBasicBlock *InsertBlock = Previous->getParent(); |
838 | if (isa<VPHeaderPHIRecipe>(Val: Previous)) |
839 | Builder.setInsertPoint(TheBB: InsertBlock, IP: InsertBlock->getFirstNonPhi()); |
840 | else |
841 | Builder.setInsertPoint(TheBB: InsertBlock, IP: std::next(x: Previous->getIterator())); |
842 | |
843 | auto *RecurSplice = cast<VPInstruction>( |
844 | Val: Builder.createNaryOp(Opcode: VPInstruction::FirstOrderRecurrenceSplice, |
845 | Operands: {FOR, FOR->getBackedgeValue()})); |
846 | |
847 | FOR->replaceAllUsesWith(New: RecurSplice); |
848 | // Set the first operand of RecurSplice to FOR again, after replacing |
849 | // all users. |
850 | RecurSplice->setOperand(I: 0, New: FOR); |
851 | } |
852 | return true; |
853 | } |
854 | |
855 | static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) { |
856 | SetVector<VPUser *> Users(V->user_begin(), V->user_end()); |
857 | for (unsigned I = 0; I != Users.size(); ++I) { |
858 | VPRecipeBase *Cur = dyn_cast<VPRecipeBase>(Val: Users[I]); |
859 | if (!Cur || isa<VPHeaderPHIRecipe>(Val: Cur)) |
860 | continue; |
861 | for (VPValue *V : Cur->definedValues()) |
862 | Users.insert(Start: V->user_begin(), End: V->user_end()); |
863 | } |
864 | return Users.takeVector(); |
865 | } |
866 | |
867 | void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { |
868 | for (VPRecipeBase &R : |
869 | Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { |
870 | auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R); |
871 | if (!PhiR) |
872 | continue; |
873 | const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); |
874 | RecurKind RK = RdxDesc.getRecurrenceKind(); |
875 | if (RK != RecurKind::Add && RK != RecurKind::Mul) |
876 | continue; |
877 | |
878 | for (VPUser *U : collectUsersRecursively(V: PhiR)) |
879 | if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: U)) { |
880 | RecWithFlags->dropPoisonGeneratingFlags(); |
881 | } |
882 | } |
883 | } |
884 | |
885 | /// Try to simplify recipe \p R. |
886 | static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { |
887 | using namespace llvm::VPlanPatternMatch; |
888 | // Try to remove redundant blend recipes. |
889 | if (auto *Blend = dyn_cast<VPBlendRecipe>(Val: &R)) { |
890 | VPValue *Inc0 = Blend->getIncomingValue(Idx: 0); |
891 | for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I) |
892 | if (Inc0 != Blend->getIncomingValue(Idx: I) && |
893 | !match(V: Blend->getMask(Idx: I), P: m_False())) |
894 | return; |
895 | Blend->replaceAllUsesWith(New: Inc0); |
896 | Blend->eraseFromParent(); |
897 | return; |
898 | } |
899 | |
900 | VPValue *A; |
901 | if (match(V: &R, P: m_Trunc(Op0: m_ZExtOrSExt(Op0: m_VPValue(V&: A))))) { |
902 | VPValue *Trunc = R.getVPSingleValue(); |
903 | Type *TruncTy = TypeInfo.inferScalarType(V: Trunc); |
904 | Type *ATy = TypeInfo.inferScalarType(V: A); |
905 | if (TruncTy == ATy) { |
906 | Trunc->replaceAllUsesWith(New: A); |
907 | } else { |
908 | // Don't replace a scalarizing recipe with a widened cast. |
909 | if (isa<VPReplicateRecipe>(Val: &R)) |
910 | return; |
911 | if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { |
912 | |
913 | unsigned ExtOpcode = match(V: R.getOperand(N: 0), P: m_SExt(Op0: m_VPValue())) |
914 | ? Instruction::SExt |
915 | : Instruction::ZExt; |
916 | auto *VPC = |
917 | new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); |
918 | VPC->insertBefore(InsertPos: &R); |
919 | Trunc->replaceAllUsesWith(New: VPC); |
920 | } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) { |
921 | auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy); |
922 | VPC->insertBefore(InsertPos: &R); |
923 | Trunc->replaceAllUsesWith(New: VPC); |
924 | } |
925 | } |
926 | #ifndef NDEBUG |
927 | // Verify that the cached type info is for both A and its users is still |
928 | // accurate by comparing it to freshly computed types. |
929 | VPTypeAnalysis TypeInfo2( |
930 | R.getParent()->getPlan()->getCanonicalIV()->getScalarType(), |
931 | TypeInfo.getContext()); |
932 | assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A)); |
933 | for (VPUser *U : A->users()) { |
934 | auto *R = dyn_cast<VPRecipeBase>(Val: U); |
935 | if (!R) |
936 | continue; |
937 | for (VPValue *VPV : R->definedValues()) |
938 | assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV)); |
939 | } |
940 | #endif |
941 | } |
942 | |
943 | if (match(V: &R, P: m_CombineOr(L: m_Mul(Op0: m_VPValue(V&: A), Op1: m_SpecificInt(V: 1)), |
944 | R: m_Mul(Op0: m_SpecificInt(V: 1), Op1: m_VPValue(V&: A))))) |
945 | return R.getVPSingleValue()->replaceAllUsesWith(New: A); |
946 | } |
947 | |
948 | /// Try to simplify the recipes in \p Plan. |
949 | static void simplifyRecipes(VPlan &Plan, LLVMContext &Ctx) { |
950 | ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( |
951 | Plan.getEntry()); |
952 | VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(), Ctx); |
953 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) { |
954 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
955 | simplifyRecipe(R, TypeInfo); |
956 | } |
957 | } |
958 | } |
959 | |
960 | void VPlanTransforms::truncateToMinimalBitwidths( |
961 | VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs, |
962 | LLVMContext &Ctx) { |
963 | #ifndef NDEBUG |
964 | // Count the processed recipes and cross check the count later with MinBWs |
965 | // size, to make sure all entries in MinBWs have been handled. |
966 | unsigned NumProcessedRecipes = 0; |
967 | #endif |
968 | // Keep track of created truncates, so they can be re-used. Note that we |
969 | // cannot use RAUW after creating a new truncate, as this would could make |
970 | // other uses have different types for their operands, making them invalidly |
971 | // typed. |
972 | DenseMap<VPValue *, VPWidenCastRecipe *> ProcessedTruncs; |
973 | VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(), Ctx); |
974 | VPBasicBlock *PH = Plan.getEntry(); |
975 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( |
976 | Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) { |
977 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
978 | if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe, |
979 | VPWidenSelectRecipe, VPWidenLoadRecipe>(Val: &R)) |
980 | continue; |
981 | |
982 | VPValue *ResultVPV = R.getVPSingleValue(); |
983 | auto *UI = cast_or_null<Instruction>(Val: ResultVPV->getUnderlyingValue()); |
984 | unsigned NewResSizeInBits = MinBWs.lookup(Key: UI); |
985 | if (!NewResSizeInBits) |
986 | continue; |
987 | |
988 | #ifndef NDEBUG |
989 | NumProcessedRecipes++; |
990 | #endif |
991 | // If the value wasn't vectorized, we must maintain the original scalar |
992 | // type. Skip those here, after incrementing NumProcessedRecipes. Also |
993 | // skip casts which do not need to be handled explicitly here, as |
994 | // redundant casts will be removed during recipe simplification. |
995 | if (isa<VPReplicateRecipe, VPWidenCastRecipe>(Val: &R)) { |
996 | #ifndef NDEBUG |
997 | // If any of the operands is a live-in and not used by VPWidenRecipe or |
998 | // VPWidenSelectRecipe, but in MinBWs, make sure it is counted as |
999 | // processed as well. When MinBWs is currently constructed, there is no |
1000 | // information about whether recipes are widened or replicated and in |
1001 | // case they are reciplicated the operands are not truncated. Counting |
1002 | // them them here ensures we do not miss any recipes in MinBWs. |
1003 | // TODO: Remove once the analysis is done on VPlan. |
1004 | for (VPValue *Op : R.operands()) { |
1005 | if (!Op->isLiveIn()) |
1006 | continue; |
1007 | auto *UV = dyn_cast_or_null<Instruction>(Val: Op->getUnderlyingValue()); |
1008 | if (UV && MinBWs.contains(Key: UV) && !ProcessedTruncs.contains(Val: Op) && |
1009 | all_of(Range: Op->users(), P: [](VPUser *U) { |
1010 | return !isa<VPWidenRecipe, VPWidenSelectRecipe>(Val: U); |
1011 | })) { |
1012 | // Add an entry to ProcessedTruncs to avoid counting the same |
1013 | // operand multiple times. |
1014 | ProcessedTruncs[Op] = nullptr; |
1015 | NumProcessedRecipes += 1; |
1016 | } |
1017 | } |
1018 | #endif |
1019 | continue; |
1020 | } |
1021 | |
1022 | Type *OldResTy = TypeInfo.inferScalarType(V: ResultVPV); |
1023 | unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits(); |
1024 | assert(OldResTy->isIntegerTy() && "only integer types supported" ); |
1025 | (void)OldResSizeInBits; |
1026 | |
1027 | auto *NewResTy = IntegerType::get(C&: Ctx, NumBits: NewResSizeInBits); |
1028 | |
1029 | // Any wrapping introduced by shrinking this operation shouldn't be |
1030 | // considered undefined behavior. So, we can't unconditionally copy |
1031 | // arithmetic wrapping flags to VPW. |
1032 | if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(Val: &R)) |
1033 | VPW->dropPoisonGeneratingFlags(); |
1034 | |
1035 | using namespace llvm::VPlanPatternMatch; |
1036 | if (OldResSizeInBits != NewResSizeInBits && |
1037 | !match(V: &R, P: m_Binary<Instruction::ICmp>(Op0: m_VPValue(), Op1: m_VPValue()))) { |
1038 | // Extend result to original width. |
1039 | auto *Ext = |
1040 | new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy); |
1041 | Ext->insertAfter(InsertPos: &R); |
1042 | ResultVPV->replaceAllUsesWith(New: Ext); |
1043 | Ext->setOperand(I: 0, New: ResultVPV); |
1044 | assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?" ); |
1045 | } else |
1046 | assert( |
1047 | match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) && |
1048 | "Only ICmps should not need extending the result." ); |
1049 | |
1050 | assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed" ); |
1051 | if (isa<VPWidenLoadRecipe>(Val: &R)) |
1052 | continue; |
1053 | |
1054 | // Shrink operands by introducing truncates as needed. |
1055 | unsigned StartIdx = isa<VPWidenSelectRecipe>(Val: &R) ? 1 : 0; |
1056 | for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) { |
1057 | auto *Op = R.getOperand(N: Idx); |
1058 | unsigned OpSizeInBits = |
1059 | TypeInfo.inferScalarType(V: Op)->getScalarSizeInBits(); |
1060 | if (OpSizeInBits == NewResSizeInBits) |
1061 | continue; |
1062 | assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate" ); |
1063 | auto [ProcessedIter, IterIsEmpty] = |
1064 | ProcessedTruncs.insert(KV: {Op, nullptr}); |
1065 | VPWidenCastRecipe *NewOp = |
1066 | IterIsEmpty |
1067 | ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy) |
1068 | : ProcessedIter->second; |
1069 | R.setOperand(I: Idx, New: NewOp); |
1070 | if (!IterIsEmpty) |
1071 | continue; |
1072 | ProcessedIter->second = NewOp; |
1073 | if (!Op->isLiveIn()) { |
1074 | NewOp->insertBefore(InsertPos: &R); |
1075 | } else { |
1076 | PH->appendRecipe(Recipe: NewOp); |
1077 | #ifndef NDEBUG |
1078 | auto *OpInst = dyn_cast<Instruction>(Val: Op->getLiveInIRValue()); |
1079 | bool IsContained = MinBWs.contains(Key: OpInst); |
1080 | NumProcessedRecipes += IsContained; |
1081 | #endif |
1082 | } |
1083 | } |
1084 | |
1085 | } |
1086 | } |
1087 | |
1088 | assert(MinBWs.size() == NumProcessedRecipes && |
1089 | "some entries in MinBWs haven't been processed" ); |
1090 | } |
1091 | |
1092 | void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) { |
1093 | removeRedundantCanonicalIVs(Plan); |
1094 | removeRedundantInductionCasts(Plan); |
1095 | |
1096 | simplifyRecipes(Plan, Ctx&: SE.getContext()); |
1097 | legalizeAndOptimizeInductions(Plan, SE); |
1098 | removeDeadRecipes(Plan); |
1099 | |
1100 | createAndOptimizeReplicateRegions(Plan); |
1101 | |
1102 | removeRedundantExpandSCEVRecipes(Plan); |
1103 | mergeBlocksIntoPredecessors(Plan); |
1104 | } |
1105 | |
1106 | // Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace |
1107 | // the loop terminator with a branch-on-cond recipe with the negated |
1108 | // active-lane-mask as operand. Note that this turns the loop into an |
1109 | // uncountable one. Only the existing terminator is replaced, all other existing |
1110 | // recipes/users remain unchanged, except for poison-generating flags being |
1111 | // dropped from the canonical IV increment. Return the created |
1112 | // VPActiveLaneMaskPHIRecipe. |
1113 | // |
1114 | // The function uses the following definitions: |
1115 | // |
1116 | // %TripCount = DataWithControlFlowWithoutRuntimeCheck ? |
1117 | // calculate-trip-count-minus-VF (original TC) : original TC |
1118 | // %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ? |
1119 | // CanonicalIVPhi : CanonicalIVIncrement |
1120 | // %StartV is the canonical induction start value. |
1121 | // |
1122 | // The function adds the following recipes: |
1123 | // |
1124 | // vector.ph: |
1125 | // %TripCount = calculate-trip-count-minus-VF (original TC) |
1126 | // [if DataWithControlFlowWithoutRuntimeCheck] |
1127 | // %EntryInc = canonical-iv-increment-for-part %StartV |
1128 | // %EntryALM = active-lane-mask %EntryInc, %TripCount |
1129 | // |
1130 | // vector.body: |
1131 | // ... |
1132 | // %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ] |
1133 | // ... |
1134 | // %InLoopInc = canonical-iv-increment-for-part %IncrementValue |
1135 | // %ALM = active-lane-mask %InLoopInc, TripCount |
1136 | // %Negated = Not %ALM |
1137 | // branch-on-cond %Negated |
1138 | // |
1139 | static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( |
1140 | VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) { |
1141 | VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); |
1142 | VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); |
1143 | auto *CanonicalIVPHI = Plan.getCanonicalIV(); |
1144 | VPValue *StartV = CanonicalIVPHI->getStartValue(); |
1145 | |
1146 | auto *CanonicalIVIncrement = |
1147 | cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue()); |
1148 | // TODO: Check if dropping the flags is needed if |
1149 | // !DataAndControlFlowWithoutRuntimeCheck. |
1150 | CanonicalIVIncrement->dropPoisonGeneratingFlags(); |
1151 | DebugLoc DL = CanonicalIVIncrement->getDebugLoc(); |
1152 | // We can't use StartV directly in the ActiveLaneMask VPInstruction, since |
1153 | // we have to take unrolling into account. Each part needs to start at |
1154 | // Part * VF |
1155 | auto * = cast<VPBasicBlock>(Val: TopRegion->getSinglePredecessor()); |
1156 | VPBuilder Builder(VecPreheader); |
1157 | |
1158 | // Create the ActiveLaneMask instruction using the correct start values. |
1159 | VPValue *TC = Plan.getTripCount(); |
1160 | |
1161 | VPValue *TripCount, *IncrementValue; |
1162 | if (!DataAndControlFlowWithoutRuntimeCheck) { |
1163 | // When the loop is guarded by a runtime overflow check for the loop |
1164 | // induction variable increment by VF, we can increment the value before |
1165 | // the get.active.lane mask and use the unmodified tripcount. |
1166 | IncrementValue = CanonicalIVIncrement; |
1167 | TripCount = TC; |
1168 | } else { |
1169 | // When avoiding a runtime check, the active.lane.mask inside the loop |
1170 | // uses a modified trip count and the induction variable increment is |
1171 | // done after the active.lane.mask intrinsic is called. |
1172 | IncrementValue = CanonicalIVPHI; |
1173 | TripCount = Builder.createNaryOp(Opcode: VPInstruction::CalculateTripCountMinusVF, |
1174 | Operands: {TC}, DL); |
1175 | } |
1176 | auto *EntryIncrement = Builder.createOverflowingOp( |
1177 | Opcode: VPInstruction::CanonicalIVIncrementForPart, Operands: {StartV}, WrapFlags: {false, false}, DL, |
1178 | Name: "index.part.next" ); |
1179 | |
1180 | // Create the active lane mask instruction in the VPlan preheader. |
1181 | auto *EntryALM = |
1182 | Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask, Operands: {EntryIncrement, TC}, |
1183 | DL, Name: "active.lane.mask.entry" ); |
1184 | |
1185 | // Now create the ActiveLaneMaskPhi recipe in the main loop using the |
1186 | // preheader ActiveLaneMask instruction. |
1187 | auto LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); |
1188 | LaneMaskPhi->insertAfter(InsertPos: CanonicalIVPHI); |
1189 | |
1190 | // Create the active lane mask for the next iteration of the loop before the |
1191 | // original terminator. |
1192 | VPRecipeBase *OriginalTerminator = EB->getTerminator(); |
1193 | Builder.setInsertPoint(OriginalTerminator); |
1194 | auto *InLoopIncrement = |
1195 | Builder.createOverflowingOp(Opcode: VPInstruction::CanonicalIVIncrementForPart, |
1196 | Operands: {IncrementValue}, WrapFlags: {false, false}, DL); |
1197 | auto *ALM = Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask, |
1198 | Operands: {InLoopIncrement, TripCount}, DL, |
1199 | Name: "active.lane.mask.next" ); |
1200 | LaneMaskPhi->addOperand(Operand: ALM); |
1201 | |
1202 | // Replace the original terminator with BranchOnCond. We have to invert the |
1203 | // mask here because a true condition means jumping to the exit block. |
1204 | auto *NotMask = Builder.createNot(Operand: ALM, DL); |
1205 | Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {NotMask}, DL); |
1206 | OriginalTerminator->eraseFromParent(); |
1207 | return LaneMaskPhi; |
1208 | } |
1209 | |
1210 | /// Collect all VPValues representing a header mask through the (ICMP_ULE, |
1211 | /// WideCanonicalIV, backedge-taken-count) pattern. |
1212 | /// TODO: Introduce explicit recipe for header-mask instead of searching |
1213 | /// for the header-mask pattern manually. |
1214 | static SmallVector<VPValue *> (VPlan &Plan) { |
1215 | SmallVector<VPValue *> WideCanonicalIVs; |
1216 | auto *FoundWidenCanonicalIVUser = |
1217 | find_if(Range: Plan.getCanonicalIV()->users(), |
1218 | P: [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(Val: U); }); |
1219 | assert(count_if(Plan.getCanonicalIV()->users(), |
1220 | [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }) <= |
1221 | 1 && |
1222 | "Must have at most one VPWideCanonicalIVRecipe" ); |
1223 | if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) { |
1224 | auto *WideCanonicalIV = |
1225 | cast<VPWidenCanonicalIVRecipe>(Val: *FoundWidenCanonicalIVUser); |
1226 | WideCanonicalIVs.push_back(Elt: WideCanonicalIV); |
1227 | } |
1228 | |
1229 | // Also include VPWidenIntOrFpInductionRecipes that represent a widened |
1230 | // version of the canonical induction. |
1231 | VPBasicBlock * = Plan.getVectorLoopRegion()->getEntryBasicBlock(); |
1232 | for (VPRecipeBase &Phi : HeaderVPBB->phis()) { |
1233 | auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi); |
1234 | if (WidenOriginalIV && WidenOriginalIV->isCanonical()) |
1235 | WideCanonicalIVs.push_back(Elt: WidenOriginalIV); |
1236 | } |
1237 | |
1238 | // Walk users of wide canonical IVs and collect to all compares of the form |
1239 | // (ICMP_ULE, WideCanonicalIV, backedge-taken-count). |
1240 | SmallVector<VPValue *> ; |
1241 | VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); |
1242 | for (auto *Wide : WideCanonicalIVs) { |
1243 | for (VPUser *U : SmallVector<VPUser *>(Wide->users())) { |
1244 | auto * = dyn_cast<VPInstruction>(Val: U); |
1245 | if (!HeaderMask || HeaderMask->getOpcode() != Instruction::ICmp || |
1246 | HeaderMask->getPredicate() != CmpInst::ICMP_ULE || |
1247 | HeaderMask->getOperand(N: 1) != BTC) |
1248 | continue; |
1249 | |
1250 | assert(HeaderMask->getOperand(0) == Wide && |
1251 | "WidenCanonicalIV must be the first operand of the compare" ); |
1252 | HeaderMasks.push_back(Elt: HeaderMask); |
1253 | } |
1254 | } |
1255 | return HeaderMasks; |
1256 | } |
1257 | |
1258 | void VPlanTransforms::addActiveLaneMask( |
1259 | VPlan &Plan, bool UseActiveLaneMaskForControlFlow, |
1260 | bool DataAndControlFlowWithoutRuntimeCheck) { |
1261 | assert((!DataAndControlFlowWithoutRuntimeCheck || |
1262 | UseActiveLaneMaskForControlFlow) && |
1263 | "DataAndControlFlowWithoutRuntimeCheck implies " |
1264 | "UseActiveLaneMaskForControlFlow" ); |
1265 | |
1266 | auto FoundWidenCanonicalIVUser = |
1267 | find_if(Range: Plan.getCanonicalIV()->users(), |
1268 | P: [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(Val: U); }); |
1269 | assert(FoundWidenCanonicalIVUser && |
1270 | "Must have widened canonical IV when tail folding!" ); |
1271 | auto *WideCanonicalIV = |
1272 | cast<VPWidenCanonicalIVRecipe>(Val: *FoundWidenCanonicalIVUser); |
1273 | VPSingleDefRecipe *LaneMask; |
1274 | if (UseActiveLaneMaskForControlFlow) { |
1275 | LaneMask = addVPLaneMaskPhiAndUpdateExitBranch( |
1276 | Plan, DataAndControlFlowWithoutRuntimeCheck); |
1277 | } else { |
1278 | VPBuilder B = VPBuilder::getToInsertAfter(R: WideCanonicalIV); |
1279 | LaneMask = B.createNaryOp(Opcode: VPInstruction::ActiveLaneMask, |
1280 | Operands: {WideCanonicalIV, Plan.getTripCount()}, Inst: nullptr, |
1281 | Name: "active.lane.mask" ); |
1282 | } |
1283 | |
1284 | // Walk users of WideCanonicalIV and replace all compares of the form |
1285 | // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an |
1286 | // active-lane-mask. |
1287 | for (VPValue * : collectAllHeaderMasks(Plan)) |
1288 | HeaderMask->replaceAllUsesWith(New: LaneMask); |
1289 | } |
1290 | |
1291 | /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and |
1292 | /// replaces all uses except the canonical IV increment of |
1293 | /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe |
1294 | /// is used only for loop iterations counting after this transformation. |
1295 | /// |
1296 | /// The function uses the following definitions: |
1297 | /// %StartV is the canonical induction start value. |
1298 | /// |
1299 | /// The function adds the following recipes: |
1300 | /// |
1301 | /// vector.ph: |
1302 | /// ... |
1303 | /// |
1304 | /// vector.body: |
1305 | /// ... |
1306 | /// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ], |
1307 | /// [ %NextEVLIV, %vector.body ] |
1308 | /// %VPEVL = EXPLICIT-VECTOR-LENGTH %EVLPhi, original TC |
1309 | /// ... |
1310 | /// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi |
1311 | /// ... |
1312 | /// |
1313 | void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) { |
1314 | VPBasicBlock * = Plan.getVectorLoopRegion()->getEntryBasicBlock(); |
1315 | auto *CanonicalIVPHI = Plan.getCanonicalIV(); |
1316 | VPValue *StartV = CanonicalIVPHI->getStartValue(); |
1317 | |
1318 | // Create the ExplicitVectorLengthPhi recipe in the main loop. |
1319 | auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc()); |
1320 | EVLPhi->insertAfter(InsertPos: CanonicalIVPHI); |
1321 | auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength, |
1322 | {EVLPhi, Plan.getTripCount()}); |
1323 | VPEVL->insertBefore(BB&: *Header, IP: Header->getFirstNonPhi()); |
1324 | |
1325 | auto *CanonicalIVIncrement = |
1326 | cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue()); |
1327 | VPSingleDefRecipe *OpVPEVL = VPEVL; |
1328 | if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits(); |
1329 | IVSize != 32) { |
1330 | OpVPEVL = new VPScalarCastRecipe(IVSize < 32 ? Instruction::Trunc |
1331 | : Instruction::ZExt, |
1332 | OpVPEVL, CanonicalIVPHI->getScalarType()); |
1333 | OpVPEVL->insertBefore(InsertPos: CanonicalIVIncrement); |
1334 | } |
1335 | auto *NextEVLIV = |
1336 | new VPInstruction(Instruction::Add, {OpVPEVL, EVLPhi}, |
1337 | {CanonicalIVIncrement->hasNoUnsignedWrap(), |
1338 | CanonicalIVIncrement->hasNoSignedWrap()}, |
1339 | CanonicalIVIncrement->getDebugLoc(), "index.evl.next" ); |
1340 | NextEVLIV->insertBefore(InsertPos: CanonicalIVIncrement); |
1341 | EVLPhi->addOperand(Operand: NextEVLIV); |
1342 | |
1343 | for (VPValue * : collectAllHeaderMasks(Plan)) { |
1344 | for (VPUser *U : collectUsersRecursively(V: HeaderMask)) { |
1345 | auto *MemR = dyn_cast<VPWidenMemoryRecipe>(Val: U); |
1346 | if (!MemR) |
1347 | continue; |
1348 | assert(!MemR->isReverse() && |
1349 | "Reversed memory operations not supported yet." ); |
1350 | VPValue *OrigMask = MemR->getMask(); |
1351 | assert(OrigMask && "Unmasked widen memory recipe when folding tail" ); |
1352 | VPValue *NewMask = HeaderMask == OrigMask ? nullptr : OrigMask; |
1353 | if (auto *L = dyn_cast<VPWidenLoadRecipe>(Val: MemR)) { |
1354 | auto *N = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask); |
1355 | N->insertBefore(InsertPos: L); |
1356 | L->replaceAllUsesWith(New: N); |
1357 | L->eraseFromParent(); |
1358 | } else if (auto *S = dyn_cast<VPWidenStoreRecipe>(Val: MemR)) { |
1359 | auto *N = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask); |
1360 | N->insertBefore(InsertPos: S); |
1361 | S->eraseFromParent(); |
1362 | } else { |
1363 | llvm_unreachable("unsupported recipe" ); |
1364 | } |
1365 | } |
1366 | recursivelyDeleteDeadRecipes(V: HeaderMask); |
1367 | } |
1368 | // Replace all uses of VPCanonicalIVPHIRecipe by |
1369 | // VPEVLBasedIVPHIRecipe except for the canonical IV increment. |
1370 | CanonicalIVPHI->replaceAllUsesWith(New: EVLPhi); |
1371 | CanonicalIVIncrement->setOperand(I: 0, New: CanonicalIVPHI); |
1372 | // TODO: support unroll factor > 1. |
1373 | Plan.setUF(1); |
1374 | } |
1375 | |
1376 | void VPlanTransforms::dropPoisonGeneratingRecipes( |
1377 | VPlan &Plan, function_ref<bool(BasicBlock *)> BlockNeedsPredication) { |
1378 | // Collect recipes in the backward slice of `Root` that may generate a poison |
1379 | // value that is used after vectorization. |
1380 | SmallPtrSet<VPRecipeBase *, 16> Visited; |
1381 | auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) { |
1382 | SmallVector<VPRecipeBase *, 16> Worklist; |
1383 | Worklist.push_back(Elt: Root); |
1384 | |
1385 | // Traverse the backward slice of Root through its use-def chain. |
1386 | while (!Worklist.empty()) { |
1387 | VPRecipeBase *CurRec = Worklist.back(); |
1388 | Worklist.pop_back(); |
1389 | |
1390 | if (!Visited.insert(Ptr: CurRec).second) |
1391 | continue; |
1392 | |
1393 | // Prune search if we find another recipe generating a widen memory |
1394 | // instruction. Widen memory instructions involved in address computation |
1395 | // will lead to gather/scatter instructions, which don't need to be |
1396 | // handled. |
1397 | if (isa<VPWidenMemoryRecipe>(Val: CurRec) || isa<VPInterleaveRecipe>(Val: CurRec) || |
1398 | isa<VPScalarIVStepsRecipe>(Val: CurRec) || isa<VPHeaderPHIRecipe>(Val: CurRec)) |
1399 | continue; |
1400 | |
1401 | // This recipe contributes to the address computation of a widen |
1402 | // load/store. If the underlying instruction has poison-generating flags, |
1403 | // drop them directly. |
1404 | if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: CurRec)) { |
1405 | VPValue *A, *B; |
1406 | using namespace llvm::VPlanPatternMatch; |
1407 | // Dropping disjoint from an OR may yield incorrect results, as some |
1408 | // analysis may have converted it to an Add implicitly (e.g. SCEV used |
1409 | // for dependence analysis). Instead, replace it with an equivalent Add. |
1410 | // This is possible as all users of the disjoint OR only access lanes |
1411 | // where the operands are disjoint or poison otherwise. |
1412 | if (match(V: RecWithFlags, P: m_Or(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B))) && |
1413 | RecWithFlags->isDisjoint()) { |
1414 | VPBuilder Builder(RecWithFlags); |
1415 | VPInstruction *New = Builder.createOverflowingOp( |
1416 | Opcode: Instruction::Add, Operands: {A, B}, WrapFlags: {false, false}, |
1417 | DL: RecWithFlags->getDebugLoc()); |
1418 | RecWithFlags->replaceAllUsesWith(New); |
1419 | RecWithFlags->eraseFromParent(); |
1420 | CurRec = New; |
1421 | } else |
1422 | RecWithFlags->dropPoisonGeneratingFlags(); |
1423 | } else { |
1424 | Instruction *Instr = dyn_cast_or_null<Instruction>( |
1425 | Val: CurRec->getVPSingleValue()->getUnderlyingValue()); |
1426 | (void)Instr; |
1427 | assert((!Instr || !Instr->hasPoisonGeneratingFlags()) && |
1428 | "found instruction with poison generating flags not covered by " |
1429 | "VPRecipeWithIRFlags" ); |
1430 | } |
1431 | |
1432 | // Add new definitions to the worklist. |
1433 | for (VPValue *operand : CurRec->operands()) |
1434 | if (VPRecipeBase *OpDef = operand->getDefiningRecipe()) |
1435 | Worklist.push_back(Elt: OpDef); |
1436 | } |
1437 | }); |
1438 | |
1439 | // Traverse all the recipes in the VPlan and collect the poison-generating |
1440 | // recipes in the backward slice starting at the address of a VPWidenRecipe or |
1441 | // VPInterleaveRecipe. |
1442 | auto Iter = vp_depth_first_deep(G: Plan.getEntry()); |
1443 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) { |
1444 | for (VPRecipeBase &Recipe : *VPBB) { |
1445 | if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(Val: &Recipe)) { |
1446 | Instruction &UnderlyingInstr = WidenRec->getIngredient(); |
1447 | VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe(); |
1448 | if (AddrDef && WidenRec->isConsecutive() && |
1449 | BlockNeedsPredication(UnderlyingInstr.getParent())) |
1450 | collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); |
1451 | } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(Val: &Recipe)) { |
1452 | VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe(); |
1453 | if (AddrDef) { |
1454 | // Check if any member of the interleave group needs predication. |
1455 | const InterleaveGroup<Instruction> *InterGroup = |
1456 | InterleaveRec->getInterleaveGroup(); |
1457 | bool NeedPredication = false; |
1458 | for (int I = 0, NumMembers = InterGroup->getNumMembers(); |
1459 | I < NumMembers; ++I) { |
1460 | Instruction *Member = InterGroup->getMember(Index: I); |
1461 | if (Member) |
1462 | NeedPredication |= BlockNeedsPredication(Member->getParent()); |
1463 | } |
1464 | |
1465 | if (NeedPredication) |
1466 | collectPoisonGeneratingInstrsInBackwardSlice(AddrDef); |
1467 | } |
1468 | } |
1469 | } |
1470 | } |
1471 | } |
1472 | |