VPlanTransforms.cpp source code [llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp]

1	//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	///
9	/// \file
10	/// This file implements a set of utility VPlan to VPlan transformations.
11	///
12	//===----------------------------------------------------------------------===//
13
14	#include "VPlanTransforms.h"
15	#include "VPRecipeBuilder.h"
16	#include "VPlan.h"
17	#include "VPlanAnalysis.h"
18	#include "VPlanCFG.h"
19	#include "VPlanDominatorTree.h"
20	#include "VPlanHelpers.h"
21	#include "VPlanPatternMatch.h"
22	#include "VPlanUtils.h"
23	#include "VPlanVerifier.h"
24	#include "llvm/ADT/APInt.h"
25	#include "llvm/ADT/PostOrderIterator.h"
26	#include "llvm/ADT/STLExtras.h"
27	#include "llvm/ADT/SetVector.h"
28	#include "llvm/ADT/TypeSwitch.h"
29	#include "llvm/Analysis/IVDescriptors.h"
30	#include "llvm/Analysis/InstSimplifyFolder.h"
31	#include "llvm/Analysis/LoopInfo.h"
32	#include "llvm/Analysis/VectorUtils.h"
33	#include "llvm/IR/Intrinsics.h"
34	#include "llvm/IR/PatternMatch.h"
35	#include "llvm/Support/Casting.h"
36	#include "llvm/Support/TypeSize.h"
37
38	using namespace llvm;
39
40	bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
41	VPlanPtr &Plan,
42	function_ref<const InductionDescriptor (PHINode )>
43	GetIntOrFpInductionDescriptor,
44	ScalarEvolution &SE, const TargetLibraryInfo &TLI) {
45
46	ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
47	Plan ->getVectorLoopRegion());
48	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
49	// Skip blocks outside region
50	if (!VPBB->getParent())
51	break;
52	VPRecipeBase *Term = VPBB->getTerminator();
53	auto EndIter = Term ? Term->getIterator() : VPBB->end();
54	// Introduce each ingredient into VPlan.
55	for (VPRecipeBase &Ingredient :
56	make_early_inc_range(Range: make_range(x: VPBB->begin(), y: EndIter))) {
57
58	VPValue *VPV = Ingredient.getVPSingleValue();
59	if (!VPV->getUnderlyingValue())
60	continue;
61
62	Instruction *Inst = cast<Instruction>(Val: VPV->getUnderlyingValue());
63
64	VPRecipeBase NewRecipe = nullptr*;
65	if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &Ingredient)) {
66	auto *Phi = cast<PHINode>(Val: VPPhi->getUnderlyingValue());
67	const auto *II = GetIntOrFpInductionDescriptor (Phi);
68	if (!II)
69	continue;
70
71	VPValue *Start = Plan ->getOrAddLiveIn(V: II->getStartValue());
72	VPValue *Step =
73	vputils::getOrCreateVPValueForSCEVExpr(Plan&: *Plan, Expr: II->getStep(), SE);
74	NewRecipe = new VPWidenIntOrFpInductionRecipe (
75	Phi, Start, Step, &Plan ->getVF(), *II, Ingredient.getDebugLoc());
76	} else {
77	assert(isa<VPInstruction>(&Ingredient) &&
78	"only VPInstructions expected here");
79	assert(!isa<PHINode>(Inst) && "phis should be handled above");
80	// Create VPWidenMemoryRecipe for loads and stores.
81	if (LoadInst *Load = dyn_cast<LoadInst>(Val: Inst)) {
82	NewRecipe = new VPWidenLoadRecipe (
83	Load, Ingredient.getOperand(N: `0`), nullptr* /Mask/,
84	false /Consecutive/, false /Reverse/, VPIRMetadata (*Load),
85	Ingredient.getDebugLoc());
86	} else if (StoreInst *Store = dyn_cast<StoreInst>(Val: Inst)) {
87	NewRecipe = new VPWidenStoreRecipe (
88	*Store, Ingredient.getOperand(N: `1`), Ingredient.getOperand(N: `0`),
89	nullptr /Mask/, false /Consecutive/, false /Reverse/,
90	VPIRMetadata (*Store), Ingredient.getDebugLoc());
91	} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) {
92	NewRecipe = new VPWidenGEPRecipe (GEP, Ingredient.operands());
93	} else if (CallInst *CI = dyn_cast<CallInst>(Val: Inst)) {
94	Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, TLI: &TLI);
95	if (VectorID == Intrinsic::not_intrinsic)
96	return false;
97	NewRecipe = new VPWidenIntrinsicRecipe (
98	*CI, getVectorIntrinsicIDForCall(CI, TLI: &TLI),
99	{Ingredient.op_begin(), Ingredient.op_end() - `1`}, CI->getType(),
100	CI->getDebugLoc());
101	} else if (SelectInst *SI = dyn_cast<SelectInst>(Val: Inst)) {
102	NewRecipe = new VPWidenSelectRecipe (*SI, Ingredient.operands());
103	} else if (auto *CI = dyn_cast<CastInst>(Val: Inst)) {
104	NewRecipe = new VPWidenCastRecipe (
105	CI->getOpcode(), Ingredient.getOperand(N: `0`), CI->getType(), *CI);
106	} else {
107	NewRecipe = new VPWidenRecipe (*Inst, Ingredient.operands());
108	}
109	}
110
111	NewRecipe->insertBefore(InsertPos: &Ingredient);
112	if (NewRecipe->getNumDefinedValues() == `1`)
113	VPV->replaceAllUsesWith(New: NewRecipe->getVPSingleValue());
114	else
115	assert(NewRecipe->getNumDefinedValues() == `0` &&
116	"Only recpies with zero or one defined values expected");
117	Ingredient.eraseFromParent();
118	}
119	}
120	return true;
121	}
122
123	static bool sinkScalarOperands(VPlan &Plan) {
124	auto Iter = vp_depth_first_deep(G: Plan.getEntry());
125	bool Changed = false;
126	// First, collect the operands of all recipes in replicate blocks as seeds for
127	// sinking.
128	SetVector<std::pair<VPBasicBlock , VPSingleDefRecipe >> WorkList;
129	for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Range: Iter)) {
130	VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
131	if (!VPR->isReplicator() \|\| EntryVPBB->getSuccessors().size() != `2`)
132	continue;
133	VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Val: EntryVPBB->getSuccessors()[`0`]);
134	if (!VPBB \|\| VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
135	continue;
136	for (auto &Recipe : *VPBB) {
137	for (VPValue *Op : Recipe.operands())
138	if (auto *Def =
139	dyn_cast_or_null<VPSingleDefRecipe>(Val: Op->getDefiningRecipe()))
140	WorkList.insert(X: std::make_pair(x&: VPBB, y&: Def));
141	}
142	}
143
144	bool ScalarVFOnly = Plan.hasScalarVFOnly();
145	// Try to sink each replicate or scalar IV steps recipe in the worklist.
146	for (unsigned I = `0`; I != WorkList.size(); ++I) {
147	VPBasicBlock *SinkTo;
148	VPSingleDefRecipe *SinkCandidate;
149	std::tie(args&: SinkTo, args&: SinkCandidate) = WorkList [I];
150	if (SinkCandidate->getParent() == SinkTo \|\|
151	SinkCandidate->mayHaveSideEffects() \|\|
152	SinkCandidate->mayReadOrWriteMemory())
153	continue;
154	if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: SinkCandidate)) {
155	if (!ScalarVFOnly && RepR->isSingleScalar())
156	continue;
157	} else if (!isa<VPScalarIVStepsRecipe>(Val: SinkCandidate))
158	continue;
159
160	bool NeedsDuplicating = false;
161	// All recipe users of the sink candidate must be in the same block SinkTo
162	// or all users outside of SinkTo must be uniform-after-vectorization (
163	// i.e., only first lane is used) . In the latter case, we need to duplicate
164	// SinkCandidate.
165	auto CanSinkWithUser = [SinkTo, &NeedsDuplicating,
166	SinkCandidate](VPUser *U) {
167	auto *UI = cast<VPRecipeBase>(Val: U);
168	if (UI->getParent() == SinkTo)
169	return true;
170	NeedsDuplicating = UI->onlyFirstLaneUsed(Op: SinkCandidate);
171	// We only know how to duplicate VPReplicateRecipes and
172	// VPScalarIVStepsRecipes for now.
173	return NeedsDuplicating &&
174	isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Val: SinkCandidate);
175	};
176	if (!all_of(Range: SinkCandidate->users(), P: CanSinkWithUser))
177	continue;
178
179	if (NeedsDuplicating) {
180	if (ScalarVFOnly)
181	continue;
182	VPSingleDefRecipe *Clone;
183	if (auto *SinkCandidateRepR =
184	dyn_cast<VPReplicateRecipe>(Val: SinkCandidate)) {
185	// TODO: Handle converting to uniform recipes as separate transform,
186	// then cloning should be sufficient here.
187	Instruction *I = SinkCandidate->getUnderlyingInstr();
188	Clone = new VPReplicateRecipe (I, SinkCandidate->operands(), true,
189	nullptr /Mask/, *SinkCandidateRepR);
190	// TODO: add ".cloned" suffix to name of Clone's VPValue.
191	} else {
192	Clone = SinkCandidate->clone();
193	}
194
195	Clone->insertBefore(InsertPos: SinkCandidate);
196	SinkCandidate->replaceUsesWithIf(New: Clone, ShouldReplace: [SinkTo](VPUser &U, unsigned) {
197	return cast<VPRecipeBase>(Val: &U)->getParent() != SinkTo;
198	});
199	}
200	SinkCandidate->moveBefore(BB&: *SinkTo, I: SinkTo->getFirstNonPhi());
201	for (VPValue *Op : SinkCandidate->operands())
202	if (auto *Def =
203	dyn_cast_or_null<VPSingleDefRecipe>(Val: Op->getDefiningRecipe()))
204	WorkList.insert(X: std::make_pair(x&: SinkTo, y&: Def));
205	Changed = true;
206	}
207	return Changed;
208	}
209
210	/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
211	/// the mask.
212	VPValue getPredicatedMask(VPRegionBlock R) {
213	auto *EntryBB = dyn_cast<VPBasicBlock>(Val: R->getEntry());
214	if (!EntryBB \|\| EntryBB->size() != `1` \|\|
215	!isa<VPBranchOnMaskRecipe>(Val: EntryBB->begin()))
216	return nullptr;
217
218	return cast<VPBranchOnMaskRecipe>(Val: &*EntryBB->begin())->getOperand(N: `0`);
219	}
220
221	/// If \p R is a triangle region, return the 'then' block of the triangle.
222	static VPBasicBlock getPredicatedThenBlock(VPRegionBlock R) {
223	auto *EntryBB = cast<VPBasicBlock>(Val: R->getEntry());
224	if (EntryBB->getNumSuccessors() != `2`)
225	return nullptr;
226
227	auto *Succ0 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[`0`]);
228	auto *Succ1 = dyn_cast<VPBasicBlock>(Val: EntryBB->getSuccessors()[`1`]);
229	if (!Succ0 \|\| !Succ1)
230	return nullptr;
231
232	if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != `1`)
233	return nullptr;
234	if (Succ0->getSingleSuccessor() == Succ1)
235	return Succ0;
236	if (Succ1->getSingleSuccessor() == Succ0)
237	return Succ1;
238	return nullptr;
239	}
240
241	// Merge replicate regions in their successor region, if a replicate region
242	// is connected to a successor replicate region with the same predicate by a
243	// single, empty VPBasicBlock.
244	static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) {
245	SmallPtrSet<VPRegionBlock *, `4`> TransformedRegions;
246
247	// Collect replicate regions followed by an empty block, followed by another
248	// replicate region with matching masks to process front. This is to avoid
249	// iterator invalidation issues while merging regions.
250	SmallVector<VPRegionBlock *, `8`> WorkList;
251	for (VPRegionBlock *Region1 : VPBlockUtils::blocksOnly<VPRegionBlock>(
252	Range: vp_depth_first_deep(G: Plan.getEntry()))) {
253	if (!Region1->isReplicator())
254	continue;
255	auto *MiddleBasicBlock =
256	dyn_cast_or_null<VPBasicBlock>(Val: Region1->getSingleSuccessor());
257	if (!MiddleBasicBlock \|\| !MiddleBasicBlock->empty())
258	continue;
259
260	auto *Region2 =
261	dyn_cast_or_null<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor());
262	if (!Region2 \|\| !Region2->isReplicator())
263	continue;
264
265	VPValue *Mask1 = getPredicatedMask(R: Region1);
266	VPValue *Mask2 = getPredicatedMask(R: Region2);
267	if (!Mask1 \|\| Mask1 != Mask2)
268	continue;
269
270	assert(Mask1 && Mask2 && "both region must have conditions");
271	WorkList.push_back(Elt: Region1);
272	}
273
274	// Move recipes from Region1 to its successor region, if both are triangles.
275	for (VPRegionBlock *Region1 : WorkList) {
276	if (TransformedRegions.contains(Ptr: Region1))
277	continue;
278	auto *MiddleBasicBlock = cast<VPBasicBlock>(Val: Region1->getSingleSuccessor());
279	auto *Region2 = cast<VPRegionBlock>(Val: MiddleBasicBlock->getSingleSuccessor());
280
281	VPBasicBlock *Then1 = getPredicatedThenBlock(R: Region1);
282	VPBasicBlock *Then2 = getPredicatedThenBlock(R: Region2);
283	if (!Then1 \|\| !Then2)
284	continue;
285
286	// Note: No fusion-preventing memory dependencies are expected in either
287	// region. Such dependencies should be rejected during earlier dependence
288	// checks, which guarantee accesses can be re-ordered for vectorization.
289	//
290	// Move recipes to the successor region.
291	for (VPRecipeBase &ToMove : make_early_inc_range(Range: reverse(C&: *Then1)))
292	ToMove.moveBefore(BB&: *Then2, I: Then2->getFirstNonPhi());
293
294	auto *Merge1 = cast<VPBasicBlock>(Val: Then1->getSingleSuccessor());
295	auto *Merge2 = cast<VPBasicBlock>(Val: Then2->getSingleSuccessor());
296
297	// Move VPPredInstPHIRecipes from the merge block to the successor region's
298	// merge block. Update all users inside the successor region to use the
299	// original values.
300	for (VPRecipeBase &Phi1ToMove : make_early_inc_range(Range: reverse(C&: *Merge1))) {
301	VPValue *PredInst1 =
302	cast<VPPredInstPHIRecipe>(Val: &Phi1ToMove)->getOperand(N: `0`);
303	VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
304	Phi1ToMoveV->replaceUsesWithIf(New: PredInst1, ShouldReplace: [Then2](VPUser &U, unsigned) {
305	return cast<VPRecipeBase>(Val: &U)->getParent() == Then2;
306	});
307
308	// Remove phi recipes that are unused after merging the regions.
309	if (Phi1ToMove.getVPSingleValue()->getNumUsers() == `0`) {
310	Phi1ToMove.eraseFromParent();
311	continue;
312	}
313	Phi1ToMove.moveBefore(BB&: *Merge2, I: Merge2->begin());
314	}
315
316	// Remove the dead recipes in Region1's entry block.
317	for (VPRecipeBase &R :
318	make_early_inc_range(Range: reverse(C&: *Region1->getEntryBasicBlock())))
319	R.eraseFromParent();
320
321	// Finally, remove the first region.
322	for (VPBlockBase *Pred : make_early_inc_range(Range&: Region1->getPredecessors())) {
323	VPBlockUtils::disconnectBlocks(From: Pred, To: Region1);
324	VPBlockUtils::connectBlocks(From: Pred, To: MiddleBasicBlock);
325	}
326	VPBlockUtils::disconnectBlocks(From: Region1, To: MiddleBasicBlock);
327	TransformedRegions.insert(Ptr: Region1);
328	}
329
330	return !TransformedRegions.empty();
331	}
332
333	static VPRegionBlock createReplicateRegion(VPReplicateRecipe PredRecipe,
334	VPlan &Plan) {
335	Instruction *Instr = PredRecipe->getUnderlyingInstr();
336	// Build the triangular if-then region.
337	std::string RegionName = (Twine ("pred.") + Instr->getOpcodeName()).str();
338	assert(Instr->getParent() && "Predicated instruction not in any basic block");
339	auto *BlockInMask = PredRecipe->getMask();
340	auto *MaskDef = BlockInMask->getDefiningRecipe();
341	auto BOMRecipe = new* VPBranchOnMaskRecipe (
342	BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc ());
343	auto *Entry =
344	Plan.createVPBasicBlock(Name: Twine (RegionName) + ".entry", Recipe: BOMRecipe);
345
346	// Replace predicated replicate recipe with a replicate recipe without a
347	// mask but in the replicate region.
348	auto RecipeWithoutMask = new* VPReplicateRecipe (
349	PredRecipe->getUnderlyingInstr(),
350	make_range(x: PredRecipe->op_begin(), y: std::prev(x: PredRecipe->op_end())),
351	PredRecipe->isSingleScalar(), nullptr /Mask/, *PredRecipe);
352	auto *Pred =
353	Plan.createVPBasicBlock(Name: Twine (RegionName) + ".if", Recipe: RecipeWithoutMask);
354
355	VPPredInstPHIRecipe PHIRecipe = nullptr*;
356	if (PredRecipe->getNumUsers() != `0`) {
357	PHIRecipe = new VPPredInstPHIRecipe (RecipeWithoutMask,
358	RecipeWithoutMask->getDebugLoc());
359	PredRecipe->replaceAllUsesWith(New: PHIRecipe);
360	PHIRecipe->setOperand(I: `0`, New: RecipeWithoutMask);
361	}
362	PredRecipe->eraseFromParent();
363	auto *Exiting =
364	Plan.createVPBasicBlock(Name: Twine (RegionName) + ".continue", Recipe: PHIRecipe);
365	VPRegionBlock *Region =
366	Plan.createVPRegionBlock(Entry, Exiting, Name: RegionName, IsReplicator: true);
367
368	// Note: first set Entry as region entry and then connect successors starting
369	// from it in order, to propagate the "parent" of each VPBasicBlock.
370	VPBlockUtils::insertTwoBlocksAfter(IfTrue: Pred, IfFalse: Exiting, BlockPtr: Entry);
371	VPBlockUtils::connectBlocks(From: Pred, To: Exiting);
372
373	return Region;
374	}
375
376	static void addReplicateRegions(VPlan &Plan) {
377	SmallVector<VPReplicateRecipe *> WorkList;
378	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
379	Range: vp_depth_first_deep(G: Plan.getEntry()))) {
380	for (VPRecipeBase &R : *VPBB)
381	if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
382	if (RepR->isPredicated())
383	WorkList.push_back(Elt: RepR);
384	}
385	}
386
387	unsigned BBNum = `0`;
388	for (VPReplicateRecipe *RepR : WorkList) {
389	VPBasicBlock *CurrentBlock = RepR->getParent();
390	VPBasicBlock *SplitBlock = CurrentBlock->splitAt(SplitAt: RepR->getIterator());
391
392	BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
393	SplitBlock->setName(
394	OrigBB->hasName() ? OrigBB->getName() + "." + Twine (BBNum++) : "");
395	// Record predicated instructions for above packing optimizations.
396	VPRegionBlock *Region = createReplicateRegion(PredRecipe: RepR, Plan);
397	Region->setParent(CurrentBlock->getParent());
398	VPBlockUtils::insertOnEdge(From: CurrentBlock, To: SplitBlock, BlockPtr: Region);
399
400	VPRegionBlock *ParentRegion = Region->getParent();
401	if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
402	ParentRegion->setExiting(SplitBlock);
403	}
404	}
405
406	/// Remove redundant VPBasicBlocks by merging them into their predecessor if
407	/// the predecessor has a single successor.
408	static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
409	SmallVector<VPBasicBlock *> WorkList;
410	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
411	Range: vp_depth_first_deep(G: Plan.getEntry()))) {
412	// Don't fold the blocks in the skeleton of the Plan into their single
413	// predecessors for now.
414	// TODO: Remove restriction once more of the skeleton is modeled in VPlan.
415	if (!VPBB->getParent())
416	continue;
417	auto *PredVPBB =
418	dyn_cast_or_null<VPBasicBlock>(Val: VPBB->getSinglePredecessor());
419	if (!PredVPBB \|\| PredVPBB->getNumSuccessors() != `1` \|\|
420	isa<VPIRBasicBlock>(Val: PredVPBB))
421	continue;
422	WorkList.push_back(Elt: VPBB);
423	}
424
425	for (VPBasicBlock *VPBB : WorkList) {
426	VPBasicBlock *PredVPBB = cast<VPBasicBlock>(Val: VPBB->getSinglePredecessor());
427	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB))
428	R.moveBefore(BB&: *PredVPBB, I: PredVPBB->end());
429	VPBlockUtils::disconnectBlocks(From: PredVPBB, To: VPBB);
430	auto *ParentRegion = VPBB->getParent();
431	if (ParentRegion && ParentRegion->getExiting() == VPBB)
432	ParentRegion->setExiting(PredVPBB);
433	for (auto *Succ : to_vector(Range: VPBB->successors())) {
434	VPBlockUtils::disconnectBlocks(From: VPBB, To: Succ);
435	VPBlockUtils::connectBlocks(From: PredVPBB, To: Succ);
436	}
437	// VPBB is now dead and will be cleaned up when the plan gets destroyed.
438	}
439	return !WorkList.empty();
440	}
441
442	void VPlanTransforms::createAndOptimizeReplicateRegions(VPlan &Plan) {
443	// Convert masked VPReplicateRecipes to if-then region blocks.
444	addReplicateRegions(Plan);
445
446	bool ShouldSimplify = true;
447	while (ShouldSimplify) {
448	ShouldSimplify = sinkScalarOperands(Plan);
449	ShouldSimplify \|= mergeReplicateRegionsIntoSuccessors(Plan);
450	ShouldSimplify \|= mergeBlocksIntoPredecessors(Plan);
451	}
452	}
453
454	/// Remove redundant casts of inductions.
455	///
456	/// Such redundant casts are casts of induction variables that can be ignored,
457	/// because we already proved that the casted phi is equal to the uncasted phi
458	/// in the vectorized loop. There is no need to vectorize the cast - the same
459	/// value can be used for both the phi and casts in the vector loop.
460	static void removeRedundantInductionCasts(VPlan &Plan) {
461	for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
462	auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
463	if (!IV \|\| IV->getTruncInst())
464	continue;
465
466	// A sequence of IR Casts has potentially been recorded for IV, which
467	// must be bypassed* when the IV is vectorized, because the vectorized IV*
468	// will produce the desired casted value. This sequence forms a def-use
469	// chain and is provided in reverse order, ending with the cast that uses
470	// the IV phi. Search for the recipe of the last cast in the chain and
471	// replace it with the original IV. Note that only the final cast is
472	// expected to have users outside the cast-chain and the dead casts left
473	// over will be cleaned up later.
474	auto &Casts = IV->getInductionDescriptor().getCastInsts();
475	VPValue *FindMyCast = IV;
476	for (Instruction *IRCast : reverse(C: Casts)) {
477	VPSingleDefRecipe FoundUserCast = nullptr*;
478	for (auto *U : FindMyCast->users()) {
479	auto *UserCast = dyn_cast<VPSingleDefRecipe>(Val: U);
480	if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
481	FoundUserCast = UserCast;
482	break;
483	}
484	}
485	FindMyCast = FoundUserCast;
486	}
487	FindMyCast->replaceAllUsesWith(New: IV);
488	}
489	}
490
491	/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
492	/// recipe, if it exists.
493	static void removeRedundantCanonicalIVs(VPlan &Plan) {
494	VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
495	VPWidenCanonicalIVRecipe WidenNewIV = nullptr*;
496	for (VPUser *U : CanonicalIV->users()) {
497	WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(Val: U);
498	if (WidenNewIV)
499	break;
500	}
501
502	if (!WidenNewIV)
503	return;
504
505	VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
506	for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
507	auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
508
509	if (!WidenOriginalIV \|\| !WidenOriginalIV->isCanonical())
510	continue;
511
512	// Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
513	// everything WidenNewIV's users need. That is, WidenOriginalIV will
514	// generate a vector phi or all users of WidenNewIV demand the first lane
515	// only.
516	if (any_of(Range: WidenOriginalIV->users(),
517	P: [WidenOriginalIV](VPUser *U) {
518	return !U->usesScalars(Op: WidenOriginalIV);
519	}) \|\|
520	vputils::onlyFirstLaneUsed(Def: WidenNewIV)) {
521	WidenNewIV->replaceAllUsesWith(New: WidenOriginalIV);
522	WidenNewIV->eraseFromParent();
523	return;
524	}
525	}
526	}
527
528	/// Returns true if \p R is dead and can be removed.
529	static bool isDeadRecipe(VPRecipeBase &R) {
530	using namespace llvm::PatternMatch;
531	// Do remove conditional assume instructions as their conditions may be
532	// flattened.
533	auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
534	bool IsConditionalAssume =
535	RepR && RepR->isPredicated() &&
536	match(RepR->getUnderlyingInstr(), m_Intrinsic<Intrinsic::assume>());
537	if (IsConditionalAssume)
538	return true;
539
540	if (R.mayHaveSideEffects())
541	return false;
542
543	// Recipe is dead if no user keeps the recipe alive.
544	return all_of(Range: R.definedValues(),
545	P: [](VPValue V) { return* V->getNumUsers() == `0`; });
546	}
547
548	void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
549	ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
550	Plan.getEntry());
551
552	for (VPBasicBlock *VPBB : reverse(C: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT))) {
553	// The recipes in the block are processed in reverse order, to catch chains
554	// of dead recipes.
555	for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
556	if (isDeadRecipe(R))
557	R.eraseFromParent();
558	}
559	}
560	}
561
562	static VPScalarIVStepsRecipe *
563	createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
564	Instruction::BinaryOps InductionOpcode,
565	FPMathOperator FPBinOp, Instruction TruncI,
566	VPValue StartV, VPValue Step, DebugLoc DL,
567	VPBuilder &Builder) {
568	VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
569	VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
570	VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
571	Kind, FPBinOp, Start: StartV, Current: CanonicalIV, Step, Name: "offset.idx");
572
573	// Truncate base induction if needed.
574	Type *CanonicalIVType = CanonicalIV->getScalarType();
575	VPTypeAnalysis TypeInfo(CanonicalIVType);
576	Type *ResultTy = TypeInfo.inferScalarType(V: BaseIV);
577	if (TruncI) {
578	Type *TruncTy = TruncI->getType();
579	assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
580	"Not truncating.");
581	assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
582	BaseIV = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: BaseIV, ResultTy: TruncTy, DL);
583	ResultTy = TruncTy;
584	}
585
586	// Truncate step if needed.
587	Type *StepTy = TypeInfo.inferScalarType(V: Step);
588	if (ResultTy != StepTy) {
589	assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
590	"Not truncating.");
591	assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
592	auto *VecPreheader =
593	cast<VPBasicBlock>(Val: HeaderVPBB->getSingleHierarchicalPredecessor());
594	VPBuilder::InsertPointGuard Guard(Builder);
595	Builder.setInsertPoint(VecPreheader);
596	Step = Builder.createScalarCast(Opcode: Instruction::Trunc, Op: Step, ResultTy, DL);
597	}
598	return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, IV: BaseIV, Step,
599	VF: &Plan.getVF(), DL);
600	}
601
602	static SmallVector<VPUser > collectUsersRecursively(VPValue V) {
603	SetVector<VPUser *> Users(llvm::from_range, V->users());
604	for (unsigned I = `0`; I != Users.size(); ++I) {
605	VPRecipeBase *Cur = cast<VPRecipeBase>(Val: Users [I]);
606	if (isa<VPHeaderPHIRecipe>(Val: Cur))
607	continue;
608	for (VPValue *V : Cur->definedValues())
609	Users.insert_range(R: V->users());
610	}
611	return Users.takeVector();
612	}
613
614	/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
615	/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
616	/// VPWidenPointerInductionRecipe will generate vectors only. If some users
617	/// require vectors while other require scalars, the scalar uses need to extract
618	/// the scalars from the generated vectors (Note that this is different to how
619	/// int/fp inductions are handled). Legalize extract-from-ends using uniform
620	/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
621	/// the correct end value is available. Also optimize
622	/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
623	/// providing them scalar steps built on the canonical scalar IV and update the
624	/// original IV's users. This is an optional optimization to reduce the needs of
625	/// vector extracts.
626	static void legalizeAndOptimizeInductions(VPlan &Plan) {
627	using namespace llvm::VPlanPatternMatch;
628	VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
629	bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
630	VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
631	for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
632	auto *PhiR = dyn_cast<VPWidenInductionRecipe>(Val: &Phi);
633	if (!PhiR)
634	continue;
635
636	// Try to narrow wide and replicating recipes to uniform recipes, based on
637	// VPlan analysis.
638	// TODO: Apply to all recipes in the future, to replace legacy uniformity
639	// analysis.
640	auto Users = collectUsersRecursively(V: PhiR);
641	for (VPUser *U : reverse(C&: Users)) {
642	auto *Def = dyn_cast<VPSingleDefRecipe>(Val: U);
643	auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U);
644	// Skip recipes that shouldn't be narrowed.
645	if (!Def \|\| !isa<VPReplicateRecipe, VPWidenRecipe>(Val: Def) \|\|
646	Def->getNumUsers() == `0` \|\| !Def->getUnderlyingValue() \|\|
647	(RepR && (RepR->isSingleScalar() \|\| RepR->isPredicated())))
648	continue;
649
650	// Skip recipes that may have other lanes than their first used.
651	if (!vputils::isSingleScalar(VPV: Def) && !vputils::onlyFirstLaneUsed(Def))
652	continue;
653
654	auto Clone = new* VPReplicateRecipe (Def->getUnderlyingInstr(),
655	Def->operands(), /IsUniform/ true);
656	Clone->insertAfter(InsertPos: Def);
657	Def->replaceAllUsesWith(New: Clone);
658	}
659
660	// Replace wide pointer inductions which have only their scalars used by
661	// PtrAdd(IndStart, ScalarIVSteps (0, Step)).
662	if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(Val: &Phi)) {
663	if (!PtrIV->onlyScalarsGenerated(IsScalable: Plan.hasScalableVF()))
664	continue;
665
666	const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
667	VPValue *StartV =
668	Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: ID.getStep()->getType(), V: `0`));
669	VPValue *StepV = PtrIV->getOperand(N: `1`);
670	VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
671	Plan, Kind: InductionDescriptor::IK_IntInduction, InductionOpcode: Instruction::Add, FPBinOp: nullptr,
672	TruncI: nullptr, StartV, Step: StepV, DL: PtrIV->getDebugLoc(), Builder);
673
674	VPValue *PtrAdd = Builder.createPtrAdd(Ptr: PtrIV->getStartValue(), Offset: Steps,
675	DL: PtrIV->getDebugLoc(), Name: "next.gep");
676
677	PtrIV->replaceAllUsesWith(New: PtrAdd);
678	continue;
679	}
680
681	// Replace widened induction with scalar steps for users that only use
682	// scalars.
683	auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
684	if (HasOnlyVectorVFs && none_of(Range: WideIV->users(), P: [WideIV](VPUser *U) {
685	return U->usesScalars(Op: WideIV);
686	}))
687	continue;
688
689	const InductionDescriptor &ID = WideIV->getInductionDescriptor();
690	VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
691	Plan, Kind: ID.getKind(), InductionOpcode: ID.getInductionOpcode(),
692	FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
693	TruncI: WideIV->getTruncInst(), StartV: WideIV->getStartValue(), Step: WideIV->getStepValue(),
694	DL: WideIV->getDebugLoc(), Builder);
695
696	// Update scalar users of IV to use Step instead.
697	if (!HasOnlyVectorVFs)
698	WideIV->replaceAllUsesWith(New: Steps);
699	else
700	WideIV->replaceUsesWithIf(New: Steps, ShouldReplace: [WideIV](VPUser &U, unsigned) {
701	return U.usesScalars(Op: WideIV);
702	});
703	}
704	}
705
706	/// Check if \p VPV is an untruncated wide induction, either before or after the
707	/// increment. If so return the header IV (before the increment), otherwise
708	/// return null.
709	static VPWidenInductionRecipe getOptimizableIVOf(VPValue VPV) {
710	auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Val: VPV);
711	if (WideIV) {
712	// VPV itself is a wide induction, separately compute the end value for exit
713	// users if it is not a truncated IV.
714	auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
715	return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
716	}
717
718	// Check if VPV is an optimizable induction increment.
719	VPRecipeBase *Def = VPV->getDefiningRecipe();
720	if (!Def \|\| Def->getNumOperands() != `2`)
721	return nullptr;
722	WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Def->getOperand(N: `0`));
723	if (!WideIV)
724	WideIV = dyn_cast<VPWidenInductionRecipe>(Val: Def->getOperand(N: `1`));
725	if (!WideIV)
726	return nullptr;
727
728	auto IsWideIVInc = [&]() {
729	using namespace VPlanPatternMatch;
730	auto &ID = WideIV->getInductionDescriptor();
731
732	// Check if VPV increments the induction by the induction step.
733	VPValue *IVStep = WideIV->getStepValue();
734	switch (ID.getInductionOpcode()) {
735	case Instruction::Add:
736	return match(V: VPV, P: m_c_Binary<Instruction::Add>(Op0: m_Specific(VPV: WideIV),
737	Op1: m_Specific(VPV: IVStep)));
738	case Instruction::FAdd:
739	return match(V: VPV, P: m_c_Binary<Instruction::FAdd>(Op0: m_Specific(VPV: WideIV),
740	Op1: m_Specific(VPV: IVStep)));
741	case Instruction::FSub:
742	return match(V: VPV, P: m_Binary<Instruction::FSub>(Op0: m_Specific(VPV: WideIV),
743	Op1: m_Specific(VPV: IVStep)));
744	case Instruction::Sub: {
745	// IVStep will be the negated step of the subtraction. Check if Step == -1
746	// IVStep.*
747	VPValue *Step;
748	if (!match(V: VPV,
749	P: m_Binary<Instruction::Sub>(Op0: m_VPValue(), Op1: m_VPValue(V&: Step))) \|\|
750	!Step->isLiveIn() \|\| !IVStep->isLiveIn())
751	return false;
752	auto *StepCI = dyn_cast<ConstantInt>(Val: Step->getLiveInIRValue());
753	auto *IVStepCI = dyn_cast<ConstantInt>(Val: IVStep->getLiveInIRValue());
754	return StepCI && IVStepCI &&
755	StepCI->getValue() == (-`1` * IVStepCI->getValue());
756	}
757	default:
758	return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
759	match(V: VPV, P: m_GetElementPtr(Op0: m_Specific(VPV: WideIV),
760	Op1: m_Specific(VPV: WideIV->getStepValue())));
761	}
762	llvm_unreachable("should have been covered by switch above");
763	};
764	return IsWideIVInc () ? WideIV : nullptr;
765	}
766
767	/// Attempts to optimize the induction variable exit values for users in the
768	/// early exit block.
769	static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
770	VPTypeAnalysis &TypeInfo,
771	VPBlockBase *PredVPBB,
772	VPValue *Op) {
773	using namespace VPlanPatternMatch;
774
775	VPValue Incoming, Mask;
776	if (!match(V: Op, P: m_VPInstruction<Instruction::ExtractElement>(
777	Op0: m_VPValue(V&: Incoming),
778	Op1: m_VPInstruction<VPInstruction::FirstActiveLane>(
779	Op0: m_VPValue(V&: Mask)))))
780	return nullptr;
781
782	auto *WideIV = getOptimizableIVOf(VPV: Incoming);
783	if (!WideIV)
784	return nullptr;
785
786	auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
787	if (WideIntOrFp && WideIntOrFp->getTruncInst())
788	return nullptr;
789
790	// Calculate the final index.
791	VPValue *EndValue = Plan.getCanonicalIV();
792	auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
793	VPBuilder B(cast<VPBasicBlock>(Val: PredVPBB));
794
795	DebugLoc DL = cast<VPInstruction>(Val: Op)->getDebugLoc();
796	VPValue *FirstActiveLane =
797	B.createNaryOp(Opcode: VPInstruction::FirstActiveLane, Operands: Mask, DL);
798	Type *FirstActiveLaneType = TypeInfo.inferScalarType(V: FirstActiveLane);
799	if (CanonicalIVType != FirstActiveLaneType) {
800	Instruction::CastOps CastOp =
801	CanonicalIVType->getScalarSizeInBits() <
802	FirstActiveLaneType->getScalarSizeInBits()
803	? Instruction::Trunc
804	: Instruction::ZExt;
805	FirstActiveLane =
806	B.createScalarCast(Opcode: CastOp, Op: FirstActiveLane, ResultTy: CanonicalIVType, DL);
807	}
808	EndValue = B.createNaryOp(Opcode: Instruction::Add, Operands: {EndValue, FirstActiveLane}, DL);
809
810	// `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
811	// changed it means the exit is using the incremented value, so we need to
812	// add the step.
813	if (Incoming != WideIV) {
814	VPValue *One = Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: CanonicalIVType, V: `1`));
815	EndValue = B.createNaryOp(Opcode: Instruction::Add, Operands: {EndValue, One}, DL);
816	}
817
818	if (!WideIntOrFp \|\| !WideIntOrFp->isCanonical()) {
819	const InductionDescriptor &ID = WideIV->getInductionDescriptor();
820	VPValue *Start = WideIV->getStartValue();
821	VPValue *Step = WideIV->getStepValue();
822	EndValue = B.createDerivedIV(
823	Kind: ID.getKind(), FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
824	Start, Current: EndValue, Step);
825	}
826
827	return EndValue;
828	}
829
830	/// Attempts to optimize the induction variable exit values for users in the
831	/// exit block coming from the latch in the original scalar loop.
832	static VPValue *
833	optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo,
834	VPBlockBase PredVPBB, VPValue Op,
835	DenseMap<VPValue , VPValue > &EndValues) {
836	using namespace VPlanPatternMatch;
837
838	VPValue *Incoming;
839	if (!match(V: Op, P: m_VPInstruction<VPInstruction::ExtractLastElement>(
840	Op0: m_VPValue(V&: Incoming))))
841	return nullptr;
842
843	auto *WideIV = getOptimizableIVOf(VPV: Incoming);
844	if (!WideIV)
845	return nullptr;
846
847	VPValue *EndValue = EndValues.lookup(Val: WideIV);
848	assert(EndValue && "end value must have been pre-computed");
849
850	// `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
851	// changed it means the exit is using the incremented value, so we don't
852	// need to subtract the step.
853	if (Incoming != WideIV)
854	return EndValue;
855
856	// Otherwise, subtract the step from the EndValue.
857	VPBuilder B(cast<VPBasicBlock>(Val: PredVPBB)->getTerminator());
858	VPValue *Step = WideIV->getStepValue();
859	Type *ScalarTy = TypeInfo.inferScalarType(V: WideIV);
860	if (ScalarTy->isIntegerTy())
861	return B.createNaryOp(Opcode: Instruction::Sub, Operands: {EndValue, Step}, Inst: {}, Name: "ind.escape");
862	if (ScalarTy->isPointerTy()) {
863	auto *Zero = Plan.getOrAddLiveIn(
864	V: ConstantInt::get(Ty: Step->getLiveInIRValue()->getType(), V: `0`));
865	return B.createPtrAdd(Ptr: EndValue,
866	Offset: B.createNaryOp(Opcode: Instruction::Sub, Operands: {Zero, Step}), DL: {},
867	Name: "ind.escape");
868	}
869	if (ScalarTy->isFloatingPointTy()) {
870	const auto &ID = WideIV->getInductionDescriptor();
871	return B.createNaryOp(
872	Opcode: ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
873	? Instruction::FSub
874	: Instruction::FAdd,
875	Operands: {EndValue, Step}, Flags: {ID.getInductionBinOp()->getFastMathFlags()});
876	}
877	llvm_unreachable("all possible induction types must be handled");
878	return nullptr;
879	}
880
881	void VPlanTransforms::optimizeInductionExitUsers(
882	VPlan &Plan, DenseMap<VPValue , VPValue > &EndValues) {
883	VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
884	VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
885	for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
886	for (VPRecipeBase &R : ExitVPBB->phis()) {
887	auto *ExitIRI = cast<VPIRPhi>(Val: &R);
888
889	for (auto [Idx, PredVPBB] : enumerate(First&: ExitVPBB->getPredecessors())) {
890	VPValue Escape = nullptr*;
891	if (PredVPBB == MiddleVPBB)
892	Escape = optimizeLatchExitInductionUser(
893	Plan, TypeInfo, PredVPBB, Op: ExitIRI->getOperand(N: Idx), EndValues);
894	else
895	Escape = optimizeEarlyExitInductionUser(Plan, TypeInfo, PredVPBB,
896	Op: ExitIRI->getOperand(N: Idx));
897	if (Escape)
898	ExitIRI->setOperand(I: Idx, New: Escape);
899	}
900	}
901	}
902	}
903
904	/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
905	/// them with already existing recipes expanding the same SCEV expression.
906	static void removeRedundantExpandSCEVRecipes(VPlan &Plan) {
907	DenseMap<const SCEV , VPValue > SCEV2VPV;
908
909	for (VPRecipeBase &R :
910	make_early_inc_range(Range&: *Plan.getEntry()->getEntryBasicBlock())) {
911	auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
912	if (!ExpR)
913	continue;
914
915	auto I = SCEV2VPV.insert(KV: {ExpR->getSCEV(), ExpR});
916	if (I.second)
917	continue;
918	ExpR->replaceAllUsesWith(New: I.first ->second);
919	ExpR->eraseFromParent();
920	}
921	}
922
923	static void recursivelyDeleteDeadRecipes(VPValue *V) {
924	SmallVector<VPValue *> WorkList;
925	SmallPtrSet<VPValue *, `8`> Seen;
926	WorkList.push_back(Elt: V);
927
928	while (!WorkList.empty()) {
929	VPValue *Cur = WorkList.pop_back_val();
930	if (!Seen.insert(Ptr: Cur).second)
931	continue;
932	VPRecipeBase *R = Cur->getDefiningRecipe();
933	if (!R)
934	continue;
935	if (!isDeadRecipe(R&: *R))
936	continue;
937	WorkList.append(in_start: R->op_begin(), in_end: R->op_end());
938	R->eraseFromParent();
939	}
940	}
941
942	/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
943	/// non-nullptr Value for a handled \p Opcode if corresponding \p Operands are
944	/// foldable live-ins.
945	static Value tryToFoldLiveIns(const* VPRecipeBase &R, unsigned Opcode,
946	ArrayRef<VPValue *> Operands,
947	const DataLayout &DL, VPTypeAnalysis &TypeInfo) {
948	SmallVector<Value *, `4`> Ops;
949	for (VPValue *Op : Operands) {
950	if (!Op->isLiveIn() \|\| !Op->getLiveInIRValue())
951	return nullptr;
952	Ops.push_back(Elt: Op->getLiveInIRValue());
953	}
954
955	InstSimplifyFolder Folder(DL);
956	if (Instruction::isBinaryOp(Opcode))
957	return Folder.FoldBinOp(Opc: static_cast<Instruction::BinaryOps>(Opcode), LHS: Ops [`0`],
958	RHS: Ops [`1`]);
959	if (Instruction::isCast(Opcode))
960	return Folder.FoldCast(Op: static_cast<Instruction::CastOps>(Opcode), V: Ops [`0`],
961	DestTy: TypeInfo.inferScalarType(V: R.getVPSingleValue()));
962	switch (Opcode) {
963	case VPInstruction::LogicalAnd:
964	return Folder.FoldSelect(C: Ops [`0`], True: Ops [`1`],
965	False: ConstantInt::getNullValue(Ty: Ops [`1`]->getType()));
966	case VPInstruction::Not:
967	return Folder.FoldBinOp(Opc: Instruction::BinaryOps::Xor, LHS: Ops [`0`],
968	RHS: Constant::getAllOnesValue(Ty: Ops [`0`]->getType()));
969	case Instruction::Select:
970	return Folder.FoldSelect(C: Ops [`0`], True: Ops [`1`], False: Ops [`2`]);
971	case Instruction::ICmp:
972	case Instruction::FCmp:
973	return Folder.FoldCmp(P: cast<VPRecipeWithIRFlags>(Val: R).getPredicate(), LHS: Ops [`0`],
974	RHS: Ops [`1`]);
975	case Instruction::GetElementPtr: {
976	auto &RFlags = cast<VPRecipeWithIRFlags>(Val: R);
977	auto *GEP = cast<GetElementPtrInst>(Val: RFlags.getUnderlyingInstr());
978	return Folder.FoldGEP(Ty: GEP->getSourceElementType(), Ptr: Ops [`0`], IdxList: drop_begin(RangeOrContainer&: Ops),
979	NW: RFlags.getGEPNoWrapFlags());
980	}
981	case VPInstruction::PtrAdd:
982	return Folder.FoldGEP(Ty: IntegerType::getInt8Ty(C&: TypeInfo.getContext()), Ptr: Ops [`0`],
983	IdxList: Ops [`1`],
984	NW: cast<VPRecipeWithIRFlags>(Val: R).getGEPNoWrapFlags());
985	case Instruction::InsertElement:
986	return Folder.FoldInsertElement(Vec: Ops [`0`], NewElt: Ops [`1`], Idx: Ops [`2`]);
987	case Instruction::ExtractElement:
988	return Folder.FoldExtractElement(Vec: Ops [`0`], Idx: Ops [`1`]);
989	}
990	return nullptr;
991	}
992
993	/// Try to simplify recipe \p R.
994	static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
995	using namespace llvm::VPlanPatternMatch;
996	VPlan *Plan = R.getParent()->getPlan();
997
998	auto *Def = dyn_cast<VPSingleDefRecipe>(Val: &R);
999	if (!Def)
1000	return;
1001
1002	// Simplification of live-in IR values for SingleDef recipes using
1003	// InstSimplifyFolder.
1004	if (TypeSwitch<VPRecipeBase , bool*>(&R)
1005	.Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
1006	VPReplicateRecipe>(caseFn: [&](auto *I) {
1007	const DataLayout &DL =
1008	Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout();
1009	Value V = tryToFoldLiveIns(I, I->getOpcode(), I->operands(), DL,
1010	TypeInfo);
1011	if (V)
1012	I->replaceAllUsesWith(Plan->getOrAddLiveIn(V));
1013	return V;
1014	})
1015	.Default(defaultFn: [](auto ) { return* false; }))
1016	return;
1017
1018	// Fold PredPHI LiveIn -> LiveIn.
1019	if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Val: &R)) {
1020	VPValue *Op = PredPHI->getOperand(N: `0`);
1021	if (Op->isLiveIn())
1022	PredPHI->replaceAllUsesWith(New: Op);
1023	}
1024
1025	VPValue *A;
1026	if (match(V: Def, P: m_Trunc(Op0: m_ZExtOrSExt(Op0: m_VPValue(V&: A))))) {
1027	Type *TruncTy = TypeInfo.inferScalarType(V: Def);
1028	Type *ATy = TypeInfo.inferScalarType(V: A);
1029	if (TruncTy == ATy) {
1030	Def->replaceAllUsesWith(New: A);
1031	} else {
1032	// Don't replace a scalarizing recipe with a widened cast.
1033	if (isa<VPReplicateRecipe>(Val: Def))
1034	return;
1035	if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1036
1037	unsigned ExtOpcode = match(V: R.getOperand(N: `0`), P: m_SExt(Op0: m_VPValue()))
1038	? Instruction::SExt
1039	: Instruction::ZExt;
1040	auto *VPC =
1041	new VPWidenCastRecipe (Instruction::CastOps(ExtOpcode), A, TruncTy);
1042	if (auto *UnderlyingExt = R.getOperand(N: `0`)->getUnderlyingValue()) {
1043	// UnderlyingExt has distinct return type, used to retain legacy cost.
1044	VPC->setUnderlyingValue(UnderlyingExt);
1045	}
1046	VPC->insertBefore(InsertPos: &R);
1047	Def->replaceAllUsesWith(New: VPC);
1048	} else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1049	auto VPC = new* VPWidenCastRecipe (Instruction::Trunc, A, TruncTy);
1050	VPC->insertBefore(InsertPos: &R);
1051	Def->replaceAllUsesWith(New: VPC);
1052	}
1053	}
1054	#ifndef NDEBUG
1055	// Verify that the cached type info is for both A and its users is still
1056	// accurate by comparing it to freshly computed types.
1057	VPTypeAnalysis TypeInfo2(Plan->getCanonicalIV()->getScalarType());
1058	assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1059	for (VPUser *U : A->users()) {
1060	auto *R = cast<VPRecipeBase>(Val: U);
1061	for (VPValue *VPV : R->definedValues())
1062	assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1063	}
1064	#endif
1065	}
1066
1067	// Simplify (X && Y) \|\| (X && !Y) -> X.
1068	// TODO: Split up into simpler, modular combines: (X && Y) \|\| (X && Z) into X
1069	// && (Y \|\| Z) and (X \|\| !X) into true. This requires queuing newly created
1070	// recipes to be visited during simplification.
1071	VPValue X, Y;
1072	if (match(V: Def,
1073	P: m_c_BinaryOr(Op0: m_LogicalAnd(Op0: m_VPValue(V&: X), Op1: m_VPValue(V&: Y)),
1074	Op1: m_LogicalAnd(Op0: m_Deferred(V: X), Op1: m_Not(Op0: m_Deferred(V: Y)))))) {
1075	Def->replaceAllUsesWith(New: X);
1076	Def->eraseFromParent();
1077	return;
1078	}
1079
1080	// OR x, 1 -> 1.
1081	if (match(V: Def, P: m_c_BinaryOr(Op0: m_VPValue(V&: X), Op1: m_AllOnes()))) {
1082	Def->replaceAllUsesWith(New: Def->getOperand(N: `0`) == X ? Def->getOperand(N: `1`)
1083	: Def->getOperand(N: `0`));
1084	Def->eraseFromParent();
1085	return;
1086	}
1087
1088	if (match(V: Def, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: X), Op2: m_Deferred(V: X))))
1089	return Def->replaceAllUsesWith(New: X);
1090
1091	if (match(V: Def, P: m_c_Mul(Op0: m_VPValue(V&: A), Op1: m_SpecificInt(V: `1`))))
1092	return Def->replaceAllUsesWith(New: A);
1093
1094	if (match(V: Def, P: m_Not(Op0: m_VPValue(V&: A)))) {
1095	if (match(V: A, P: m_Not(Op0: m_VPValue(V&: A))))
1096	return Def->replaceAllUsesWith(New: A);
1097
1098	// Try to fold Not into compares by adjusting the predicate in-place.
1099	if (isa<VPWidenRecipe>(Val: A) && A->getNumUsers() == `1`) {
1100	auto *WideCmp = cast<VPWidenRecipe>(Val: A);
1101	if (WideCmp->getOpcode() == Instruction::ICmp \|\|
1102	WideCmp->getOpcode() == Instruction::FCmp) {
1103	WideCmp->setPredicate(
1104	CmpInst::getInversePredicate(pred: WideCmp->getPredicate()));
1105	Def->replaceAllUsesWith(New: WideCmp);
1106	// If WideCmp doesn't have a debug location, use the one from the
1107	// negation, to preserve the location.
1108	if (!WideCmp->getDebugLoc() && R.getDebugLoc())
1109	WideCmp->setDebugLoc(R.getDebugLoc());
1110	}
1111	}
1112	}
1113
1114	// Remove redundant DerviedIVs, that is 0 + A 1 -> A and 0 + 0 * x -> 0.*
1115	if ((match(V: Def,
1116	P: m_DerivedIV(Op0: m_SpecificInt(V: `0`), Op1: m_VPValue(V&: A), Op2: m_SpecificInt(V: `1`))) \|\|
1117	match(V: Def,
1118	P: m_DerivedIV(Op0: m_SpecificInt(V: `0`), Op1: m_SpecificInt(V: `0`), Op2: m_VPValue()))) &&
1119	TypeInfo.inferScalarType(V: Def->getOperand(N: `1`)) ==
1120	TypeInfo.inferScalarType(V: Def))
1121	return Def->replaceAllUsesWith(New: Def->getOperand(N: `1`));
1122
1123	if (match(V: Def, P: m_VPInstruction<VPInstruction::WideIVStep>(
1124	Op0: m_VPValue(V&: X), Op1: m_SpecificInt(V: `1`)))) {
1125	Type *WideStepTy = TypeInfo.inferScalarType(V: Def);
1126	if (TypeInfo.inferScalarType(V: X) != WideStepTy)
1127	X = VPBuilder (Def).createWidenCast(Opcode: Instruction::Trunc, Op: X, ResultTy: WideStepTy);
1128	Def->replaceAllUsesWith(New: X);
1129	return;
1130	}
1131
1132	// For i1 vp.merges produced by AnyOf reductions:
1133	// vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1134	if (match(Def, m_Intrinsic<Intrinsic::vp_merge>(m_True(), m_VPValue(A),
1135	m_VPValue(X), m_VPValue())) &&
1136	match(A, m_c_BinaryOr(m_Specific(X), m_VPValue(Y))) &&
1137	TypeInfo.inferScalarType(R.getVPSingleValue())->isIntegerTy(`1`)) {
1138	Def->setOperand(I: `1`, New: Def->getOperand(N: `0`));
1139	Def->setOperand(I: `0`, New: Y);
1140	return;
1141	}
1142
1143	// Some simplifications can only be applied after unrolling. Perform them
1144	// below.
1145	if (!Plan->isUnrolled())
1146	return;
1147
1148	// VPScalarIVSteps for part 0 can be replaced by their start value, if only
1149	// the first lane is demanded.
1150	if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: Def)) {
1151	if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Def: Steps)) {
1152	Steps->replaceAllUsesWith(New: Steps->getOperand(N: `0`));
1153	return;
1154	}
1155	}
1156	// Simplify redundant ReductionStartVector recipes after unrolling.
1157	VPValue *StartV;
1158	if (match(V: Def, P: m_VPInstruction<VPInstruction::ReductionStartVector>(
1159	Op0: m_VPValue(V&: StartV), Op1: m_VPValue(), Op2: m_VPValue()))) {
1160	Def->replaceUsesWithIf(New: StartV, ShouldReplace: [](const VPUser &U, unsigned Idx) {
1161	auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &U);
1162	return PhiR && PhiR->isInLoop();
1163	});
1164	return;
1165	}
1166	}
1167
1168	void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {
1169	ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
1170	Plan.getEntry());
1171	VPTypeAnalysis TypeInfo(&CanonicalIVTy);
1172	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
1173	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
1174	simplifyRecipe(R, TypeInfo);
1175	}
1176	}
1177	}
1178
1179	static void narrowToSingleScalarRecipes(VPlan &Plan) {
1180	if (Plan.hasScalarVFOnly())
1181	return;
1182
1183	// Try to narrow wide and replicating recipes to single scalar recipes,
1184	// based on VPlan analysis. Only process blocks in the loop region for now,
1185	// without traversing into nested regions, as recipes in replicate regions
1186	// cannot be converted yet.
1187	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1188	Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
1189	for (VPRecipeBase &R : make_early_inc_range(Range: reverse(C&: *VPBB))) {
1190	auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
1191	if (!RepR && !isa<VPWidenRecipe>(Val: &R))
1192	continue;
1193	if (RepR && (RepR->isSingleScalar() \|\| RepR->isPredicated()))
1194	continue;
1195
1196	auto *RepOrWidenR = cast<VPSingleDefRecipe>(Val: &R);
1197	// Skip recipes that aren't single scalars or don't have only their
1198	// scalar results used. In the latter case, we would introduce extra
1199	// broadcasts.
1200	if (!vputils::isSingleScalar(VPV: RepOrWidenR) \|\|
1201	any_of(Range: RepOrWidenR->users(), P: [RepOrWidenR](VPUser *U) {
1202	return !U->usesScalars(Op: RepOrWidenR);
1203	}))
1204	continue;
1205
1206	auto Clone = new* VPReplicateRecipe (RepOrWidenR->getUnderlyingInstr(),
1207	RepOrWidenR->operands(),
1208	true /IsSingleScalar/);
1209	Clone->insertBefore(InsertPos: RepOrWidenR);
1210	RepOrWidenR->replaceAllUsesWith(New: Clone);
1211	}
1212	}
1213	}
1214
1215	/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1216	/// to make sure the masks are simplified.
1217	static void simplifyBlends(VPlan &Plan) {
1218	using namespace llvm::VPlanPatternMatch;
1219	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1220	Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
1221	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
1222	auto *Blend = dyn_cast<VPBlendRecipe>(Val: &R);
1223	if (!Blend)
1224	continue;
1225
1226	// Try to remove redundant blend recipes.
1227	SmallPtrSet<VPValue *, `4`> UniqueValues;
1228	if (Blend->isNormalized() \|\| !match(V: Blend->getMask(Idx: `0`), P: m_False()))
1229	UniqueValues.insert(Ptr: Blend->getIncomingValue(Idx: `0`));
1230	for (unsigned I = `1`; I != Blend->getNumIncomingValues(); ++I)
1231	if (!match(V: Blend->getMask(Idx: I), P: m_False()))
1232	UniqueValues.insert(Ptr: Blend->getIncomingValue(Idx: I));
1233
1234	if (UniqueValues.size() == `1`) {
1235	Blend->replaceAllUsesWith(New: *UniqueValues.begin());
1236	Blend->eraseFromParent();
1237	continue;
1238	}
1239
1240	if (Blend->isNormalized())
1241	continue;
1242
1243	// Normalize the blend so its first incoming value is used as the initial
1244	// value with the others blended into it.
1245
1246	unsigned StartIndex = `0`;
1247	for (unsigned I = `0`; I != Blend->getNumIncomingValues(); ++I) {
1248	// If a value's mask is used only by the blend then is can be deadcoded.
1249	// TODO: Find the most expensive mask that can be deadcoded, or a mask
1250	// that's used by multiple blends where it can be removed from them all.
1251	VPValue *Mask = Blend->getMask(Idx: I);
1252	if (Mask->getNumUsers() == `1` && !match(V: Mask, P: m_False())) {
1253	StartIndex = I;
1254	break;
1255	}
1256	}
1257
1258	SmallVector<VPValue *, `4`> OperandsWithMask;
1259	OperandsWithMask.push_back(Elt: Blend->getIncomingValue(Idx: StartIndex));
1260
1261	for (unsigned I = `0`; I != Blend->getNumIncomingValues(); ++I) {
1262	if (I == StartIndex)
1263	continue;
1264	OperandsWithMask.push_back(Elt: Blend->getIncomingValue(Idx: I));
1265	OperandsWithMask.push_back(Elt: Blend->getMask(Idx: I));
1266	}
1267
1268	auto NewBlend = new* VPBlendRecipe (
1269	cast<PHINode>(Val: Blend->getUnderlyingValue()), OperandsWithMask);
1270	NewBlend->insertBefore(InsertPos: &R);
1271
1272	VPValue *DeadMask = Blend->getMask(Idx: StartIndex);
1273	Blend->replaceAllUsesWith(New: NewBlend);
1274	Blend->eraseFromParent();
1275	recursivelyDeleteDeadRecipes(V: DeadMask);
1276
1277	/// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1278	VPValue *NewMask;
1279	if (NewBlend->getNumOperands() == `3` &&
1280	match(V: NewBlend->getMask(Idx: `1`), P: m_Not(Op0: m_VPValue(V&: NewMask)))) {
1281	VPValue *Inc0 = NewBlend->getOperand(N: `0`);
1282	VPValue *Inc1 = NewBlend->getOperand(N: `1`);
1283	VPValue *OldMask = NewBlend->getOperand(N: `2`);
1284	NewBlend->setOperand(I: `0`, New: Inc1);
1285	NewBlend->setOperand(I: `1`, New: Inc0);
1286	NewBlend->setOperand(I: `2`, New: NewMask);
1287	if (OldMask->getNumUsers() == `0`)
1288	cast<VPInstruction>(Val: OldMask)->eraseFromParent();
1289	}
1290	}
1291	}
1292	}
1293
1294	/// Optimize the width of vector induction variables in \p Plan based on a known
1295	/// constant Trip Count, \p BestVF and \p BestUF.
1296	static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
1297	ElementCount BestVF,
1298	unsigned BestUF) {
1299	// Only proceed if we have not completely removed the vector region.
1300	if (!Plan.getVectorLoopRegion())
1301	return false;
1302
1303	if (!Plan.getTripCount()->isLiveIn())
1304	return false;
1305	auto *TC = dyn_cast_if_present<ConstantInt>(
1306	Val: Plan.getTripCount()->getUnderlyingValue());
1307	if (!TC \|\| !BestVF.isFixed())
1308	return false;
1309
1310	// Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1311	// and UF. Returns at least 8.
1312	auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1313	APInt AlignedTC =
1314	Align * APIntOps::RoundingUDiv(A: TC, B: APInt (TC.getBitWidth(), Align),
1315	RM: APInt::Rounding::UP);
1316	APInt MaxVal = AlignedTC - `1`;
1317	return std::max<unsigned>(a: PowerOf2Ceil(A: MaxVal.getActiveBits()), b: `8`);
1318	};
1319	unsigned NewBitWidth =
1320	ComputeBitWidth (TC->getValue(), BestVF.getKnownMinValue() * BestUF);
1321
1322	LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext();
1323	auto *NewIVTy = IntegerType::get(C&: Ctx, NumBits: NewBitWidth);
1324
1325	bool MadeChange = false;
1326
1327	VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1328	for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1329	auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
1330
1331	// Currently only handle canonical IVs as it is trivial to replace the start
1332	// and stop values, and we currently only perform the optimization when the
1333	// IV has a single use.
1334	if (!WideIV \|\| !WideIV->isCanonical() \|\|
1335	WideIV->hasMoreThanOneUniqueUser() \|\|
1336	NewIVTy == WideIV->getScalarType())
1337	continue;
1338
1339	// Currently only handle cases where the single user is a header-mask
1340	// comparison with the backedge-taken-count.
1341	using namespace VPlanPatternMatch;
1342	if (!match(
1343	U: *WideIV->user_begin(),
1344	P: m_Binary<Instruction::ICmp>(
1345	Op0: m_Specific(VPV: WideIV),
1346	Op1: m_Broadcast(Op0: m_Specific(VPV: Plan.getOrCreateBackedgeTakenCount())))))
1347	continue;
1348
1349	// Update IV operands and comparison bound to use new narrower type.
1350	auto *NewStart = Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: NewIVTy, V: `0`));
1351	WideIV->setStartValue(NewStart);
1352	auto *NewStep = Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: NewIVTy, V: `1`));
1353	WideIV->setStepValue(NewStep);
1354	// TODO: Remove once VPWidenIntOrFpInductionRecipe is fully expanded.
1355	VPInstructionWithType *OldStepVector = WideIV->getStepVector();
1356	assert(OldStepVector->getNumUsers() == `1` &&
1357	"step vector should only be used by single "
1358	"VPWidenIntOrFpInductionRecipe");
1359	auto *NewStepVector =
1360	new VPInstructionWithType (VPInstruction::StepVector, {}, NewIVTy, {},
1361	OldStepVector->getDebugLoc());
1362	NewStepVector->insertAfter(InsertPos: OldStepVector->getDefiningRecipe());
1363	OldStepVector->replaceAllUsesWith(New: NewStepVector);
1364	OldStepVector->eraseFromParent();
1365
1366	auto NewBTC = new* VPWidenCastRecipe (
1367	Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1368	Plan.getVectorPreheader()->appendRecipe(Recipe: NewBTC);
1369	auto Cmp = cast<VPInstruction>(Val: WideIV->user_begin());
1370	Cmp->setOperand(I: `1`, New: NewBTC);
1371
1372	MadeChange = true;
1373	}
1374
1375	return MadeChange;
1376	}
1377
1378	/// Return true if \p Cond is known to be true for given \p BestVF and \p
1379	/// BestUF.
1380	static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
1381	ElementCount BestVF, unsigned BestUF,
1382	ScalarEvolution &SE) {
1383	using namespace llvm::VPlanPatternMatch;
1384	if (match(V: Cond, P: m_Binary<Instruction::Or>(Op0: m_VPValue(), Op1: m_VPValue())))
1385	return any_of(Range: Cond->getDefiningRecipe()->operands(), P: [&Plan, BestVF, BestUF,
1386	&SE](VPValue *C) {
1387	return isConditionTrueViaVFAndUF(Cond: C, Plan, BestVF, BestUF, SE);
1388	});
1389
1390	auto *CanIV = Plan.getCanonicalIV();
1391	if (!match(V: Cond, P: m_Binary<Instruction::ICmp>(
1392	Op0: m_Specific(VPV: CanIV->getBackedgeValue()),
1393	Op1: m_Specific(VPV: &Plan.getVectorTripCount()))) \|\|
1394	cast<VPRecipeWithIRFlags>(Val: Cond->getDefiningRecipe())->getPredicate() !=
1395	CmpInst::ICMP_EQ)
1396	return false;
1397
1398	// The compare checks CanIV + VFxUF == vector trip count. The vector trip
1399	// count is not conveniently available as SCEV so far, so we compare directly
1400	// against the original trip count. This is stricter than necessary, as we
1401	// will only return true if the trip count == vector trip count.
1402	// TODO: Use SCEV for vector trip count once available, to cover cases where
1403	// vector trip count == UF VF, but original trip count != UF * VF.*
1404	const SCEV *TripCount =
1405	vputils::getSCEVExprForVPValue(V: Plan.getTripCount(), SE);
1406	assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1407	"Trip count SCEV must be computable");
1408	ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF);
1409	const SCEV *C = SE.getElementCount(Ty: TripCount->getType(), EC: NumElements);
1410	return SE.isKnownPredicate(Pred: CmpInst::ICMP_EQ, LHS: TripCount, RHS: C);
1411	}
1412
1413	/// Try to simplify the branch condition of \p Plan. This may restrict the
1414	/// resulting plan to \p BestVF and \p BestUF.
1415	static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
1416	unsigned BestUF,
1417	PredicatedScalarEvolution &PSE) {
1418	VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1419	VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1420	auto *Term = &ExitingVPBB->back();
1421	VPValue *Cond;
1422	ScalarEvolution &SE = *PSE.getSE();
1423	using namespace llvm::VPlanPatternMatch;
1424	if (match(V: Term, P: m_BranchOnCount(Op0: m_VPValue(), Op1: m_VPValue())) \|\|
1425	match(V: Term, P: m_BranchOnCond(
1426	Op0: m_Not(Op0: m_ActiveLaneMask(Op0: m_VPValue(), Op1: m_VPValue()))))) {
1427	// Try to simplify the branch condition if TC <= VF UF when the latch*
1428	// terminator is BranchOnCount or BranchOnCond where the input is
1429	// Not(ActiveLaneMask).
1430	const SCEV *TripCount =
1431	vputils::getSCEVExprForVPValue(V: Plan.getTripCount(), SE);
1432	assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1433	"Trip count SCEV must be computable");
1434	ElementCount NumElements = BestVF.multiplyCoefficientBy(RHS: BestUF);
1435	const SCEV *C = SE.getElementCount(Ty: TripCount->getType(), EC: NumElements);
1436	if (TripCount->isZero() \|\|
1437	!SE.isKnownPredicate(Pred: CmpInst::ICMP_ULE, LHS: TripCount, RHS: C))
1438	return false;
1439	} else if (match(V: Term, P: m_BranchOnCond(Op0: m_VPValue(V&: Cond)))) {
1440	// For BranchOnCond, check if we can prove the condition to be true using VF
1441	// and UF.
1442	if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1443	return false;
1444	} else {
1445	return false;
1446	}
1447
1448	// The vector loop region only executes once. If possible, completely remove
1449	// the region, otherwise replace the terminator controlling the latch with
1450	// (BranchOnCond true).
1451	auto *Header = cast<VPBasicBlock>(Val: VectorRegion->getEntry());
1452	auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
1453	if (all_of(
1454	Range: Header->phis(),
1455	P: IsaPred<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe>)) {
1456	for (VPRecipeBase &HeaderR : make_early_inc_range(Range: Header->phis())) {
1457	auto *HeaderPhiR = cast<VPHeaderPHIRecipe>(Val: &HeaderR);
1458	HeaderPhiR->replaceAllUsesWith(New: HeaderPhiR->getStartValue());
1459	HeaderPhiR->eraseFromParent();
1460	}
1461
1462	VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
1463	VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
1464	VPBlockUtils::disconnectBlocks(From: Preheader, To: VectorRegion);
1465	VPBlockUtils::disconnectBlocks(From: VectorRegion, To: Exit);
1466
1467	for (VPBlockBase *B : vp_depth_first_shallow(G: VectorRegion->getEntry()))
1468	B->setParent(nullptr);
1469
1470	VPBlockUtils::connectBlocks(From: Preheader, To: Header);
1471	VPBlockUtils::connectBlocks(From: ExitingVPBB, To: Exit);
1472	VPlanTransforms::simplifyRecipes(Plan, CanonicalIVTy&: *CanIVTy);
1473	} else {
1474	// The vector region contains header phis for which we cannot remove the
1475	// loop region yet.
1476	LLVMContext &Ctx = SE.getContext();
1477	auto BOC = new* VPInstruction (
1478	VPInstruction::BranchOnCond,
1479	{Plan.getOrAddLiveIn(V: ConstantInt::getTrue(Context&: Ctx))}, Term->getDebugLoc());
1480	ExitingVPBB->appendRecipe(Recipe: BOC);
1481	}
1482
1483	Term->eraseFromParent();
1484
1485	return true;
1486	}
1487
1488	void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
1489	unsigned BestUF,
1490	PredicatedScalarEvolution &PSE) {
1491	assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1492	assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1493
1494	bool MadeChange =
1495	simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1496	MadeChange \|= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1497
1498	if (MadeChange) {
1499	Plan.setVF(BestVF);
1500	assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1501	}
1502	// TODO: Further simplifications are possible
1503	// 1. Replace inductions with constants.
1504	// 2. Replace vector loop region with VPBasicBlock.
1505	}
1506
1507	/// Sink users of \p FOR after the recipe defining the previous value \p
1508	/// Previous of the recurrence. \returns true if all users of \p FOR could be
1509	/// re-arranged as needed or false if it is not possible.
1510	static bool
1511	sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
1512	VPRecipeBase *Previous,
1513	VPDominatorTree &VPDT) {
1514	// Collect recipes that need sinking.
1515	SmallVector<VPRecipeBase *> WorkList;
1516	SmallPtrSet<VPRecipeBase *, `8`> Seen;
1517	Seen.insert(Ptr: Previous);
1518	auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
1519	// The previous value must not depend on the users of the recurrence phi. In
1520	// that case, FOR is not a fixed order recurrence.
1521	if (SinkCandidate == Previous)
1522	return false;
1523
1524	if (isa<VPHeaderPHIRecipe>(Val: SinkCandidate) \|\|
1525	!Seen.insert(Ptr: SinkCandidate).second \|\|
1526	VPDT.properlyDominates(A: Previous, B: SinkCandidate))
1527	return true;
1528
1529	if (SinkCandidate->mayHaveSideEffects())
1530	return false;
1531
1532	WorkList.push_back(Elt: SinkCandidate);
1533	return true;
1534	};
1535
1536	// Recursively sink users of FOR after Previous.
1537	WorkList.push_back(Elt: FOR);
1538	for (unsigned I = `0`; I != WorkList.size(); ++I) {
1539	VPRecipeBase *Current = WorkList [I];
1540	assert(Current->getNumDefinedValues() == `1` &&
1541	"only recipes with a single defined value expected");
1542
1543	for (VPUser *User : Current->getVPSingleValue()->users()) {
1544	if (!TryToPushSinkCandidate (cast<VPRecipeBase>(Val: User)))
1545	return false;
1546	}
1547	}
1548
1549	// Keep recipes to sink ordered by dominance so earlier instructions are
1550	// processed first.
1551	sort(C&: WorkList, Comp: [&VPDT](const VPRecipeBase A, const* VPRecipeBase *B) {
1552	return VPDT.properlyDominates(A, B);
1553	});
1554
1555	for (VPRecipeBase *SinkCandidate : WorkList) {
1556	if (SinkCandidate == FOR)
1557	continue;
1558
1559	SinkCandidate->moveAfter(MovePos: Previous);
1560	Previous = SinkCandidate;
1561	}
1562	return true;
1563	}
1564
1565	/// Try to hoist \p Previous and its operands before all users of \p FOR.
1566	static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
1567	VPRecipeBase *Previous,
1568	VPDominatorTree &VPDT) {
1569	if (Previous->mayHaveSideEffects() \|\| Previous->mayReadFromMemory())
1570	return false;
1571
1572	// Collect recipes that need hoisting.
1573	SmallVector<VPRecipeBase *> HoistCandidates;
1574	SmallPtrSet<VPRecipeBase *, `8`> Visited;
1575	VPRecipeBase HoistPoint = nullptr*;
1576	// Find the closest hoist point by looking at all users of FOR and selecting
1577	// the recipe dominating all other users.
1578	for (VPUser *U : FOR->users()) {
1579	auto *R = cast<VPRecipeBase>(Val: U);
1580	if (!HoistPoint \|\| VPDT.properlyDominates(A: R, B: HoistPoint))
1581	HoistPoint = R;
1582	}
1583	assert(all_of(FOR->users(),
1584	[&VPDT, HoistPoint](VPUser *U) {
1585	auto *R = cast<VPRecipeBase>(U);
1586	return HoistPoint == R \|\|
1587	VPDT.properlyDominates(HoistPoint, R);
1588	}) &&
1589	"HoistPoint must dominate all users of FOR");
1590
1591	auto NeedsHoisting = [HoistPoint, &VPDT,
1592	&Visited](VPValue HoistCandidateV) -> VPRecipeBase {
1593	VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
1594	if (!HoistCandidate)
1595	return nullptr;
1596	VPRegionBlock *EnclosingLoopRegion =
1597	HoistCandidate->getParent()->getEnclosingLoopRegion();
1598	assert((!HoistCandidate->getParent()->getParent() \|\|
1599	HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) &&
1600	"CFG in VPlan should still be flat, without replicate regions");
1601	// Hoist candidate was already visited, no need to hoist.
1602	if (!Visited.insert(Ptr: HoistCandidate).second)
1603	return nullptr;
1604
1605	// Candidate is outside loop region or a header phi, dominates FOR users w/o
1606	// hoisting.
1607	if (!EnclosingLoopRegion \|\| isa<VPHeaderPHIRecipe>(Val: HoistCandidate))
1608	return nullptr;
1609
1610	// If we reached a recipe that dominates HoistPoint, we don't need to
1611	// hoist the recipe.
1612	if (VPDT.properlyDominates(A: HoistCandidate, B: HoistPoint))
1613	return nullptr;
1614	return HoistCandidate;
1615	};
1616	auto CanHoist = [&](VPRecipeBase *HoistCandidate) {
1617	// Avoid hoisting candidates with side-effects, as we do not yet analyze
1618	// associated dependencies.
1619	return !HoistCandidate->mayHaveSideEffects();
1620	};
1621
1622	if (!NeedsHoisting (Previous->getVPSingleValue()))
1623	return true;
1624
1625	// Recursively try to hoist Previous and its operands before all users of FOR.
1626	HoistCandidates.push_back(Elt: Previous);
1627
1628	for (unsigned I = `0`; I != HoistCandidates.size(); ++I) {
1629	VPRecipeBase *Current = HoistCandidates [I];
1630	assert(Current->getNumDefinedValues() == `1` &&
1631	"only recipes with a single defined value expected");
1632	if (!CanHoist (Current))
1633	return false;
1634
1635	for (VPValue *Op : Current->operands()) {
1636	// If we reach FOR, it means the original Previous depends on some other
1637	// recurrence that in turn depends on FOR. If that is the case, we would
1638	// also need to hoist recipes involving the other FOR, which may break
1639	// dependencies.
1640	if (Op == FOR)
1641	return false;
1642
1643	if (auto *R = NeedsHoisting (Op))
1644	HoistCandidates.push_back(Elt: R);
1645	}
1646	}
1647
1648	// Order recipes to hoist by dominance so earlier instructions are processed
1649	// first.
1650	sort(C&: HoistCandidates, Comp: [&VPDT](const VPRecipeBase A, const* VPRecipeBase *B) {
1651	return VPDT.properlyDominates(A, B);
1652	});
1653
1654	for (VPRecipeBase *HoistCandidate : HoistCandidates) {
1655	HoistCandidate->moveBefore(BB&: *HoistPoint->getParent(),
1656	I: HoistPoint->getIterator());
1657	}
1658
1659	return true;
1660	}
1661
1662	bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
1663	VPBuilder &LoopBuilder) {
1664	VPDominatorTree VPDT;
1665	VPDT.recalculate(Func&: Plan);
1666
1667	SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis;
1668	for (VPRecipeBase &R :
1669	Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis())
1670	if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &R))
1671	RecurrencePhis.push_back(Elt: FOR);
1672
1673	for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
1674	SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, `4`> SeenPhis;
1675	VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
1676	// Fixed-order recurrences do not contain cycles, so this loop is guaranteed
1677	// to terminate.
1678	while (auto *PrevPhi =
1679	dyn_cast_or_null<VPFirstOrderRecurrencePHIRecipe>(Val: Previous)) {
1680	assert(PrevPhi->getParent() == FOR->getParent());
1681	assert(SeenPhis.insert(PrevPhi).second);
1682	Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
1683	}
1684
1685	if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
1686	!hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
1687	return false;
1688
1689	// Introduce a recipe to combine the incoming and previous values of a
1690	// fixed-order recurrence.
1691	VPBasicBlock *InsertBlock = Previous->getParent();
1692	if (isa<VPHeaderPHIRecipe>(Val: Previous))
1693	LoopBuilder.setInsertPoint(TheBB: InsertBlock, IP: InsertBlock->getFirstNonPhi());
1694	else
1695	LoopBuilder.setInsertPoint(TheBB: InsertBlock,
1696	IP: std::next(x: Previous->getIterator()));
1697
1698	auto *RecurSplice =
1699	LoopBuilder.createNaryOp(Opcode: VPInstruction::FirstOrderRecurrenceSplice,
1700	Operands: {FOR, FOR->getBackedgeValue()});
1701
1702	FOR->replaceAllUsesWith(New: RecurSplice);
1703	// Set the first operand of RecurSplice to FOR again, after replacing
1704	// all users.
1705	RecurSplice->setOperand(I: `0`, New: FOR);
1706	}
1707	return true;
1708	}
1709
1710	void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
1711	for (VPRecipeBase &R :
1712	Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
1713	auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
1714	if (!PhiR)
1715	continue;
1716	const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
1717	RecurKind RK = RdxDesc.getRecurrenceKind();
1718	if (RK != RecurKind::Add && RK != RecurKind::Mul)
1719	continue;
1720
1721	for (VPUser *U : collectUsersRecursively(V: PhiR))
1722	if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: U)) {
1723	RecWithFlags->dropPoisonGeneratingFlags();
1724	}
1725	}
1726	}
1727
1728	/// Move loop-invariant recipes out of the vector loop region in \p Plan.
1729	static void licm(VPlan &Plan) {
1730	VPBasicBlock *Preheader = Plan.getVectorPreheader();
1731
1732	// Return true if we do not know how to (mechanically) hoist a given recipe
1733	// out of a loop region. Does not address legality concerns such as aliasing
1734	// or speculation safety.
1735	auto CannotHoistRecipe = [](VPRecipeBase &R) {
1736	// Allocas cannot be hoisted.
1737	auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R);
1738	return RepR && RepR->getOpcode() == Instruction::Alloca;
1739	};
1740
1741	// Hoist any loop invariant recipes from the vector loop region to the
1742	// preheader. Preform a shallow traversal of the vector loop region, to
1743	// exclude recipes in replicate regions.
1744	VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
1745	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1746	Range: vp_depth_first_shallow(G: LoopRegion->getEntry()))) {
1747	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
1748	if (CannotHoistRecipe (R))
1749	continue;
1750	// TODO: Relax checks in the future, e.g. we could also hoist reads, if
1751	// their memory location is not modified in the vector loop.
1752	if (R.mayHaveSideEffects() \|\| R.mayReadFromMemory() \|\| R.isPhi() \|\|
1753	any_of(Range: R.operands(), P: [](VPValue *Op) {
1754	return !Op->isDefinedOutsideLoopRegions();
1755	}))
1756	continue;
1757	R.moveBefore(BB&: *Preheader, I: Preheader->end());
1758	}
1759	}
1760	}
1761
1762	void VPlanTransforms::truncateToMinimalBitwidths(
1763	VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
1764	// Keep track of created truncates, so they can be re-used. Note that we
1765	// cannot use RAUW after creating a new truncate, as this would could make
1766	// other uses have different types for their operands, making them invalidly
1767	// typed.
1768	DenseMap<VPValue , VPWidenCastRecipe > ProcessedTruncs;
1769	Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
1770	VPTypeAnalysis TypeInfo(CanonicalIVType);
1771	VPBasicBlock *PH = Plan.getVectorPreheader();
1772	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1773	Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) {
1774	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
1775	if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
1776	VPWidenSelectRecipe, VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(
1777	Val: &R))
1778	continue;
1779
1780	VPValue *ResultVPV = R.getVPSingleValue();
1781	auto *UI = cast_or_null<Instruction>(Val: ResultVPV->getUnderlyingValue());
1782	unsigned NewResSizeInBits = MinBWs.lookup(Key: UI);
1783	if (!NewResSizeInBits)
1784	continue;
1785
1786	// If the value wasn't vectorized, we must maintain the original scalar
1787	// type. Skip those here, after incrementing NumProcessedRecipes. Also
1788	// skip casts which do not need to be handled explicitly here, as
1789	// redundant casts will be removed during recipe simplification.
1790	if (isa<VPReplicateRecipe, VPWidenCastRecipe>(Val: &R))
1791	continue;
1792
1793	Type *OldResTy = TypeInfo.inferScalarType(V: ResultVPV);
1794	unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
1795	assert(OldResTy->isIntegerTy() && "only integer types supported");
1796	(void)OldResSizeInBits;
1797
1798	LLVMContext &Ctx = CanonicalIVType->getContext();
1799	auto *NewResTy = IntegerType::get(C&: Ctx, NumBits: NewResSizeInBits);
1800
1801	// Any wrapping introduced by shrinking this operation shouldn't be
1802	// considered undefined behavior. So, we can't unconditionally copy
1803	// arithmetic wrapping flags to VPW.
1804	if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(Val: &R))
1805	VPW->dropPoisonGeneratingFlags();
1806
1807	using namespace llvm::VPlanPatternMatch;
1808	if (OldResSizeInBits != NewResSizeInBits &&
1809	!match(V: &R, P: m_Binary<Instruction::ICmp>(Op0: m_VPValue(), Op1: m_VPValue()))) {
1810	// Extend result to original width.
1811	auto *Ext =
1812	new VPWidenCastRecipe (Instruction::ZExt, ResultVPV, OldResTy);
1813	Ext->insertAfter(InsertPos: &R);
1814	ResultVPV->replaceAllUsesWith(New: Ext);
1815	Ext->setOperand(I: `0`, New: ResultVPV);
1816	assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
1817	} else {
1818	assert(
1819	match(&R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) &&
1820	"Only ICmps should not need extending the result.");
1821	}
1822
1823	assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
1824	if (isa<VPWidenLoadRecipe, VPWidenIntrinsicRecipe>(Val: &R))
1825	continue;
1826
1827	// Shrink operands by introducing truncates as needed.
1828	unsigned StartIdx = isa<VPWidenSelectRecipe>(Val: &R) ? `1` : `0`;
1829	for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
1830	auto *Op = R.getOperand(N: Idx);
1831	unsigned OpSizeInBits =
1832	TypeInfo.inferScalarType(V: Op)->getScalarSizeInBits();
1833	if (OpSizeInBits == NewResSizeInBits)
1834	continue;
1835	assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
1836	auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Key: Op);
1837	VPWidenCastRecipe *NewOp =
1838	IterIsEmpty
1839	? new VPWidenCastRecipe (Instruction::Trunc, Op, NewResTy)
1840	: ProcessedIter ->second;
1841	R.setOperand(I: Idx, New: NewOp);
1842	if (!IterIsEmpty)
1843	continue;
1844	ProcessedIter ->second = NewOp;
1845	if (!Op->isLiveIn()) {
1846	NewOp->insertBefore(InsertPos: &R);
1847	} else {
1848	PH->appendRecipe(Recipe: NewOp);
1849	}
1850	}
1851
1852	}
1853	}
1854	}
1855
1856	/// Remove BranchOnCond recipes with true or false conditions together with
1857	/// removing dead edges to their successors.
1858	static void removeBranchOnConst(VPlan &Plan) {
1859	using namespace llvm::VPlanPatternMatch;
1860	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1861	Range: vp_depth_first_shallow(G: Plan.getEntry()))) {
1862	VPValue *Cond;
1863	if (VPBB->getNumSuccessors() != `2` \|\| VPBB == Plan.getEntry() \|\|
1864	!match(V: &VPBB->back(), P: m_BranchOnCond(Op0: m_VPValue(V&: Cond))))
1865	continue;
1866
1867	unsigned RemovedIdx;
1868	if (match(V: Cond, P: m_True()))
1869	RemovedIdx = `1`;
1870	else if (match(V: Cond, P: m_False()))
1871	RemovedIdx = `0`;
1872	else
1873	continue;
1874
1875	VPBasicBlock *RemovedSucc =
1876	cast<VPBasicBlock>(Val: VPBB->getSuccessors()[RemovedIdx]);
1877	assert(count(RemovedSucc->getPredecessors(), VPBB) == `1` &&
1878	"There must be a single edge between VPBB and its successor");
1879	// Values coming from VPBB into phi recipes of RemoveSucc are removed from
1880	// these recipes.
1881	for (VPRecipeBase &R : RemovedSucc->phis()) {
1882	auto *Phi = cast<VPPhiAccessors>(Val: &R);
1883	assert((!isa<VPIRPhi>(&R) \|\| RemovedSucc->getNumPredecessors() == `1`) &&
1884	"VPIRPhis must have a single predecessor");
1885	Phi->removeIncomingValueFor(IncomingBlock: VPBB);
1886	}
1887	// Disconnect blocks and remove the terminator. RemovedSucc will be deleted
1888	// automatically on VPlan destruction if it becomes unreachable.
1889	VPBlockUtils::disconnectBlocks(From: VPBB, To: RemovedSucc);
1890	VPBB->back().eraseFromParent();
1891	}
1892	}
1893
1894	void VPlanTransforms::optimize(VPlan &Plan) {
1895	runPass(Fn: removeRedundantCanonicalIVs, Plan);
1896	runPass(Fn: removeRedundantInductionCasts, Plan);
1897
1898	runPass(Fn: simplifyRecipes, Plan, Args&: *Plan.getCanonicalIV()->getScalarType());
1899	runPass(Fn: simplifyBlends, Plan);
1900	runPass(Fn: removeDeadRecipes, Plan);
1901	runPass(Fn: narrowToSingleScalarRecipes, Plan);
1902	runPass(Fn: legalizeAndOptimizeInductions, Plan);
1903	runPass(Fn: removeRedundantExpandSCEVRecipes, Plan);
1904	runPass(Fn: simplifyRecipes, Plan, Args&: *Plan.getCanonicalIV()->getScalarType());
1905	runPass(Fn: removeBranchOnConst, Plan);
1906	runPass(Fn: removeDeadRecipes, Plan);
1907
1908	runPass(Fn: createAndOptimizeReplicateRegions, Plan);
1909	runPass(Transform: mergeBlocksIntoPredecessors, Plan);
1910	runPass(Fn: licm, Plan);
1911	}
1912
1913	// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
1914	// the loop terminator with a branch-on-cond recipe with the negated
1915	// active-lane-mask as operand. Note that this turns the loop into an
1916	// uncountable one. Only the existing terminator is replaced, all other existing
1917	// recipes/users remain unchanged, except for poison-generating flags being
1918	// dropped from the canonical IV increment. Return the created
1919	// VPActiveLaneMaskPHIRecipe.
1920	//
1921	// The function uses the following definitions:
1922	//
1923	// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
1924	// calculate-trip-count-minus-VF (original TC) : original TC
1925	// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
1926	// CanonicalIVPhi : CanonicalIVIncrement
1927	// %StartV is the canonical induction start value.
1928	//
1929	// The function adds the following recipes:
1930	//
1931	// vector.ph:
1932	// %TripCount = calculate-trip-count-minus-VF (original TC)
1933	// [if DataWithControlFlowWithoutRuntimeCheck]
1934	// %EntryInc = canonical-iv-increment-for-part %StartV
1935	// %EntryALM = active-lane-mask %EntryInc, %TripCount
1936	//
1937	// vector.body:
1938	// ...
1939	// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
1940	// ...
1941	// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
1942	// %ALM = active-lane-mask %InLoopInc, TripCount
1943	// %Negated = Not %ALM
1944	// branch-on-cond %Negated
1945	//
1946	static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
1947	VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
1948	VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
1949	VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
1950	auto *CanonicalIVPHI = Plan.getCanonicalIV();
1951	VPValue *StartV = CanonicalIVPHI->getStartValue();
1952
1953	auto *CanonicalIVIncrement =
1954	cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue());
1955	// TODO: Check if dropping the flags is needed if
1956	// !DataAndControlFlowWithoutRuntimeCheck.
1957	CanonicalIVIncrement->dropPoisonGeneratingFlags();
1958	DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
1959	// We can't use StartV directly in the ActiveLaneMask VPInstruction, since
1960	// we have to take unrolling into account. Each part needs to start at
1961	// Part VF*
1962	auto *VecPreheader = Plan.getVectorPreheader();
1963	VPBuilder Builder(VecPreheader);
1964
1965	// Create the ActiveLaneMask instruction using the correct start values.
1966	VPValue *TC = Plan.getTripCount();
1967
1968	VPValue TripCount, IncrementValue;
1969	if (!DataAndControlFlowWithoutRuntimeCheck) {
1970	// When the loop is guarded by a runtime overflow check for the loop
1971	// induction variable increment by VF, we can increment the value before
1972	// the get.active.lane mask and use the unmodified tripcount.
1973	IncrementValue = CanonicalIVIncrement;
1974	TripCount = TC;
1975	} else {
1976	// When avoiding a runtime check, the active.lane.mask inside the loop
1977	// uses a modified trip count and the induction variable increment is
1978	// done after the active.lane.mask intrinsic is called.
1979	IncrementValue = CanonicalIVPHI;
1980	TripCount = Builder.createNaryOp(Opcode: VPInstruction::CalculateTripCountMinusVF,
1981	Operands: {TC}, DL);
1982	}
1983	auto *EntryIncrement = Builder.createOverflowingOp(
1984	Opcode: VPInstruction::CanonicalIVIncrementForPart, Operands: {StartV}, WrapFlags: {false, false}, DL,
1985	Name: "index.part.next");
1986
1987	// Create the active lane mask instruction in the VPlan preheader.
1988	auto *EntryALM =
1989	Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask, Operands: {EntryIncrement, TC},
1990	DL, Name: "active.lane.mask.entry");
1991
1992	// Now create the ActiveLaneMaskPhi recipe in the main loop using the
1993	// preheader ActiveLaneMask instruction.
1994	auto LaneMaskPhi = new* VPActiveLaneMaskPHIRecipe (EntryALM, DebugLoc ());
1995	LaneMaskPhi->insertAfter(InsertPos: CanonicalIVPHI);
1996
1997	// Create the active lane mask for the next iteration of the loop before the
1998	// original terminator.
1999	VPRecipeBase *OriginalTerminator = EB->getTerminator();
2000	Builder.setInsertPoint(OriginalTerminator);
2001	auto *InLoopIncrement =
2002	Builder.createOverflowingOp(Opcode: VPInstruction::CanonicalIVIncrementForPart,
2003	Operands: {IncrementValue}, WrapFlags: {false, false}, DL);
2004	auto *ALM = Builder.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
2005	Operands: {InLoopIncrement, TripCount}, DL,
2006	Name: "active.lane.mask.next");
2007	LaneMaskPhi->addOperand(Operand: ALM);
2008
2009	// Replace the original terminator with BranchOnCond. We have to invert the
2010	// mask here because a true condition means jumping to the exit block.
2011	auto *NotMask = Builder.createNot(Operand: ALM, DL);
2012	Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {NotMask}, DL);
2013	OriginalTerminator->eraseFromParent();
2014	return LaneMaskPhi;
2015	}
2016
2017	/// Collect all VPValues representing a header mask through the (ICMP_ULE,
2018	/// WideCanonicalIV, backedge-taken-count) pattern.
2019	/// TODO: Introduce explicit recipe for header-mask instead of searching
2020	/// for the header-mask pattern manually.
2021	static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
2022	SmallVector<VPValue *> WideCanonicalIVs;
2023	auto *FoundWidenCanonicalIVUser =
2024	find_if(Range: Plan.getCanonicalIV()->users(),
2025	P: [](VPUser U) { return* isa<VPWidenCanonicalIVRecipe>(Val: U); });
2026	assert(count_if(Plan.getCanonicalIV()->users(),
2027	[](VPUser U) { return* isa<VPWidenCanonicalIVRecipe>(U); }) <=
2028	`1` &&
2029	"Must have at most one VPWideCanonicalIVRecipe");
2030	if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) {
2031	auto *WideCanonicalIV =
2032	cast<VPWidenCanonicalIVRecipe>(Val: *FoundWidenCanonicalIVUser);
2033	WideCanonicalIVs.push_back(Elt: WideCanonicalIV);
2034	}
2035
2036	// Also include VPWidenIntOrFpInductionRecipes that represent a widened
2037	// version of the canonical induction.
2038	VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2039	for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2040	auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
2041	if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2042	WideCanonicalIVs.push_back(Elt: WidenOriginalIV);
2043	}
2044
2045	// Walk users of wide canonical IVs and collect to all compares of the form
2046	// (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2047	SmallVector<VPValue *> HeaderMasks;
2048	for (auto *Wide : WideCanonicalIVs) {
2049	for (VPUser U : SmallVector<VPUser >(Wide->users())) {
2050	auto *HeaderMask = dyn_cast<VPInstruction>(Val: U);
2051	if (!HeaderMask \|\| !vputils::isHeaderMask(V: HeaderMask, Plan))
2052	continue;
2053
2054	assert(HeaderMask->getOperand(`0`) == Wide &&
2055	"WidenCanonicalIV must be the first operand of the compare");
2056	HeaderMasks.push_back(Elt: HeaderMask);
2057	}
2058	}
2059	return HeaderMasks;
2060	}
2061
2062	void VPlanTransforms::addActiveLaneMask(
2063	VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2064	bool DataAndControlFlowWithoutRuntimeCheck) {
2065	assert((!DataAndControlFlowWithoutRuntimeCheck \|\|
2066	UseActiveLaneMaskForControlFlow) &&
2067	"DataAndControlFlowWithoutRuntimeCheck implies "
2068	"UseActiveLaneMaskForControlFlow");
2069
2070	auto *FoundWidenCanonicalIVUser =
2071	find_if(Range: Plan.getCanonicalIV()->users(),
2072	P: [](VPUser U) { return* isa<VPWidenCanonicalIVRecipe>(Val: U); });
2073	assert(FoundWidenCanonicalIVUser &&
2074	"Must have widened canonical IV when tail folding!");
2075	auto *WideCanonicalIV =
2076	cast<VPWidenCanonicalIVRecipe>(Val: *FoundWidenCanonicalIVUser);
2077	VPSingleDefRecipe *LaneMask;
2078	if (UseActiveLaneMaskForControlFlow) {
2079	LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
2080	Plan, DataAndControlFlowWithoutRuntimeCheck);
2081	} else {
2082	VPBuilder B = VPBuilder::getToInsertAfter(R: WideCanonicalIV);
2083	LaneMask = B.createNaryOp(Opcode: VPInstruction::ActiveLaneMask,
2084	Operands: {WideCanonicalIV, Plan.getTripCount()}, Inst: nullptr,
2085	Name: "active.lane.mask");
2086	}
2087
2088	// Walk users of WideCanonicalIV and replace all compares of the form
2089	// (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an
2090	// active-lane-mask.
2091	for (VPValue *HeaderMask : collectAllHeaderMasks(Plan))
2092	HeaderMask->replaceAllUsesWith(New: LaneMask);
2093	}
2094
2095	/// Try to convert \p CurRecipe to a corresponding EVL-based recipe. Returns
2096	/// nullptr if no EVL-based recipe could be created.
2097	/// \p HeaderMask Header Mask.
2098	/// \p CurRecipe Recipe to be transform.
2099	/// \p TypeInfo VPlan-based type analysis.
2100	/// \p AllOneMask The vector mask parameter of vector-predication intrinsics.
2101	/// \p EVL The explicit vector length parameter of vector-predication
2102	/// intrinsics.
2103	/// \p PrevEVL The explicit vector length of the previous iteration. Only
2104	/// required if \p CurRecipe is a VPInstruction::FirstOrderRecurrenceSplice.
2105	static VPRecipeBase createEVLRecipe(VPValue HeaderMask,
2106	VPRecipeBase &CurRecipe,
2107	VPTypeAnalysis &TypeInfo,
2108	VPValue &AllOneMask, VPValue &EVL,
2109	VPValue *PrevEVL) {
2110	using namespace llvm::VPlanPatternMatch;
2111	auto GetNewMask = [&](VPValue OrigMask) -> VPValue {
2112	assert(OrigMask && "Unmasked recipe when folding tail");
2113	// HeaderMask will be handled using EVL.
2114	VPValue *Mask;
2115	if (match(V: OrigMask, P: m_LogicalAnd(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: Mask))))
2116	return Mask;
2117	return HeaderMask == OrigMask ? nullptr : OrigMask;
2118	};
2119
2120	return TypeSwitch<VPRecipeBase , VPRecipeBase >(&CurRecipe)
2121	.Case<VPWidenLoadRecipe>(caseFn: [&](VPWidenLoadRecipe *L) {
2122	VPValue *NewMask = GetNewMask (L->getMask());
2123	return new VPWidenLoadEVLRecipe (*L, EVL, NewMask);
2124	})
2125	.Case<VPWidenStoreRecipe>(caseFn: [&](VPWidenStoreRecipe *S) {
2126	VPValue *NewMask = GetNewMask (S->getMask());
2127	return new VPWidenStoreEVLRecipe (*S, EVL, NewMask);
2128	})
2129	.Case<VPReductionRecipe>(caseFn: [&](VPReductionRecipe *Red) {
2130	VPValue *NewMask = GetNewMask (Red->getCondOp());
2131	return new VPReductionEVLRecipe (*Red, EVL, NewMask);
2132	})
2133	.Case<VPWidenSelectRecipe>(caseFn: [&](VPWidenSelectRecipe *Sel) {
2134	SmallVector<VPValue *> Ops(Sel->operands());
2135	Ops.push_back(Elt: &EVL);
2136	return new VPWidenIntrinsicRecipe(Intrinsic::vp_select, Ops,
2137	TypeInfo.inferScalarType(Sel),
2138	Sel->getDebugLoc());
2139	})
2140	.Case<VPInstruction>(caseFn: [&](VPInstruction VPI) -> VPRecipeBase {
2141	if (VPI->getOpcode() == VPInstruction::FirstOrderRecurrenceSplice) {
2142	assert(PrevEVL && "Fixed-order recurrences require previous EVL");
2143	VPValue *MinusOneVPV = VPI->getParent()->getPlan()->getOrAddLiveIn(
2144	V: ConstantInt::getSigned(Ty: Type::getInt32Ty(C&: TypeInfo.getContext()),
2145	V: -`1`));
2146	SmallVector<VPValue *> Ops(VPI->operands());
2147	Ops.append(IL: {MinusOneVPV, &AllOneMask, PrevEVL, &EVL});
2148	return new VPWidenIntrinsicRecipe(Intrinsic::experimental_vp_splice,
2149	Ops, TypeInfo.inferScalarType(V: VPI),
2150	VPI->getDebugLoc());
2151	}
2152
2153	VPValue LHS, RHS;
2154	// Transform select with a header mask condition
2155	// select(header_mask, LHS, RHS)
2156	// into vector predication merge.
2157	// vp.merge(all-true, LHS, RHS, EVL)
2158	if (!match(V: VPI, P: m_Select(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V&: LHS),
2159	Op2: m_VPValue(V&: RHS))))
2160	return nullptr;
2161	// Use all true as the condition because this transformation is
2162	// limited to selects whose condition is a header mask.
2163	return new VPWidenIntrinsicRecipe(
2164	Intrinsic::vp_merge, {&AllOneMask, LHS, RHS, &EVL},
2165	TypeInfo.inferScalarType(V: LHS), VPI->getDebugLoc());
2166	})
2167	.Default(defaultFn: [&](VPRecipeBase R) { return* nullptr; });
2168	}
2169
2170	/// Replace recipes with their EVL variants.
2171	static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
2172	Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
2173	VPTypeAnalysis TypeInfo(CanonicalIVType);
2174	LLVMContext &Ctx = CanonicalIVType->getContext();
2175	VPValue *AllOneMask = Plan.getOrAddLiveIn(V: ConstantInt::getTrue(Context&: Ctx));
2176	VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2177	VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2178
2179	// Create a scalar phi to track the previous EVL if fixed-order recurrence is
2180	// contained.
2181	VPInstruction PrevEVL = nullptr*;
2182	bool ContainsFORs =
2183	any_of(Range: Header->phis(), P: IsaPred<VPFirstOrderRecurrencePHIRecipe>);
2184	if (ContainsFORs) {
2185	// TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
2186	VPValue *MaxEVL = &Plan.getVF();
2187	// Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
2188	VPBuilder Builder(LoopRegion->getPreheaderVPBB());
2189	if (unsigned VFSize =
2190	TypeInfo.inferScalarType(V: MaxEVL)->getScalarSizeInBits();
2191	VFSize != `32`) {
2192	MaxEVL = Builder.createScalarCast(
2193	Opcode: VFSize > `32` ? Instruction::Trunc : Instruction::ZExt, Op: MaxEVL,
2194	ResultTy: Type::getInt32Ty(C&: Ctx), DL: DebugLoc ());
2195	}
2196	Builder.setInsertPoint(TheBB: Header, IP: Header->getFirstNonPhi());
2197	PrevEVL = Builder.createScalarPhi(IncomingValues: {MaxEVL, &EVL}, DL: DebugLoc (), Name: "prev.evl");
2198	}
2199
2200	for (VPUser *U : to_vector(Range: Plan.getVF().users())) {
2201	if (auto *R = dyn_cast<VPVectorEndPointerRecipe>(Val: U))
2202	R->setOperand(I: `1`, New: &EVL);
2203	}
2204
2205	SmallVector<VPRecipeBase *> ToErase;
2206
2207	for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
2208	for (VPUser *U : collectUsersRecursively(V: HeaderMask)) {
2209	auto *CurRecipe = cast<VPRecipeBase>(Val: U);
2210	VPRecipeBase *EVLRecipe = createEVLRecipe(
2211	HeaderMask, CurRecipe&: CurRecipe, TypeInfo, AllOneMask&: AllOneMask, EVL, PrevEVL);
2212	if (!EVLRecipe)
2213	continue;
2214
2215	[[maybe_unused]] unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
2216	assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
2217	"New recipe must define the same number of values as the "
2218	"original.");
2219	assert(
2220	NumDefVal <= `1` &&
2221	"Only supports recipes with a single definition or without users.");
2222	EVLRecipe->insertBefore(InsertPos: CurRecipe);
2223	if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(Val: EVLRecipe)) {
2224	VPValue *CurVPV = CurRecipe->getVPSingleValue();
2225	CurVPV->replaceAllUsesWith(New: EVLRecipe->getVPSingleValue());
2226	}
2227	// Defer erasing recipes till the end so that we don't invalidate the
2228	// VPTypeAnalysis cache.
2229	ToErase.push_back(Elt: CurRecipe);
2230	}
2231	}
2232
2233	for (VPRecipeBase *R : reverse(C&: ToErase)) {
2234	SmallVector<VPValue *> PossiblyDead(R->operands());
2235	R->eraseFromParent();
2236	for (VPValue *Op : PossiblyDead)
2237	recursivelyDeleteDeadRecipes(V: Op);
2238	}
2239	}
2240
2241	/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
2242	/// replaces all uses except the canonical IV increment of
2243	/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
2244	/// is used only for loop iterations counting after this transformation.
2245	///
2246	/// The function uses the following definitions:
2247	/// %StartV is the canonical induction start value.
2248	///
2249	/// The function adds the following recipes:
2250	///
2251	/// vector.ph:
2252	/// ...
2253	///
2254	/// vector.body:
2255	/// ...
2256	/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2257	/// [ %NextEVLIV, %vector.body ]
2258	/// %AVL = sub original TC, %EVLPhi
2259	/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
2260	/// ...
2261	/// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
2262	/// ...
2263	///
2264	/// If MaxSafeElements is provided, the function adds the following recipes:
2265	/// vector.ph:
2266	/// ...
2267	///
2268	/// vector.body:
2269	/// ...
2270	/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
2271	/// [ %NextEVLIV, %vector.body ]
2272	/// %AVL = sub original TC, %EVLPhi
2273	/// %cmp = cmp ult %AVL, MaxSafeElements
2274	/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
2275	/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
2276	/// ...
2277	/// %NextEVLIV = add IVSize (cast i32 %VPEVL to IVSize), %EVLPhi
2278	/// ...
2279	///
2280	bool VPlanTransforms::tryAddExplicitVectorLength(
2281	VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
2282	VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2283	// The transform updates all users of inductions to work based on EVL, instead
2284	// of the VF directly. At the moment, widened inductions cannot be updated, so
2285	// bail out if the plan contains any.
2286	bool ContainsWidenInductions = any_of(
2287	Range: Header->phis(),
2288	P: IsaPred<VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>);
2289	if (ContainsWidenInductions)
2290	return false;
2291
2292	auto *CanonicalIVPHI = Plan.getCanonicalIV();
2293	VPValue *StartV = CanonicalIVPHI->getStartValue();
2294
2295	// Create the ExplicitVectorLengthPhi recipe in the main loop.
2296	auto EVLPhi = new* VPEVLBasedIVPHIRecipe (StartV, DebugLoc ());
2297	EVLPhi->insertAfter(InsertPos: CanonicalIVPHI);
2298	VPBuilder Builder(Header, Header->getFirstNonPhi());
2299	// Compute original TC - IV as the AVL (application vector length).
2300	VPValue *AVL = Builder.createNaryOp(
2301	Opcode: Instruction::Sub, Operands: {Plan.getTripCount(), EVLPhi}, DL: DebugLoc (), Name: "avl");
2302	if (MaxSafeElements) {
2303	// Support for MaxSafeDist for correct loop emission.
2304	VPValue *AVLSafe = Plan.getOrAddLiveIn(
2305	V: ConstantInt::get(Ty: CanonicalIVPHI->getScalarType(), V: *MaxSafeElements));
2306	VPValue *Cmp = Builder.createICmp(Pred: ICmpInst::ICMP_ULT, A: AVL, B: AVLSafe);
2307	AVL = Builder.createSelect(Cond: Cmp, TrueVal: AVL, FalseVal: AVLSafe, DL: DebugLoc (), Name: "safe_avl");
2308	}
2309	auto *VPEVL = Builder.createNaryOp(Opcode: VPInstruction::ExplicitVectorLength, Operands: AVL,
2310	DL: DebugLoc ());
2311
2312	auto *CanonicalIVIncrement =
2313	cast<VPInstruction>(Val: CanonicalIVPHI->getBackedgeValue());
2314	Builder.setInsertPoint(CanonicalIVIncrement);
2315	VPSingleDefRecipe *OpVPEVL = VPEVL;
2316	if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits();
2317	IVSize != `32`) {
2318	OpVPEVL = Builder.createScalarCast(
2319	Opcode: IVSize < `32` ? Instruction::Trunc : Instruction::ZExt, Op: OpVPEVL,
2320	ResultTy: CanonicalIVPHI->getScalarType(), DL: CanonicalIVIncrement->getDebugLoc());
2321	}
2322	auto *NextEVLIV = Builder.createOverflowingOp(
2323	Opcode: Instruction::Add, Operands: {OpVPEVL, EVLPhi},
2324	WrapFlags: {CanonicalIVIncrement->hasNoUnsignedWrap(),
2325	CanonicalIVIncrement->hasNoSignedWrap()},
2326	DL: CanonicalIVIncrement->getDebugLoc(), Name: "index.evl.next");
2327	EVLPhi->addOperand(Operand: NextEVLIV);
2328
2329	transformRecipestoEVLRecipes(Plan, EVL&: *VPEVL);
2330
2331	// Replace all uses of VPCanonicalIVPHIRecipe by
2332	// VPEVLBasedIVPHIRecipe except for the canonical IV increment.
2333	CanonicalIVPHI->replaceAllUsesWith(New: EVLPhi);
2334	CanonicalIVIncrement->setOperand(I: `0`, New: CanonicalIVPHI);
2335	// TODO: support unroll factor > 1.
2336	Plan.setUF(`1`);
2337	return true;
2338	}
2339
2340	void VPlanTransforms::dropPoisonGeneratingRecipes(
2341	VPlan &Plan,
2342	const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
2343	// Collect recipes in the backward slice of `Root` that may generate a poison
2344	// value that is used after vectorization.
2345	SmallPtrSet<VPRecipeBase *, `16`> Visited;
2346	auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
2347	SmallVector<VPRecipeBase *, `16`> Worklist;
2348	Worklist.push_back(Elt: Root);
2349
2350	// Traverse the backward slice of Root through its use-def chain.
2351	while (!Worklist.empty()) {
2352	VPRecipeBase *CurRec = Worklist.pop_back_val();
2353
2354	if (!Visited.insert(Ptr: CurRec).second)
2355	continue;
2356
2357	// Prune search if we find another recipe generating a widen memory
2358	// instruction. Widen memory instructions involved in address computation
2359	// will lead to gather/scatter instructions, which don't need to be
2360	// handled.
2361	if (isa<VPWidenMemoryRecipe, VPInterleaveRecipe, VPScalarIVStepsRecipe,
2362	VPHeaderPHIRecipe>(Val: CurRec))
2363	continue;
2364
2365	// This recipe contributes to the address computation of a widen
2366	// load/store. If the underlying instruction has poison-generating flags,
2367	// drop them directly.
2368	if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(Val: CurRec)) {
2369	VPValue A, B;
2370	using namespace llvm::VPlanPatternMatch;
2371	// Dropping disjoint from an OR may yield incorrect results, as some
2372	// analysis may have converted it to an Add implicitly (e.g. SCEV used
2373	// for dependence analysis). Instead, replace it with an equivalent Add.
2374	// This is possible as all users of the disjoint OR only access lanes
2375	// where the operands are disjoint or poison otherwise.
2376	if (match(V: RecWithFlags, P: m_BinaryOr(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B))) &&
2377	RecWithFlags->isDisjoint()) {
2378	VPBuilder Builder(RecWithFlags);
2379	VPInstruction *New = Builder.createOverflowingOp(
2380	Opcode: Instruction::Add, Operands: {A, B}, WrapFlags: {false, false},
2381	DL: RecWithFlags->getDebugLoc());
2382	New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
2383	RecWithFlags->replaceAllUsesWith(New);
2384	RecWithFlags->eraseFromParent();
2385	CurRec = New;
2386	} else
2387	RecWithFlags->dropPoisonGeneratingFlags();
2388	} else {
2389	Instruction *Instr = dyn_cast_or_null<Instruction>(
2390	Val: CurRec->getVPSingleValue()->getUnderlyingValue());
2391	(void)Instr;
2392	assert((!Instr \|\| !Instr->hasPoisonGeneratingFlags()) &&
2393	"found instruction with poison generating flags not covered by "
2394	"VPRecipeWithIRFlags");
2395	}
2396
2397	// Add new definitions to the worklist.
2398	for (VPValue *Operand : CurRec->operands())
2399	if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
2400	Worklist.push_back(Elt: OpDef);
2401	}
2402	});
2403
2404	// Traverse all the recipes in the VPlan and collect the poison-generating
2405	// recipes in the backward slice starting at the address of a VPWidenRecipe or
2406	// VPInterleaveRecipe.
2407	auto Iter = vp_depth_first_deep(G: Plan.getEntry());
2408	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
2409	for (VPRecipeBase &Recipe : *VPBB) {
2410	if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(Val: &Recipe)) {
2411	Instruction &UnderlyingInstr = WidenRec->getIngredient();
2412	VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
2413	if (AddrDef && WidenRec->isConsecutive() &&
2414	BlockNeedsPredication (UnderlyingInstr.getParent()))
2415	CollectPoisonGeneratingInstrsInBackwardSlice (AddrDef);
2416	} else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(Val: &Recipe)) {
2417	VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
2418	if (AddrDef) {
2419	// Check if any member of the interleave group needs predication.
2420	const InterleaveGroup<Instruction> *InterGroup =
2421	InterleaveRec->getInterleaveGroup();
2422	bool NeedPredication = false;
2423	for (int I = `0`, NumMembers = InterGroup->getNumMembers();
2424	I < NumMembers; ++I) {
2425	Instruction *Member = InterGroup->getMember(Index: I);
2426	if (Member)
2427	NeedPredication \|= BlockNeedsPredication (Member->getParent());
2428	}
2429
2430	if (NeedPredication)
2431	CollectPoisonGeneratingInstrsInBackwardSlice (AddrDef);
2432	}
2433	}
2434	}
2435	}
2436	}
2437
2438	void VPlanTransforms::createInterleaveGroups(
2439	VPlan &Plan,
2440	const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
2441	&InterleaveGroups,
2442	VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
2443	if (InterleaveGroups.empty())
2444	return;
2445
2446	// Interleave memory: for each Interleave Group we marked earlier as relevant
2447	// for this VPlan, replace the Recipes widening its memory instructions with a
2448	// single VPInterleaveRecipe at its insertion point.
2449	VPDominatorTree VPDT;
2450	VPDT.recalculate(Func&: Plan);
2451	for (const auto *IG : InterleaveGroups) {
2452	SmallVector<VPValue *, `4`> StoredValues;
2453	for (unsigned i = `0`; i < IG->getFactor(); ++i)
2454	if (auto *SI = dyn_cast_or_null<StoreInst>(Val: IG->getMember(Index: i))) {
2455	auto *StoreR = cast<VPWidenStoreRecipe>(Val: RecipeBuilder.getRecipe(I: SI));
2456	StoredValues.push_back(Elt: StoreR->getStoredValue());
2457	}
2458
2459	bool NeedsMaskForGaps =
2460	IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed;
2461
2462	Instruction *IRInsertPos = IG->getInsertPos();
2463	auto *InsertPos =
2464	cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IRInsertPos));
2465
2466	// Get or create the start address for the interleave group.
2467	auto *Start =
2468	cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IG->getMember(Index: `0`)));
2469	VPValue *Addr = Start->getAddr();
2470	VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
2471	if (AddrDef && !VPDT.properlyDominates(A: AddrDef, B: InsertPos)) {
2472	// TODO: Hoist Addr's defining recipe (and any operands as needed) to
2473	// InsertPos or sink loads above zero members to join it.
2474	bool InBounds = false;
2475	if (auto *Gep = dyn_cast<GetElementPtrInst>(
2476	Val: getLoadStorePointerOperand(V: IRInsertPos)->stripPointerCasts()))
2477	InBounds = Gep->isInBounds();
2478
2479	// We cannot re-use the address of member zero because it does not
2480	// dominate the insert position. Instead, use the address of the insert
2481	// position and create a PtrAdd adjusting it to the address of member
2482	// zero.
2483	assert(IG->getIndex(IRInsertPos) != `0` &&
2484	"index of insert position shouldn't be zero");
2485	auto &DL = IRInsertPos->getDataLayout();
2486	APInt Offset(`32`,
2487	DL.getTypeAllocSize(Ty: getLoadStoreType(I: IRInsertPos)) *
2488	IG->getIndex(Instr: IRInsertPos),
2489	/IsSigned=/true);
2490	VPValue *OffsetVPV = Plan.getOrAddLiveIn(
2491	V: ConstantInt::get(Context&: IRInsertPos->getParent()->getContext(), V: -Offset));
2492	VPBuilder B(InsertPos);
2493	Addr = InBounds ? B.createInBoundsPtrAdd(Ptr: InsertPos->getAddr(), Offset: OffsetVPV)
2494	: B.createPtrAdd(Ptr: InsertPos->getAddr(), Offset: OffsetVPV);
2495	}
2496	auto VPIG = new* VPInterleaveRecipe (IG, Addr, StoredValues,
2497	InsertPos->getMask(), NeedsMaskForGaps, InsertPos->getDebugLoc());
2498	VPIG->insertBefore(InsertPos);
2499
2500	unsigned J = `0`;
2501	for (unsigned i = `0`; i < IG->getFactor(); ++i)
2502	if (Instruction *Member = IG->getMember(Index: i)) {
2503	VPRecipeBase *MemberR = RecipeBuilder.getRecipe(I: Member);
2504	if (!Member->getType()->isVoidTy()) {
2505	VPValue *OriginalV = MemberR->getVPSingleValue();
2506	OriginalV->replaceAllUsesWith(New: VPIG->getVPValue(I: J));
2507	J++;
2508	}
2509	MemberR->eraseFromParent();
2510	}
2511	}
2512	}
2513
2514	void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
2515	// Replace loop regions with explicity CFG.
2516	SmallVector<VPRegionBlock *> LoopRegions;
2517	for (VPRegionBlock *R : VPBlockUtils::blocksOnly<VPRegionBlock>(
2518	Range: vp_depth_first_deep(G: Plan.getEntry()))) {
2519	if (!R->isReplicator())
2520	LoopRegions.push_back(Elt: R);
2521	}
2522	for (VPRegionBlock *R : LoopRegions)
2523	R->dissolveToCFGLoop();
2524	}
2525
2526	// Expand VPExtendedReductionRecipe to VPWidenCastRecipe + VPReductionRecipe.
2527	static void expandVPExtendedReduction(VPExtendedReductionRecipe *ExtRed) {
2528	VPWidenCastRecipe *Ext;
2529	// Only ZExt contains non-neg flags.
2530	if (ExtRed->isZExt())
2531	Ext = new VPWidenCastRecipe (ExtRed->getExtOpcode(), ExtRed->getVecOp(),
2532	ExtRed->getResultType(), *ExtRed,
2533	ExtRed->getDebugLoc());
2534	else
2535	Ext = new VPWidenCastRecipe (ExtRed->getExtOpcode(), ExtRed->getVecOp(),
2536	ExtRed->getResultType(), {},
2537	ExtRed->getDebugLoc());
2538
2539	auto Red = new* VPReductionRecipe (
2540	ExtRed->getRecurrenceKind(), FastMathFlags (), ExtRed->getChainOp(), Ext,
2541	ExtRed->getCondOp(), ExtRed->isOrdered(), ExtRed->getDebugLoc());
2542	Ext->insertBefore(InsertPos: ExtRed);
2543	Red->insertBefore(InsertPos: ExtRed);
2544	ExtRed->replaceAllUsesWith(New: Red);
2545	ExtRed->eraseFromParent();
2546	}
2547
2548	// Expand VPMulAccumulateReductionRecipe to VPWidenRecipe (mul) +
2549	// VPReductionRecipe (reduce.add)
2550	// + VPWidenCastRecipe (optional).
2551	static void
2552	expandVPMulAccumulateReduction(VPMulAccumulateReductionRecipe *MulAcc) {
2553	// Generate inner VPWidenCastRecipes if necessary.
2554	// Note that we will drop the extend after mul which transforms
2555	// reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)).
2556	VPValue Op0, Op1;
2557	if (MulAcc->isExtended()) {
2558	Type *RedTy = MulAcc->getResultType();
2559	if (MulAcc->isZExt())
2560	Op0 = new VPWidenCastRecipe (
2561	MulAcc->getExtOpcode(), MulAcc->getVecOp0(), RedTy,
2562	VPIRFlags::NonNegFlagsTy (MulAcc->isNonNeg()), MulAcc->getDebugLoc());
2563	else
2564	Op0 = new VPWidenCastRecipe (MulAcc->getExtOpcode(), MulAcc->getVecOp0(),
2565	RedTy, {}, MulAcc->getDebugLoc());
2566	Op0->getDefiningRecipe()->insertBefore(InsertPos: MulAcc);
2567	// Prevent reduce.add(mul(ext(A), ext(A))) generate duplicate
2568	// VPWidenCastRecipe.
2569	if (MulAcc->getVecOp0() == MulAcc->getVecOp1()) {
2570	Op1 = Op0;
2571	} else {
2572	if (MulAcc->isZExt())
2573	Op1 = new VPWidenCastRecipe (
2574	MulAcc->getExtOpcode(), MulAcc->getVecOp1(), RedTy,
2575	VPIRFlags::NonNegFlagsTy (MulAcc->isNonNeg()),
2576	MulAcc->getDebugLoc());
2577	else
2578	Op1 = new VPWidenCastRecipe (MulAcc->getExtOpcode(), MulAcc->getVecOp1(),
2579	RedTy, {}, MulAcc->getDebugLoc());
2580	Op1->getDefiningRecipe()->insertBefore(InsertPos: MulAcc);
2581	}
2582	} else {
2583	// No extends in this MulAccRecipe.
2584	Op0 = MulAcc->getVecOp0();
2585	Op1 = MulAcc->getVecOp1();
2586	}
2587
2588	std::array<VPValue *, `2`> MulOps = {Op0, Op1};
2589	auto Mul = new* VPWidenRecipe (
2590	Instruction::Mul, ArrayRef(MulOps), MulAcc->hasNoUnsignedWrap(),
2591	MulAcc->hasNoSignedWrap(), MulAcc->getDebugLoc());
2592	Mul->insertBefore(InsertPos: MulAcc);
2593
2594	auto Red = new* VPReductionRecipe (
2595	MulAcc->getRecurrenceKind(), FastMathFlags (), MulAcc->getChainOp(), Mul,
2596	MulAcc->getCondOp(), MulAcc->isOrdered(), MulAcc->getDebugLoc());
2597	Red->insertBefore(InsertPos: MulAcc);
2598
2599	MulAcc->replaceAllUsesWith(New: Red);
2600	MulAcc->eraseFromParent();
2601	}
2602
2603	void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
2604	Type &CanonicalIVTy) {
2605	using namespace llvm::VPlanPatternMatch;
2606
2607	VPTypeAnalysis TypeInfo(&CanonicalIVTy);
2608	SmallVector<VPRecipeBase *> ToRemove;
2609	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2610	Range: vp_depth_first_deep(G: Plan.getEntry()))) {
2611	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
2612	if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(Val: &R)) {
2613	auto *ScalarR = VPBuilder (PhiR).createScalarPhi(
2614	IncomingValues: {PhiR->getStartValue(), PhiR->getBackedgeValue()},
2615	DL: PhiR->getDebugLoc(), Name: "evl.based.iv");
2616	PhiR->replaceAllUsesWith(New: ScalarR);
2617	ToRemove.push_back(Elt: PhiR);
2618	continue;
2619	}
2620
2621	VPValue *VectorStep;
2622	VPValue *ScalarStep;
2623	if (!match(V: &R, P: m_VPInstruction<VPInstruction::WideIVStep>(
2624	Op0: m_VPValue(V&: VectorStep), Op1: m_VPValue(V&: ScalarStep))))
2625	continue;
2626
2627	// Expand WideIVStep.
2628	auto *VPI = cast<VPInstruction>(Val: &R);
2629	VPBuilder Builder(VPI);
2630	Type *IVTy = TypeInfo.inferScalarType(V: VPI);
2631	if (TypeInfo.inferScalarType(V: VectorStep) != IVTy) {
2632	Instruction::CastOps CastOp = IVTy->isFloatingPointTy()
2633	? Instruction::UIToFP
2634	: Instruction::Trunc;
2635	VectorStep = Builder.createWidenCast(Opcode: CastOp, Op: VectorStep, ResultTy: IVTy);
2636	}
2637
2638	[[maybe_unused]] auto *ConstStep =
2639	ScalarStep->isLiveIn()
2640	? dyn_cast<ConstantInt>(Val: ScalarStep->getLiveInIRValue())
2641	: nullptr;
2642	assert(!ConstStep \|\| ConstStep->getValue() != `1`);
2643	(void)ConstStep;
2644	if (TypeInfo.inferScalarType(V: ScalarStep) != IVTy) {
2645	ScalarStep =
2646	Builder.createWidenCast(Opcode: Instruction::Trunc, Op: ScalarStep, ResultTy: IVTy);
2647	}
2648
2649	VPIRFlags Flags;
2650	if (IVTy->isFloatingPointTy())
2651	Flags = {VPI->getFastMathFlags()};
2652
2653	unsigned MulOpc =
2654	IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
2655	VPInstruction *Mul = Builder.createNaryOp(
2656	Opcode: MulOpc, Operands: {VectorStep, ScalarStep}, Flags, DL: R.getDebugLoc());
2657	VectorStep = Mul;
2658	VPI->replaceAllUsesWith(New: VectorStep);
2659	ToRemove.push_back(Elt: VPI);
2660	}
2661	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
2662	if (auto *ExtRed = dyn_cast<VPExtendedReductionRecipe>(Val: &R)) {
2663	expandVPExtendedReduction(ExtRed);
2664	continue;
2665	}
2666	if (auto *MulAcc = dyn_cast<VPMulAccumulateReductionRecipe>(Val: &R))
2667	expandVPMulAccumulateReduction(MulAcc);
2668	}
2669	}
2670
2671	for (VPRecipeBase *R : ToRemove)
2672	R->eraseFromParent();
2673	}
2674
2675	void VPlanTransforms::handleUncountableEarlyExit(
2676	VPBasicBlock EarlyExitingVPBB, VPBasicBlock EarlyExitVPBB, VPlan &Plan,
2677	VPBasicBlock HeaderVPBB, VPBasicBlock LatchVPBB, VFRange &Range) {
2678	using namespace llvm::VPlanPatternMatch;
2679
2680	VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[`0`];
2681	if (!EarlyExitVPBB->getSinglePredecessor() &&
2682	EarlyExitVPBB->getPredecessors()[`1`] == MiddleVPBB) {
2683	assert(EarlyExitVPBB->getNumPredecessors() == `2` &&
2684	EarlyExitVPBB->getPredecessors()[`0`] == EarlyExitingVPBB &&
2685	"unsupported early exit VPBB");
2686	// Early exit operand should always be last phi operand. If EarlyExitVPBB
2687	// has two predecessors and EarlyExitingVPBB is the first, swap the operands
2688	// of the phis.
2689	for (VPRecipeBase &R : EarlyExitVPBB->phis())
2690	cast<VPIRPhi>(Val: &R)->swapOperands();
2691	}
2692
2693	VPBuilder Builder(LatchVPBB->getTerminator());
2694	VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[`0`];
2695	assert(
2696	match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond(m_VPValue())) &&
2697	"Terminator must be be BranchOnCond");
2698	VPValue *CondOfEarlyExitingVPBB =
2699	EarlyExitingVPBB->getTerminator()->getOperand(N: `0`);
2700	auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
2701	? CondOfEarlyExitingVPBB
2702	: Builder.createNot(Operand: CondOfEarlyExitingVPBB);
2703
2704	// Split the middle block and have it conditionally branch to the early exit
2705	// block if CondToEarlyExit.
2706	VPValue *IsEarlyExitTaken =
2707	Builder.createNaryOp(Opcode: VPInstruction::AnyOf, Operands: {CondToEarlyExit});
2708	VPBasicBlock *NewMiddle = Plan.createVPBasicBlock(Name: "middle.split");
2709	VPBasicBlock *VectorEarlyExitVPBB =
2710	Plan.createVPBasicBlock(Name: "vector.early.exit");
2711	VPBlockUtils::insertOnEdge(From: LatchVPBB, To: MiddleVPBB, BlockPtr: NewMiddle);
2712	VPBlockUtils::connectBlocks(From: NewMiddle, To: VectorEarlyExitVPBB);
2713	NewMiddle->swapSuccessors();
2714
2715	VPBlockUtils::connectBlocks(From: VectorEarlyExitVPBB, To: EarlyExitVPBB);
2716
2717	// Update the exit phis in the early exit block.
2718	VPBuilder MiddleBuilder(NewMiddle);
2719	VPBuilder EarlyExitB(VectorEarlyExitVPBB);
2720	for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
2721	auto *ExitIRI = cast<VPIRPhi>(Val: &R);
2722	// Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
2723	// a single predecessor and 1 if it has two.
2724	unsigned EarlyExitIdx = ExitIRI->getNumOperands() - `1`;
2725	if (ExitIRI->getNumOperands() != `1`) {
2726	// The first of two operands corresponds to the latch exit, via MiddleVPBB
2727	// predecessor. Extract its last lane.
2728	ExitIRI->extractLastLaneOfFirstOperand(Builder&: MiddleBuilder);
2729	}
2730
2731	VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(N: EarlyExitIdx);
2732	auto IsVector = [](ElementCount VF) { return VF.isVector(); };
2733	// When the VFs are vectors, need to add `extract` to get the incoming value
2734	// from early exit. When the range contains scalar VF, limit the range to
2735	// scalar VF to prevent mis-compilation for the range containing both scalar
2736	// and vector VFs.
2737	if (!IncomingFromEarlyExit->isLiveIn() &&
2738	LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: IsVector, Range)) {
2739	// Update the incoming value from the early exit.
2740	VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
2741	Opcode: VPInstruction::FirstActiveLane, Operands: {CondToEarlyExit}, Inst: nullptr,
2742	Name: "first.active.lane");
2743	IncomingFromEarlyExit = EarlyExitB.createNaryOp(
2744	Opcode: Instruction::ExtractElement, Operands: {IncomingFromEarlyExit, FirstActiveLane},
2745	Inst: nullptr, Name: "early.exit.value");
2746	ExitIRI->setOperand(I: EarlyExitIdx, New: IncomingFromEarlyExit);
2747	}
2748	}
2749	MiddleBuilder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: {IsEarlyExitTaken});
2750
2751	// Replace the condition controlling the non-early exit from the vector loop
2752	// with one exiting if either the original condition of the vector latch is
2753	// true or the early exit has been taken.
2754	auto *LatchExitingBranch = cast<VPInstruction>(Val: LatchVPBB->getTerminator());
2755	assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
2756	"Unexpected terminator");
2757	auto *IsLatchExitTaken =
2758	Builder.createICmp(Pred: CmpInst::ICMP_EQ, A: LatchExitingBranch->getOperand(N: `0`),
2759	B: LatchExitingBranch->getOperand(N: `1`));
2760	auto *AnyExitTaken = Builder.createNaryOp(
2761	Opcode: Instruction::Or, Operands: {IsEarlyExitTaken, IsLatchExitTaken});
2762	Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, Operands: AnyExitTaken);
2763	LatchExitingBranch->eraseFromParent();
2764	}
2765
2766	/// This function tries convert extended in-loop reductions to
2767	/// VPExtendedReductionRecipe and clamp the \p Range if it is beneficial and
2768	/// valid. The created recipe must be lowered to concrete
2769	/// recipes before execution.
2770	static VPExtendedReductionRecipe *
2771	tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
2772	VFRange &Range) {
2773	using namespace VPlanPatternMatch;
2774
2775	Type *RedTy = Ctx.Types.inferScalarType(V: Red);
2776	VPValue *VecOp = Red->getVecOp();
2777
2778	// Clamp the range if using extended-reduction is profitable.
2779	auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt,
2780	Type SrcTy) -> bool* {
2781	return LoopVectorizationPlanner::getDecisionAndClampRange(
2782	Predicate: [&](ElementCount VF) {
2783	auto *SrcVecTy = cast<VectorType>(Val: toVectorTy(Scalar: SrcTy, EC: VF));
2784	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2785	InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost(
2786	Opcode, IsUnsigned: isZExt, ResTy: RedTy, Ty: SrcVecTy, FMF: Red->getFastMathFlags(),
2787	CostKind);
2788	InstructionCost ExtCost =
2789	cast<VPWidenCastRecipe>(Val: VecOp)->computeCost(VF, Ctx);
2790	InstructionCost RedCost = Red->computeCost(VF, Ctx);
2791	return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
2792	},
2793	Range);
2794	};
2795
2796	VPValue *A;
2797	// Match reduce(ext)).
2798	if (match(V: VecOp, P: m_ZExtOrSExt(Op0: m_VPValue(V&: A))) &&
2799	IsExtendedRedValidAndClampRange (
2800	RecurrenceDescriptor::getOpcode(Kind: Red->getRecurrenceKind()),
2801	cast<VPWidenCastRecipe>(Val: VecOp)->getOpcode() ==
2802	Instruction::CastOps::ZExt,
2803	Ctx.Types.inferScalarType(V: A)))
2804	return new VPExtendedReductionRecipe (Red, cast<VPWidenCastRecipe>(Val: VecOp));
2805
2806	return nullptr;
2807	}
2808
2809	/// This function tries convert extended in-loop reductions to
2810	/// VPMulAccumulateReductionRecipe and clamp the \p Range if it is beneficial
2811	/// and valid. The created VPExtendedReductionRecipe must be lower to concrete
2812	/// recipes before execution. Patterns of MulAccumulateReduction:
2813	/// reduce.add(mul(...)),
2814	/// reduce.add(mul(ext(A), ext(B))),
2815	/// reduce.add(ext(mul(ext(A), ext(B)))).
2816	static VPMulAccumulateReductionRecipe *
2817	tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
2818	VPCostContext &Ctx, VFRange &Range) {
2819	using namespace VPlanPatternMatch;
2820
2821	unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: Red->getRecurrenceKind());
2822	if (Opcode != Instruction::Add)
2823	return nullptr;
2824
2825	Type *RedTy = Ctx.Types.inferScalarType(V: Red);
2826
2827	// Clamp the range if using multiply-accumulate-reduction is profitable.
2828	auto IsMulAccValidAndClampRange =
2829	[&](bool isZExt, VPWidenRecipe Mul, VPWidenCastRecipe Ext0,
2830	VPWidenCastRecipe Ext1, VPWidenCastRecipe OuterExt) -> bool {
2831	return LoopVectorizationPlanner::getDecisionAndClampRange(
2832	Predicate: [&](ElementCount VF) {
2833	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2834	Type *SrcTy =
2835	Ext0 ? Ctx.Types.inferScalarType(V: Ext0->getOperand(N: `0`)) : RedTy;
2836	auto *SrcVecTy = cast<VectorType>(Val: toVectorTy(Scalar: SrcTy, EC: VF));
2837	InstructionCost MulAccCost =
2838	Ctx.TTI.getMulAccReductionCost(IsUnsigned: isZExt, ResTy: RedTy, Ty: SrcVecTy, CostKind);
2839	InstructionCost MulCost = Mul->computeCost(VF, Ctx);
2840	InstructionCost RedCost = Red->computeCost(VF, Ctx);
2841	InstructionCost ExtCost = `0`;
2842	if (Ext0)
2843	ExtCost += Ext0->computeCost(VF, Ctx);
2844	if (Ext1)
2845	ExtCost += Ext1->computeCost(VF, Ctx);
2846	if (OuterExt)
2847	ExtCost += OuterExt->computeCost(VF, Ctx);
2848
2849	return MulAccCost.isValid() &&
2850	MulAccCost < ExtCost + MulCost + RedCost;
2851	},
2852	Range);
2853	};
2854
2855	VPValue *VecOp = Red->getVecOp();
2856	VPValue A, B;
2857	// Try to match reduce.add(mul(...)).
2858	if (match(V: VecOp, P: m_Mul(Op0: m_VPValue(V&: A), Op1: m_VPValue(V&: B)))) {
2859	auto *RecipeA =
2860	dyn_cast_if_present<VPWidenCastRecipe>(Val: A->getDefiningRecipe());
2861	auto *RecipeB =
2862	dyn_cast_if_present<VPWidenCastRecipe>(Val: B->getDefiningRecipe());
2863	auto *Mul = cast<VPWidenRecipe>(Val: VecOp->getDefiningRecipe());
2864
2865	// Match reduce.add(mul(ext, ext)).
2866	if (RecipeA && RecipeB &&
2867	(RecipeA->getOpcode() == RecipeB->getOpcode() \|\| A == B) &&
2868	match(V: RecipeA, P: m_ZExtOrSExt(Op0: m_VPValue())) &&
2869	match(V: RecipeB, P: m_ZExtOrSExt(Op0: m_VPValue())) &&
2870	IsMulAccValidAndClampRange (RecipeA->getOpcode() ==
2871	Instruction::CastOps::ZExt,
2872	Mul, RecipeA, RecipeB, nullptr))
2873	return new VPMulAccumulateReductionRecipe (Red, Mul, RecipeA, RecipeB,
2874	RecipeA->getResultType());
2875	// Match reduce.add(mul).
2876	if (IsMulAccValidAndClampRange (true, Mul, nullptr, nullptr, nullptr))
2877	return new VPMulAccumulateReductionRecipe (Red, Mul, RedTy);
2878	}
2879	// Match reduce.add(ext(mul(ext(A), ext(B)))).
2880	// All extend recipes must have same opcode or A == B
2881	// which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
2882	if (match(V: VecOp, P: m_ZExtOrSExt(Op0: m_Mul(Op0: m_ZExtOrSExt(Op0: m_VPValue()),
2883	Op1: m_ZExtOrSExt(Op0: m_VPValue()))))) {
2884	auto *Ext = cast<VPWidenCastRecipe>(Val: VecOp->getDefiningRecipe());
2885	auto *Mul = cast<VPWidenRecipe>(Val: Ext->getOperand(N: `0`)->getDefiningRecipe());
2886	auto *Ext0 =
2887	cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: `0`)->getDefiningRecipe());
2888	auto *Ext1 =
2889	cast<VPWidenCastRecipe>(Val: Mul->getOperand(N: `1`)->getDefiningRecipe());
2890	if ((Ext->getOpcode() == Ext0->getOpcode() \|\| Ext0 == Ext1) &&
2891	Ext0->getOpcode() == Ext1->getOpcode() &&
2892	IsMulAccValidAndClampRange (Ext0->getOpcode() ==
2893	Instruction::CastOps::ZExt,
2894	Mul, Ext0, Ext1, Ext))
2895	return new VPMulAccumulateReductionRecipe (Red, Mul, Ext0, Ext1,
2896	Ext->getResultType());
2897	}
2898	return nullptr;
2899	}
2900
2901	/// This function tries to create abstract recipes from the reduction recipe for
2902	/// following optimizations and cost estimation.
2903	static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
2904	VPCostContext &Ctx,
2905	VFRange &Range) {
2906	VPReductionRecipe AbstractR = nullptr*;
2907
2908	if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
2909	AbstractR = MulAcc;
2910	else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
2911	AbstractR = ExtRed;
2912	// Cannot create abstract inloop reduction recipes.
2913	if (!AbstractR)
2914	return;
2915
2916	AbstractR->insertBefore(InsertPos: Red);
2917	Red->replaceAllUsesWith(New: AbstractR);
2918	}
2919
2920	void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
2921	VFRange &Range) {
2922	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2923	Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()))) {
2924	for (VPRecipeBase &R : *VPBB) {
2925	if (auto *Red = dyn_cast<VPReductionRecipe>(Val: &R))
2926	tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
2927	}
2928	}
2929	}
2930
2931	void VPlanTransforms::materializeStepVectors(VPlan &Plan) {
2932	for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
2933	auto *IVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &Phi);
2934	if (!IVR)
2935	continue;
2936
2937	Type *Ty = IVR->getPHINode()->getType();
2938	if (TruncInst *Trunc = IVR->getTruncInst())
2939	Ty = Trunc->getType();
2940	if (Ty->isFloatingPointTy())
2941	Ty = IntegerType::get(C&: Ty->getContext(), NumBits: Ty->getScalarSizeInBits());
2942
2943	VPBuilder Builder(Plan.getVectorPreheader());
2944	VPInstruction *StepVector = Builder.createNaryOp(
2945	Opcode: VPInstruction::StepVector, Operands: {}, ResultTy: Ty, Flags: {}, DL: IVR->getDebugLoc());
2946	assert(IVR->getNumOperands() == `3` &&
2947	"can only add step vector before unrolling");
2948	IVR->addOperand(Operand: StepVector);
2949	}
2950	}
2951
2952	void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
2953	if (Plan.hasScalarVFOnly())
2954	return;
2955
2956	#ifndef NDEBUG
2957	VPDominatorTree VPDT;
2958	VPDT.recalculate(Func&: Plan);
2959	#endif
2960
2961	SmallVector<VPValue *> VPValues;
2962	if (Plan.getOrCreateBackedgeTakenCount()->getNumUsers() > `0`)
2963	VPValues.push_back(Elt: Plan.getOrCreateBackedgeTakenCount());
2964	append_range(C&: VPValues, R: Plan.getLiveIns());
2965	for (VPRecipeBase &R : *Plan.getEntry())
2966	append_range(C&: VPValues, R: R.definedValues());
2967
2968	auto *VectorPreheader = Plan.getVectorPreheader();
2969	for (VPValue *VPV : VPValues) {
2970	if (all_of(Range: VPV->users(),
2971	P: [VPV](VPUser U) { return* U->usesScalars(Op: VPV); }) \|\|
2972	(VPV->isLiveIn() && VPV->getLiveInIRValue() &&
2973	isa<Constant>(Val: VPV->getLiveInIRValue())))
2974	continue;
2975
2976	// Add explicit broadcast at the insert point that dominates all users.
2977	VPBasicBlock *HoistBlock = VectorPreheader;
2978	VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
2979	for (VPUser *User : VPV->users()) {
2980	if (User->usesScalars(Op: VPV))
2981	continue;
2982	if (cast<VPRecipeBase>(Val: User)->getParent() == VectorPreheader)
2983	HoistPoint = HoistBlock->begin();
2984	else
2985	assert(VPDT.dominates(VectorPreheader,
2986	cast<VPRecipeBase>(User)->getParent()) &&
2987	"All users must be in the vector preheader or dominated by it");
2988	}
2989
2990	VPBuilder Builder(cast<VPBasicBlock>(Val: HoistBlock), HoistPoint);
2991	auto *Broadcast = Builder.createNaryOp(Opcode: VPInstruction::Broadcast, Operands: {VPV});
2992	VPV->replaceUsesWithIf(New: Broadcast,
2993	ShouldReplace: [VPV, Broadcast](VPUser &U, unsigned Idx) {
2994	return Broadcast != &U && !U.usesScalars(Op: VPV);
2995	});
2996	}
2997	}
2998
2999	/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
3000	/// converted to a narrower recipe. \p V is used by a wide recipe \p WideMember
3001	/// that feeds a store interleave group at index \p Idx, \p WideMember0 is the
3002	/// recipe feeding the same interleave group at index 0. A VPWidenLoadRecipe can
3003	/// be narrowed to an index-independent load if it feeds all wide ops at all
3004	/// indices (checked by via the operands of the wide recipe at lane0, \p
3005	/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
3006	/// is defined at \p Idx of a load interleave group.
3007	static bool canNarrowLoad(VPWidenRecipe WideMember0, VPWidenRecipe WideMember,
3008	VPValue V, unsigned* Idx) {
3009	auto *DefR = V->getDefiningRecipe();
3010	if (!DefR)
3011	return false;
3012	if (auto *W = dyn_cast<VPWidenLoadRecipe>(Val: DefR))
3013	return !W->getMask() &&
3014	all_of(Range: zip(t: WideMember0->operands(), u: WideMember->operands()),
3015	P: [V](const auto P) {
3016	// V must be as at the same places in both WideMember0 and
3017	// WideMember.
3018	const auto &[WideMember0Op, WideMemberOp] = P;
3019	return (WideMember0Op == V) == (WideMemberOp == V);
3020	});
3021
3022	if (auto *IR = dyn_cast<VPInterleaveRecipe>(Val: DefR))
3023	return IR->getInterleaveGroup()->getFactor() ==
3024	IR->getInterleaveGroup()->getNumMembers() &&
3025	IR->getVPValue(I: Idx) == V;
3026	return false;
3027	}
3028
3029	/// Returns true if \p IR is a full interleave group with factor and number of
3030	/// members both equal to \p VF. The interleave group must also access the full
3031	/// vector width \p VectorRegWidth.
3032	static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
3033	unsigned VF, VPTypeAnalysis &TypeInfo,
3034	unsigned VectorRegWidth) {
3035	if (!InterleaveR)
3036	return false;
3037
3038	Type GroupElementTy = nullptr*;
3039	if (InterleaveR->getStoredValues().empty()) {
3040	GroupElementTy = TypeInfo.inferScalarType(V: InterleaveR->getVPValue(I: `0`));
3041	if (!all_of(Range: InterleaveR->definedValues(),
3042	P: [&TypeInfo, GroupElementTy](VPValue *Op) {
3043	return TypeInfo.inferScalarType(V: Op) == GroupElementTy;
3044	}))
3045	return false;
3046	} else {
3047	GroupElementTy =
3048	TypeInfo.inferScalarType(V: InterleaveR->getStoredValues()[`0`]);
3049	if (!all_of(Range: InterleaveR->getStoredValues(),
3050	P: [&TypeInfo, GroupElementTy](VPValue *Op) {
3051	return TypeInfo.inferScalarType(V: Op) == GroupElementTy;
3052	}))
3053	return false;
3054	}
3055
3056	unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
3057	auto IG = InterleaveR->getInterleaveGroup();
3058	return IG->getFactor() == VF && IG->getNumMembers() == VF &&
3059	GroupSize == VectorRegWidth;
3060	}
3061
3062	void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
3063	unsigned VectorRegWidth) {
3064	using namespace llvm::VPlanPatternMatch;
3065	VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
3066	if (VF.isScalable() \|\| !VectorLoop)
3067	return;
3068
3069	VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
3070	Type *CanonicalIVType = CanonicalIV->getScalarType();
3071	VPTypeAnalysis TypeInfo(CanonicalIVType);
3072
3073	unsigned FixedVF = VF.getFixedValue();
3074	SmallVector<VPInterleaveRecipe *> StoreGroups;
3075	for (auto &R : *VectorLoop->getEntryBasicBlock()) {
3076	if (isa<VPCanonicalIVPHIRecipe>(Val: &R) \|\|
3077	match(V: &R, P: m_BranchOnCount(Op0: m_VPValue(), Op1: m_VPValue())))
3078	continue;
3079
3080	// Bail out on recipes not supported at the moment:
3081	// phi recipes other than the canonical induction*
3082	// recipes writing to memory except interleave groups*
3083	// Only support plans with a canonical induction phi.
3084	if (R.isPhi())
3085	return;
3086
3087	auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(Val: &R);
3088	if (R.mayWriteToMemory() && !InterleaveR)
3089	return;
3090
3091	// Do not narrow interleave groups if there are VectorPointer recipes and
3092	// the plan was unrolled. The recipe implicitly uses VF from
3093	// VPTransformState.
3094	// TODO: Remove restriction once the VF for the VectorPointer offset is
3095	// modeled explicitly as operand.
3096	if (isa<VPVectorPointerRecipe>(Val: &R) && Plan.getUF() > `1`)
3097	return;
3098
3099	// All other ops are allowed, but we reject uses that cannot be converted
3100	// when checking all allowed consumers (store interleave groups) below.
3101	if (!InterleaveR)
3102	continue;
3103
3104	// Bail out on non-consecutive interleave groups.
3105	if (!isConsecutiveInterleaveGroup(InterleaveR, VF: FixedVF, TypeInfo,
3106	VectorRegWidth))
3107	return;
3108
3109	// Skip read interleave groups.
3110	if (InterleaveR->getStoredValues().empty())
3111	continue;
3112
3113	// For now, we only support full interleave groups storing load interleave
3114	// groups.
3115	if (all_of(Range: enumerate(First: InterleaveR->getStoredValues()), P: [](auto Op) {
3116	VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
3117	if (!DefR)
3118	return false;
3119	auto *IR = dyn_cast<VPInterleaveRecipe>(Val: DefR);
3120	return IR &&
3121	IR->getInterleaveGroup()->getFactor() ==
3122	IR->getInterleaveGroup()->getNumMembers() &&
3123	IR->getVPValue(Op.index()) == Op.value();
3124	})) {
3125	StoreGroups.push_back(Elt: InterleaveR);
3126	continue;
3127	}
3128
3129	// Check if all values feeding InterleaveR are matching wide recipes, which
3130	// operands that can be narrowed.
3131	auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
3132	Val: InterleaveR->getStoredValues()[`0`]->getDefiningRecipe());
3133	if (!WideMember0)
3134	return;
3135	for (const auto &[I, V] : enumerate(First: InterleaveR->getStoredValues())) {
3136	auto *R = dyn_cast<VPWidenRecipe>(Val: V->getDefiningRecipe());
3137	if (!R \|\| R->getOpcode() != WideMember0->getOpcode() \|\|
3138	R->getNumOperands() > `2`)
3139	return;
3140	if (any_of(Range: R->operands(), P: [WideMember0, Idx = I, R](VPValue *V) {
3141	return !canNarrowLoad(WideMember0, WideMember: R, V, Idx);
3142	}))
3143	return;
3144	}
3145	StoreGroups.push_back(Elt: InterleaveR);
3146	}
3147
3148	if (StoreGroups.empty())
3149	return;
3150
3151	// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
3152	auto NarrowOp = [](VPRecipeBase R) -> VPValue {
3153	if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(Val: R)) {
3154	// Narrow interleave group to wide load, as transformed VPlan will only
3155	// process one original iteration.
3156	auto L = new* VPWidenLoadRecipe (
3157	*cast<LoadInst>(Val: LoadGroup->getInterleaveGroup()->getInsertPos()),
3158	LoadGroup->getAddr(), LoadGroup->getMask(), /Consecutive=/true,
3159	/Reverse=/false, {}, LoadGroup->getDebugLoc());
3160	L->insertBefore(InsertPos: LoadGroup);
3161	return L;
3162	}
3163
3164	auto *WideLoad = cast<VPWidenLoadRecipe>(Val: R);
3165
3166	// Narrow wide load to uniform scalar load, as transformed VPlan will only
3167	// process one original iteration.
3168	auto N = new* VPReplicateRecipe (&WideLoad->getIngredient(),
3169	WideLoad->operands(), /IsUniform/ true,
3170	/Mask/ nullptr, *WideLoad);
3171	N->insertBefore(InsertPos: WideLoad);
3172	return N;
3173	};
3174
3175	// Narrow operation tree rooted at store groups.
3176	for (auto *StoreGroup : StoreGroups) {
3177	VPValue Res = nullptr*;
3178	if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(
3179	Val: StoreGroup->getStoredValues()[`0`]->getDefiningRecipe())) {
3180	for (unsigned Idx = `0`, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
3181	WideMember0->setOperand(
3182	I: Idx, New: NarrowOp (WideMember0->getOperand(N: Idx)->getDefiningRecipe()));
3183	Res = WideMember0;
3184	} else {
3185	Res = NarrowOp (StoreGroup->getStoredValues()[`0`]->getDefiningRecipe());
3186	}
3187
3188	auto S = new* VPWidenStoreRecipe (
3189	*cast<StoreInst>(Val: StoreGroup->getInterleaveGroup()->getInsertPos()),
3190	StoreGroup->getAddr(), Res, nullptr, /Consecutive=/true,
3191	/Reverse=/false, {}, StoreGroup->getDebugLoc());
3192	S->insertBefore(InsertPos: StoreGroup);
3193	StoreGroup->eraseFromParent();
3194	}
3195
3196	// Adjust induction to reflect that the transformed plan only processes one
3197	// original iteration.
3198	auto *CanIV = Plan.getCanonicalIV();
3199	auto *Inc = cast<VPInstruction>(Val: CanIV->getBackedgeValue());
3200	Inc->setOperand(I: `1`, New: Plan.getOrAddLiveIn(V: ConstantInt::get(
3201	Ty: CanIV->getScalarType(), V: `1` * Plan.getUF())));
3202	Plan.getVF().replaceAllUsesWith(
3203	New: Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: CanIV->getScalarType(), V: `1`)));
3204	removeDeadRecipes(Plan);
3205	}
3206

Provided by KDAB

Definitions

tryToConvertVPInstructionsToVPRecipes
sinkScalarOperands
getPredicatedMask
getPredicatedThenBlock
mergeReplicateRegionsIntoSuccessors
createReplicateRegion
addReplicateRegions
mergeBlocksIntoPredecessors
createAndOptimizeReplicateRegions
removeRedundantInductionCasts
removeRedundantCanonicalIVs
isDeadRecipe
removeDeadRecipes
createScalarIVSteps
collectUsersRecursively
legalizeAndOptimizeInductions
getOptimizableIVOf
optimizeEarlyExitInductionUser
optimizeLatchExitInductionUser
optimizeInductionExitUsers
removeRedundantExpandSCEVRecipes
recursivelyDeleteDeadRecipes
tryToFoldLiveIns
simplifyRecipe
simplifyRecipes
narrowToSingleScalarRecipes
simplifyBlends
optimizeVectorInductionWidthForTCAndVFUF
isConditionTrueViaVFAndUF
simplifyBranchConditionForVFAndUF
optimizeForVFAndUF
sinkRecurrenceUsersAfterPrevious
hoistPreviousBeforeFORUsers
adjustFixedOrderRecurrences
clearReductionWrapFlags
licm
truncateToMinimalBitwidths
removeBranchOnConst
optimize
addVPLaneMaskPhiAndUpdateExitBranch
collectAllHeaderMasks
addActiveLaneMask
createEVLRecipe
transformRecipestoEVLRecipes
tryAddExplicitVectorLength
dropPoisonGeneratingRecipes
createInterleaveGroups
dissolveLoopRegions
expandVPExtendedReduction
expandVPMulAccumulateReduction
convertToConcreteRecipes
handleUncountableEarlyExit
tryToMatchAndCreateExtendedReduction
tryToMatchAndCreateMulAccumulateReduction
tryToCreateAbstractReductionRecipe
convertToAbstractRecipes
materializeStepVectors
materializeBroadcasts
canNarrowLoad
isConsecutiveInterleaveGroup

Update your C++ knowledge – Modern C++11/14/17 Training

Find out more

Definitions

source code of llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp