summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Foad <jay.foad@amd.com>2022-02-03 15:27:12 +0000
committerJay Foad <jay.foad@amd.com>2022-02-03 16:27:48 +0000
commitb9cf52bc3d29fc5a28015ee642d2b63354391c41 (patch)
treea534ab423209851e0596575cbdf478e1ab1d462c
parent93c81f44cce802be7f2b723a96ed8e10db6101fb (diff)
[AMDGPU] Simplify AMDGPUAnnotateUniformValues::visitLoadInst
Always set uniform metadata on the pointer if it is an instruction, but otherwise do not bother to create a trivial getelementptr instruction, because AMDGPUInstrInfo::isUniformMMO can already detect that various non-instruction pointers are uniform. Most of the test case churn is from tests that used undef as a pointer, which AMDGPUInstrInfo::isUniformMMO treats as uniform. Differential Revision: https://reviews.llvm.org/D118909
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp41
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll60
-rw-r--r--llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll158
-rw-r--r--llvm/test/CodeGen/AMDGPU/fdiv.f64.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll62
-rw-r--r--llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll34
-rw-r--r--llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/rcp-pattern.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll2
11 files changed, 163 insertions, 306 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index b7506f2d1baa..dbbc478291b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -33,7 +33,6 @@ class AMDGPUAnnotateUniformValues : public FunctionPass,
LegacyDivergenceAnalysis *DA;
MemorySSA *MSSA;
AliasAnalysis *AA;
- DenseMap<Value*, GetElementPtrInst*> noClobberClones;
bool isEntryFunc;
public:
@@ -160,44 +159,17 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
Value *Ptr = I.getPointerOperand();
if (!DA->isUniform(Ptr))
return;
+ Instruction *PtrI = dyn_cast<Instruction>(Ptr);
+ if (PtrI)
+ setUniformMetadata(PtrI);
+
// We're tracking up to the Function boundaries, and cannot go beyond because
// of FunctionPass restrictions. We can ensure that is memory not clobbered
// for memory operations that are live in to entry points only.
- Instruction *PtrI = dyn_cast<Instruction>(Ptr);
-
- if (!isEntryFunc) {
- if (PtrI)
- setUniformMetadata(PtrI);
+ if (!isEntryFunc)
return;
- }
-
- bool NotClobbered = false;
bool GlobalLoad = I.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
- if (PtrI)
- NotClobbered = GlobalLoad && !isClobberedInFunction(&I);
- else if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) {
- if (GlobalLoad && !isClobberedInFunction(&I)) {
- NotClobbered = true;
- // Lookup for the existing GEP
- if (noClobberClones.count(Ptr)) {
- PtrI = noClobberClones[Ptr];
- } else {
- // Create GEP of the Value
- Function *F = I.getParent()->getParent();
- Value *Idx = Constant::getIntegerValue(
- Type::getInt32Ty(Ptr->getContext()), APInt(64, 0));
- // Insert GEP at the entry to make it dominate all uses
- PtrI = GetElementPtrInst::Create(I.getType(), Ptr,
- ArrayRef<Value *>(Idx), Twine(""),
- F->getEntryBlock().getFirstNonPHI());
- }
- I.replaceUsesOfWith(Ptr, PtrI);
- }
- }
-
- if (PtrI)
- setUniformMetadata(PtrI);
-
+ bool NotClobbered = GlobalLoad && !isClobberedInFunction(&I);
if (NotClobbered)
setNoClobberMetadata(&I);
}
@@ -216,7 +188,6 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv());
visit(F);
- noClobberClones.clear();
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
index edcc3033a97d..41ac2b984ef1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
@@ -1677,8 +1677,7 @@ define amdgpu_kernel void @byref_global_i32_arg(i32 addrspace(1)* nocapture %out
; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; HSA-VI-NEXT: [[ADDRSPACE_CAST:%[0-9]+]]:_(p1) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4)
- ; HSA-VI-NEXT: [[COPY1:%[0-9]+]]:_(p1) = COPY [[ADDRSPACE_CAST]](p1)
- ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[COPY1]](p1) :: (dereferenceable "amdgpu-noclobber" load (s32) from %ir.1, addrspace 1)
+ ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p1) :: (dereferenceable "amdgpu-noclobber" load (s32) from %ir.in.byref, addrspace 1)
; HSA-VI-NEXT: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; HSA-VI-NEXT: S_ENDPGM 0
; LEGACY-MESA-VI-LABEL: name: byref_global_i32_arg
@@ -1692,8 +1691,7 @@ define amdgpu_kernel void @byref_global_i32_arg(i32 addrspace(1)* nocapture %out
; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44
; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
; LEGACY-MESA-VI-NEXT: [[ADDRSPACE_CAST:%[0-9]+]]:_(p1) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4)
- ; LEGACY-MESA-VI-NEXT: [[COPY1:%[0-9]+]]:_(p1) = COPY [[ADDRSPACE_CAST]](p1)
- ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[COPY1]](p1) :: (dereferenceable "amdgpu-noclobber" load (s32) from %ir.1, addrspace 1)
+ ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p1) :: (dereferenceable "amdgpu-noclobber" load (s32) from %ir.in.byref, addrspace 1)
; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
; LEGACY-MESA-VI-NEXT: S_ENDPGM 0
%in = load i32, i32 addrspace(1)* %in.byref
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index f15d00cd6881..697cee38c90f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -386,7 +386,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (volatile load (s1) from `i1 addrspace(1)* undef`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s1) from `i1 addrspace(1)* undef`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i1_signext
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -444,7 +444,7 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (volatile load (s1) from `i1 addrspace(1)* undef`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s1) from `i1 addrspace(1)* undef`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i1_zeroext
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -559,7 +559,7 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (volatile load (s8) from `i8 addrspace(1)* undef`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s8) from `i8 addrspace(1)* undef`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i8_signext
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -618,7 +618,7 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (volatile load (s8) from `i8 addrspace(1)* undef`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s8) from `i8 addrspace(1)* undef`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i8_zeroext
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -732,7 +732,7 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (volatile load (s16) from `i16 addrspace(1)* undef`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s16) from `i16 addrspace(1)* undef`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i16_signext
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -790,7 +790,7 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (volatile load (s16) from `i16 addrspace(1)* undef`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s16) from `i16 addrspace(1)* undef`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i16_zeroext
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -1000,7 +1000,7 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[C]](p1) :: (load (<2 x s64>) from `<2 x i64> addrspace(1)* null`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[C]](p1) :: ("amdgpu-noclobber" load (<2 x s64>) from `<2 x i64> addrspace(1)* null`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2i64
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -1121,7 +1121,7 @@ define amdgpu_kernel void @test_call_external_void_func_i48(i32) #0 {
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (volatile load (s48) from `i48 addrspace(1)* undef`, align 8, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s48) from `i48 addrspace(1)* undef`, align 8, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i48
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -1181,7 +1181,7 @@ define amdgpu_kernel void @test_call_external_void_func_i48_signext(i32) #0 {
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (volatile load (s48) from `i48 addrspace(1)* undef`, align 8, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s48) from `i48 addrspace(1)* undef`, align 8, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i48_signext
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -1241,7 +1241,7 @@ define amdgpu_kernel void @test_call_external_void_func_i48_zeroext(i32) #0 {
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (volatile load (s48) from `i48 addrspace(1)* undef`, align 8, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s48) from `i48 addrspace(1)* undef`, align 8, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i48_zeroext
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -1357,7 +1357,7 @@ define amdgpu_kernel void @test_call_external_void_func_v2p0() #0 {
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p0>) = G_LOAD [[C]](p1) :: (load (<2 x p0>) from `<2 x i8*> addrspace(1)* null`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p0>) = G_LOAD [[C]](p1) :: ("amdgpu-noclobber" load (<2 x p0>) from `<2 x i8*> addrspace(1)* null`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2p0
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -1420,7 +1420,7 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934593
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C1]](s64), [[DEF]](s64)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[C]](p1) :: (load (<2 x s64>) from `<2 x i64> addrspace(1)* null`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[C]](p1) :: ("amdgpu-noclobber" load (<2 x s64>) from `<2 x i64> addrspace(1)* null`, addrspace 1)
; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<3 x s64>) = G_SHUFFLE_VECTOR [[LOAD]](<2 x s64>), [[BUILD_VECTOR]], shufflemask(0, 1, 2)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3i64
@@ -1488,7 +1488,7 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934593
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 17179869187
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C1]](s64), [[C2]](s64)
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[C]](p1) :: (load (<2 x s64>) from `<2 x i64> addrspace(1)* null`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[C]](p1) :: ("amdgpu-noclobber" load (<2 x s64>) from `<2 x i64> addrspace(1)* null`, addrspace 1)
; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s64>) = G_SHUFFLE_VECTOR [[LOAD]](<2 x s64>), [[BUILD_VECTOR]], shufflemask(0, 1, 2, 3)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v4i64
@@ -2024,7 +2024,7 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[DEF]](p1) :: (load (<2 x s16>) from `<2 x i16> addrspace(1)* undef`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<2 x s16>) from `<2 x i16> addrspace(1)* undef`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2i16
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -2080,7 +2080,7 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[DEF]](p1) :: (load (<3 x s16>) from `<3 x i16> addrspace(1)* undef`, align 8, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<3 x s16>) from `<3 x i16> addrspace(1)* undef`, align 8, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3i16
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -2141,7 +2141,7 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[DEF]](p1) :: (load (<3 x s16>) from `<3 x half> addrspace(1)* undef`, align 8, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<3 x s16>) from `<3 x half> addrspace(1)* undef`, align 8, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3f16
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -2202,7 +2202,7 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[DEF]](p1) :: (load (<4 x s16>) from `<4 x i16> addrspace(1)* undef`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<4 x s16>) from `<4 x i16> addrspace(1)* undef`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v4i16
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -2320,7 +2320,7 @@ define amdgpu_kernel void @test_call_external_void_func_v5i16() #0 {
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<5 x s16>) = G_LOAD [[DEF]](p1) :: (load (<5 x s16>) from `<5 x i16> addrspace(1)* undef`, align 16, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<5 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<5 x s16>) from `<5 x i16> addrspace(1)* undef`, align 16, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v5i16
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -2382,7 +2382,7 @@ define amdgpu_kernel void @test_call_external_void_func_v7i16() #0 {
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<7 x s16>) = G_LOAD [[DEF]](p1) :: (load (<7 x s16>) from `<7 x i16> addrspace(1)* undef`, align 16, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<7 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<7 x s16>) from `<7 x i16> addrspace(1)* undef`, align 16, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v7i16
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -2445,7 +2445,7 @@ define amdgpu_kernel void @test_call_external_void_func_v63i16() #0 {
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<63 x s16>) = G_LOAD [[DEF]](p1) :: (load (<63 x s16>) from `<63 x i16> addrspace(1)* undef`, align 128, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<63 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<63 x s16>) from `<63 x i16> addrspace(1)* undef`, align 128, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v63i16
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -2539,7 +2539,7 @@ define amdgpu_kernel void @test_call_external_void_func_v65i16() #0 {
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<65 x s16>) = G_LOAD [[DEF]](p1) :: (load (<65 x s16>) from `<65 x i16> addrspace(1)* undef`, align 256, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<65 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<65 x s16>) from `<65 x i16> addrspace(1)* undef`, align 256, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v65i16
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -2636,7 +2636,7 @@ define amdgpu_kernel void @test_call_external_void_func_v66i16() #0 {
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<66 x s16>) = G_LOAD [[DEF]](p1) :: (load (<66 x s16>) from `<66 x i16> addrspace(1)* undef`, align 256, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<66 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<66 x s16>) from `<66 x i16> addrspace(1)* undef`, align 256, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v66i16
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -2730,7 +2730,7 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[DEF]](p1) :: (load (<2 x s16>) from `<2 x half> addrspace(1)* undef`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<2 x s16>) from `<2 x half> addrspace(1)* undef`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2f16
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -2786,7 +2786,7 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[DEF]](p1) :: (load (<2 x s32>) from `<2 x i32> addrspace(1)* undef`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<2 x s32>) from `<2 x i32> addrspace(1)* undef`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2i32
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -3026,7 +3026,7 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[DEF]](p1) :: (load (<4 x s32>) from `<4 x i32> addrspace(1)* undef`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<4 x s32>) from `<4 x i32> addrspace(1)* undef`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v4i32
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -3519,7 +3519,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load (p1) from `<32 x i32> addrspace(1)* addrspace(4)* undef`, addrspace 4)
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<32 x s32>) from %ir.ptr0, addrspace 1)
- ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p1) :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_i32
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -3619,8 +3619,8 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i8_i8_i16() #0 {
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p1) = COPY [[DEF1]](p1)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load (p1) from `<32 x i32> addrspace(1)* addrspace(4)* undef`, addrspace 4)
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<32 x s32>) from %ir.ptr0, addrspace 1)
- ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s8) = G_LOAD [[DEF1]](p1) :: (load (s8) from `i8 addrspace(1)* undef`, addrspace 1)
- ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[COPY10]](p1) :: (load (s16) from `i16 addrspace(1)* undef`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s8) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s8) from `i8 addrspace(1)* undef`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[COPY10]](p1) :: ("amdgpu-noclobber" load (s16) from `i16 addrspace(1)* undef`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_i8_i8_i16
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]]
@@ -3730,8 +3730,8 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p1) = COPY [[DEF1]](p1)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load (p1) from `<32 x i32> addrspace(1)* addrspace(4)* undef`, addrspace 4)
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<32 x s32>) from %ir.ptr0, addrspace 1)
- ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p3) = G_LOAD [[DEF1]](p1) :: (load (p3) from `i8 addrspace(3)* addrspace(1)* undef`, addrspace 1)
- ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p5) = G_LOAD [[COPY10]](p1) :: (load (p5) from `i8 addrspace(5)* addrspace(1)* undef`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p3) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p3) from `i8 addrspace(3)* addrspace(1)* undef`, addrspace 1)
+ ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p5) = G_LOAD [[COPY10]](p1) :: ("amdgpu-noclobber" load (p5) from `i8 addrspace(5)* addrspace(1)* undef`, addrspace 1)
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_p3_p5
; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]]
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 97203031b4e0..6fa0c0a9be18 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -2345,142 +2345,82 @@ bb:
define amdgpu_kernel void @cvt_f32_ubyte0_vector() local_unnamed_addr {
; SI-LABEL: cvt_f32_ubyte0_vector:
; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
-; SI-NEXT: s_mov_b64 s[4:5], 0
-; SI-NEXT: s_mov_b32 s6, 0
-; SI-NEXT: s_mov_b32 s7, s3
-; SI-NEXT: buffer_load_dword v2, off, s[0:3], 0
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3
-; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
-; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:1
-; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:3
+; SI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:2
+; SI-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:1
+; SI-NEXT: buffer_load_ubyte v3, off, s[0:3], 0
+; SI-NEXT: s_load_dword s0, s[0:1], 0x0
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3
-; SI-NEXT: v_fma_f32 v1, v2, v1, 0.5
-; SI-NEXT: v_cvt_u32_f32_e32 v1, v1
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_fma_f32 v0, s0, v0, 0.5
+; SI-NEXT: v_cvt_u32_f32_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: buffer_store_byte v4, off, s[0:3], 0
+; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: buffer_store_byte v5, off, s[0:3], 0
+; SI-NEXT: buffer_store_byte v2, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: buffer_store_byte v3, off, s[0:3], 0
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
-; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0
; SI-NEXT: .LBB40_1: ; %for.body.i
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_branch .LBB40_1
;
; VI-LABEL: cvt_f32_ubyte0_vector:
; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
-; VI-NEXT: v_mov_b32_e32 v2, -1
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: s_mov_b64 s[4:5], exec
-; VI-NEXT: .LBB40_1: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s8, v0
-; VI-NEXT: v_readfirstlane_b32 s9, v1
-; VI-NEXT: v_readfirstlane_b32 s10, v2
-; VI-NEXT: v_readfirstlane_b32 s11, v3
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; VI-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3]
-; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; VI-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; VI-NEXT: s_nop 0
-; VI-NEXT: buffer_load_ubyte v4, off, s[8:11], 0
-; VI-NEXT: s_xor_b64 exec, exec, s[0:1]
-; VI-NEXT: s_cbranch_execnz .LBB40_1
-; VI-NEXT: ; %bb.2:
-; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: s_mov_b64 s[4:5], exec
-; VI-NEXT: .LBB40_3: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_readfirstlane_b32 s8, v0
-; VI-NEXT: v_readfirstlane_b32 s9, v1
-; VI-NEXT: v_readfirstlane_b32 s10, v2
-; VI-NEXT: v_readfirstlane_b32 s11, v3
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; VI-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3]
-; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; VI-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; VI-NEXT: s_nop 0
-; VI-NEXT: buffer_load_ubyte v5, off, s[8:11], 0 offset:1
-; VI-NEXT: s_xor_b64 exec, exec, s[0:1]
-; VI-NEXT: s_cbranch_execnz .LBB40_3
-; VI-NEXT: ; %bb.4:
-; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: s_mov_b64 s[4:5], exec
-; VI-NEXT: .LBB40_5: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_readfirstlane_b32 s8, v0
-; VI-NEXT: v_readfirstlane_b32 s9, v1
-; VI-NEXT: v_readfirstlane_b32 s10, v2
-; VI-NEXT: v_readfirstlane_b32 s11, v3
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; VI-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3]
-; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; VI-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; VI-NEXT: s_nop 0
-; VI-NEXT: buffer_load_ubyte v6, off, s[8:11], 0 offset:2
-; VI-NEXT: s_xor_b64 exec, exec, s[0:1]
-; VI-NEXT: s_cbranch_execnz .LBB40_5
-; VI-NEXT: ; %bb.6:
-; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: s_mov_b64 s[4:5], exec
-; VI-NEXT: .LBB40_7: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_readfirstlane_b32 s8, v0
-; VI-NEXT: v_readfirstlane_b32 s9, v1
-; VI-NEXT: v_readfirstlane_b32 s10, v2
-; VI-NEXT: v_readfirstlane_b32 s11, v3
-; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; VI-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3]
-; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; VI-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; VI-NEXT: s_nop 0
-; VI-NEXT: buffer_load_ubyte v7, off, s[8:11], 0 offset:3
-; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; VI-NEXT: s_xor_b64 exec, exec, s[0:1]
-; VI-NEXT: s_cbranch_execnz .LBB40_7
-; VI-NEXT: ; %bb.8:
-; VI-NEXT: s_mov_b64 exec, s[4:5]
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v7
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mul_f32_e32 v0, v0, v1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:3
+; VI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:2
+; VI-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:1
+; VI-NEXT: buffer_load_ubyte v3, off, s[0:3], 0
+; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mul_f32_e32 v0, s0, v0
; VI-NEXT: v_add_f32_e32 v0, 0.5, v0
; VI-NEXT: v_cvt_i32_f32_e32 v0, v0
-; VI-NEXT: buffer_store_byte v6, off, s[0:3], 0
-; VI-NEXT: buffer_store_byte v5, off, s[0:3], 0
-; VI-NEXT: buffer_store_byte v4, off, s[0:3], 0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_store_byte v2, off, s[0:3], 0
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: buffer_store_byte v3, off, s[0:3], 0
; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
-; VI-NEXT: .LBB40_9: ; %for.body.i
+; VI-NEXT: .LBB40_1: ; %for.body.i
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: s_branch .LBB40_9
+; VI-NEXT: s_branch .LBB40_1
;
; GFX10-LABEL: cvt_f32_ubyte0_vector:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_clause 0x4
-; GFX10-NEXT: global_load_ubyte v2, v[0:1], off offset:3
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: global_load_ubyte v4, v[0:1], off offset:2
-; GFX10-NEXT: global_load_ubyte v5, v[0:1], off offset:1
-; GFX10-NEXT: global_load_ubyte v6, v[0:1], off
-; GFX10-NEXT: s_waitcnt vmcnt(4)
-; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x3
+; GFX10-NEXT: global_load_ubyte v1, v0, s[0:1] offset:3
+; GFX10-NEXT: global_load_ubyte v2, v0, s[0:1] offset:2
+; GFX10-NEXT: global_load_ubyte v3, v0, s[0:1] offset:1
+; GFX10-NEXT: global_load_ubyte v4, v0, s[0:1]
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-NEXT: s_waitcnt vmcnt(3)
-; GFX10-NEXT: v_fma_f32 v0, v3, v0, 0.5
+; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_fma_f32 v0, s0, v0, 0.5
; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: global_store_byte v[0:1], v4, off
+; GFX10-NEXT: global_store_byte v[0:1], v2, off
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: global_store_byte v[0:1], v5, off
+; GFX10-NEXT: global_store_byte v[0:1], v3, off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_byte v[0:1], v6, off
+; GFX10-NEXT: global_store_byte v[0:1], v4, off
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: .LBB40_1: ; %for.body.i
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
index a2277035b90b..2bf383a48bd0 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
@@ -125,7 +125,7 @@ define amdgpu_kernel void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x do
}
; GCN-LABEL: {{^}}div_fast_2_x_pat_f64:
-; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, 0.5
+; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0.5
; GCN: buffer_store_dwordx2 [[MUL]]
define amdgpu_kernel void @div_fast_2_x_pat_f64(double addrspace(1)* %out) #1 {
%x = load double, double addrspace(1)* undef
@@ -135,9 +135,9 @@ define amdgpu_kernel void @div_fast_2_x_pat_f64(double addrspace(1)* %out) #1 {
}
; GCN-LABEL: {{^}}div_fast_k_x_pat_f64:
-; GCN-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x9999999a
-; GCN-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fb99999
-; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; GCN-DAG: v_mov_b32_e32 v[[K_LO:[0-9]+]], 0x9999999a
+; GCN-DAG: v_mov_b32_e32 v[[K_HI:[0-9]+]], 0x3fb99999
+; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
; GCN: buffer_store_dwordx2 [[MUL]]
define amdgpu_kernel void @div_fast_k_x_pat_f64(double addrspace(1)* %out) #1 {
%x = load double, double addrspace(1)* undef
@@ -147,9 +147,9 @@ define amdgpu_kernel void @div_fast_k_x_pat_f64(double addrspace(1)* %out) #1 {
}
; GCN-LABEL: {{^}}div_fast_neg_k_x_pat_f64:
-; GCN-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x9999999a
-; GCN-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfb99999
-; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; GCN-DAG: v_mov_b32_e32 v[[K_LO:[0-9]+]], 0x9999999a
+; GCN-DAG: v_mov_b32_e32 v[[K_HI:[0-9]+]], 0xbfb99999
+; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
; GCN: buffer_store_dwordx2 [[MUL]]
define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(double addrspace(1)* %out) #1 {
%x = load double, double addrspace(1)* undef
diff --git a/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll b/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll
index 0e330d38cbba..254272bd81de 100644
--- a/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll
@@ -16,9 +16,7 @@
; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], private@rel32@lo+8
; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], private@rel32@hi+16
-; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]]
-; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]]
-; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]]
define amdgpu_kernel void @private_test(i32 addrspace(1)* %out) {
%ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @private, i32 0, i32 1
%val = load i32, i32 addrspace(1)* %ptr
@@ -30,9 +28,7 @@ define amdgpu_kernel void @private_test(i32 addrspace(1)* %out) {
; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], internal@rel32@lo+8
; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], internal@rel32@hi+16
-; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]]
-; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]]
-; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]]
define amdgpu_kernel void @internal_test(i32 addrspace(1)* %out) {
%ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @internal, i32 0, i32 1
%val = load i32, i32 addrspace(1)* %ptr
@@ -45,11 +41,7 @@ define amdgpu_kernel void @internal_test(i32 addrspace(1)* %out) {
; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], available_externally@gotpcrel32@lo+4
; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], available_externally@gotpcrel32@hi+12
; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
-; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
-; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
-; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
-; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
-; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) {
%ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @available_externally, i32 0, i32 1
%val = load i32, i32 addrspace(1)* %ptr
@@ -62,11 +54,7 @@ define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) {
; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], linkonce@gotpcrel32@lo+4
; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], linkonce@gotpcrel32@hi+12
; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
-; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
-; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
-; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
-; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
-; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
define amdgpu_kernel void @linkonce_test(i32 addrspace(1)* %out) {
%ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce, i32 0, i32 1
%val = load i32, i32 addrspace(1)* %ptr
@@ -79,11 +67,7 @@ define amdgpu_kernel void @linkonce_test(i32 addrspace(1)* %out) {
; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], weak@gotpcrel32@lo+4
; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], weak@gotpcrel32@hi+12
; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
-; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
-; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
-; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
-; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
-; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
define amdgpu_kernel void @weak_test(i32 addrspace(1)* %out) {
%ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak, i32 0, i32 1
%val = load i32, i32 addrspace(1)* %ptr
@@ -96,11 +80,7 @@ define amdgpu_kernel void @weak_test(i32 addrspace(1)* %out) {
; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], common@gotpcrel32@lo+4
; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], common@gotpcrel32@hi+12
; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
-; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
-; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
-; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
-; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
-; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
define amdgpu_kernel void @common_test(i32 addrspace(1)* %out) {
%ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @common, i32 0, i32 1
%val = load i32, i32 addrspace(1)* %ptr
@@ -113,11 +93,7 @@ define amdgpu_kernel void @common_test(i32 addrspace(1)* %out) {
; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], extern_weak@gotpcrel32@lo+4
; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], extern_weak@gotpcrel32@hi+12
; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
-; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
-; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
-; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
-; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
-; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
define amdgpu_kernel void @extern_weak_test(i32 addrspace(1)* %out) {
%ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @extern_weak, i32 0, i32 1
%val = load i32, i32 addrspace(1)* %ptr
@@ -130,11 +106,7 @@ define amdgpu_kernel void @extern_weak_test(i32 addrspace(1)* %out) {
; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], linkonce_odr@gotpcrel32@lo+4
; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], linkonce_odr@gotpcrel32@hi+12
; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
-; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
-; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
-; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
-; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
-; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
define amdgpu_kernel void @linkonce_odr_test(i32 addrspace(1)* %out) {
%ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce_odr, i32 0, i32 1
%val = load i32, i32 addrspace(1)* %ptr
@@ -147,11 +119,7 @@ define amdgpu_kernel void @linkonce_odr_test(i32 addrspace(1)* %out) {
; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], weak_odr@gotpcrel32@lo+4
; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], weak_odr@gotpcrel32@hi+12
; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
-; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
-; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
-; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
-; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
-; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
define amdgpu_kernel void @weak_odr_test(i32 addrspace(1)* %out) {
%ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak_odr, i32 0, i32 1
%val = load i32, i32 addrspace(1)* %ptr
@@ -164,11 +132,7 @@ define amdgpu_kernel void @weak_odr_test(i32 addrspace(1)* %out) {
; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], external@gotpcrel32@lo+4
; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], external@gotpcrel32@hi+12
; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
-; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
-; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
-; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
-; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
-; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
define amdgpu_kernel void @external_test(i32 addrspace(1)* %out) {
%ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external, i32 0, i32 1
%val = load i32, i32 addrspace(1)* %ptr
@@ -181,11 +145,7 @@ define amdgpu_kernel void @external_test(i32 addrspace(1)* %out) {
; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], external_w_init@gotpcrel32@lo+4
; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], external_w_init@gotpcrel32@hi+12
; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0
-; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4
-; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0
-; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
-; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
-; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
+; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4
define amdgpu_kernel void @external_w_init_test(i32 addrspace(1)* %out) {
%ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external_w_init, i32 0, i32 1
%val = load i32, i32 addrspace(1)* %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index 113e346ab616..767a6134367f 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -148,41 +148,43 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 8, v0
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_cbranch_vccnz .LBB1_6
; GCN-NEXT: ; %bb.1: ; %bb14.lr.ph
-; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x0
; GCN-NEXT: s_branch .LBB1_3
-; GCN-NEXT: .LBB1_2: ; in Loop: Header=BB1_3 Depth=1
-; GCN-NEXT: s_mov_b64 s[0:1], -1
-; GCN-NEXT: ; implicit-def: $vgpr0
-; GCN-NEXT: s_cbranch_execnz .LBB1_6
+; GCN-NEXT: .LBB1_2: ; %Flow
+; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
+; GCN-NEXT: s_and_b64 vcc, exec, s[0:1]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b64 vcc, vcc
+; GCN-NEXT: s_cbranch_vccnz .LBB1_6
; GCN-NEXT: .LBB1_3: ; %bb14
; GCN-NEXT: ; =>This Loop Header: Depth=1
; GCN-NEXT: ; Child Loop BB1_4 Depth 2
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
-; GCN-NEXT: s_and_b64 vcc, exec, vcc
-; GCN-NEXT: s_cbranch_vccnz .LBB1_2
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s4, 1
+; GCN-NEXT: s_mov_b64 s[0:1], -1
+; GCN-NEXT: ; implicit-def: $sgpr4
+; GCN-NEXT: s_cbranch_scc1 .LBB1_2
; GCN-NEXT: .LBB1_4: ; %bb18
; GCN-NEXT: ; Parent Loop BB1_3 Depth=1
; GCN-NEXT: ; => This Inner Loop Header: Depth=2
-; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 8, v0
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_cbranch_vccnz .LBB1_4
; GCN-NEXT: ; %bb.5: ; %bb21
; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1
-; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0
-; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cmp_lt_i32_e64 s[0:1], 8, v1
-; GCN-NEXT: s_and_b64 vcc, exec, s[0:1]
-; GCN-NEXT: s_cbranch_vccz .LBB1_3
+; GCN-NEXT: v_cmp_lt_i32_e64 s[0:1], 8, v0
+; GCN-NEXT: s_branch .LBB1_2
; GCN-NEXT: .LBB1_6: ; %bb31
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
index afff30bdb521..0b9438ab8d46 100644
--- a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
@@ -20,8 +20,7 @@
define amdgpu_kernel void @simple_barrier(i32 addrspace(1)* %arg) {
; CHECK-LABEL: @simple_barrier(
; CHECK-NEXT: bb:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0
-; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0
; CHECK-NEXT: fence syncscope("workgroup") release
; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: fence syncscope("workgroup") acquire
@@ -59,8 +58,7 @@ bb:
define amdgpu_kernel void @memory_phi_no_clobber(i32 addrspace(1)* %arg) {
; CHECK-LABEL: @memory_phi_no_clobber(
; CHECK-NEXT: bb:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0
-; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0
; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0
; CHECK: if.then:
; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
@@ -106,8 +104,7 @@ if.end:
define amdgpu_kernel void @memory_phi_clobber1(i32 addrspace(1)* %arg) {
; CHECK-LABEL: @memory_phi_clobber1(
; CHECK-NEXT: bb:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0
-; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0
; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0
; CHECK: if.then:
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 3
@@ -155,8 +152,7 @@ if.end:
define amdgpu_kernel void @memory_phi_clobber2(i32 addrspace(1)* %arg) {
; CHECK-LABEL: @memory_phi_clobber2(
; CHECK-NEXT: bb:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0
-; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0
; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0
; CHECK: if.then:
; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
@@ -203,8 +199,7 @@ if.end:
define amdgpu_kernel void @no_clobbering_loop1(i32 addrspace(1)* %arg, i1 %cc) {
; CHECK-LABEL: @no_clobbering_loop1(
; CHECK-NEXT: bb:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0
-; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0
; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0
; CHECK: while.cond:
; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0
@@ -242,8 +237,7 @@ end:
define amdgpu_kernel void @no_clobbering_loop2(i32 addrspace(1)* noalias %arg, i32 addrspace(1)* noalias %out, i32 %n) {
; CHECK-LABEL: @no_clobbering_loop2(
; CHECK-NEXT: bb:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0
-; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0
; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0
; CHECK: while.cond:
; CHECK-NEXT: [[C:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[INC:%.*]], [[WHILE_COND]] ]
@@ -286,8 +280,7 @@ end:
define amdgpu_kernel void @clobbering_loop(i32 addrspace(1)* %arg, i32 addrspace(1)* %out, i1 %cc) {
; CHECK-LABEL: @clobbering_loop(
; CHECK-NEXT: bb:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0
-; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0
; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0
; CHECK: while.cond:
; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0
@@ -325,8 +318,7 @@ end:
define amdgpu_kernel void @clobber_by_atomic_load(i32 addrspace(1)* %arg) {
; CHECK-LABEL: @clobber_by_atomic_load(
; CHECK-NEXT: bb:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0
-; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2, !amdgpu.uniform !0
; CHECK-NEXT: [[VAL:%.*]] = load atomic i32, i32 addrspace(1)* [[GEP]] seq_cst, align 4, !amdgpu.noclobber !0
; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 3, !amdgpu.uniform !0
@@ -357,12 +349,11 @@ bb:
define protected amdgpu_kernel void @no_alias_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
; CHECK-LABEL: @no_alias_store(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0
; CHECK-NEXT: store i32 0, i32 addrspace(3)* @LDS, align 4
; CHECK-NEXT: fence syncscope("workgroup") release
; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: fence syncscope("workgroup") acquire
-; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0
; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
@@ -411,12 +402,11 @@ entry:
define protected amdgpu_kernel void @no_alias_volatile_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
; CHECK-LABEL: @no_alias_volatile_store(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0
; CHECK-NEXT: store volatile i32 0, i32 addrspace(3)* @LDS, align 4
; CHECK-NEXT: fence syncscope("workgroup") release
; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: fence syncscope("workgroup") acquire
-; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0
; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
@@ -438,9 +428,8 @@ entry:
define protected amdgpu_kernel void @no_alias_atomic_rmw_relaxed(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
; CHECK-LABEL: @no_alias_atomic_rmw_relaxed(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0
; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 monotonic, align 4
-; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0
; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
@@ -459,12 +448,11 @@ entry:
define protected amdgpu_kernel void @no_alias_atomic_cmpxchg(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %swap) {
; CHECK-LABEL: @no_alias_atomic_cmpxchg(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0
; CHECK-NEXT: [[UNUSED:%.*]] = cmpxchg i32 addrspace(3)* @LDS, i32 7, i32 [[SWAP:%.*]] seq_cst monotonic, align 4
; CHECK-NEXT: fence syncscope("workgroup") release
; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: fence syncscope("workgroup") acquire
-; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0
; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
@@ -486,12 +474,11 @@ entry:
define protected amdgpu_kernel void @no_alias_atomic_rmw(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
; CHECK-LABEL: @no_alias_atomic_rmw(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0
; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst, align 4
; CHECK-NEXT: fence syncscope("workgroup") release
; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: fence syncscope("workgroup") acquire
-; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0
; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
@@ -595,13 +582,12 @@ entry:
define protected amdgpu_kernel void @no_alias_atomic_rmw_then_no_alias_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 addrspace(1)* noalias %noalias) {
; CHECK-LABEL: @no_alias_atomic_rmw_then_no_alias_store(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0
; CHECK-NEXT: store i32 2, i32 addrspace(1)* [[NOALIAS:%.*]], align 4
; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst, align 4
; CHECK-NEXT: fence syncscope("workgroup") release
; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: fence syncscope("workgroup") acquire
-; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0
; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
index d68cee3ca7e9..b162eff966a1 100644
--- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -118,7 +118,7 @@ define amdgpu_kernel void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %
}
; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f32:
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
+; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], s{{[0-9]+}}, 0.5
; GCN: buffer_store_dword [[MUL]]
define amdgpu_kernel void @div_arcp_2_x_pat_f32(float addrspace(1)* %out) #0 {
%x = load float, float addrspace(1)* undef
@@ -128,7 +128,8 @@ define amdgpu_kernel void @div_arcp_2_x_pat_f32(float addrspace(1)* %out) #0 {
}
; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f32:
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0x3dcccccd, v{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x3dcccccd
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], s{{[0-9]+}}, [[V]]
; GCN: buffer_store_dword [[MUL]]
define amdgpu_kernel void @div_arcp_k_x_pat_f32(float addrspace(1)* %out) #0 {
%x = load float, float addrspace(1)* undef
@@ -138,7 +139,8 @@ define amdgpu_kernel void @div_arcp_k_x_pat_f32(float addrspace(1)* %out) #0 {
}
; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f32:
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbdcccccd, v{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0xbdcccccd
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], s{{[0-9]+}}, [[V]]
; GCN: buffer_store_dword [[MUL]]
define amdgpu_kernel void @div_arcp_neg_k_x_pat_f32(float addrspace(1)* %out) #0 {
%x = load float, float addrspace(1)* undef
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index 9fee4732bd72..c6532a984669 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -164,25 +164,24 @@ declare float @llvm.fabs.f32(float) nounwind readnone
define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {
; SI-LABEL: loop_land_info_assert:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
+; SI-NEXT: s_load_dword s6, s[0:1], 0x0
; SI-NEXT: s_load_dword s14, s[0:1], 0xc
-; SI-NEXT: s_brev_b32 s8, 44
+; SI-NEXT: v_bfrev_b32_e32 v0, 44
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_lt_i32 s2, 1
-; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT: s_cmp_lt_i32 s3, 4
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT: s_cmp_lt_i32 s3, 4
+; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: s_cmp_gt_i32 s3, 3
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
-; SI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
-; SI-NEXT: s_and_b64 s[0:1], exec, s[0:1]
+; SI-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3]
+; SI-NEXT: v_cmp_lt_f32_e64 s[6:7], |s6|, v0
+; SI-NEXT: s_and_b64 s[0:1], exec, s[4:5]
; SI-NEXT: s_and_b64 s[2:3], exec, s[2:3]
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s8
-; SI-NEXT: s_and_b64 s[4:5], exec, s[4:5]
+; SI-NEXT: s_and_b64 s[4:5], exec, s[6:7]
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, 3
; SI-NEXT: s_branch .LBB3_4
; SI-NEXT: .LBB3_1: ; %Flow6
@@ -240,25 +239,24 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
;
; FLAT-LABEL: loop_land_info_assert:
; FLAT: ; %bb.0: ; %entry
-; FLAT-NEXT: s_mov_b32 s7, 0xf000
-; FLAT-NEXT: s_mov_b32 s6, -1
-; FLAT-NEXT: buffer_load_dword v0, off, s[4:7], 0
; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; FLAT-NEXT: s_load_dword s6, s[0:1], 0x0
; FLAT-NEXT: s_load_dword s14, s[0:1], 0x30
-; FLAT-NEXT: s_brev_b32 s8, 44
+; FLAT-NEXT: v_bfrev_b32_e32 v0, 44
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
; FLAT-NEXT: s_cmp_lt_i32 s2, 1
-; FLAT-NEXT: s_cselect_b64 s[4:5], -1, 0
-; FLAT-NEXT: s_cmp_lt_i32 s3, 4
; FLAT-NEXT: s_cselect_b64 s[0:1], -1, 0
+; FLAT-NEXT: s_cmp_lt_i32 s3, 4
+; FLAT-NEXT: s_cselect_b64 s[4:5], -1, 0
; FLAT-NEXT: s_cmp_gt_i32 s3, 3
; FLAT-NEXT: s_cselect_b64 s[2:3], -1, 0
-; FLAT-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
-; FLAT-NEXT: s_and_b64 s[0:1], exec, s[0:1]
+; FLAT-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3]
+; FLAT-NEXT: v_cmp_lt_f32_e64 s[6:7], |s6|, v0
+; FLAT-NEXT: s_and_b64 s[0:1], exec, s[4:5]
; FLAT-NEXT: s_and_b64 s[2:3], exec, s[2:3]
-; FLAT-NEXT: s_waitcnt vmcnt(0)
-; FLAT-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s8
-; FLAT-NEXT: s_and_b64 s[4:5], exec, s[4:5]
+; FLAT-NEXT: s_and_b64 s[4:5], exec, s[6:7]
+; FLAT-NEXT: s_mov_b32 s7, 0xf000
+; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: v_mov_b32_e32 v0, 3
; FLAT-NEXT: s_branch .LBB3_4
; FLAT-NEXT: .LBB3_1: ; %Flow6
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 0b6645556af3..4eb59202ece5 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -481,7 +481,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(i8 addrspace(1)* %s
; SI-NEXT: successors: %bb.6(0x80000000)
; SI-NEXT: {{ $}}
; SI-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
- ; SI-NEXT: [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[V_MOV_B1]], 0, 0, implicit $exec :: (load (s8) from `i8 addrspace(1)* null`, addrspace 1)
+ ; SI-NEXT: [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[V_MOV_B1]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s8) from `i8 addrspace(1)* null`, addrspace 1)
; SI-NEXT: S_BRANCH %bb.6
; SI-NEXT: {{ $}}
; SI-NEXT: bb.5.Flow: