diff options
author | Jay Foad <jay.foad@amd.com> | 2022-03-09 17:03:09 +0000 |
---|---|---|
committer | Jay Foad <jay.foad@amd.com> | 2022-03-09 17:05:49 +0000 |
commit | 28f67aed9d7f42529e34b2d4252b089975be3698 (patch) | |
tree | 2df0875f7f28df9e274c46d72591b71070575204 | |
parent | 041080fc9b7a245967f512923dae9d6af7cfbc57 (diff) |
[AMDGPU] Fix some confusing check prefixes. NFC.
Tahiti is SI/GFX6.
Kaveri and Hawaii are CI/GFX7.
Fiji is VI/GFX8.
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-flat.mir | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-flat.mir | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/half.ll | 1806 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/min.ll | 80 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll | 18 |
5 files changed, 956 insertions, 956 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-flat.mir index 683ca816fcd6..94601eb99bb8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-flat.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -o - %s | FileCheck %s -check-prefix=SI -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -o - %s | FileCheck %s -check-prefix=VI +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -o - %s | FileCheck %s -check-prefix=SI +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -o - %s | FileCheck %s -check-prefix=VI --- name: test_sextload_flat_i32_i8 body: | diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-flat.mir index 044d5fc4405b..45c8b128379c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-flat.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -o - %s | FileCheck %s --check-prefix=SI -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -o - %s | FileCheck %s --check-prefix=VI +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer -o - %s | FileCheck %s --check-prefix=SI +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -o - %s | FileCheck %s --check-prefix=VI --- name: test_zextload_flat_i32_i8 body: | diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 9669f01ea3e2..1fa785499109 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -1,20 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI %s -; half args should be promoted to float for SI and lower. +; half args should be promoted to float for CI and lower. define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { -; SI-LABEL: load_f16_arg: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dword s2, s[4:5], 0x2 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: flat_store_short v[0:1], v2 -; SI-NEXT: s_endpgm +; CI-LABEL: load_f16_arg: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: flat_store_short v[0:1], v2 +; CI-NEXT: s_endpgm ; ; VI-LABEL: load_f16_arg: ; VI: ; %bb.0: @@ -31,16 +31,16 @@ define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { } define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { -; SI-LABEL: load_v2f16_arg: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dword s2, s[4:5], 0x2 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: flat_store_dword v[0:1], v2 -; SI-NEXT: s_endpgm +; CI-LABEL: load_v2f16_arg: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm ; ; VI-LABEL: load_v2f16_arg: ; VI: ; %bb.0: @@ -57,22 +57,22 @@ define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x ha } define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { -; SI-LABEL: load_v3f16_arg: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_u32 s4, s0, 4 -; SI-NEXT: s_addc_u32 s5, s1, 0 -; SI-NEXT: v_mov_b32_e32 v2, s4 -; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v3, s5 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v5, s2 -; SI-NEXT: flat_store_short v[2:3], v4 -; SI-NEXT: flat_store_dword v[0:1], v5 -; SI-NEXT: s_endpgm +; CI-LABEL: load_v3f16_arg: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s4, s0, 4 +; CI-NEXT: s_addc_u32 s5, s1, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v4, s3 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v5, s2 +; CI-NEXT: flat_store_short v[2:3], v4 +; CI-NEXT: flat_store_dword v[0:1], v5 +; CI-NEXT: s_endpgm ; ; VI-LABEL: load_v3f16_arg: ; VI: ; %bb.0: @@ -97,17 +97,17 @@ define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x ha ; FIXME: Why not one load? define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { -; SI-LABEL: load_v4f16_arg: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v3, s3 -; SI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; SI-NEXT: s_endpgm +; CI-LABEL: load_v4f16_arg: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; CI-NEXT: s_endpgm ; ; VI-LABEL: load_v4f16_arg: ; VI: ; %bb.0: @@ -125,19 +125,19 @@ define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x ha } define amdgpu_kernel void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 { -; SI-LABEL: load_v8f16_arg: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, s6 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v5, s7 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v3, s3 -; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; SI-NEXT: s_endpgm +; CI-LABEL: load_v8f16_arg: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v4, s6 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v5, s7 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: s_endpgm ; ; VI-LABEL: load_v8f16_arg: ; VI: ; %bb.0: @@ -157,18 +157,18 @@ define amdgpu_kernel void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x ha } define amdgpu_kernel void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { -; SI-LABEL: extload_v2f16_arg: -; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s3, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v3, s1 -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; SI-NEXT: s_endpgm +; CI-LABEL: extload_v2f16_arg: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s3, s2, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm ; ; VI-LABEL: extload_v2f16_arg: ; VI: ; %bb.0: @@ -188,16 +188,16 @@ define amdgpu_kernel void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 } define amdgpu_kernel void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { -; SI-LABEL: extload_f16_to_f32_arg: -; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: flat_store_dword v[0:1], v2 -; SI-NEXT: s_endpgm +; CI-LABEL: extload_f16_to_f32_arg: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm ; ; VI-LABEL: extload_f16_to_f32_arg: ; VI: ; %bb.0: @@ -215,18 +215,18 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(float addrspace(1)* %out, half } define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { -; SI-LABEL: extload_v2f16_to_v2f32_arg: -; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s3, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v3, s1 -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; SI-NEXT: s_endpgm +; CI-LABEL: extload_v2f16_to_v2f32_arg: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s2, s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s3, s2, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm ; ; VI-LABEL: extload_v2f16_to_v2f32_arg: ; VI: ; %bb.0: @@ -246,19 +246,19 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* } define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { -; SI-LABEL: extload_v3f16_to_v3f32_arg: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: v_mov_b32_e32 v3, s2 -; SI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] -; SI-NEXT: s_endpgm +; CI-LABEL: extload_v3f16_to_v3f32_arg: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s4, s0, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v4, s3 +; CI-NEXT: v_mov_b32_e32 v3, s2 +; CI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] +; CI-NEXT: s_endpgm ; ; VI-LABEL: extload_v3f16_to_v3f32_arg: ; VI: ; %bb.0: @@ -279,21 +279,21 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* } define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { -; SI-LABEL: extload_v4f16_to_v4f32_arg: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s1, 16 -; SI-NEXT: s_lshr_b32 s5, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v5, s3 -; SI-NEXT: v_mov_b32_e32 v4, s2 -; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; SI-NEXT: s_endpgm +; CI-LABEL: extload_v4f16_to_v4f32_arg: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s4, s1, 16 +; CI-NEXT: s_lshr_b32 s5, s0, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: s_endpgm ; ; VI-LABEL: extload_v4f16_to_v4f32_arg: ; VI: ; %bb.0: @@ -316,33 +316,33 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* } define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { -; SI-LABEL: extload_v8f16_to_v8f32_arg: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s6, s1, 16 -; SI-NEXT: s_lshr_b32 s7, s0, 16 -; SI-NEXT: s_lshr_b32 s8, s3, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: s_lshr_b32 s6, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s2 -; SI-NEXT: s_add_u32 s0, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s1 -; SI-NEXT: s_addc_u32 s1, s5, 0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v9, s1 -; SI-NEXT: v_mov_b32_e32 v8, s0 -; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; SI-NEXT: s_nop 0 -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: v_mov_b32_e32 v5, s5 -; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; SI-NEXT: s_endpgm +; CI-LABEL: extload_v8f16_to_v8f32_arg: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s6, s1, 16 +; CI-NEXT: s_lshr_b32 s7, s0, 16 +; CI-NEXT: s_lshr_b32 s8, s3, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; CI-NEXT: s_lshr_b32 s6, s2, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v7, s8 +; CI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: v_cvt_f32_f16_e32 v6, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s2 +; CI-NEXT: s_add_u32 s0, s4, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s1 +; CI-NEXT: s_addc_u32 s1, s5, 0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; CI-NEXT: v_mov_b32_e32 v9, s1 +; CI-NEXT: v_mov_b32_e32 v8, s0 +; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; CI-NEXT: s_nop 0 +; CI-NEXT: v_mov_b32_e32 v4, s4 +; CI-NEXT: v_mov_b32_e32 v5, s5 +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: s_endpgm ; ; VI-LABEL: extload_v8f16_to_v8f32_arg: ; VI: ; %bb.0: @@ -377,18 +377,18 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* } define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { -; SI-LABEL: extload_f16_to_f64_arg: -; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[4:5], 0x2 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, s1 -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; SI-NEXT: s_endpgm +; CI-LABEL: extload_f16_to_f64_arg: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm ; ; VI-LABEL: extload_f16_to_f64_arg: ; VI: ; %bb.0: @@ -408,21 +408,21 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, hal } define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { -; SI-LABEL: extload_v2f16_to_v2f64_arg: -; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[4:5], 0x2 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s1, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v5, s1 -; SI-NEXT: v_mov_b32_e32 v4, s0 -; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; SI-NEXT: s_endpgm +; CI-LABEL: extload_v2f16_to_v2f64_arg: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s1, s0, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: s_endpgm ; ; VI-LABEL: extload_v2f16_to_v2f64_arg: ; VI: ; %bb.0: @@ -445,27 +445,27 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* } define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { -; SI-LABEL: extload_v3f16_to_v3f64_arg: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 -; SI-NEXT: s_lshr_b32 s4, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_add_u32 s0, s2, 16 -; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v0 -; SI-NEXT: s_addc_u32 s1, s3, 0 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; SI-NEXT: v_mov_b32_e32 v7, s1 -; SI-NEXT: v_mov_b32_e32 v6, s0 -; SI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] -; SI-NEXT: v_mov_b32_e32 v5, s3 -; SI-NEXT: v_mov_b32_e32 v4, s2 -; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; SI-NEXT: s_endpgm +; CI-LABEL: extload_v3f16_to_v3f64_arg: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; CI-NEXT: s_lshr_b32 s4, s0, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; CI-NEXT: s_add_u32 s0, s2, 16 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v0 +; CI-NEXT: s_addc_u32 s1, s3, 0 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: v_mov_b32_e32 v7, s1 +; CI-NEXT: v_mov_b32_e32 v6, s0 +; CI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: s_endpgm ; ; VI-LABEL: extload_v3f16_to_v3f64_arg: ; VI: ; %bb.0: @@ -494,31 +494,31 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* } define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { -; SI-LABEL: extload_v4f16_to_v4f64_arg: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s4, s1, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 -; SI-NEXT: s_lshr_b32 s5, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 -; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 -; SI-NEXT: s_add_u32 s0, s2, 16 -; SI-NEXT: s_addc_u32 s1, s3, 0 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; SI-NEXT: v_mov_b32_e32 v9, s1 -; SI-NEXT: v_mov_b32_e32 v8, s0 -; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; SI-NEXT: s_nop 0 -; SI-NEXT: v_mov_b32_e32 v5, s3 -; SI-NEXT: v_mov_b32_e32 v4, s2 -; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; SI-NEXT: s_endpgm +; CI-LABEL: extload_v4f16_to_v4f64_arg: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s4, s1, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 +; CI-NEXT: s_lshr_b32 s5, s0, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; CI-NEXT: s_add_u32 s0, s2, 16 +; CI-NEXT: s_addc_u32 s1, s3, 0 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: v_mov_b32_e32 v9, s1 +; CI-NEXT: v_mov_b32_e32 v8, s0 +; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; CI-NEXT: s_nop 0 +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: s_endpgm ; ; VI-LABEL: extload_v4f16_to_v4f64_arg: ; VI: ; %bb.0: @@ -551,52 +551,52 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* } define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { -; SI-LABEL: extload_v8f16_to_v8f64_arg: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s6, s3, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s3 -; SI-NEXT: s_lshr_b32 s7, s2, 16 -; SI-NEXT: s_lshr_b32 s8, s1, 16 -; SI-NEXT: s_lshr_b32 s6, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s0 -; SI-NEXT: s_add_u32 s0, s4, 48 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 -; SI-NEXT: v_cvt_f64_f32_e32 v[14:15], v0 -; SI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 -; SI-NEXT: s_addc_u32 s1, s5, 0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v17, s1 -; SI-NEXT: v_mov_b32_e32 v16, s0 -; SI-NEXT: s_add_u32 s0, s4, 32 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 -; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v1 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 -; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; SI-NEXT: s_addc_u32 s1, s5, 0 -; SI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 -; SI-NEXT: v_mov_b32_e32 v13, s1 -; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 -; SI-NEXT: v_mov_b32_e32 v12, s0 -; SI-NEXT: s_add_u32 s0, s4, 16 -; SI-NEXT: s_addc_u32 s1, s5, 0 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; SI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] -; SI-NEXT: s_nop 0 -; SI-NEXT: v_mov_b32_e32 v9, s1 -; SI-NEXT: v_mov_b32_e32 v8, s0 -; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; SI-NEXT: s_nop 0 -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: v_mov_b32_e32 v5, s5 -; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; SI-NEXT: s_endpgm +; CI-LABEL: extload_v8f16_to_v8f64_arg: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s6, s3, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; CI-NEXT: v_cvt_f32_f16_e32 v12, s3 +; CI-NEXT: s_lshr_b32 s7, s2, 16 +; CI-NEXT: s_lshr_b32 s8, s1, 16 +; CI-NEXT: s_lshr_b32 s6, s0, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s7 +; CI-NEXT: v_cvt_f32_f16_e32 v8, s2 +; CI-NEXT: v_cvt_f32_f16_e32 v9, s0 +; CI-NEXT: s_add_u32 s0, s4, 48 +; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 +; CI-NEXT: v_cvt_f64_f32_e32 v[14:15], v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 +; CI-NEXT: s_addc_u32 s1, s5, 0 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; CI-NEXT: v_mov_b32_e32 v17, s1 +; CI-NEXT: v_mov_b32_e32 v16, s0 +; CI-NEXT: s_add_u32 s0, s4, 32 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v1 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; CI-NEXT: s_addc_u32 s1, s5, 0 +; CI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 +; CI-NEXT: v_mov_b32_e32 v13, s1 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; CI-NEXT: v_mov_b32_e32 v12, s0 +; CI-NEXT: s_add_u32 s0, s4, 16 +; CI-NEXT: s_addc_u32 s1, s5, 0 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; CI-NEXT: s_nop 0 +; CI-NEXT: v_mov_b32_e32 v9, s1 +; CI-NEXT: v_mov_b32_e32 v8, s0 +; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; CI-NEXT: s_nop 0 +; CI-NEXT: v_mov_b32_e32 v4, s4 +; CI-NEXT: v_mov_b32_e32 v5, s5 +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: s_endpgm ; ; VI-LABEL: extload_v8f16_to_v8f64_arg: ; VI: ; %bb.0: @@ -742,21 +742,21 @@ define amdgpu_kernel void @global_extload_f16_to_f32(float addrspace(1)* %out, h } define amdgpu_kernel void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { -; SI-LABEL: global_extload_v2f16_to_v2f32: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: flat_load_dword v1, v[0:1] -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: v_mov_b32_e32 v3, s1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; SI-NEXT: s_endpgm +; CI-LABEL: global_extload_v2f16_to_v2f32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dword v1, v[0:1] +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v2f16_to_v2f32: ; VI: ; %bb.0: @@ -779,22 +779,22 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1 } define amdgpu_kernel void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { -; SI-LABEL: global_extload_v3f16_to_v3f32: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: flat_load_dwordx2 v[1:2], v[0:1] -; SI-NEXT: v_mov_b32_e32 v3, s0 -; SI-NEXT: v_mov_b32_e32 v4, s1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] -; SI-NEXT: s_endpgm +; CI-LABEL: global_extload_v3f16_to_v3f32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dwordx2 v[1:2], v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] +; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v3f16_to_v3f32: ; VI: ; %bb.0: @@ -818,24 +818,24 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1 } define amdgpu_kernel void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { -; SI-LABEL: global_extload_v4f16_to_v4f32: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: flat_load_dwordx2 v[3:4], v[0:1] -; SI-NEXT: v_mov_b32_e32 v5, s1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 -; SI-NEXT: v_mov_b32_e32 v4, s0 -; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; SI-NEXT: s_endpgm +; CI-LABEL: global_extload_v4f16_to_v4f32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dwordx2 v[3:4], v[0:1] +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v4f16_to_v4f32: ; VI: ; %bb.0: @@ -860,35 +860,35 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1 } define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { -; SI-LABEL: global_extload_v8f16_to_v8f32: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; SI-NEXT: s_add_u32 s2, s0, 16 -; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_mov_b32_e32 v13, s1 -; SI-NEXT: v_mov_b32_e32 v12, s0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: flat_store_dwordx4 v[0:1], v[8:11] -; SI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] -; SI-NEXT: s_endpgm +; CI-LABEL: global_extload_v8f16_to_v8f32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: s_add_u32 s2, s0, 16 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v13, s1 +; CI-NEXT: v_mov_b32_e32 v12, s0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v10, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v8f16_to_v8f32: ; VI: ; %bb.0: @@ -922,63 +922,63 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1 } define amdgpu_kernel void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { -; SI-LABEL: global_extload_v16f16_to_v16f32: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_u32 s4, s2, 16 -; SI-NEXT: v_mov_b32_e32 v5, s3 -; SI-NEXT: s_addc_u32 s5, s3, 0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v4, s2 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; SI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; SI-NEXT: s_add_u32 s2, s0, 16 -; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_mov_b32_e32 v14, s3 -; SI-NEXT: v_mov_b32_e32 v13, s2 -; SI-NEXT: s_add_u32 s2, s0, 48 -; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v6 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; SI-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_mov_b32_e32 v5, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_mov_b32_e32 v4, s0 -; SI-NEXT: s_add_u32 s0, s0, 32 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_addc_u32 s1, s1, 0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_mov_b32_e32 v15, s3 -; SI-NEXT: v_mov_b32_e32 v17, s1 -; SI-NEXT: v_mov_b32_e32 v14, s2 -; SI-NEXT: v_mov_b32_e32 v16, s0 -; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; SI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; SI-NEXT: flat_store_dwordx4 v[16:17], v[6:9] -; SI-NEXT: s_endpgm +; CI-LABEL: global_extload_v16f16_to_v16f32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s4, s2, 16 +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: s_addc_u32 s5, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; CI-NEXT: s_add_u32 s2, s0, 16 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v14, s3 +; CI-NEXT: v_mov_b32_e32 v13, s2 +; CI-NEXT: s_add_u32 s2, s0, 48 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f32_f16_e32 v8, v1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v11, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v6 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v6 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 +; CI-NEXT: flat_store_dwordx4 v[13:14], v[9:12] +; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v3 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v2 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: s_add_u32 s0, s0, 32 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: v_mov_b32_e32 v17, s1 +; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: v_mov_b32_e32 v16, s0 +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] +; CI-NEXT: flat_store_dwordx4 v[16:17], v[6:9] +; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v16f16_to_v16f32: ; VI: ; %bb.0: @@ -1057,23 +1057,23 @@ define amdgpu_kernel void @global_extload_f16_to_f64(double addrspace(1)* %out, } define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { -; SI-LABEL: global_extload_v2f16_to_v2f64: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: flat_load_dword v0, v[0:1] -; SI-NEXT: v_mov_b32_e32 v4, s0 -; SI-NEXT: v_mov_b32_e32 v5, s1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; SI-NEXT: s_endpgm +; CI-LABEL: global_extload_v2f16_to_v2f64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v2f16_to_v2f64: ; VI: ; %bb.0: @@ -1098,30 +1098,30 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace( } define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { -; SI-LABEL: global_extload_v3f16_to_v3f64: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; SI-NEXT: s_add_u32 s2, s0, 16 -; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_mov_b32_e32 v7, s3 -; SI-NEXT: v_mov_b32_e32 v6, s2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; SI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] -; SI-NEXT: v_mov_b32_e32 v5, s1 -; SI-NEXT: v_mov_b32_e32 v4, s0 -; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; SI-NEXT: s_endpgm +; CI-LABEL: global_extload_v3f16_to_v3f64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; CI-NEXT: s_add_u32 s2, s0, 16 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v7, s3 +; CI-NEXT: v_mov_b32_e32 v6, s2 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v3f16_to_v3f64: ; VI: ; %bb.0: @@ -1153,33 +1153,33 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace( } define amdgpu_kernel void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { -; SI-LABEL: global_extload_v4f16_to_v4f64: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; SI-NEXT: s_add_u32 s2, s0, 16 -; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_mov_b32_e32 v9, s1 -; SI-NEXT: v_mov_b32_e32 v8, s0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v3 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v2 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10 -; SI-NEXT: v_mov_b32_e32 v11, s3 -; SI-NEXT: v_mov_b32_e32 v10, s2 -; SI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; SI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; SI-NEXT: s_endpgm +; CI-LABEL: global_extload_v4f16_to_v4f64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; CI-NEXT: s_add_u32 s2, s0, 16 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v9, s1 +; CI-NEXT: v_mov_b32_e32 v8, s0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v3 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10 +; CI-NEXT: v_mov_b32_e32 v11, s3 +; CI-NEXT: v_mov_b32_e32 v10, s2 +; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v4f16_to_v4f64: ; VI: ; %bb.0: @@ -1213,53 +1213,53 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(<4 x double> addrspace( } define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { -; SI-LABEL: global_extload_v8f16_to_v8f64: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; SI-NEXT: s_add_u32 s2, s0, 48 -; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_mov_b32_e32 v7, s3 -; SI-NEXT: v_mov_b32_e32 v6, s2 -; SI-NEXT: s_add_u32 s2, s0, 32 -; SI-NEXT: v_mov_b32_e32 v13, s1 -; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_mov_b32_e32 v12, s0 -; SI-NEXT: s_add_u32 s0, s0, 16 -; SI-NEXT: v_mov_b32_e32 v15, s3 -; SI-NEXT: s_addc_u32 s1, s1, 0 -; SI-NEXT: v_mov_b32_e32 v14, s2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v5 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v11 -; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; SI-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 -; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v16 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 -; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v18 -; SI-NEXT: v_mov_b32_e32 v17, s1 -; SI-NEXT: v_mov_b32_e32 v16, s0 -; SI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; SI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; SI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] -; SI-NEXT: s_endpgm +; CI-LABEL: global_extload_v8f16_to_v8f64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: s_add_u32 s2, s0, 48 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v7, s3 +; CI-NEXT: v_mov_b32_e32 v6, s2 +; CI-NEXT: s_add_u32 s2, s0, 32 +; CI-NEXT: v_mov_b32_e32 v13, s1 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v12, s0 +; CI-NEXT: s_add_u32 s0, s0, 16 +; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v1 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v5 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v11 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; CI-NEXT: flat_store_dwordx4 v[6:7], v[0:3] +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v16 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v18 +; CI-NEXT: v_mov_b32_e32 v17, s1 +; CI-NEXT: v_mov_b32_e32 v16, s0 +; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; CI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v8f16_to_v8f64: ; VI: ; %bb.0: @@ -1311,99 +1311,99 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace( } define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { -; SI-LABEL: global_extload_v16f16_to_v16f64: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; SI-NEXT: s_add_u32 s2, s2, 16 -; SI-NEXT: s_addc_u32 s3, s3, 0 -; SI-NEXT: v_mov_b32_e32 v5, s3 -; SI-NEXT: v_mov_b32_e32 v4, s2 -; SI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; SI-NEXT: s_add_u32 s2, s0, 48 -; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_mov_b32_e32 v15, s3 -; SI-NEXT: v_mov_b32_e32 v14, s2 -; SI-NEXT: s_add_u32 s2, s0, 32 -; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_mov_b32_e32 v17, s3 -; SI-NEXT: v_mov_b32_e32 v16, s2 -; SI-NEXT: s_add_u32 s2, s0, 16 -; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_mov_b32_e32 v19, s3 -; SI-NEXT: v_mov_b32_e32 v18, s2 -; SI-NEXT: s_add_u32 s2, s0, 0x70 -; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_mov_b32_e32 v13, s1 -; SI-NEXT: v_mov_b32_e32 v12, s0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 -; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 -; SI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] -; SI-NEXT: v_mov_b32_e32 v15, s3 -; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 -; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; SI-NEXT: v_mov_b32_e32 v14, s2 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; SI-NEXT: s_add_u32 s2, s0, 0x60 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 -; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] -; SI-NEXT: v_mov_b32_e32 v17, s3 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v5 -; SI-NEXT: v_mov_b32_e32 v16, s2 -; SI-NEXT: s_add_u32 s2, s0, 0x50 -; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 -; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; SI-NEXT: s_add_u32 s0, s0, 64 -; SI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] -; SI-NEXT: s_addc_u32 s1, s1, 0 -; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 -; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 -; SI-NEXT: v_mov_b32_e32 v19, s3 -; SI-NEXT: v_mov_b32_e32 v13, s1 -; SI-NEXT: v_mov_b32_e32 v18, s2 -; SI-NEXT: v_mov_b32_e32 v12, s0 -; SI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; SI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; SI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] -; SI-NEXT: s_endpgm +; CI-LABEL: global_extload_v16f16_to_v16f64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: s_add_u32 s2, s2, 16 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; CI-NEXT: s_add_u32 s2, s0, 48 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: s_add_u32 s2, s0, 32 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v17, s3 +; CI-NEXT: v_mov_b32_e32 v16, s2 +; CI-NEXT: s_add_u32 s2, s0, 16 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v19, s3 +; CI-NEXT: v_mov_b32_e32 v18, s2 +; CI-NEXT: s_add_u32 s2, s0, 0x70 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v13, s1 +; CI-NEXT: v_mov_b32_e32 v12, s0 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v8 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v5 +; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; CI-NEXT: v_mov_b32_e32 v15, s3 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; CI-NEXT: v_mov_b32_e32 v14, s2 +; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v0 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; CI-NEXT: s_add_u32 s2, s0, 0x60 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; CI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] +; CI-NEXT: v_mov_b32_e32 v17, s3 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v20 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v5 +; CI-NEXT: v_mov_b32_e32 v16, s2 +; CI-NEXT: s_add_u32 s2, s0, 0x50 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 +; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; CI-NEXT: s_add_u32 s0, s0, 64 +; CI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21 +; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 +; CI-NEXT: v_mov_b32_e32 v19, s3 +; CI-NEXT: v_mov_b32_e32 v13, s1 +; CI-NEXT: v_mov_b32_e32 v18, s2 +; CI-NEXT: v_mov_b32_e32 v12, s0 +; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] +; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; CI-NEXT: s_endpgm ; ; VI-LABEL: global_extload_v16f16_to_v16f64: ; VI: ; %bb.0: @@ -1518,22 +1518,22 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out, } define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { -; SI-LABEL: global_truncstore_v2f32_to_v2f16: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: flat_store_dword v[0:1], v2 -; SI-NEXT: s_endpgm +; CI-LABEL: global_truncstore_v2f32_to_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v2, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_or_b32_e32 v2, v3, v2 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm ; ; VI-LABEL: global_truncstore_v2f32_to_v2f16: ; VI: ; %bb.0: @@ -1557,28 +1557,28 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace } define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { -; SI-LABEL: global_truncstore_v3f32_to_v3f16: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: flat_load_dwordx3 v[0:2], v[0:1] -; SI-NEXT: s_add_u32 s2, s0, 4 -; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: flat_store_short v[0:1], v2 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_or_b32_e32 v2, v4, v3 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: flat_store_dword v[0:1], v2 -; SI-NEXT: s_endpgm +; CI-LABEL: global_truncstore_v3f32_to_v3f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dwordx3 v[0:2], v[0:1] +; CI-NEXT: s_add_u32 s2, s0, 4 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: flat_store_short v[0:1], v2 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_or_b32_e32 v2, v4, v3 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm ; ; VI-LABEL: global_truncstore_v3f32_to_v3f16: ; VI: ; %bb.0: @@ -1608,26 +1608,26 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace } define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { -; SI-LABEL: global_truncstore_v4f32_to_v4f16: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; SI-NEXT: v_mov_b32_e32 v4, s0 -; SI-NEXT: v_mov_b32_e32 v5, s1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v6 -; SI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] -; SI-NEXT: s_endpgm +; CI-LABEL: global_truncstore_v4f32_to_v4f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; CI-NEXT: v_or_b32_e32 v1, v2, v3 +; CI-NEXT: v_or_b32_e32 v0, v0, v6 +; CI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; CI-NEXT: s_endpgm ; ; VI-LABEL: global_truncstore_v4f32_to_v4f16: ; VI: ; %bb.0: @@ -1654,40 +1654,40 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace } define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { -; SI-LABEL: global_truncstore_v8f32_to_v8f16: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: s_add_u32 s2, s2, 16 -; SI-NEXT: s_addc_u32 s3, s3, 0 -; SI-NEXT: v_mov_b32_e32 v5, s3 -; SI-NEXT: v_mov_b32_e32 v4, s2 -; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; SI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; SI-NEXT: v_mov_b32_e32 v8, s0 -; SI-NEXT: v_mov_b32_e32 v9, s1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v1, v2, v3 -; SI-NEXT: v_or_b32_e32 v0, v0, v10 -; SI-NEXT: v_or_b32_e32 v3, v6, v7 -; SI-NEXT: v_or_b32_e32 v2, v4, v5 -; SI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; SI-NEXT: s_endpgm +; CI-LABEL: global_truncstore_v8f32_to_v8f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_add_u32 s2, s2, 16 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; CI-NEXT: v_mov_b32_e32 v8, s0 +; CI-NEXT: v_mov_b32_e32 v9, s1 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v1 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_or_b32_e32 v1, v2, v3 +; CI-NEXT: v_or_b32_e32 v0, v0, v10 +; CI-NEXT: v_or_b32_e32 v3, v6, v7 +; CI-NEXT: v_or_b32_e32 v2, v4, v5 +; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; CI-NEXT: s_endpgm ; ; VI-LABEL: global_truncstore_v8f32_to_v8f16: ; VI: ; %bb.0: @@ -1726,73 +1726,73 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace } define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { -; SI-LABEL: global_truncstore_v16f32_to_v16f16: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_u32 s4, s2, 32 -; SI-NEXT: s_addc_u32 s5, s3, 0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: s_add_u32 s4, s2, 48 -; SI-NEXT: s_addc_u32 s5, s3, 0 -; SI-NEXT: v_mov_b32_e32 v9, s3 -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: v_mov_b32_e32 v8, s2 -; SI-NEXT: s_add_u32 s2, s2, 16 -; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; SI-NEXT: v_mov_b32_e32 v5, s5 -; SI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; SI-NEXT: s_addc_u32 s3, s3, 0 -; SI-NEXT: v_mov_b32_e32 v13, s3 -; SI-NEXT: v_mov_b32_e32 v12, s2 -; SI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] -; SI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] -; SI-NEXT: s_add_u32 s2, s0, 16 -; SI-NEXT: s_addc_u32 s3, s1, 0 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mov_b32_e32 v5, s3 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 -; SI-NEXT: v_mov_b32_e32 v4, s2 -; SI-NEXT: v_or_b32_e32 v0, v0, v18 -; SI-NEXT: v_or_b32_e32 v3, v6, v2 -; SI-NEXT: v_or_b32_e32 v2, v17, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; SI-NEXT: v_mov_b32_e32 v5, s1 -; SI-NEXT: v_or_b32_e32 v1, v10, v6 -; SI-NEXT: v_or_b32_e32 v0, v8, v7 -; SI-NEXT: v_or_b32_e32 v3, v14, v9 -; SI-NEXT: v_or_b32_e32 v2, v12, v11 -; SI-NEXT: v_mov_b32_e32 v4, s0 -; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; SI-NEXT: s_endpgm +; CI-LABEL: global_truncstore_v16f32_to_v16f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s4, s2, 32 +; CI-NEXT: s_addc_u32 s5, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: s_add_u32 s4, s2, 48 +; CI-NEXT: s_addc_u32 s5, s3, 0 +; CI-NEXT: v_mov_b32_e32 v9, s3 +; CI-NEXT: v_mov_b32_e32 v4, s4 +; CI-NEXT: v_mov_b32_e32 v8, s2 +; CI-NEXT: s_add_u32 s2, s2, 16 +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v5, s5 +; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v13, s3 +; CI-NEXT: v_mov_b32_e32 v12, s2 +; CI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; CI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; CI-NEXT: s_add_u32 s2, s0, 16 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v4 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; CI-NEXT: v_or_b32_e32 v1, v2, v3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_or_b32_e32 v0, v0, v18 +; CI-NEXT: v_or_b32_e32 v3, v6, v2 +; CI-NEXT: v_or_b32_e32 v2, v17, v7 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 +; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_or_b32_e32 v1, v10, v6 +; CI-NEXT: v_or_b32_e32 v0, v8, v7 +; CI-NEXT: v_or_b32_e32 v3, v14, v9 +; CI-NEXT: v_or_b32_e32 v2, v12, v11 +; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: s_endpgm ; ; VI-LABEL: global_truncstore_v16f32_to_v16f16: ; VI: ; %bb.0: @@ -1861,21 +1861,21 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrsp ; FIXME: Unsafe math should fold conversions away define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { -; SI-LABEL: fadd_f16: -; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[4:5], 0x2 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_lshr_b32 s0, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: flat_store_short v[0:1], v2 -; SI-NEXT: s_endpgm +; CI-LABEL: fadd_f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[4:5], 0x2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: s_lshr_b32 s0, s0, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_short v[0:1], v2 +; CI-NEXT: s_endpgm ; ; VI-LABEL: fadd_f16: ; VI: ; %bb.0: @@ -1895,28 +1895,28 @@ define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half %a, half %b) # } define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { -; SI-LABEL: fadd_v2f16: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s2, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: s_lshr_b32 s0, s1, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v1, v2, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v2, v0, v1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: flat_store_dword v[0:1], v2 -; SI-NEXT: s_endpgm +; CI-LABEL: fadd_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s2, s0, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; CI-NEXT: s_lshr_b32 s0, s1, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s0 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_add_f32_e32 v1, v2, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v2, v0, v1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm ; ; VI-LABEL: fadd_v2f16: ; VI: ; %bb.0: @@ -1941,42 +1941,42 @@ define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> } define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { -; SI-LABEL: fadd_v4f16: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; SI-NEXT: v_mov_b32_e32 v4, s0 -; SI-NEXT: v_mov_b32_e32 v5, s1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v7, v7, v9 -; SI-NEXT: v_add_f32_e32 v6, v6, v8 -; SI-NEXT: v_add_f32_e32 v1, v1, v3 -; SI-NEXT: v_add_f32_e32 v0, v0, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_or_b32_e32 v0, v3, v0 -; SI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] -; SI-NEXT: s_endpgm +; CI-LABEL: fadd_v4f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v3 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_add_f32_e32 v7, v7, v9 +; CI-NEXT: v_add_f32_e32 v6, v6, v8 +; CI-NEXT: v_add_f32_e32 v1, v1, v3 +; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_or_b32_e32 v1, v2, v1 +; CI-NEXT: v_or_b32_e32 v0, v3, v0 +; CI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; CI-NEXT: s_endpgm ; ; VI-LABEL: fadd_v4f16: ; VI: ; %bb.0: @@ -2005,64 +2005,64 @@ define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> } define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { -; SI-LABEL: fadd_v8f16: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x8 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s10, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 -; SI-NEXT: s_lshr_b32 s0, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s0 -; SI-NEXT: s_lshr_b32 s0, s5, 16 -; SI-NEXT: s_lshr_b32 s11, s1, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s10 -; SI-NEXT: s_lshr_b32 s10, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s0 -; SI-NEXT: s_lshr_b32 s0, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s10 -; SI-NEXT: s_lshr_b32 s10, s3, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v10, s0 -; SI-NEXT: s_lshr_b32 s0, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 -; SI-NEXT: v_add_f32_e32 v1, v1, v9 -; SI-NEXT: v_add_f32_e32 v0, v0, v8 -; SI-NEXT: v_add_f32_e32 v3, v3, v11 -; SI-NEXT: v_add_f32_e32 v2, v2, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v5, v5, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v4, v4, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v7, v7, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v6, v6, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; SI-NEXT: s_endpgm +; CI-LABEL: fadd_v8f16: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x8 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s10, s0, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s0 +; CI-NEXT: s_lshr_b32 s0, s4, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v8, s0 +; CI-NEXT: s_lshr_b32 s0, s5, 16 +; CI-NEXT: s_lshr_b32 s11, s1, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s10 +; CI-NEXT: s_lshr_b32 s10, s2, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v9, s0 +; CI-NEXT: s_lshr_b32 s0, s6, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s11 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s10 +; CI-NEXT: s_lshr_b32 s10, s3, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v10, s0 +; CI-NEXT: s_lshr_b32 s0, s7, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s10 +; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 +; CI-NEXT: v_cvt_f32_f16_e32 v11, s0 +; CI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; CI-NEXT: v_cvt_f32_f16_e32 v13, s5 +; CI-NEXT: v_cvt_f32_f16_e32 v6, s2 +; CI-NEXT: v_cvt_f32_f16_e32 v7, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v14, s7 +; CI-NEXT: v_cvt_f32_f16_e32 v15, s6 +; CI-NEXT: v_add_f32_e32 v1, v1, v9 +; CI-NEXT: v_add_f32_e32 v0, v0, v8 +; CI-NEXT: v_add_f32_e32 v3, v3, v11 +; CI-NEXT: v_add_f32_e32 v2, v2, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v5, v5, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_add_f32_e32 v4, v4, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_add_f32_e32 v7, v7, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_add_f32_e32 v6, v6, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_or_b32_e32 v1, v5, v1 +; CI-NEXT: v_or_b32_e32 v0, v4, v0 +; CI-NEXT: v_mov_b32_e32 v4, s8 +; CI-NEXT: v_or_b32_e32 v3, v7, v3 +; CI-NEXT: v_or_b32_e32 v2, v6, v2 +; CI-NEXT: v_mov_b32_e32 v5, s9 +; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: s_endpgm ; ; VI-LABEL: fadd_v8f16: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index c5e83127166b..e97fece0d184 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI,FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI,GFX8_9_10,FUNC %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9_10,GFX8_9_10,FUNC %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GFX9_10,GFX8_9_10,FUNC %s @@ -80,10 +80,10 @@ define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, [8 x i32], ; GCN-DAG: s_load_dword s ; GCN-NOT: _load_ -; SI: s_min_i32 -; SI: s_min_i32 -; SI: s_min_i32 -; SI: s_min_i32 +; CI: s_min_i32 +; CI: s_min_i32 +; CI: s_min_i32 +; CI: s_min_i32 ; VI-DAG: s_min_i32 ; VI-DAG: s_min_i32 @@ -110,12 +110,12 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, [8 ; GCN: s_load_dwordx2 s ; GCN: s_load_dwordx2 s -; SI: s_ashr_i32 -; SI: s_sext_i32_i16 -; SI: s_ashr_i32 -; SI: s_sext_i32_i16 -; SI: s_min_i32 -; SI: s_min_i32 +; CI: s_ashr_i32 +; CI: s_sext_i32_i16 +; CI: s_ashr_i32 +; CI: s_sext_i32_i16 +; CI: s_min_i32 +; CI: s_min_i32 ; VI: s_sext_i32_i16 ; VI: s_sext_i32_i16 @@ -134,11 +134,11 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, < } ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16: -; SI-NOT: buffer_load -; SI: s_min_i32 -; SI: s_min_i32 -; SI: s_min_i32 -; SI: s_min_i32 +; CI-NOT: buffer_load +; CI: s_min_i32 +; CI: s_min_i32 +; CI: s_min_i32 +; CI: s_min_i32 ; VI: s_min_i32 ; VI: s_min_i32 @@ -177,7 +177,7 @@ define amdgpu_kernel void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrs } ; FUNC-LABEL: @v_test_imin_slt_i16 -; SI: v_min_i32_e32 +; CI: v_min_i32_e32 ; GFX8_9: v_min_i16_e32 ; GFX10: v_min_i16 @@ -286,10 +286,10 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, < ; FIXME: Reduce unused packed component to scalar ; FUNC-LABEL: @v_test_umin_ule_v3i16{{$}} -; SI: v_min_u32_e32 -; SI: v_min_u32_e32 -; SI: v_min_u32_e32 -; SI-NOT: v_min_u32_e32 +; CI: v_min_u32_e32 +; CI: v_min_u32_e32 +; CI: v_min_u32_e32 +; CI-NOT: v_min_u32_e32 ; VI: v_min_u16_e32 ; VI: v_min_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 @@ -347,9 +347,9 @@ define amdgpu_kernel void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrs } ; FUNC-LABEL: {{^}}v_test_umin_ult_i8: -; SI: {{buffer|flat|global}}_load_ubyte -; SI: {{buffer|flat|global}}_load_ubyte -; SI: v_min_u32_e32 +; CI: {{buffer|flat|global}}_load_ubyte +; CI: {{buffer|flat|global}}_load_ubyte +; CI: v_min_u32_e32 ; GFX8_9_10: {{flat|global}}_load_ubyte ; GFX8_9_10: {{flat|global}}_load_ubyte @@ -383,11 +383,11 @@ define amdgpu_kernel void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i } ; FUNC-LABEL: @v_test_umin_ult_i32_multi_use -; SI-NOT: v_min +; CI-NOT: v_min ; GCN: s_cmp_lt_u32 -; SI-NOT: v_min -; SI: v_cndmask_b32 -; SI-NOT: v_min +; CI-NOT: v_min +; CI: v_cndmask_b32 +; CI-NOT: v_min ; GCN: s_endpgm ; EG-NOT: MIN_UINT @@ -458,14 +458,14 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, < ; FUNC-LABEL: {{^}}s_test_umin_ult_v8i16: ; GCN-NOT: {{buffer|flat|global}}_load -; SI: s_min_u32 -; SI: s_min_u32 -; SI: s_min_u32 -; SI: s_min_u32 -; SI: s_min_u32 -; SI: s_min_u32 -; SI: s_min_u32 -; SI: s_min_u32 +; CI: s_min_u32 +; CI: s_min_u32 +; CI: s_min_u32 +; CI: s_min_u32 +; CI: s_min_u32 +; CI: s_min_u32 +; CI: s_min_u32 +; CI: s_min_u32 ; VI: s_min_u32 ; VI: s_min_u32 @@ -595,8 +595,8 @@ define amdgpu_kernel void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 } ; FUNC-LABEL: {{^}}v_test_imin_sle_v2i16: -; SI: v_min_i32 -; SI: v_min_i32 +; CI: v_min_i32 +; CI: v_min_i32 ; VI: v_min_i16 ; VI: v_min_i16 @@ -620,8 +620,8 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, < ; FIXME: i16 min ; FUNC-LABEL: {{^}}v_test_imin_ule_v2i16: -; SI: v_min_u32 -; SI: v_min_u32 +; CI: v_min_u32 +; CI: v_min_u32 ; VI: v_min_u16 ; VI: v_min_u16 diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll index fb30f0716afb..c5ceb2333757 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -22,12 +22,12 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 ; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[ZERO]]:[[SEL]]] ; VI: s_endpgm -; SI-DAG: s_cmp_eq_u32 -; SI-DAG: s_cselect_b64 vcc, -1, 0 -; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}}, vcc -; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; SI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[ZERO]]:[[SEL]]] -; SI: s_endpgm +; CI-DAG: s_cmp_eq_u32 +; CI-DAG: s_cselect_b64 vcc, -1, 0 +; CI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}}, vcc +; CI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; CI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[ZERO]]:[[SEL]]] +; CI: s_endpgm define amdgpu_kernel void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) { %cmp = icmp eq i32 %in, 0 %fp = sitofp i1 %cmp to double @@ -72,8 +72,8 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i6 ; FIXME: bfe and sext on VI+ ; GCN-LABEL: {{^}}s_sint_to_fp_i8_to_f64: ; GCN: s_load_dword [[VAL:s[0-9]+]] -; SI-NOT: bfe -; SI: s_sext_i32_i8 [[SEXT:s[0-9]+]], [[VAL]] +; CI-NOT: bfe +; CI: s_sext_i32_i8 [[SEXT:s[0-9]+]], [[VAL]] ; VI: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x80000 ; VI: s_sext_i32_i16 [[SEXT:s[0-9]+]], [[BFE]] |