; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s

declare i32 @llvm.amdgcn.workitem.id.x()

; --------------------------------------------------------------------
; llvm.amdgcn.smfmac.f32.16x16x64.f16
; --------------------------------------------------------------------

declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half>, <16 x half>, <4 x float>, i32, i32 immarg, i32 immarg)

define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %arg, <8 x half> %a, <16 x half> %b, i32 %idx) #0 {
; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__vgpr:
; SDAG:       ; %bb.0: ; %bb
; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT:    v_mov_b32_e32 v16, 0
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
; SDAG-NEXT:    v_mov_b32_e32 v17, s16
; SDAG-NEXT:    s_waitcnt vmcnt(0)
; SDAG-NEXT:    s_nop 0
; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7]
; SDAG-NEXT:    s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__vgpr:
; GISEL:       ; %bb.0: ; %bb
; GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
; GISEL-NEXT:    s_load_dword s16, s[4:5], 0x64
; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT:    v_mov_b32_e32 v16, s16
; GISEL-NEXT:    s_waitcnt vmcnt(0)
; GISEL-NEXT:    s_nop 0
; GISEL-NEXT:    v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
; GISEL-NEXT:    v_mov_b32_e32 v0, 0
; GISEL-NEXT:    s_nop 6
; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[6:7]
; GISEL-NEXT:    s_endpgm
bb:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
  %in.1 = load <4 x float>, ptr addrspace(1) %gep
  %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %a, <16 x half> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
  store <4 x float> %mai.1, ptr addrspace(1) %arg
  ret void
}

define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_16x16x64_f16:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <4 x float> %result
}

define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
  ret <4 x float> %result
}

define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
  ret <4 x float> %result
}

define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, <16 x half> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v8, s0
; SDAG-NEXT:    v_mov_b32_e32 v9, s1
; SDAG-NEXT:    v_mov_b32_e32 v10, s2
; SDAG-NEXT:    v_mov_b32_e32 v11, s3
; SDAG-NEXT:    v_mov_b32_e32 v0, s16
; SDAG-NEXT:    v_mov_b32_e32 v1, s17
; SDAG-NEXT:    v_mov_b32_e32 v2, s18
; SDAG-NEXT:    v_mov_b32_e32 v3, s19
; SDAG-NEXT:    v_mov_b32_e32 v4, s20
; SDAG-NEXT:    v_mov_b32_e32 v5, s21
; SDAG-NEXT:    v_mov_b32_e32 v6, s22
; SDAG-NEXT:    v_mov_b32_e32 v7, s23
; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
; SDAG-NEXT:    v_mov_b32_e32 v12, s28
; SDAG-NEXT:    s_nop 1
; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 a[0:3], v[8:11], v[0:7], v12
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[20:21]
; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[22:23]
; GISEL-NEXT:    v_mov_b32_e32 v0, s24
; GISEL-NEXT:    v_mov_b32_e32 v1, s25
; GISEL-NEXT:    v_mov_b32_e32 v2, s26
; GISEL-NEXT:    v_mov_b32_e32 v3, s27
; GISEL-NEXT:    v_mov_b32_e32 v16, s28
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_16x16x64_f16 v[0:3], v[12:15], v[4:11], v16
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <4 x float> %result
}

; --------------------------------------------------------------------
; llvm.amdgcn.smfmac.f32.32x32x32.f16
; --------------------------------------------------------------------

declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half>, <16 x half>, <16 x float>, i32, i32 immarg, i32 immarg)

define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %arg, <8 x half> %a, <16 x half> %b, i32 %idx) #0 {
; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__vgpr:
; SDAG:       ; %bb.0: ; %bb
; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[2:3]
; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[0:1]
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
; SDAG-NEXT:    v_mov_b32_e32 v28, s16
; SDAG-NEXT:    s_waitcnt vmcnt(0)
; SDAG-NEXT:    s_nop 0
; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT:    v_mov_b32_e32 v16, 0
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 2
; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
; SDAG-NEXT:    s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__vgpr:
; GISEL:       ; %bb.0: ; %bb
; GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
; GISEL-NEXT:    s_load_dword s16, s[4:5], 0x64
; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[2:3]
; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[0:1]
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
; GISEL-NEXT:    v_mov_b32_e32 v28, s16
; GISEL-NEXT:    s_waitcnt vmcnt(0)
; GISEL-NEXT:    s_nop 0
; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; GISEL-NEXT:    v_mov_b32_e32 v16, 0
; GISEL-NEXT:    s_nop 7
; GISEL-NEXT:    s_nop 2
; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; GISEL-NEXT:    s_endpgm
bb:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
  %in.1 = load <16 x float>, ptr addrspace(1) %gep
  %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %a, <16 x half> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
  store <16 x float> %mai.1, ptr addrspace(1) %arg
  ret void
}

define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
; SDAG-LABEL: test_smfmac_f32_32x32x32_f16:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x32_f16:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b32_e32 v48, v0
; GISEL-NEXT:    v_mov_b32_e32 v49, v1
; GISEL-NEXT:    v_mov_b32_e32 v50, v2
; GISEL-NEXT:    v_mov_b32_e32 v51, v3
; GISEL-NEXT:    v_mov_b32_e32 v30, v4
; GISEL-NEXT:    v_mov_b32_e32 v31, v5
; GISEL-NEXT:    v_mov_b32_e32 v32, v6
; GISEL-NEXT:    v_mov_b32_e32 v33, v7
; GISEL-NEXT:    v_mov_b32_e32 v34, v8
; GISEL-NEXT:    v_mov_b32_e32 v35, v9
; GISEL-NEXT:    v_mov_b32_e32 v36, v10
; GISEL-NEXT:    v_mov_b32_e32 v37, v11
; GISEL-NEXT:    v_mov_b32_e32 v0, v12
; GISEL-NEXT:    v_mov_b32_e32 v1, v13
; GISEL-NEXT:    v_mov_b32_e32 v2, v14
; GISEL-NEXT:    v_mov_b32_e32 v3, v15
; GISEL-NEXT:    v_mov_b32_e32 v4, v16
; GISEL-NEXT:    v_mov_b32_e32 v5, v17
; GISEL-NEXT:    v_mov_b32_e32 v6, v18
; GISEL-NEXT:    v_mov_b32_e32 v7, v19
; GISEL-NEXT:    v_mov_b32_e32 v8, v20
; GISEL-NEXT:    v_mov_b32_e32 v9, v21
; GISEL-NEXT:    v_mov_b32_e32 v10, v22
; GISEL-NEXT:    v_mov_b32_e32 v11, v23
; GISEL-NEXT:    v_mov_b32_e32 v12, v24
; GISEL-NEXT:    v_mov_b32_e32 v13, v25
; GISEL-NEXT:    v_mov_b32_e32 v14, v26
; GISEL-NEXT:    v_mov_b32_e32 v15, v27
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <16 x float> %result
}

define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b32_e32 v48, v0
; GISEL-NEXT:    v_mov_b32_e32 v49, v1
; GISEL-NEXT:    v_mov_b32_e32 v50, v2
; GISEL-NEXT:    v_mov_b32_e32 v51, v3
; GISEL-NEXT:    v_mov_b32_e32 v30, v4
; GISEL-NEXT:    v_mov_b32_e32 v31, v5
; GISEL-NEXT:    v_mov_b32_e32 v32, v6
; GISEL-NEXT:    v_mov_b32_e32 v33, v7
; GISEL-NEXT:    v_mov_b32_e32 v34, v8
; GISEL-NEXT:    v_mov_b32_e32 v35, v9
; GISEL-NEXT:    v_mov_b32_e32 v36, v10
; GISEL-NEXT:    v_mov_b32_e32 v37, v11
; GISEL-NEXT:    v_mov_b32_e32 v0, v12
; GISEL-NEXT:    v_mov_b32_e32 v1, v13
; GISEL-NEXT:    v_mov_b32_e32 v2, v14
; GISEL-NEXT:    v_mov_b32_e32 v3, v15
; GISEL-NEXT:    v_mov_b32_e32 v4, v16
; GISEL-NEXT:    v_mov_b32_e32 v5, v17
; GISEL-NEXT:    v_mov_b32_e32 v6, v18
; GISEL-NEXT:    v_mov_b32_e32 v7, v19
; GISEL-NEXT:    v_mov_b32_e32 v8, v20
; GISEL-NEXT:    v_mov_b32_e32 v9, v21
; GISEL-NEXT:    v_mov_b32_e32 v10, v22
; GISEL-NEXT:    v_mov_b32_e32 v11, v23
; GISEL-NEXT:    v_mov_b32_e32 v12, v24
; GISEL-NEXT:    v_mov_b32_e32 v13, v25
; GISEL-NEXT:    v_mov_b32_e32 v14, v26
; GISEL-NEXT:    v_mov_b32_e32 v15, v27
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
  ret <16 x float> %result
}

define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b32_e32 v48, v0
; GISEL-NEXT:    v_mov_b32_e32 v49, v1
; GISEL-NEXT:    v_mov_b32_e32 v50, v2
; GISEL-NEXT:    v_mov_b32_e32 v51, v3
; GISEL-NEXT:    v_mov_b32_e32 v30, v4
; GISEL-NEXT:    v_mov_b32_e32 v31, v5
; GISEL-NEXT:    v_mov_b32_e32 v32, v6
; GISEL-NEXT:    v_mov_b32_e32 v33, v7
; GISEL-NEXT:    v_mov_b32_e32 v34, v8
; GISEL-NEXT:    v_mov_b32_e32 v35, v9
; GISEL-NEXT:    v_mov_b32_e32 v36, v10
; GISEL-NEXT:    v_mov_b32_e32 v37, v11
; GISEL-NEXT:    v_mov_b32_e32 v0, v12
; GISEL-NEXT:    v_mov_b32_e32 v1, v13
; GISEL-NEXT:    v_mov_b32_e32 v2, v14
; GISEL-NEXT:    v_mov_b32_e32 v3, v15
; GISEL-NEXT:    v_mov_b32_e32 v4, v16
; GISEL-NEXT:    v_mov_b32_e32 v5, v17
; GISEL-NEXT:    v_mov_b32_e32 v6, v18
; GISEL-NEXT:    v_mov_b32_e32 v7, v19
; GISEL-NEXT:    v_mov_b32_e32 v8, v20
; GISEL-NEXT:    v_mov_b32_e32 v9, v21
; GISEL-NEXT:    v_mov_b32_e32 v10, v22
; GISEL-NEXT:    v_mov_b32_e32 v11, v23
; GISEL-NEXT:    v_mov_b32_e32 v12, v24
; GISEL-NEXT:    v_mov_b32_e32 v13, v25
; GISEL-NEXT:    v_mov_b32_e32 v14, v26
; GISEL-NEXT:    v_mov_b32_e32 v15, v27
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
  ret <16 x float> %result
}

define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, <16 x half> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v36, s0
; SDAG-NEXT:    v_mov_b32_e32 v37, s1
; SDAG-NEXT:    v_mov_b32_e32 v38, s2
; SDAG-NEXT:    v_mov_b32_e32 v39, s3
; SDAG-NEXT:    v_mov_b32_e32 v13, s25
; SDAG-NEXT:    v_mov_b32_e32 v14, s26
; SDAG-NEXT:    v_mov_b32_e32 v15, s27
; SDAG-NEXT:    v_mov_b32_e32 v16, s28
; SDAG-NEXT:    v_mov_b32_e32 v17, s29
; SDAG-NEXT:    v_mov_b32_e32 v28, s16
; SDAG-NEXT:    v_mov_b32_e32 v29, s17
; SDAG-NEXT:    v_mov_b32_e32 v30, s18
; SDAG-NEXT:    v_mov_b32_e32 v31, s19
; SDAG-NEXT:    v_mov_b32_e32 v32, s20
; SDAG-NEXT:    v_mov_b32_e32 v33, s21
; SDAG-NEXT:    v_mov_b32_e32 v34, s22
; SDAG-NEXT:    v_mov_b32_e32 v35, s23
; SDAG-NEXT:    v_mov_b32_e32 v12, s24
; SDAG-NEXT:    v_mov_b32_e32 v18, v0
; SDAG-NEXT:    v_mov_b32_e32 v19, v1
; SDAG-NEXT:    v_mov_b32_e32 v20, v2
; SDAG-NEXT:    v_mov_b32_e32 v21, v3
; SDAG-NEXT:    v_mov_b32_e32 v22, v4
; SDAG-NEXT:    v_mov_b32_e32 v23, v5
; SDAG-NEXT:    v_mov_b32_e32 v24, v6
; SDAG-NEXT:    v_mov_b32_e32 v25, v7
; SDAG-NEXT:    v_mov_b32_e32 v26, v8
; SDAG-NEXT:    v_mov_b32_e32 v27, v9
; SDAG-NEXT:    s_nop 1
; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[2:3]
; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
; GISEL-NEXT:    v_mov_b32_e32 v11, v0
; GISEL-NEXT:    v_mov_b32_e32 v12, v1
; GISEL-NEXT:    v_mov_b32_e32 v13, v2
; GISEL-NEXT:    v_mov_b32_e32 v14, v3
; GISEL-NEXT:    v_mov_b32_e32 v15, v4
; GISEL-NEXT:    v_mov_b32_e32 v16, v5
; GISEL-NEXT:    v_mov_b32_e32 v17, v6
; GISEL-NEXT:    v_mov_b32_e32 v18, v7
; GISEL-NEXT:    v_mov_b32_e32 v19, v8
; GISEL-NEXT:    v_mov_b32_e32 v20, v9
; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[22:23]
; GISEL-NEXT:    v_mov_b32_e32 v21, v10
; GISEL-NEXT:    v_mov_b32_e32 v0, s24
; GISEL-NEXT:    v_mov_b32_e32 v1, s25
; GISEL-NEXT:    v_mov_b32_e32 v2, s26
; GISEL-NEXT:    v_mov_b32_e32 v3, s27
; GISEL-NEXT:    v_mov_b32_e32 v4, s28
; GISEL-NEXT:    v_mov_b32_e32 v5, s29
; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[20:21]
; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[18:19]
; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[16:17]
; GISEL-NEXT:    v_mov_b32_e32 v6, v11
; GISEL-NEXT:    v_mov_b32_e32 v7, v12
; GISEL-NEXT:    v_mov_b32_e32 v8, v13
; GISEL-NEXT:    v_mov_b32_e32 v9, v14
; GISEL-NEXT:    v_mov_b32_e32 v10, v15
; GISEL-NEXT:    v_mov_b32_e32 v11, v16
; GISEL-NEXT:    v_mov_b32_e32 v12, v17
; GISEL-NEXT:    v_mov_b32_e32 v13, v18
; GISEL-NEXT:    v_mov_b32_e32 v14, v19
; GISEL-NEXT:    v_mov_b32_e32 v15, v20
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <16 x float> %result
}

; --------------------------------------------------------------------
; llvm.amdgcn.smfmac.f32.16x16x64.bf16
; --------------------------------------------------------------------

declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat>, <16 x bfloat>, <4 x float>, i32, i32 immarg, i32 immarg)

define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) %arg, <8 x bfloat> %a, <16 x bfloat> %b, i32 %idx) #0 {
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__vgpr:
; GCN:       ; %bb.0: ; %bb
; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
; GCN-NEXT:    v_mov_b32_e32 v16, 0
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
; GCN-NEXT:    s_load_dword s16, s[4:5], 0x64
; GCN-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
; GCN-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
; GCN-NEXT:    v_mov_b32_e32 v17, s16
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    s_nop 0
; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7]
; GCN-NEXT:    s_endpgm
bb:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
  %in.1 = load <4 x float>, ptr addrspace(1) %gep
  %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %a, <16 x bfloat> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
  store <4 x float> %mai.1, ptr addrspace(1) %arg
  ret void
}

define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <4 x float> %result
}

define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags0:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
  ret <4 x float> %result
}

define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags1:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
  ret <4 x float> %result
}

define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0, <16 x bfloat> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v8, s0
; GCN-NEXT:    v_mov_b32_e32 v9, s1
; GCN-NEXT:    v_mov_b32_e32 v10, s2
; GCN-NEXT:    v_mov_b32_e32 v11, s3
; GCN-NEXT:    v_mov_b32_e32 v0, s16
; GCN-NEXT:    v_mov_b32_e32 v1, s17
; GCN-NEXT:    v_mov_b32_e32 v2, s18
; GCN-NEXT:    v_mov_b32_e32 v3, s19
; GCN-NEXT:    v_mov_b32_e32 v4, s20
; GCN-NEXT:    v_mov_b32_e32 v5, s21
; GCN-NEXT:    v_mov_b32_e32 v6, s22
; GCN-NEXT:    v_mov_b32_e32 v7, s23
; GCN-NEXT:    v_accvgpr_write_b32 a0, s24
; GCN-NEXT:    v_accvgpr_write_b32 a1, s25
; GCN-NEXT:    v_accvgpr_write_b32 a2, s26
; GCN-NEXT:    v_accvgpr_write_b32 a3, s27
; GCN-NEXT:    v_mov_b32_e32 v12, s28
; GCN-NEXT:    s_nop 1
; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[8:11], v[0:7], v12
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <4 x float> %result
}

; --------------------------------------------------------------------
; llvm.amdgcn.smfmac.f32.32x32x32.bf16
; --------------------------------------------------------------------

declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat>, <16 x bfloat>, <16 x float>, i32, i32 immarg, i32 immarg)

define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1) %arg, <8 x bfloat> %a, <16 x bfloat> %b, i32 %idx) #0 {
; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr:
; GCN:       ; %bb.0: ; %bb
; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GCN-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
; GCN-NEXT:    s_load_dword s16, s[4:5], 0x64
; GCN-NEXT:    v_mov_b64_e32 v[26:27], s[2:3]
; GCN-NEXT:    v_mov_b64_e32 v[24:25], s[0:1]
; GCN-NEXT:    s_waitcnt lgkmcnt(0)
; GCN-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
; GCN-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
; GCN-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
; GCN-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
; GCN-NEXT:    v_mov_b32_e32 v28, s16
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    s_nop 0
; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; GCN-NEXT:    v_mov_b32_e32 v16, 0
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    s_nop 2
; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
; GCN-NEXT:    s_endpgm
bb:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
  %in.1 = load <16 x float>, ptr addrspace(1) %gep
  %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %a, <16 x bfloat> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
  store <16 x float> %mai.1, ptr addrspace(1) %arg
  ret void
}

define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_32x32x32_bf16:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    s_nop 3
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    v_mov_b32_e32 v4, v16
; GCN-NEXT:    v_mov_b32_e32 v5, v17
; GCN-NEXT:    v_mov_b32_e32 v6, v18
; GCN-NEXT:    v_mov_b32_e32 v7, v19
; GCN-NEXT:    v_mov_b32_e32 v8, v20
; GCN-NEXT:    v_mov_b32_e32 v9, v21
; GCN-NEXT:    v_mov_b32_e32 v10, v22
; GCN-NEXT:    v_mov_b32_e32 v11, v23
; GCN-NEXT:    v_mov_b32_e32 v12, v24
; GCN-NEXT:    v_mov_b32_e32 v13, v25
; GCN-NEXT:    v_mov_b32_e32 v14, v26
; GCN-NEXT:    v_mov_b32_e32 v15, v27
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <16 x float> %result
}

define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    s_nop 3
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    v_mov_b32_e32 v4, v16
; GCN-NEXT:    v_mov_b32_e32 v5, v17
; GCN-NEXT:    v_mov_b32_e32 v6, v18
; GCN-NEXT:    v_mov_b32_e32 v7, v19
; GCN-NEXT:    v_mov_b32_e32 v8, v20
; GCN-NEXT:    v_mov_b32_e32 v9, v21
; GCN-NEXT:    v_mov_b32_e32 v10, v22
; GCN-NEXT:    v_mov_b32_e32 v11, v23
; GCN-NEXT:    v_mov_b32_e32 v12, v24
; GCN-NEXT:    v_mov_b32_e32 v13, v25
; GCN-NEXT:    v_mov_b32_e32 v14, v26
; GCN-NEXT:    v_mov_b32_e32 v15, v27
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
  ret <16 x float> %result
}

define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags1:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    s_nop 3
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    v_mov_b32_e32 v4, v16
; GCN-NEXT:    v_mov_b32_e32 v5, v17
; GCN-NEXT:    v_mov_b32_e32 v6, v18
; GCN-NEXT:    v_mov_b32_e32 v7, v19
; GCN-NEXT:    v_mov_b32_e32 v8, v20
; GCN-NEXT:    v_mov_b32_e32 v9, v21
; GCN-NEXT:    v_mov_b32_e32 v10, v22
; GCN-NEXT:    v_mov_b32_e32 v11, v23
; GCN-NEXT:    v_mov_b32_e32 v12, v24
; GCN-NEXT:    v_mov_b32_e32 v13, v25
; GCN-NEXT:    v_mov_b32_e32 v14, v26
; GCN-NEXT:    v_mov_b32_e32 v15, v27
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
  ret <16 x float> %result
}

define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg0, <16 x bfloat> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v36, s0
; GCN-NEXT:    v_mov_b32_e32 v37, s1
; GCN-NEXT:    v_mov_b32_e32 v38, s2
; GCN-NEXT:    v_mov_b32_e32 v39, s3
; GCN-NEXT:    v_mov_b32_e32 v13, s25
; GCN-NEXT:    v_mov_b32_e32 v14, s26
; GCN-NEXT:    v_mov_b32_e32 v15, s27
; GCN-NEXT:    v_mov_b32_e32 v16, s28
; GCN-NEXT:    v_mov_b32_e32 v17, s29
; GCN-NEXT:    v_mov_b32_e32 v28, s16
; GCN-NEXT:    v_mov_b32_e32 v29, s17
; GCN-NEXT:    v_mov_b32_e32 v30, s18
; GCN-NEXT:    v_mov_b32_e32 v31, s19
; GCN-NEXT:    v_mov_b32_e32 v32, s20
; GCN-NEXT:    v_mov_b32_e32 v33, s21
; GCN-NEXT:    v_mov_b32_e32 v34, s22
; GCN-NEXT:    v_mov_b32_e32 v35, s23
; GCN-NEXT:    v_mov_b32_e32 v12, s24
; GCN-NEXT:    v_mov_b32_e32 v18, v0
; GCN-NEXT:    v_mov_b32_e32 v19, v1
; GCN-NEXT:    v_mov_b32_e32 v20, v2
; GCN-NEXT:    v_mov_b32_e32 v21, v3
; GCN-NEXT:    v_mov_b32_e32 v22, v4
; GCN-NEXT:    v_mov_b32_e32 v23, v5
; GCN-NEXT:    v_mov_b32_e32 v24, v6
; GCN-NEXT:    v_mov_b32_e32 v25, v7
; GCN-NEXT:    v_mov_b32_e32 v26, v8
; GCN-NEXT:    v_mov_b32_e32 v27, v9
; GCN-NEXT:    s_nop 1
; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    s_nop 3
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    v_mov_b32_e32 v4, v16
; GCN-NEXT:    v_mov_b32_e32 v5, v17
; GCN-NEXT:    v_mov_b32_e32 v6, v18
; GCN-NEXT:    v_mov_b32_e32 v7, v19
; GCN-NEXT:    v_mov_b32_e32 v8, v20
; GCN-NEXT:    v_mov_b32_e32 v9, v21
; GCN-NEXT:    v_mov_b32_e32 v10, v22
; GCN-NEXT:    v_mov_b32_e32 v11, v23
; GCN-NEXT:    v_mov_b32_e32 v12, v24
; GCN-NEXT:    v_mov_b32_e32 v13, v25
; GCN-NEXT:    v_mov_b32_e32 v14, v26
; GCN-NEXT:    v_mov_b32_e32 v15, v27
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <16 x float> %result
}

; --------------------------------------------------------------------
; llvm.amdgcn.smfmac.i32.16x16x128.i8
; --------------------------------------------------------------------

declare <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32)

define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__vgpr:
; SDAG:       ; %bb.0: ; %bb
; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT:    v_mov_b32_e32 v16, 0
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT:    v_mov_b32_e32 v12, s8
; SDAG-NEXT:    v_mov_b32_e32 v13, s9
; SDAG-NEXT:    v_mov_b32_e32 v14, s10
; SDAG-NEXT:    v_mov_b32_e32 v15, s11
; SDAG-NEXT:    v_mov_b32_e32 v0, s12
; SDAG-NEXT:    v_mov_b32_e32 v1, s13
; SDAG-NEXT:    v_mov_b32_e32 v2, s14
; SDAG-NEXT:    v_mov_b32_e32 v3, s15
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v4, s0
; SDAG-NEXT:    v_mov_b32_e32 v5, s1
; SDAG-NEXT:    v_mov_b32_e32 v6, s2
; SDAG-NEXT:    v_mov_b32_e32 v7, s3
; SDAG-NEXT:    v_mov_b32_e32 v17, s16
; SDAG-NEXT:    s_waitcnt vmcnt(0)
; SDAG-NEXT:    s_nop 0
; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7]
; SDAG-NEXT:    s_endpgm
;
; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__vgpr:
; GISEL:       ; %bb.0: ; %bb
; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
; GISEL-NEXT:    v_mov_b32_e32 v16, s2
; GISEL-NEXT:    s_waitcnt vmcnt(0)
; GISEL-NEXT:    s_nop 0
; GISEL-NEXT:    v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
; GISEL-NEXT:    v_mov_b32_e32 v0, 0
; GISEL-NEXT:    s_nop 6
; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
; GISEL-NEXT:    s_endpgm
bb:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr <4 x i32>, ptr addrspace(1) %arg, i32 %id
  %in.1 = load <4 x i32>, ptr addrspace(1) %gep
  %mai.1 = tail call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %a, <8 x i32> %b, <4 x i32> %in.1, i32 %idx, i32 1, i32 2)
  store <4 x i32> %mai.1, ptr addrspace(1) %arg
  ret void
}

define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_i32_16x16x128_i8:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <4 x i32> %result
}

define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
  ret <4 x i32> %result
}

define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
  ret <4 x i32> %result
}

define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x i32> inreg %arg2, i32 inreg %arg3) {
; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v8, s0
; SDAG-NEXT:    v_mov_b32_e32 v9, s1
; SDAG-NEXT:    v_mov_b32_e32 v10, s2
; SDAG-NEXT:    v_mov_b32_e32 v11, s3
; SDAG-NEXT:    v_mov_b32_e32 v0, s16
; SDAG-NEXT:    v_mov_b32_e32 v1, s17
; SDAG-NEXT:    v_mov_b32_e32 v2, s18
; SDAG-NEXT:    v_mov_b32_e32 v3, s19
; SDAG-NEXT:    v_mov_b32_e32 v4, s20
; SDAG-NEXT:    v_mov_b32_e32 v5, s21
; SDAG-NEXT:    v_mov_b32_e32 v6, s22
; SDAG-NEXT:    v_mov_b32_e32 v7, s23
; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
; SDAG-NEXT:    v_mov_b32_e32 v12, s28
; SDAG-NEXT:    s_nop 1
; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[8:11], v[0:7], v12
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[20:21]
; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[22:23]
; GISEL-NEXT:    v_mov_b32_e32 v0, s24
; GISEL-NEXT:    v_mov_b32_e32 v1, s25
; GISEL-NEXT:    v_mov_b32_e32 v2, s26
; GISEL-NEXT:    v_mov_b32_e32 v3, s27
; GISEL-NEXT:    v_mov_b32_e32 v16, s28
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_i32_16x16x128_i8 v[0:3], v[12:15], v[4:11], v16
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <4 x i32> %result
}

; --------------------------------------------------------------------
; llvm.amdgcn.smfmac.i32.32x32x64.i8
; --------------------------------------------------------------------

declare <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32>, <8 x i32>, <16 x i32>, i32, i32, i32)

define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__vgpr:
; SDAG:       ; %bb.0: ; %bb
; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v24, s8
; SDAG-NEXT:    v_mov_b32_e32 v25, s9
; SDAG-NEXT:    v_mov_b32_e32 v26, s10
; SDAG-NEXT:    v_mov_b32_e32 v27, s11
; SDAG-NEXT:    v_mov_b32_e32 v16, s12
; SDAG-NEXT:    v_mov_b32_e32 v17, s13
; SDAG-NEXT:    v_mov_b32_e32 v18, s14
; SDAG-NEXT:    v_mov_b32_e32 v19, s15
; SDAG-NEXT:    v_mov_b32_e32 v20, s0
; SDAG-NEXT:    v_mov_b32_e32 v21, s1
; SDAG-NEXT:    v_mov_b32_e32 v22, s2
; SDAG-NEXT:    v_mov_b32_e32 v23, s3
; SDAG-NEXT:    v_mov_b32_e32 v28, s16
; SDAG-NEXT:    s_waitcnt vmcnt(0)
; SDAG-NEXT:    s_nop 0
; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT:    v_mov_b32_e32 v16, 0
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 2
; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
; SDAG-NEXT:    s_endpgm
;
; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__vgpr:
; GISEL:       ; %bb.0: ; %bb
; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[8:9]
; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
; GISEL-NEXT:    v_mov_b32_e32 v28, s2
; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[12:13]
; GISEL-NEXT:    s_waitcnt vmcnt(0)
; GISEL-NEXT:    s_nop 0
; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; GISEL-NEXT:    v_mov_b32_e32 v16, 0
; GISEL-NEXT:    s_nop 7
; GISEL-NEXT:    s_nop 2
; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT:    s_endpgm
bb:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr <16 x i32>, ptr addrspace(1) %arg, i32 %id
  %in.1 = load <16 x i32>, ptr addrspace(1) %gep
  %mai.1 = tail call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %a, <8 x i32> %b, <16 x i32> %in.1, i32 %idx, i32 1, i32 2)
  store <16 x i32> %mai.1, ptr addrspace(1) %arg
  ret void
}

define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
; SDAG-LABEL: test_smfmac_i32_32x32x64_i8:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_i32_32x32x64_i8:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b32_e32 v48, v0
; GISEL-NEXT:    v_mov_b32_e32 v49, v1
; GISEL-NEXT:    v_mov_b32_e32 v50, v2
; GISEL-NEXT:    v_mov_b32_e32 v51, v3
; GISEL-NEXT:    v_mov_b32_e32 v30, v4
; GISEL-NEXT:    v_mov_b32_e32 v31, v5
; GISEL-NEXT:    v_mov_b32_e32 v32, v6
; GISEL-NEXT:    v_mov_b32_e32 v33, v7
; GISEL-NEXT:    v_mov_b32_e32 v34, v8
; GISEL-NEXT:    v_mov_b32_e32 v35, v9
; GISEL-NEXT:    v_mov_b32_e32 v36, v10
; GISEL-NEXT:    v_mov_b32_e32 v37, v11
; GISEL-NEXT:    v_mov_b32_e32 v0, v12
; GISEL-NEXT:    v_mov_b32_e32 v1, v13
; GISEL-NEXT:    v_mov_b32_e32 v2, v14
; GISEL-NEXT:    v_mov_b32_e32 v3, v15
; GISEL-NEXT:    v_mov_b32_e32 v4, v16
; GISEL-NEXT:    v_mov_b32_e32 v5, v17
; GISEL-NEXT:    v_mov_b32_e32 v6, v18
; GISEL-NEXT:    v_mov_b32_e32 v7, v19
; GISEL-NEXT:    v_mov_b32_e32 v8, v20
; GISEL-NEXT:    v_mov_b32_e32 v9, v21
; GISEL-NEXT:    v_mov_b32_e32 v10, v22
; GISEL-NEXT:    v_mov_b32_e32 v11, v23
; GISEL-NEXT:    v_mov_b32_e32 v12, v24
; GISEL-NEXT:    v_mov_b32_e32 v13, v25
; GISEL-NEXT:    v_mov_b32_e32 v14, v26
; GISEL-NEXT:    v_mov_b32_e32 v15, v27
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <16 x i32> %result
}

define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b32_e32 v48, v0
; GISEL-NEXT:    v_mov_b32_e32 v49, v1
; GISEL-NEXT:    v_mov_b32_e32 v50, v2
; GISEL-NEXT:    v_mov_b32_e32 v51, v3
; GISEL-NEXT:    v_mov_b32_e32 v30, v4
; GISEL-NEXT:    v_mov_b32_e32 v31, v5
; GISEL-NEXT:    v_mov_b32_e32 v32, v6
; GISEL-NEXT:    v_mov_b32_e32 v33, v7
; GISEL-NEXT:    v_mov_b32_e32 v34, v8
; GISEL-NEXT:    v_mov_b32_e32 v35, v9
; GISEL-NEXT:    v_mov_b32_e32 v36, v10
; GISEL-NEXT:    v_mov_b32_e32 v37, v11
; GISEL-NEXT:    v_mov_b32_e32 v0, v12
; GISEL-NEXT:    v_mov_b32_e32 v1, v13
; GISEL-NEXT:    v_mov_b32_e32 v2, v14
; GISEL-NEXT:    v_mov_b32_e32 v3, v15
; GISEL-NEXT:    v_mov_b32_e32 v4, v16
; GISEL-NEXT:    v_mov_b32_e32 v5, v17
; GISEL-NEXT:    v_mov_b32_e32 v6, v18
; GISEL-NEXT:    v_mov_b32_e32 v7, v19
; GISEL-NEXT:    v_mov_b32_e32 v8, v20
; GISEL-NEXT:    v_mov_b32_e32 v9, v21
; GISEL-NEXT:    v_mov_b32_e32 v10, v22
; GISEL-NEXT:    v_mov_b32_e32 v11, v23
; GISEL-NEXT:    v_mov_b32_e32 v12, v24
; GISEL-NEXT:    v_mov_b32_e32 v13, v25
; GISEL-NEXT:    v_mov_b32_e32 v14, v26
; GISEL-NEXT:    v_mov_b32_e32 v15, v27
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
  ret <16 x i32> %result
}

define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b32_e32 v48, v0
; GISEL-NEXT:    v_mov_b32_e32 v49, v1
; GISEL-NEXT:    v_mov_b32_e32 v50, v2
; GISEL-NEXT:    v_mov_b32_e32 v51, v3
; GISEL-NEXT:    v_mov_b32_e32 v30, v4
; GISEL-NEXT:    v_mov_b32_e32 v31, v5
; GISEL-NEXT:    v_mov_b32_e32 v32, v6
; GISEL-NEXT:    v_mov_b32_e32 v33, v7
; GISEL-NEXT:    v_mov_b32_e32 v34, v8
; GISEL-NEXT:    v_mov_b32_e32 v35, v9
; GISEL-NEXT:    v_mov_b32_e32 v36, v10
; GISEL-NEXT:    v_mov_b32_e32 v37, v11
; GISEL-NEXT:    v_mov_b32_e32 v0, v12
; GISEL-NEXT:    v_mov_b32_e32 v1, v13
; GISEL-NEXT:    v_mov_b32_e32 v2, v14
; GISEL-NEXT:    v_mov_b32_e32 v3, v15
; GISEL-NEXT:    v_mov_b32_e32 v4, v16
; GISEL-NEXT:    v_mov_b32_e32 v5, v17
; GISEL-NEXT:    v_mov_b32_e32 v6, v18
; GISEL-NEXT:    v_mov_b32_e32 v7, v19
; GISEL-NEXT:    v_mov_b32_e32 v8, v20
; GISEL-NEXT:    v_mov_b32_e32 v9, v21
; GISEL-NEXT:    v_mov_b32_e32 v10, v22
; GISEL-NEXT:    v_mov_b32_e32 v11, v23
; GISEL-NEXT:    v_mov_b32_e32 v12, v24
; GISEL-NEXT:    v_mov_b32_e32 v13, v25
; GISEL-NEXT:    v_mov_b32_e32 v14, v26
; GISEL-NEXT:    v_mov_b32_e32 v15, v27
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
  ret <16 x i32> %result
}

define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x i32> inreg %arg2, i32 inreg %arg3) {
; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v36, s0
; SDAG-NEXT:    v_mov_b32_e32 v37, s1
; SDAG-NEXT:    v_mov_b32_e32 v38, s2
; SDAG-NEXT:    v_mov_b32_e32 v39, s3
; SDAG-NEXT:    v_mov_b32_e32 v13, s25
; SDAG-NEXT:    v_mov_b32_e32 v14, s26
; SDAG-NEXT:    v_mov_b32_e32 v15, s27
; SDAG-NEXT:    v_mov_b32_e32 v16, s28
; SDAG-NEXT:    v_mov_b32_e32 v17, s29
; SDAG-NEXT:    v_mov_b32_e32 v28, s16
; SDAG-NEXT:    v_mov_b32_e32 v29, s17
; SDAG-NEXT:    v_mov_b32_e32 v30, s18
; SDAG-NEXT:    v_mov_b32_e32 v31, s19
; SDAG-NEXT:    v_mov_b32_e32 v32, s20
; SDAG-NEXT:    v_mov_b32_e32 v33, s21
; SDAG-NEXT:    v_mov_b32_e32 v34, s22
; SDAG-NEXT:    v_mov_b32_e32 v35, s23
; SDAG-NEXT:    v_mov_b32_e32 v12, s24
; SDAG-NEXT:    v_mov_b32_e32 v18, v0
; SDAG-NEXT:    v_mov_b32_e32 v19, v1
; SDAG-NEXT:    v_mov_b32_e32 v20, v2
; SDAG-NEXT:    v_mov_b32_e32 v21, v3
; SDAG-NEXT:    v_mov_b32_e32 v22, v4
; SDAG-NEXT:    v_mov_b32_e32 v23, v5
; SDAG-NEXT:    v_mov_b32_e32 v24, v6
; SDAG-NEXT:    v_mov_b32_e32 v25, v7
; SDAG-NEXT:    v_mov_b32_e32 v26, v8
; SDAG-NEXT:    v_mov_b32_e32 v27, v9
; SDAG-NEXT:    s_nop 1
; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[2:3]
; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
; GISEL-NEXT:    v_mov_b32_e32 v11, v0
; GISEL-NEXT:    v_mov_b32_e32 v12, v1
; GISEL-NEXT:    v_mov_b32_e32 v13, v2
; GISEL-NEXT:    v_mov_b32_e32 v14, v3
; GISEL-NEXT:    v_mov_b32_e32 v15, v4
; GISEL-NEXT:    v_mov_b32_e32 v16, v5
; GISEL-NEXT:    v_mov_b32_e32 v17, v6
; GISEL-NEXT:    v_mov_b32_e32 v18, v7
; GISEL-NEXT:    v_mov_b32_e32 v19, v8
; GISEL-NEXT:    v_mov_b32_e32 v20, v9
; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[22:23]
; GISEL-NEXT:    v_mov_b32_e32 v21, v10
; GISEL-NEXT:    v_mov_b32_e32 v0, s24
; GISEL-NEXT:    v_mov_b32_e32 v1, s25
; GISEL-NEXT:    v_mov_b32_e32 v2, s26
; GISEL-NEXT:    v_mov_b32_e32 v3, s27
; GISEL-NEXT:    v_mov_b32_e32 v4, s28
; GISEL-NEXT:    v_mov_b32_e32 v5, s29
; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[20:21]
; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[18:19]
; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[16:17]
; GISEL-NEXT:    v_mov_b32_e32 v6, v11
; GISEL-NEXT:    v_mov_b32_e32 v7, v12
; GISEL-NEXT:    v_mov_b32_e32 v8, v13
; GISEL-NEXT:    v_mov_b32_e32 v9, v14
; GISEL-NEXT:    v_mov_b32_e32 v10, v15
; GISEL-NEXT:    v_mov_b32_e32 v11, v16
; GISEL-NEXT:    v_mov_b32_e32 v12, v17
; GISEL-NEXT:    v_mov_b32_e32 v13, v18
; GISEL-NEXT:    v_mov_b32_e32 v14, v19
; GISEL-NEXT:    v_mov_b32_e32 v15, v20
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <16 x i32> %result
}

; --------------------------------------------------------------------
; llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8
; --------------------------------------------------------------------

declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32 immarg, i32 immarg)

define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
; SDAG:       ; %bb.0: ; %bb
; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT:    v_mov_b32_e32 v16, 0
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT:    v_mov_b32_e32 v12, s8
; SDAG-NEXT:    v_mov_b32_e32 v13, s9
; SDAG-NEXT:    v_mov_b32_e32 v14, s10
; SDAG-NEXT:    v_mov_b32_e32 v15, s11
; SDAG-NEXT:    v_mov_b32_e32 v0, s12
; SDAG-NEXT:    v_mov_b32_e32 v1, s13
; SDAG-NEXT:    v_mov_b32_e32 v2, s14
; SDAG-NEXT:    v_mov_b32_e32 v3, s15
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v4, s0
; SDAG-NEXT:    v_mov_b32_e32 v5, s1
; SDAG-NEXT:    v_mov_b32_e32 v6, s2
; SDAG-NEXT:    v_mov_b32_e32 v7, s3
; SDAG-NEXT:    v_mov_b32_e32 v17, s16
; SDAG-NEXT:    s_waitcnt vmcnt(0)
; SDAG-NEXT:    s_nop 0
; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7]
; SDAG-NEXT:    s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
; GISEL:       ; %bb.0: ; %bb
; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
; GISEL-NEXT:    v_mov_b32_e32 v16, s2
; GISEL-NEXT:    s_waitcnt vmcnt(0)
; GISEL-NEXT:    s_nop 0
; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
; GISEL-NEXT:    v_mov_b32_e32 v0, 0
; GISEL-NEXT:    s_nop 6
; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
; GISEL-NEXT:    s_endpgm
bb:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
  %in.1 = load <4 x float>, ptr addrspace(1) %gep
  %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %a, <8 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
  store <4 x float> %mai.1, ptr addrspace(1) %arg
  ret void
}

define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <4 x float> %result
}

define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
  ret <4 x float> %result
}

define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
  ret <4 x float> %result
}

define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v8, s0
; SDAG-NEXT:    v_mov_b32_e32 v9, s1
; SDAG-NEXT:    v_mov_b32_e32 v10, s2
; SDAG-NEXT:    v_mov_b32_e32 v11, s3
; SDAG-NEXT:    v_mov_b32_e32 v0, s16
; SDAG-NEXT:    v_mov_b32_e32 v1, s17
; SDAG-NEXT:    v_mov_b32_e32 v2, s18
; SDAG-NEXT:    v_mov_b32_e32 v3, s19
; SDAG-NEXT:    v_mov_b32_e32 v4, s20
; SDAG-NEXT:    v_mov_b32_e32 v5, s21
; SDAG-NEXT:    v_mov_b32_e32 v6, s22
; SDAG-NEXT:    v_mov_b32_e32 v7, s23
; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
; SDAG-NEXT:    v_mov_b32_e32 v12, s28
; SDAG-NEXT:    s_nop 1
; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[8:11], v[0:7], v12
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[20:21]
; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[22:23]
; GISEL-NEXT:    v_mov_b32_e32 v0, s24
; GISEL-NEXT:    v_mov_b32_e32 v1, s25
; GISEL-NEXT:    v_mov_b32_e32 v2, s26
; GISEL-NEXT:    v_mov_b32_e32 v3, s27
; GISEL-NEXT:    v_mov_b32_e32 v16, s28
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[12:15], v[4:11], v16
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <4 x float> %result
}

; --------------------------------------------------------------------
; llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8
; --------------------------------------------------------------------

declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32 immarg, i32 immarg)

define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
; SDAG:       ; %bb.0: ; %bb
; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT:    v_mov_b32_e32 v16, 0
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT:    v_mov_b32_e32 v12, s8
; SDAG-NEXT:    v_mov_b32_e32 v13, s9
; SDAG-NEXT:    v_mov_b32_e32 v14, s10
; SDAG-NEXT:    v_mov_b32_e32 v15, s11
; SDAG-NEXT:    v_mov_b32_e32 v0, s12
; SDAG-NEXT:    v_mov_b32_e32 v1, s13
; SDAG-NEXT:    v_mov_b32_e32 v2, s14
; SDAG-NEXT:    v_mov_b32_e32 v3, s15
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v4, s0
; SDAG-NEXT:    v_mov_b32_e32 v5, s1
; SDAG-NEXT:    v_mov_b32_e32 v6, s2
; SDAG-NEXT:    v_mov_b32_e32 v7, s3
; SDAG-NEXT:    v_mov_b32_e32 v17, s16
; SDAG-NEXT:    s_waitcnt vmcnt(0)
; SDAG-NEXT:    s_nop 0
; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7]
; SDAG-NEXT:    s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
; GISEL:       ; %bb.0: ; %bb
; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
; GISEL-NEXT:    v_mov_b32_e32 v16, s2
; GISEL-NEXT:    s_waitcnt vmcnt(0)
; GISEL-NEXT:    s_nop 0
; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
; GISEL-NEXT:    v_mov_b32_e32 v0, 0
; GISEL-NEXT:    s_nop 6
; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
; GISEL-NEXT:    s_endpgm
bb:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
  %in.1 = load <4 x float>, ptr addrspace(1) %gep
  %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %a, <8 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
  store <4 x float> %mai.1, ptr addrspace(1) %arg
  ret void
}

define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <4 x float> %result
}

define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
  ret <4 x float> %result
}

define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
  ret <4 x float> %result
}

define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v8, s0
; SDAG-NEXT:    v_mov_b32_e32 v9, s1
; SDAG-NEXT:    v_mov_b32_e32 v10, s2
; SDAG-NEXT:    v_mov_b32_e32 v11, s3
; SDAG-NEXT:    v_mov_b32_e32 v0, s16
; SDAG-NEXT:    v_mov_b32_e32 v1, s17
; SDAG-NEXT:    v_mov_b32_e32 v2, s18
; SDAG-NEXT:    v_mov_b32_e32 v3, s19
; SDAG-NEXT:    v_mov_b32_e32 v4, s20
; SDAG-NEXT:    v_mov_b32_e32 v5, s21
; SDAG-NEXT:    v_mov_b32_e32 v6, s22
; SDAG-NEXT:    v_mov_b32_e32 v7, s23
; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
; SDAG-NEXT:    v_mov_b32_e32 v12, s28
; SDAG-NEXT:    s_nop 1
; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[8:11], v[0:7], v12
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[20:21]
; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[22:23]
; GISEL-NEXT:    v_mov_b32_e32 v0, s24
; GISEL-NEXT:    v_mov_b32_e32 v1, s25
; GISEL-NEXT:    v_mov_b32_e32 v2, s26
; GISEL-NEXT:    v_mov_b32_e32 v3, s27
; GISEL-NEXT:    v_mov_b32_e32 v16, s28
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[12:15], v[4:11], v16
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <4 x float> %result
}

; --------------------------------------------------------------------
; llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8
; --------------------------------------------------------------------

declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32 immarg, i32 immarg)

define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
; SDAG:       ; %bb.0: ; %bb
; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT:    v_mov_b32_e32 v16, 0
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT:    v_mov_b32_e32 v12, s8
; SDAG-NEXT:    v_mov_b32_e32 v13, s9
; SDAG-NEXT:    v_mov_b32_e32 v14, s10
; SDAG-NEXT:    v_mov_b32_e32 v15, s11
; SDAG-NEXT:    v_mov_b32_e32 v0, s12
; SDAG-NEXT:    v_mov_b32_e32 v1, s13
; SDAG-NEXT:    v_mov_b32_e32 v2, s14
; SDAG-NEXT:    v_mov_b32_e32 v3, s15
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v4, s0
; SDAG-NEXT:    v_mov_b32_e32 v5, s1
; SDAG-NEXT:    v_mov_b32_e32 v6, s2
; SDAG-NEXT:    v_mov_b32_e32 v7, s3
; SDAG-NEXT:    v_mov_b32_e32 v17, s16
; SDAG-NEXT:    s_waitcnt vmcnt(0)
; SDAG-NEXT:    s_nop 0
; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7]
; SDAG-NEXT:    s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
; GISEL:       ; %bb.0: ; %bb
; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
; GISEL-NEXT:    v_mov_b32_e32 v16, s2
; GISEL-NEXT:    s_waitcnt vmcnt(0)
; GISEL-NEXT:    s_nop 0
; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
; GISEL-NEXT:    v_mov_b32_e32 v0, 0
; GISEL-NEXT:    s_nop 6
; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
; GISEL-NEXT:    s_endpgm
bb:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
  %in.1 = load <4 x float>, ptr addrspace(1) %gep
  %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %a, <8 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
  store <4 x float> %mai.1, ptr addrspace(1) %arg
  ret void
}

define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <4 x float> %result
}

define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
  ret <4 x float> %result
}

define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
  ret <4 x float> %result
}

define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v8, s0
; SDAG-NEXT:    v_mov_b32_e32 v9, s1
; SDAG-NEXT:    v_mov_b32_e32 v10, s2
; SDAG-NEXT:    v_mov_b32_e32 v11, s3
; SDAG-NEXT:    v_mov_b32_e32 v0, s16
; SDAG-NEXT:    v_mov_b32_e32 v1, s17
; SDAG-NEXT:    v_mov_b32_e32 v2, s18
; SDAG-NEXT:    v_mov_b32_e32 v3, s19
; SDAG-NEXT:    v_mov_b32_e32 v4, s20
; SDAG-NEXT:    v_mov_b32_e32 v5, s21
; SDAG-NEXT:    v_mov_b32_e32 v6, s22
; SDAG-NEXT:    v_mov_b32_e32 v7, s23
; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
; SDAG-NEXT:    v_mov_b32_e32 v12, s28
; SDAG-NEXT:    s_nop 1
; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[8:11], v[0:7], v12
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[20:21]
; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[22:23]
; GISEL-NEXT:    v_mov_b32_e32 v0, s24
; GISEL-NEXT:    v_mov_b32_e32 v1, s25
; GISEL-NEXT:    v_mov_b32_e32 v2, s26
; GISEL-NEXT:    v_mov_b32_e32 v3, s27
; GISEL-NEXT:    v_mov_b32_e32 v16, s28
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[12:15], v[4:11], v16
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <4 x float> %result
}

; --------------------------------------------------------------------
; llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8
; --------------------------------------------------------------------

declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32 immarg, i32 immarg)

define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
; SDAG:       ; %bb.0: ; %bb
; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT:    v_mov_b32_e32 v16, 0
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT:    v_mov_b32_e32 v12, s8
; SDAG-NEXT:    v_mov_b32_e32 v13, s9
; SDAG-NEXT:    v_mov_b32_e32 v14, s10
; SDAG-NEXT:    v_mov_b32_e32 v15, s11
; SDAG-NEXT:    v_mov_b32_e32 v0, s12
; SDAG-NEXT:    v_mov_b32_e32 v1, s13
; SDAG-NEXT:    v_mov_b32_e32 v2, s14
; SDAG-NEXT:    v_mov_b32_e32 v3, s15
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v4, s0
; SDAG-NEXT:    v_mov_b32_e32 v5, s1
; SDAG-NEXT:    v_mov_b32_e32 v6, s2
; SDAG-NEXT:    v_mov_b32_e32 v7, s3
; SDAG-NEXT:    v_mov_b32_e32 v17, s16
; SDAG-NEXT:    s_waitcnt vmcnt(0)
; SDAG-NEXT:    s_nop 0
; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7]
; SDAG-NEXT:    s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
; GISEL:       ; %bb.0: ; %bb
; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
; GISEL-NEXT:    v_mov_b32_e32 v16, s2
; GISEL-NEXT:    s_waitcnt vmcnt(0)
; GISEL-NEXT:    s_nop 0
; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
; GISEL-NEXT:    v_mov_b32_e32 v0, 0
; GISEL-NEXT:    s_nop 6
; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
; GISEL-NEXT:    s_endpgm
bb:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
  %in.1 = load <4 x float>, ptr addrspace(1) %gep
  %mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %a, <8 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
  store <4 x float> %mai.1, ptr addrspace(1) %arg
  ret void
}

define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <4 x float> %result
}

define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
  ret <4 x float> %result
}

define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
; GCN-NEXT:    s_nop 7
; GCN-NEXT:    v_mov_b32_e32 v0, v12
; GCN-NEXT:    v_mov_b32_e32 v1, v13
; GCN-NEXT:    v_mov_b32_e32 v2, v14
; GCN-NEXT:    v_mov_b32_e32 v3, v15
; GCN-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
  ret <4 x float> %result
}

define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v8, s0
; SDAG-NEXT:    v_mov_b32_e32 v9, s1
; SDAG-NEXT:    v_mov_b32_e32 v10, s2
; SDAG-NEXT:    v_mov_b32_e32 v11, s3
; SDAG-NEXT:    v_mov_b32_e32 v0, s16
; SDAG-NEXT:    v_mov_b32_e32 v1, s17
; SDAG-NEXT:    v_mov_b32_e32 v2, s18
; SDAG-NEXT:    v_mov_b32_e32 v3, s19
; SDAG-NEXT:    v_mov_b32_e32 v4, s20
; SDAG-NEXT:    v_mov_b32_e32 v5, s21
; SDAG-NEXT:    v_mov_b32_e32 v6, s22
; SDAG-NEXT:    v_mov_b32_e32 v7, s23
; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
; SDAG-NEXT:    v_mov_b32_e32 v12, s28
; SDAG-NEXT:    s_nop 1
; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[8:11], v[0:7], v12
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[20:21]
; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[22:23]
; GISEL-NEXT:    v_mov_b32_e32 v0, s24
; GISEL-NEXT:    v_mov_b32_e32 v1, s25
; GISEL-NEXT:    v_mov_b32_e32 v2, s26
; GISEL-NEXT:    v_mov_b32_e32 v3, s27
; GISEL-NEXT:    v_mov_b32_e32 v16, s28
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[12:15], v[4:11], v16
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <4 x float> %result
}

; --------------------------------------------------------------------
; llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8
; --------------------------------------------------------------------

declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32>, <8 x i32>, <16 x float>, i32, i32 immarg, i32 immarg)

define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__vgpr:
; SDAG:       ; %bb.0: ; %bb
; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v24, s8
; SDAG-NEXT:    v_mov_b32_e32 v25, s9
; SDAG-NEXT:    v_mov_b32_e32 v26, s10
; SDAG-NEXT:    v_mov_b32_e32 v27, s11
; SDAG-NEXT:    v_mov_b32_e32 v16, s12
; SDAG-NEXT:    v_mov_b32_e32 v17, s13
; SDAG-NEXT:    v_mov_b32_e32 v18, s14
; SDAG-NEXT:    v_mov_b32_e32 v19, s15
; SDAG-NEXT:    v_mov_b32_e32 v20, s0
; SDAG-NEXT:    v_mov_b32_e32 v21, s1
; SDAG-NEXT:    v_mov_b32_e32 v22, s2
; SDAG-NEXT:    v_mov_b32_e32 v23, s3
; SDAG-NEXT:    v_mov_b32_e32 v28, s16
; SDAG-NEXT:    s_waitcnt vmcnt(0)
; SDAG-NEXT:    s_nop 0
; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT:    v_mov_b32_e32 v16, 0
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 2
; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
; SDAG-NEXT:    s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__vgpr:
; GISEL:       ; %bb.0: ; %bb
; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[8:9]
; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
; GISEL-NEXT:    v_mov_b32_e32 v28, s2
; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[12:13]
; GISEL-NEXT:    s_waitcnt vmcnt(0)
; GISEL-NEXT:    s_nop 0
; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; GISEL-NEXT:    v_mov_b32_e32 v16, 0
; GISEL-NEXT:    s_nop 7
; GISEL-NEXT:    s_nop 2
; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT:    s_endpgm
bb:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
  %in.1 = load <16 x float>, ptr addrspace(1) %gep
  %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %a, <8 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
  store <16 x float> %mai.1, ptr addrspace(1) %arg
  ret void
}

define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b32_e32 v48, v0
; GISEL-NEXT:    v_mov_b32_e32 v49, v1
; GISEL-NEXT:    v_mov_b32_e32 v50, v2
; GISEL-NEXT:    v_mov_b32_e32 v51, v3
; GISEL-NEXT:    v_mov_b32_e32 v30, v4
; GISEL-NEXT:    v_mov_b32_e32 v31, v5
; GISEL-NEXT:    v_mov_b32_e32 v32, v6
; GISEL-NEXT:    v_mov_b32_e32 v33, v7
; GISEL-NEXT:    v_mov_b32_e32 v34, v8
; GISEL-NEXT:    v_mov_b32_e32 v35, v9
; GISEL-NEXT:    v_mov_b32_e32 v36, v10
; GISEL-NEXT:    v_mov_b32_e32 v37, v11
; GISEL-NEXT:    v_mov_b32_e32 v0, v12
; GISEL-NEXT:    v_mov_b32_e32 v1, v13
; GISEL-NEXT:    v_mov_b32_e32 v2, v14
; GISEL-NEXT:    v_mov_b32_e32 v3, v15
; GISEL-NEXT:    v_mov_b32_e32 v4, v16
; GISEL-NEXT:    v_mov_b32_e32 v5, v17
; GISEL-NEXT:    v_mov_b32_e32 v6, v18
; GISEL-NEXT:    v_mov_b32_e32 v7, v19
; GISEL-NEXT:    v_mov_b32_e32 v8, v20
; GISEL-NEXT:    v_mov_b32_e32 v9, v21
; GISEL-NEXT:    v_mov_b32_e32 v10, v22
; GISEL-NEXT:    v_mov_b32_e32 v11, v23
; GISEL-NEXT:    v_mov_b32_e32 v12, v24
; GISEL-NEXT:    v_mov_b32_e32 v13, v25
; GISEL-NEXT:    v_mov_b32_e32 v14, v26
; GISEL-NEXT:    v_mov_b32_e32 v15, v27
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <16 x float> %result
}

define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b32_e32 v48, v0
; GISEL-NEXT:    v_mov_b32_e32 v49, v1
; GISEL-NEXT:    v_mov_b32_e32 v50, v2
; GISEL-NEXT:    v_mov_b32_e32 v51, v3
; GISEL-NEXT:    v_mov_b32_e32 v30, v4
; GISEL-NEXT:    v_mov_b32_e32 v31, v5
; GISEL-NEXT:    v_mov_b32_e32 v32, v6
; GISEL-NEXT:    v_mov_b32_e32 v33, v7
; GISEL-NEXT:    v_mov_b32_e32 v34, v8
; GISEL-NEXT:    v_mov_b32_e32 v35, v9
; GISEL-NEXT:    v_mov_b32_e32 v36, v10
; GISEL-NEXT:    v_mov_b32_e32 v37, v11
; GISEL-NEXT:    v_mov_b32_e32 v0, v12
; GISEL-NEXT:    v_mov_b32_e32 v1, v13
; GISEL-NEXT:    v_mov_b32_e32 v2, v14
; GISEL-NEXT:    v_mov_b32_e32 v3, v15
; GISEL-NEXT:    v_mov_b32_e32 v4, v16
; GISEL-NEXT:    v_mov_b32_e32 v5, v17
; GISEL-NEXT:    v_mov_b32_e32 v6, v18
; GISEL-NEXT:    v_mov_b32_e32 v7, v19
; GISEL-NEXT:    v_mov_b32_e32 v8, v20
; GISEL-NEXT:    v_mov_b32_e32 v9, v21
; GISEL-NEXT:    v_mov_b32_e32 v10, v22
; GISEL-NEXT:    v_mov_b32_e32 v11, v23
; GISEL-NEXT:    v_mov_b32_e32 v12, v24
; GISEL-NEXT:    v_mov_b32_e32 v13, v25
; GISEL-NEXT:    v_mov_b32_e32 v14, v26
; GISEL-NEXT:    v_mov_b32_e32 v15, v27
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
  ret <16 x float> %result
}

define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b32_e32 v48, v0
; GISEL-NEXT:    v_mov_b32_e32 v49, v1
; GISEL-NEXT:    v_mov_b32_e32 v50, v2
; GISEL-NEXT:    v_mov_b32_e32 v51, v3
; GISEL-NEXT:    v_mov_b32_e32 v30, v4
; GISEL-NEXT:    v_mov_b32_e32 v31, v5
; GISEL-NEXT:    v_mov_b32_e32 v32, v6
; GISEL-NEXT:    v_mov_b32_e32 v33, v7
; GISEL-NEXT:    v_mov_b32_e32 v34, v8
; GISEL-NEXT:    v_mov_b32_e32 v35, v9
; GISEL-NEXT:    v_mov_b32_e32 v36, v10
; GISEL-NEXT:    v_mov_b32_e32 v37, v11
; GISEL-NEXT:    v_mov_b32_e32 v0, v12
; GISEL-NEXT:    v_mov_b32_e32 v1, v13
; GISEL-NEXT:    v_mov_b32_e32 v2, v14
; GISEL-NEXT:    v_mov_b32_e32 v3, v15
; GISEL-NEXT:    v_mov_b32_e32 v4, v16
; GISEL-NEXT:    v_mov_b32_e32 v5, v17
; GISEL-NEXT:    v_mov_b32_e32 v6, v18
; GISEL-NEXT:    v_mov_b32_e32 v7, v19
; GISEL-NEXT:    v_mov_b32_e32 v8, v20
; GISEL-NEXT:    v_mov_b32_e32 v9, v21
; GISEL-NEXT:    v_mov_b32_e32 v10, v22
; GISEL-NEXT:    v_mov_b32_e32 v11, v23
; GISEL-NEXT:    v_mov_b32_e32 v12, v24
; GISEL-NEXT:    v_mov_b32_e32 v13, v25
; GISEL-NEXT:    v_mov_b32_e32 v14, v26
; GISEL-NEXT:    v_mov_b32_e32 v15, v27
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
  ret <16 x float> %result
}

define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v36, s0
; SDAG-NEXT:    v_mov_b32_e32 v37, s1
; SDAG-NEXT:    v_mov_b32_e32 v38, s2
; SDAG-NEXT:    v_mov_b32_e32 v39, s3
; SDAG-NEXT:    v_mov_b32_e32 v13, s25
; SDAG-NEXT:    v_mov_b32_e32 v14, s26
; SDAG-NEXT:    v_mov_b32_e32 v15, s27
; SDAG-NEXT:    v_mov_b32_e32 v16, s28
; SDAG-NEXT:    v_mov_b32_e32 v17, s29
; SDAG-NEXT:    v_mov_b32_e32 v28, s16
; SDAG-NEXT:    v_mov_b32_e32 v29, s17
; SDAG-NEXT:    v_mov_b32_e32 v30, s18
; SDAG-NEXT:    v_mov_b32_e32 v31, s19
; SDAG-NEXT:    v_mov_b32_e32 v32, s20
; SDAG-NEXT:    v_mov_b32_e32 v33, s21
; SDAG-NEXT:    v_mov_b32_e32 v34, s22
; SDAG-NEXT:    v_mov_b32_e32 v35, s23
; SDAG-NEXT:    v_mov_b32_e32 v12, s24
; SDAG-NEXT:    v_mov_b32_e32 v18, v0
; SDAG-NEXT:    v_mov_b32_e32 v19, v1
; SDAG-NEXT:    v_mov_b32_e32 v20, v2
; SDAG-NEXT:    v_mov_b32_e32 v21, v3
; SDAG-NEXT:    v_mov_b32_e32 v22, v4
; SDAG-NEXT:    v_mov_b32_e32 v23, v5
; SDAG-NEXT:    v_mov_b32_e32 v24, v6
; SDAG-NEXT:    v_mov_b32_e32 v25, v7
; SDAG-NEXT:    v_mov_b32_e32 v26, v8
; SDAG-NEXT:    v_mov_b32_e32 v27, v9
; SDAG-NEXT:    s_nop 1
; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[2:3]
; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
; GISEL-NEXT:    v_mov_b32_e32 v11, v0
; GISEL-NEXT:    v_mov_b32_e32 v12, v1
; GISEL-NEXT:    v_mov_b32_e32 v13, v2
; GISEL-NEXT:    v_mov_b32_e32 v14, v3
; GISEL-NEXT:    v_mov_b32_e32 v15, v4
; GISEL-NEXT:    v_mov_b32_e32 v16, v5
; GISEL-NEXT:    v_mov_b32_e32 v17, v6
; GISEL-NEXT:    v_mov_b32_e32 v18, v7
; GISEL-NEXT:    v_mov_b32_e32 v19, v8
; GISEL-NEXT:    v_mov_b32_e32 v20, v9
; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[22:23]
; GISEL-NEXT:    v_mov_b32_e32 v21, v10
; GISEL-NEXT:    v_mov_b32_e32 v0, s24
; GISEL-NEXT:    v_mov_b32_e32 v1, s25
; GISEL-NEXT:    v_mov_b32_e32 v2, s26
; GISEL-NEXT:    v_mov_b32_e32 v3, s27
; GISEL-NEXT:    v_mov_b32_e32 v4, s28
; GISEL-NEXT:    v_mov_b32_e32 v5, s29
; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[20:21]
; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[18:19]
; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[16:17]
; GISEL-NEXT:    v_mov_b32_e32 v6, v11
; GISEL-NEXT:    v_mov_b32_e32 v7, v12
; GISEL-NEXT:    v_mov_b32_e32 v8, v13
; GISEL-NEXT:    v_mov_b32_e32 v9, v14
; GISEL-NEXT:    v_mov_b32_e32 v10, v15
; GISEL-NEXT:    v_mov_b32_e32 v11, v16
; GISEL-NEXT:    v_mov_b32_e32 v12, v17
; GISEL-NEXT:    v_mov_b32_e32 v13, v18
; GISEL-NEXT:    v_mov_b32_e32 v14, v19
; GISEL-NEXT:    v_mov_b32_e32 v15, v20
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <16 x float> %result
}

; --------------------------------------------------------------------
; llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8
; --------------------------------------------------------------------

declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32>, <8 x i32>, <16 x float>, i32, i32 immarg, i32 immarg)

define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__vgpr:
; SDAG:       ; %bb.0: ; %bb
; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v24, s8
; SDAG-NEXT:    v_mov_b32_e32 v25, s9
; SDAG-NEXT:    v_mov_b32_e32 v26, s10
; SDAG-NEXT:    v_mov_b32_e32 v27, s11
; SDAG-NEXT:    v_mov_b32_e32 v16, s12
; SDAG-NEXT:    v_mov_b32_e32 v17, s13
; SDAG-NEXT:    v_mov_b32_e32 v18, s14
; SDAG-NEXT:    v_mov_b32_e32 v19, s15
; SDAG-NEXT:    v_mov_b32_e32 v20, s0
; SDAG-NEXT:    v_mov_b32_e32 v21, s1
; SDAG-NEXT:    v_mov_b32_e32 v22, s2
; SDAG-NEXT:    v_mov_b32_e32 v23, s3
; SDAG-NEXT:    v_mov_b32_e32 v28, s16
; SDAG-NEXT:    s_waitcnt vmcnt(0)
; SDAG-NEXT:    s_nop 0
; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT:    v_mov_b32_e32 v16, 0
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 2
; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
; SDAG-NEXT:    s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__vgpr:
; GISEL:       ; %bb.0: ; %bb
; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[8:9]
; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
; GISEL-NEXT:    v_mov_b32_e32 v28, s2
; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[12:13]
; GISEL-NEXT:    s_waitcnt vmcnt(0)
; GISEL-NEXT:    s_nop 0
; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; GISEL-NEXT:    v_mov_b32_e32 v16, 0
; GISEL-NEXT:    s_nop 7
; GISEL-NEXT:    s_nop 2
; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT:    s_endpgm
bb:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
  %in.1 = load <16 x float>, ptr addrspace(1) %gep
  %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %a, <8 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
  store <16 x float> %mai.1, ptr addrspace(1) %arg
  ret void
}

define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b32_e32 v48, v0
; GISEL-NEXT:    v_mov_b32_e32 v49, v1
; GISEL-NEXT:    v_mov_b32_e32 v50, v2
; GISEL-NEXT:    v_mov_b32_e32 v51, v3
; GISEL-NEXT:    v_mov_b32_e32 v30, v4
; GISEL-NEXT:    v_mov_b32_e32 v31, v5
; GISEL-NEXT:    v_mov_b32_e32 v32, v6
; GISEL-NEXT:    v_mov_b32_e32 v33, v7
; GISEL-NEXT:    v_mov_b32_e32 v34, v8
; GISEL-NEXT:    v_mov_b32_e32 v35, v9
; GISEL-NEXT:    v_mov_b32_e32 v36, v10
; GISEL-NEXT:    v_mov_b32_e32 v37, v11
; GISEL-NEXT:    v_mov_b32_e32 v0, v12
; GISEL-NEXT:    v_mov_b32_e32 v1, v13
; GISEL-NEXT:    v_mov_b32_e32 v2, v14
; GISEL-NEXT:    v_mov_b32_e32 v3, v15
; GISEL-NEXT:    v_mov_b32_e32 v4, v16
; GISEL-NEXT:    v_mov_b32_e32 v5, v17
; GISEL-NEXT:    v_mov_b32_e32 v6, v18
; GISEL-NEXT:    v_mov_b32_e32 v7, v19
; GISEL-NEXT:    v_mov_b32_e32 v8, v20
; GISEL-NEXT:    v_mov_b32_e32 v9, v21
; GISEL-NEXT:    v_mov_b32_e32 v10, v22
; GISEL-NEXT:    v_mov_b32_e32 v11, v23
; GISEL-NEXT:    v_mov_b32_e32 v12, v24
; GISEL-NEXT:    v_mov_b32_e32 v13, v25
; GISEL-NEXT:    v_mov_b32_e32 v14, v26
; GISEL-NEXT:    v_mov_b32_e32 v15, v27
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <16 x float> %result
}

define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b32_e32 v48, v0
; GISEL-NEXT:    v_mov_b32_e32 v49, v1
; GISEL-NEXT:    v_mov_b32_e32 v50, v2
; GISEL-NEXT:    v_mov_b32_e32 v51, v3
; GISEL-NEXT:    v_mov_b32_e32 v30, v4
; GISEL-NEXT:    v_mov_b32_e32 v31, v5
; GISEL-NEXT:    v_mov_b32_e32 v32, v6
; GISEL-NEXT:    v_mov_b32_e32 v33, v7
; GISEL-NEXT:    v_mov_b32_e32 v34, v8
; GISEL-NEXT:    v_mov_b32_e32 v35, v9
; GISEL-NEXT:    v_mov_b32_e32 v36, v10
; GISEL-NEXT:    v_mov_b32_e32 v37, v11
; GISEL-NEXT:    v_mov_b32_e32 v0, v12
; GISEL-NEXT:    v_mov_b32_e32 v1, v13
; GISEL-NEXT:    v_mov_b32_e32 v2, v14
; GISEL-NEXT:    v_mov_b32_e32 v3, v15
; GISEL-NEXT:    v_mov_b32_e32 v4, v16
; GISEL-NEXT:    v_mov_b32_e32 v5, v17
; GISEL-NEXT:    v_mov_b32_e32 v6, v18
; GISEL-NEXT:    v_mov_b32_e32 v7, v19
; GISEL-NEXT:    v_mov_b32_e32 v8, v20
; GISEL-NEXT:    v_mov_b32_e32 v9, v21
; GISEL-NEXT:    v_mov_b32_e32 v10, v22
; GISEL-NEXT:    v_mov_b32_e32 v11, v23
; GISEL-NEXT:    v_mov_b32_e32 v12, v24
; GISEL-NEXT:    v_mov_b32_e32 v13, v25
; GISEL-NEXT:    v_mov_b32_e32 v14, v26
; GISEL-NEXT:    v_mov_b32_e32 v15, v27
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
  ret <16 x float> %result
}

define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b32_e32 v48, v0
; GISEL-NEXT:    v_mov_b32_e32 v49, v1
; GISEL-NEXT:    v_mov_b32_e32 v50, v2
; GISEL-NEXT:    v_mov_b32_e32 v51, v3
; GISEL-NEXT:    v_mov_b32_e32 v30, v4
; GISEL-NEXT:    v_mov_b32_e32 v31, v5
; GISEL-NEXT:    v_mov_b32_e32 v32, v6
; GISEL-NEXT:    v_mov_b32_e32 v33, v7
; GISEL-NEXT:    v_mov_b32_e32 v34, v8
; GISEL-NEXT:    v_mov_b32_e32 v35, v9
; GISEL-NEXT:    v_mov_b32_e32 v36, v10
; GISEL-NEXT:    v_mov_b32_e32 v37, v11
; GISEL-NEXT:    v_mov_b32_e32 v0, v12
; GISEL-NEXT:    v_mov_b32_e32 v1, v13
; GISEL-NEXT:    v_mov_b32_e32 v2, v14
; GISEL-NEXT:    v_mov_b32_e32 v3, v15
; GISEL-NEXT:    v_mov_b32_e32 v4, v16
; GISEL-NEXT:    v_mov_b32_e32 v5, v17
; GISEL-NEXT:    v_mov_b32_e32 v6, v18
; GISEL-NEXT:    v_mov_b32_e32 v7, v19
; GISEL-NEXT:    v_mov_b32_e32 v8, v20
; GISEL-NEXT:    v_mov_b32_e32 v9, v21
; GISEL-NEXT:    v_mov_b32_e32 v10, v22
; GISEL-NEXT:    v_mov_b32_e32 v11, v23
; GISEL-NEXT:    v_mov_b32_e32 v12, v24
; GISEL-NEXT:    v_mov_b32_e32 v13, v25
; GISEL-NEXT:    v_mov_b32_e32 v14, v26
; GISEL-NEXT:    v_mov_b32_e32 v15, v27
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
  ret <16 x float> %result
}

define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v36, s0
; SDAG-NEXT:    v_mov_b32_e32 v37, s1
; SDAG-NEXT:    v_mov_b32_e32 v38, s2
; SDAG-NEXT:    v_mov_b32_e32 v39, s3
; SDAG-NEXT:    v_mov_b32_e32 v13, s25
; SDAG-NEXT:    v_mov_b32_e32 v14, s26
; SDAG-NEXT:    v_mov_b32_e32 v15, s27
; SDAG-NEXT:    v_mov_b32_e32 v16, s28
; SDAG-NEXT:    v_mov_b32_e32 v17, s29
; SDAG-NEXT:    v_mov_b32_e32 v28, s16
; SDAG-NEXT:    v_mov_b32_e32 v29, s17
; SDAG-NEXT:    v_mov_b32_e32 v30, s18
; SDAG-NEXT:    v_mov_b32_e32 v31, s19
; SDAG-NEXT:    v_mov_b32_e32 v32, s20
; SDAG-NEXT:    v_mov_b32_e32 v33, s21
; SDAG-NEXT:    v_mov_b32_e32 v34, s22
; SDAG-NEXT:    v_mov_b32_e32 v35, s23
; SDAG-NEXT:    v_mov_b32_e32 v12, s24
; SDAG-NEXT:    v_mov_b32_e32 v18, v0
; SDAG-NEXT:    v_mov_b32_e32 v19, v1
; SDAG-NEXT:    v_mov_b32_e32 v20, v2
; SDAG-NEXT:    v_mov_b32_e32 v21, v3
; SDAG-NEXT:    v_mov_b32_e32 v22, v4
; SDAG-NEXT:    v_mov_b32_e32 v23, v5
; SDAG-NEXT:    v_mov_b32_e32 v24, v6
; SDAG-NEXT:    v_mov_b32_e32 v25, v7
; SDAG-NEXT:    v_mov_b32_e32 v26, v8
; SDAG-NEXT:    v_mov_b32_e32 v27, v9
; SDAG-NEXT:    s_nop 1
; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[2:3]
; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
; GISEL-NEXT:    v_mov_b32_e32 v11, v0
; GISEL-NEXT:    v_mov_b32_e32 v12, v1
; GISEL-NEXT:    v_mov_b32_e32 v13, v2
; GISEL-NEXT:    v_mov_b32_e32 v14, v3
; GISEL-NEXT:    v_mov_b32_e32 v15, v4
; GISEL-NEXT:    v_mov_b32_e32 v16, v5
; GISEL-NEXT:    v_mov_b32_e32 v17, v6
; GISEL-NEXT:    v_mov_b32_e32 v18, v7
; GISEL-NEXT:    v_mov_b32_e32 v19, v8
; GISEL-NEXT:    v_mov_b32_e32 v20, v9
; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[22:23]
; GISEL-NEXT:    v_mov_b32_e32 v21, v10
; GISEL-NEXT:    v_mov_b32_e32 v0, s24
; GISEL-NEXT:    v_mov_b32_e32 v1, s25
; GISEL-NEXT:    v_mov_b32_e32 v2, s26
; GISEL-NEXT:    v_mov_b32_e32 v3, s27
; GISEL-NEXT:    v_mov_b32_e32 v4, s28
; GISEL-NEXT:    v_mov_b32_e32 v5, s29
; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[20:21]
; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[18:19]
; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[16:17]
; GISEL-NEXT:    v_mov_b32_e32 v6, v11
; GISEL-NEXT:    v_mov_b32_e32 v7, v12
; GISEL-NEXT:    v_mov_b32_e32 v8, v13
; GISEL-NEXT:    v_mov_b32_e32 v9, v14
; GISEL-NEXT:    v_mov_b32_e32 v10, v15
; GISEL-NEXT:    v_mov_b32_e32 v11, v16
; GISEL-NEXT:    v_mov_b32_e32 v12, v17
; GISEL-NEXT:    v_mov_b32_e32 v13, v18
; GISEL-NEXT:    v_mov_b32_e32 v14, v19
; GISEL-NEXT:    v_mov_b32_e32 v15, v20
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <16 x float> %result
}

; --------------------------------------------------------------------
; llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8
; --------------------------------------------------------------------

declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32>, <8 x i32>, <16 x float>, i32, i32 immarg, i32 immarg)

define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__vgpr:
; SDAG:       ; %bb.0: ; %bb
; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v24, s8
; SDAG-NEXT:    v_mov_b32_e32 v25, s9
; SDAG-NEXT:    v_mov_b32_e32 v26, s10
; SDAG-NEXT:    v_mov_b32_e32 v27, s11
; SDAG-NEXT:    v_mov_b32_e32 v16, s12
; SDAG-NEXT:    v_mov_b32_e32 v17, s13
; SDAG-NEXT:    v_mov_b32_e32 v18, s14
; SDAG-NEXT:    v_mov_b32_e32 v19, s15
; SDAG-NEXT:    v_mov_b32_e32 v20, s0
; SDAG-NEXT:    v_mov_b32_e32 v21, s1
; SDAG-NEXT:    v_mov_b32_e32 v22, s2
; SDAG-NEXT:    v_mov_b32_e32 v23, s3
; SDAG-NEXT:    v_mov_b32_e32 v28, s16
; SDAG-NEXT:    s_waitcnt vmcnt(0)
; SDAG-NEXT:    s_nop 0
; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT:    v_mov_b32_e32 v16, 0
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 2
; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
; SDAG-NEXT:    s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__vgpr:
; GISEL:       ; %bb.0: ; %bb
; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[8:9]
; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
; GISEL-NEXT:    v_mov_b32_e32 v28, s2
; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[12:13]
; GISEL-NEXT:    s_waitcnt vmcnt(0)
; GISEL-NEXT:    s_nop 0
; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; GISEL-NEXT:    v_mov_b32_e32 v16, 0
; GISEL-NEXT:    s_nop 7
; GISEL-NEXT:    s_nop 2
; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT:    s_endpgm
bb:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
  %in.1 = load <16 x float>, ptr addrspace(1) %gep
  %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %a, <8 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
  store <16 x float> %mai.1, ptr addrspace(1) %arg
  ret void
}

define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b32_e32 v48, v0
; GISEL-NEXT:    v_mov_b32_e32 v49, v1
; GISEL-NEXT:    v_mov_b32_e32 v50, v2
; GISEL-NEXT:    v_mov_b32_e32 v51, v3
; GISEL-NEXT:    v_mov_b32_e32 v30, v4
; GISEL-NEXT:    v_mov_b32_e32 v31, v5
; GISEL-NEXT:    v_mov_b32_e32 v32, v6
; GISEL-NEXT:    v_mov_b32_e32 v33, v7
; GISEL-NEXT:    v_mov_b32_e32 v34, v8
; GISEL-NEXT:    v_mov_b32_e32 v35, v9
; GISEL-NEXT:    v_mov_b32_e32 v36, v10
; GISEL-NEXT:    v_mov_b32_e32 v37, v11
; GISEL-NEXT:    v_mov_b32_e32 v0, v12
; GISEL-NEXT:    v_mov_b32_e32 v1, v13
; GISEL-NEXT:    v_mov_b32_e32 v2, v14
; GISEL-NEXT:    v_mov_b32_e32 v3, v15
; GISEL-NEXT:    v_mov_b32_e32 v4, v16
; GISEL-NEXT:    v_mov_b32_e32 v5, v17
; GISEL-NEXT:    v_mov_b32_e32 v6, v18
; GISEL-NEXT:    v_mov_b32_e32 v7, v19
; GISEL-NEXT:    v_mov_b32_e32 v8, v20
; GISEL-NEXT:    v_mov_b32_e32 v9, v21
; GISEL-NEXT:    v_mov_b32_e32 v10, v22
; GISEL-NEXT:    v_mov_b32_e32 v11, v23
; GISEL-NEXT:    v_mov_b32_e32 v12, v24
; GISEL-NEXT:    v_mov_b32_e32 v13, v25
; GISEL-NEXT:    v_mov_b32_e32 v14, v26
; GISEL-NEXT:    v_mov_b32_e32 v15, v27
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <16 x float> %result
}

define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b32_e32 v48, v0
; GISEL-NEXT:    v_mov_b32_e32 v49, v1
; GISEL-NEXT:    v_mov_b32_e32 v50, v2
; GISEL-NEXT:    v_mov_b32_e32 v51, v3
; GISEL-NEXT:    v_mov_b32_e32 v30, v4
; GISEL-NEXT:    v_mov_b32_e32 v31, v5
; GISEL-NEXT:    v_mov_b32_e32 v32, v6
; GISEL-NEXT:    v_mov_b32_e32 v33, v7
; GISEL-NEXT:    v_mov_b32_e32 v34, v8
; GISEL-NEXT:    v_mov_b32_e32 v35, v9
; GISEL-NEXT:    v_mov_b32_e32 v36, v10
; GISEL-NEXT:    v_mov_b32_e32 v37, v11
; GISEL-NEXT:    v_mov_b32_e32 v0, v12
; GISEL-NEXT:    v_mov_b32_e32 v1, v13
; GISEL-NEXT:    v_mov_b32_e32 v2, v14
; GISEL-NEXT:    v_mov_b32_e32 v3, v15
; GISEL-NEXT:    v_mov_b32_e32 v4, v16
; GISEL-NEXT:    v_mov_b32_e32 v5, v17
; GISEL-NEXT:    v_mov_b32_e32 v6, v18
; GISEL-NEXT:    v_mov_b32_e32 v7, v19
; GISEL-NEXT:    v_mov_b32_e32 v8, v20
; GISEL-NEXT:    v_mov_b32_e32 v9, v21
; GISEL-NEXT:    v_mov_b32_e32 v10, v22
; GISEL-NEXT:    v_mov_b32_e32 v11, v23
; GISEL-NEXT:    v_mov_b32_e32 v12, v24
; GISEL-NEXT:    v_mov_b32_e32 v13, v25
; GISEL-NEXT:    v_mov_b32_e32 v14, v26
; GISEL-NEXT:    v_mov_b32_e32 v15, v27
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
  ret <16 x float> %result
}

define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b32_e32 v48, v0
; GISEL-NEXT:    v_mov_b32_e32 v49, v1
; GISEL-NEXT:    v_mov_b32_e32 v50, v2
; GISEL-NEXT:    v_mov_b32_e32 v51, v3
; GISEL-NEXT:    v_mov_b32_e32 v30, v4
; GISEL-NEXT:    v_mov_b32_e32 v31, v5
; GISEL-NEXT:    v_mov_b32_e32 v32, v6
; GISEL-NEXT:    v_mov_b32_e32 v33, v7
; GISEL-NEXT:    v_mov_b32_e32 v34, v8
; GISEL-NEXT:    v_mov_b32_e32 v35, v9
; GISEL-NEXT:    v_mov_b32_e32 v36, v10
; GISEL-NEXT:    v_mov_b32_e32 v37, v11
; GISEL-NEXT:    v_mov_b32_e32 v0, v12
; GISEL-NEXT:    v_mov_b32_e32 v1, v13
; GISEL-NEXT:    v_mov_b32_e32 v2, v14
; GISEL-NEXT:    v_mov_b32_e32 v3, v15
; GISEL-NEXT:    v_mov_b32_e32 v4, v16
; GISEL-NEXT:    v_mov_b32_e32 v5, v17
; GISEL-NEXT:    v_mov_b32_e32 v6, v18
; GISEL-NEXT:    v_mov_b32_e32 v7, v19
; GISEL-NEXT:    v_mov_b32_e32 v8, v20
; GISEL-NEXT:    v_mov_b32_e32 v9, v21
; GISEL-NEXT:    v_mov_b32_e32 v10, v22
; GISEL-NEXT:    v_mov_b32_e32 v11, v23
; GISEL-NEXT:    v_mov_b32_e32 v12, v24
; GISEL-NEXT:    v_mov_b32_e32 v13, v25
; GISEL-NEXT:    v_mov_b32_e32 v14, v26
; GISEL-NEXT:    v_mov_b32_e32 v15, v27
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
  ret <16 x float> %result
}

define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v36, s0
; SDAG-NEXT:    v_mov_b32_e32 v37, s1
; SDAG-NEXT:    v_mov_b32_e32 v38, s2
; SDAG-NEXT:    v_mov_b32_e32 v39, s3
; SDAG-NEXT:    v_mov_b32_e32 v13, s25
; SDAG-NEXT:    v_mov_b32_e32 v14, s26
; SDAG-NEXT:    v_mov_b32_e32 v15, s27
; SDAG-NEXT:    v_mov_b32_e32 v16, s28
; SDAG-NEXT:    v_mov_b32_e32 v17, s29
; SDAG-NEXT:    v_mov_b32_e32 v28, s16
; SDAG-NEXT:    v_mov_b32_e32 v29, s17
; SDAG-NEXT:    v_mov_b32_e32 v30, s18
; SDAG-NEXT:    v_mov_b32_e32 v31, s19
; SDAG-NEXT:    v_mov_b32_e32 v32, s20
; SDAG-NEXT:    v_mov_b32_e32 v33, s21
; SDAG-NEXT:    v_mov_b32_e32 v34, s22
; SDAG-NEXT:    v_mov_b32_e32 v35, s23
; SDAG-NEXT:    v_mov_b32_e32 v12, s24
; SDAG-NEXT:    v_mov_b32_e32 v18, v0
; SDAG-NEXT:    v_mov_b32_e32 v19, v1
; SDAG-NEXT:    v_mov_b32_e32 v20, v2
; SDAG-NEXT:    v_mov_b32_e32 v21, v3
; SDAG-NEXT:    v_mov_b32_e32 v22, v4
; SDAG-NEXT:    v_mov_b32_e32 v23, v5
; SDAG-NEXT:    v_mov_b32_e32 v24, v6
; SDAG-NEXT:    v_mov_b32_e32 v25, v7
; SDAG-NEXT:    v_mov_b32_e32 v26, v8
; SDAG-NEXT:    v_mov_b32_e32 v27, v9
; SDAG-NEXT:    s_nop 1
; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[2:3]
; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
; GISEL-NEXT:    v_mov_b32_e32 v11, v0
; GISEL-NEXT:    v_mov_b32_e32 v12, v1
; GISEL-NEXT:    v_mov_b32_e32 v13, v2
; GISEL-NEXT:    v_mov_b32_e32 v14, v3
; GISEL-NEXT:    v_mov_b32_e32 v15, v4
; GISEL-NEXT:    v_mov_b32_e32 v16, v5
; GISEL-NEXT:    v_mov_b32_e32 v17, v6
; GISEL-NEXT:    v_mov_b32_e32 v18, v7
; GISEL-NEXT:    v_mov_b32_e32 v19, v8
; GISEL-NEXT:    v_mov_b32_e32 v20, v9
; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[22:23]
; GISEL-NEXT:    v_mov_b32_e32 v21, v10
; GISEL-NEXT:    v_mov_b32_e32 v0, s24
; GISEL-NEXT:    v_mov_b32_e32 v1, s25
; GISEL-NEXT:    v_mov_b32_e32 v2, s26
; GISEL-NEXT:    v_mov_b32_e32 v3, s27
; GISEL-NEXT:    v_mov_b32_e32 v4, s28
; GISEL-NEXT:    v_mov_b32_e32 v5, s29
; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[20:21]
; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[18:19]
; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[16:17]
; GISEL-NEXT:    v_mov_b32_e32 v6, v11
; GISEL-NEXT:    v_mov_b32_e32 v7, v12
; GISEL-NEXT:    v_mov_b32_e32 v8, v13
; GISEL-NEXT:    v_mov_b32_e32 v9, v14
; GISEL-NEXT:    v_mov_b32_e32 v10, v15
; GISEL-NEXT:    v_mov_b32_e32 v11, v16
; GISEL-NEXT:    v_mov_b32_e32 v12, v17
; GISEL-NEXT:    v_mov_b32_e32 v13, v18
; GISEL-NEXT:    v_mov_b32_e32 v14, v19
; GISEL-NEXT:    v_mov_b32_e32 v15, v20
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <16 x float> %result
}

; --------------------------------------------------------------------
; llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8
; --------------------------------------------------------------------

declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32>, <8 x i32>, <16 x float>, i32, i32 immarg, i32 immarg)

define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__vgpr:
; SDAG:       ; %bb.0: ; %bb
; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v24, s8
; SDAG-NEXT:    v_mov_b32_e32 v25, s9
; SDAG-NEXT:    v_mov_b32_e32 v26, s10
; SDAG-NEXT:    v_mov_b32_e32 v27, s11
; SDAG-NEXT:    v_mov_b32_e32 v16, s12
; SDAG-NEXT:    v_mov_b32_e32 v17, s13
; SDAG-NEXT:    v_mov_b32_e32 v18, s14
; SDAG-NEXT:    v_mov_b32_e32 v19, s15
; SDAG-NEXT:    v_mov_b32_e32 v20, s0
; SDAG-NEXT:    v_mov_b32_e32 v21, s1
; SDAG-NEXT:    v_mov_b32_e32 v22, s2
; SDAG-NEXT:    v_mov_b32_e32 v23, s3
; SDAG-NEXT:    v_mov_b32_e32 v28, s16
; SDAG-NEXT:    s_waitcnt vmcnt(0)
; SDAG-NEXT:    s_nop 0
; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; SDAG-NEXT:    v_mov_b32_e32 v16, 0
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 2
; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
; SDAG-NEXT:    s_endpgm
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__vgpr:
; GISEL:       ; %bb.0: ; %bb
; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[10:11]
; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[8:9]
; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
; GISEL-NEXT:    v_mov_b32_e32 v28, s2
; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[14:15]
; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[12:13]
; GISEL-NEXT:    s_waitcnt vmcnt(0)
; GISEL-NEXT:    s_nop 0
; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
; GISEL-NEXT:    v_mov_b32_e32 v16, 0
; GISEL-NEXT:    s_nop 7
; GISEL-NEXT:    s_nop 2
; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT:    s_endpgm
bb:
  %id = call i32 @llvm.amdgcn.workitem.id.x()
  %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
  %in.1 = load <16 x float>, ptr addrspace(1) %gep
  %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %a, <8 x i32> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
  store <16 x float> %mai.1, ptr addrspace(1) %arg
  ret void
}

define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b32_e32 v48, v0
; GISEL-NEXT:    v_mov_b32_e32 v49, v1
; GISEL-NEXT:    v_mov_b32_e32 v50, v2
; GISEL-NEXT:    v_mov_b32_e32 v51, v3
; GISEL-NEXT:    v_mov_b32_e32 v30, v4
; GISEL-NEXT:    v_mov_b32_e32 v31, v5
; GISEL-NEXT:    v_mov_b32_e32 v32, v6
; GISEL-NEXT:    v_mov_b32_e32 v33, v7
; GISEL-NEXT:    v_mov_b32_e32 v34, v8
; GISEL-NEXT:    v_mov_b32_e32 v35, v9
; GISEL-NEXT:    v_mov_b32_e32 v36, v10
; GISEL-NEXT:    v_mov_b32_e32 v37, v11
; GISEL-NEXT:    v_mov_b32_e32 v0, v12
; GISEL-NEXT:    v_mov_b32_e32 v1, v13
; GISEL-NEXT:    v_mov_b32_e32 v2, v14
; GISEL-NEXT:    v_mov_b32_e32 v3, v15
; GISEL-NEXT:    v_mov_b32_e32 v4, v16
; GISEL-NEXT:    v_mov_b32_e32 v5, v17
; GISEL-NEXT:    v_mov_b32_e32 v6, v18
; GISEL-NEXT:    v_mov_b32_e32 v7, v19
; GISEL-NEXT:    v_mov_b32_e32 v8, v20
; GISEL-NEXT:    v_mov_b32_e32 v9, v21
; GISEL-NEXT:    v_mov_b32_e32 v10, v22
; GISEL-NEXT:    v_mov_b32_e32 v11, v23
; GISEL-NEXT:    v_mov_b32_e32 v12, v24
; GISEL-NEXT:    v_mov_b32_e32 v13, v25
; GISEL-NEXT:    v_mov_b32_e32 v14, v26
; GISEL-NEXT:    v_mov_b32_e32 v15, v27
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <16 x float> %result
}

define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b32_e32 v48, v0
; GISEL-NEXT:    v_mov_b32_e32 v49, v1
; GISEL-NEXT:    v_mov_b32_e32 v50, v2
; GISEL-NEXT:    v_mov_b32_e32 v51, v3
; GISEL-NEXT:    v_mov_b32_e32 v30, v4
; GISEL-NEXT:    v_mov_b32_e32 v31, v5
; GISEL-NEXT:    v_mov_b32_e32 v32, v6
; GISEL-NEXT:    v_mov_b32_e32 v33, v7
; GISEL-NEXT:    v_mov_b32_e32 v34, v8
; GISEL-NEXT:    v_mov_b32_e32 v35, v9
; GISEL-NEXT:    v_mov_b32_e32 v36, v10
; GISEL-NEXT:    v_mov_b32_e32 v37, v11
; GISEL-NEXT:    v_mov_b32_e32 v0, v12
; GISEL-NEXT:    v_mov_b32_e32 v1, v13
; GISEL-NEXT:    v_mov_b32_e32 v2, v14
; GISEL-NEXT:    v_mov_b32_e32 v3, v15
; GISEL-NEXT:    v_mov_b32_e32 v4, v16
; GISEL-NEXT:    v_mov_b32_e32 v5, v17
; GISEL-NEXT:    v_mov_b32_e32 v6, v18
; GISEL-NEXT:    v_mov_b32_e32 v7, v19
; GISEL-NEXT:    v_mov_b32_e32 v8, v20
; GISEL-NEXT:    v_mov_b32_e32 v9, v21
; GISEL-NEXT:    v_mov_b32_e32 v10, v22
; GISEL-NEXT:    v_mov_b32_e32 v11, v23
; GISEL-NEXT:    v_mov_b32_e32 v12, v24
; GISEL-NEXT:    v_mov_b32_e32 v13, v25
; GISEL-NEXT:    v_mov_b32_e32 v14, v26
; GISEL-NEXT:    v_mov_b32_e32 v15, v27
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
  ret <16 x float> %result
}

define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b32_e32 v48, v0
; GISEL-NEXT:    v_mov_b32_e32 v49, v1
; GISEL-NEXT:    v_mov_b32_e32 v50, v2
; GISEL-NEXT:    v_mov_b32_e32 v51, v3
; GISEL-NEXT:    v_mov_b32_e32 v30, v4
; GISEL-NEXT:    v_mov_b32_e32 v31, v5
; GISEL-NEXT:    v_mov_b32_e32 v32, v6
; GISEL-NEXT:    v_mov_b32_e32 v33, v7
; GISEL-NEXT:    v_mov_b32_e32 v34, v8
; GISEL-NEXT:    v_mov_b32_e32 v35, v9
; GISEL-NEXT:    v_mov_b32_e32 v36, v10
; GISEL-NEXT:    v_mov_b32_e32 v37, v11
; GISEL-NEXT:    v_mov_b32_e32 v0, v12
; GISEL-NEXT:    v_mov_b32_e32 v1, v13
; GISEL-NEXT:    v_mov_b32_e32 v2, v14
; GISEL-NEXT:    v_mov_b32_e32 v3, v15
; GISEL-NEXT:    v_mov_b32_e32 v4, v16
; GISEL-NEXT:    v_mov_b32_e32 v5, v17
; GISEL-NEXT:    v_mov_b32_e32 v6, v18
; GISEL-NEXT:    v_mov_b32_e32 v7, v19
; GISEL-NEXT:    v_mov_b32_e32 v8, v20
; GISEL-NEXT:    v_mov_b32_e32 v9, v21
; GISEL-NEXT:    v_mov_b32_e32 v10, v22
; GISEL-NEXT:    v_mov_b32_e32 v11, v23
; GISEL-NEXT:    v_mov_b32_e32 v12, v24
; GISEL-NEXT:    v_mov_b32_e32 v13, v25
; GISEL-NEXT:    v_mov_b32_e32 v14, v26
; GISEL-NEXT:    v_mov_b32_e32 v15, v27
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
  ret <16 x float> %result
}

define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
; SDAG:       ; %bb.0:
; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT:    v_mov_b32_e32 v36, s0
; SDAG-NEXT:    v_mov_b32_e32 v37, s1
; SDAG-NEXT:    v_mov_b32_e32 v38, s2
; SDAG-NEXT:    v_mov_b32_e32 v39, s3
; SDAG-NEXT:    v_mov_b32_e32 v13, s25
; SDAG-NEXT:    v_mov_b32_e32 v14, s26
; SDAG-NEXT:    v_mov_b32_e32 v15, s27
; SDAG-NEXT:    v_mov_b32_e32 v16, s28
; SDAG-NEXT:    v_mov_b32_e32 v17, s29
; SDAG-NEXT:    v_mov_b32_e32 v28, s16
; SDAG-NEXT:    v_mov_b32_e32 v29, s17
; SDAG-NEXT:    v_mov_b32_e32 v30, s18
; SDAG-NEXT:    v_mov_b32_e32 v31, s19
; SDAG-NEXT:    v_mov_b32_e32 v32, s20
; SDAG-NEXT:    v_mov_b32_e32 v33, s21
; SDAG-NEXT:    v_mov_b32_e32 v34, s22
; SDAG-NEXT:    v_mov_b32_e32 v35, s23
; SDAG-NEXT:    v_mov_b32_e32 v12, s24
; SDAG-NEXT:    v_mov_b32_e32 v18, v0
; SDAG-NEXT:    v_mov_b32_e32 v19, v1
; SDAG-NEXT:    v_mov_b32_e32 v20, v2
; SDAG-NEXT:    v_mov_b32_e32 v21, v3
; SDAG-NEXT:    v_mov_b32_e32 v22, v4
; SDAG-NEXT:    v_mov_b32_e32 v23, v5
; SDAG-NEXT:    v_mov_b32_e32 v24, v6
; SDAG-NEXT:    v_mov_b32_e32 v25, v7
; SDAG-NEXT:    v_mov_b32_e32 v26, v8
; SDAG-NEXT:    v_mov_b32_e32 v27, v9
; SDAG-NEXT:    s_nop 1
; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10
; SDAG-NEXT:    s_nop 7
; SDAG-NEXT:    s_nop 3
; SDAG-NEXT:    v_mov_b32_e32 v0, v12
; SDAG-NEXT:    v_mov_b32_e32 v1, v13
; SDAG-NEXT:    v_mov_b32_e32 v2, v14
; SDAG-NEXT:    v_mov_b32_e32 v3, v15
; SDAG-NEXT:    v_mov_b32_e32 v4, v16
; SDAG-NEXT:    v_mov_b32_e32 v5, v17
; SDAG-NEXT:    v_mov_b32_e32 v6, v18
; SDAG-NEXT:    v_mov_b32_e32 v7, v19
; SDAG-NEXT:    v_mov_b32_e32 v8, v20
; SDAG-NEXT:    v_mov_b32_e32 v9, v21
; SDAG-NEXT:    v_mov_b32_e32 v10, v22
; SDAG-NEXT:    v_mov_b32_e32 v11, v23
; SDAG-NEXT:    v_mov_b32_e32 v12, v24
; SDAG-NEXT:    v_mov_b32_e32 v13, v25
; SDAG-NEXT:    v_mov_b32_e32 v14, v26
; SDAG-NEXT:    v_mov_b32_e32 v15, v27
; SDAG-NEXT:    s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
; GISEL:       ; %bb.0:
; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[2:3]
; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
; GISEL-NEXT:    v_mov_b32_e32 v11, v0
; GISEL-NEXT:    v_mov_b32_e32 v12, v1
; GISEL-NEXT:    v_mov_b32_e32 v13, v2
; GISEL-NEXT:    v_mov_b32_e32 v14, v3
; GISEL-NEXT:    v_mov_b32_e32 v15, v4
; GISEL-NEXT:    v_mov_b32_e32 v16, v5
; GISEL-NEXT:    v_mov_b32_e32 v17, v6
; GISEL-NEXT:    v_mov_b32_e32 v18, v7
; GISEL-NEXT:    v_mov_b32_e32 v19, v8
; GISEL-NEXT:    v_mov_b32_e32 v20, v9
; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[22:23]
; GISEL-NEXT:    v_mov_b32_e32 v21, v10
; GISEL-NEXT:    v_mov_b32_e32 v0, s24
; GISEL-NEXT:    v_mov_b32_e32 v1, s25
; GISEL-NEXT:    v_mov_b32_e32 v2, s26
; GISEL-NEXT:    v_mov_b32_e32 v3, s27
; GISEL-NEXT:    v_mov_b32_e32 v4, s28
; GISEL-NEXT:    v_mov_b32_e32 v5, s29
; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[20:21]
; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[18:19]
; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[16:17]
; GISEL-NEXT:    v_mov_b32_e32 v6, v11
; GISEL-NEXT:    v_mov_b32_e32 v7, v12
; GISEL-NEXT:    v_mov_b32_e32 v8, v13
; GISEL-NEXT:    v_mov_b32_e32 v9, v14
; GISEL-NEXT:    v_mov_b32_e32 v10, v15
; GISEL-NEXT:    v_mov_b32_e32 v11, v16
; GISEL-NEXT:    v_mov_b32_e32 v12, v17
; GISEL-NEXT:    v_mov_b32_e32 v13, v18
; GISEL-NEXT:    v_mov_b32_e32 v14, v19
; GISEL-NEXT:    v_mov_b32_e32 v15, v20
; GISEL-NEXT:    s_nop 1
; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT:    s_setpc_b64 s[30:31]
  %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
  ret <16 x float> %result
}

attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
