Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions llvm/test/CodeGen/PowerPC/optimize-vector-not-equal.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_64LE

; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \
; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_64

; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_32

; The current implementation is comparing vector of non-zeros in register v2 with v3. v3 is then negated and converts:
; 0XFFFF -> 0
; 0 -> 1
; An optimized version is to follow this NFC patch

define i32 @cols_needed(<4 x i16> %wide.load) {
; POWERPC_64LE-LABEL: cols_needed:
; POWERPC_64LE: # %bb.0: # %entry
; POWERPC_64LE-NEXT: xxlxor v3, v3, v3
; POWERPC_64LE-NEXT: li r3, 0
; POWERPC_64LE-NEXT: vcmpequh v2, v2, v3
; POWERPC_64LE-NEXT: vspltisw v3, 1
; POWERPC_64LE-NEXT: xxlnor v2, v2, v2
; POWERPC_64LE-NEXT: vmrglh v2, v2, v2
; POWERPC_64LE-NEXT: xxland v2, v2, v3
; POWERPC_64LE-NEXT: xxswapd v3, v2
; POWERPC_64LE-NEXT: vadduwm v2, v2, v3
; POWERPC_64LE-NEXT: xxspltw v3, v2, 2
; POWERPC_64LE-NEXT: vadduwm v2, v2, v3
; POWERPC_64LE-NEXT: vextuwrx r3, r3, v2
; POWERPC_64LE-NEXT: blr
;
; POWERPC_64-LABEL: cols_needed:
; POWERPC_64: # %bb.0: # %entry
; POWERPC_64-NEXT: xxlxor v3, v3, v3
; POWERPC_64-NEXT: li r3, 0
; POWERPC_64-NEXT: vcmpequh v2, v2, v3
; POWERPC_64-NEXT: vspltisw v3, 1
; POWERPC_64-NEXT: xxlnor v2, v2, v2
; POWERPC_64-NEXT: vmrghh v2, v2, v2
; POWERPC_64-NEXT: xxland v2, v2, v3
; POWERPC_64-NEXT: xxswapd v3, v2
; POWERPC_64-NEXT: vadduwm v2, v2, v3
; POWERPC_64-NEXT: xxspltw v3, v2, 1
; POWERPC_64-NEXT: vadduwm v2, v2, v3
; POWERPC_64-NEXT: vextuwlx r3, r3, v2
; POWERPC_64-NEXT: blr
;
; POWERPC_32-LABEL: cols_needed:
; POWERPC_32: # %bb.0: # %entry
; POWERPC_32-NEXT: xxlxor v3, v3, v3
; POWERPC_32-NEXT: vcmpequh v2, v2, v3
; POWERPC_32-NEXT: vspltisw v3, 1
; POWERPC_32-NEXT: xxlnor v2, v2, v2
; POWERPC_32-NEXT: vmrghh v2, v2, v2
; POWERPC_32-NEXT: xxland v2, v2, v3
; POWERPC_32-NEXT: xxswapd v3, v2
; POWERPC_32-NEXT: vadduwm v2, v2, v3
; POWERPC_32-NEXT: xxspltw v3, v2, 1
; POWERPC_32-NEXT: vadduwm v2, v2, v3
; POWERPC_32-NEXT: stxv v2, -16(r1)
; POWERPC_32-NEXT: lwz r3, -16(r1)
; POWERPC_32-NEXT: blr
entry:
%0 = icmp ne <4 x i16> %wide.load, zeroinitializer
%1 = zext <4 x i1> %0 to <4 x i32>
%2 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
ret i32 %2
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #0

attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }