From d056c90c871a62c157a0fa0a681d974d10abdaa3 Mon Sep 17 00:00:00 2001 From: himadhith Date: Tue, 9 Dec 2025 00:19:24 -0500 Subject: [PATCH] [NFC][PowerPC] Optimize vector compares for not equal to non zero vectors --- .../PowerPC/optimize-vector-not-equal.ll | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 llvm/test/CodeGen/PowerPC/optimize-vector-not-equal.ll diff --git a/llvm/test/CodeGen/PowerPC/optimize-vector-not-equal.ll b/llvm/test/CodeGen/PowerPC/optimize-vector-not-equal.ll new file mode 100644 index 0000000000000..bfb9ab3356f48 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/optimize-vector-not-equal.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_64LE + +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \ +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_64 + +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc-ibm-aix \ +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_32 + +; The current implementation is comparing vector of non-zeros in register v2 with v3. v3 is then negated and converts: +; 0XFFFF -> 0 +; 0 -> 1 +; An optimized version is to follow this NFC patch + +define i32 @cols_needed(<4 x i16> %wide.load) { +; POWERPC_64LE-LABEL: cols_needed: +; POWERPC_64LE: # %bb.0: # %entry +; POWERPC_64LE-NEXT: xxlxor v3, v3, v3 +; POWERPC_64LE-NEXT: li r3, 0 +; POWERPC_64LE-NEXT: vcmpequh v2, v2, v3 +; POWERPC_64LE-NEXT: vspltisw v3, 1 +; POWERPC_64LE-NEXT: xxlnor v2, v2, v2 +; POWERPC_64LE-NEXT: vmrglh v2, v2, v2 +; POWERPC_64LE-NEXT: xxland v2, v2, v3 +; POWERPC_64LE-NEXT: xxswapd v3, v2 +; POWERPC_64LE-NEXT: vadduwm v2, v2, v3 +; POWERPC_64LE-NEXT: xxspltw v3, v2, 2 +; POWERPC_64LE-NEXT: vadduwm v2, v2, v3 +; POWERPC_64LE-NEXT: vextuwrx r3, r3, v2 +; POWERPC_64LE-NEXT: blr +; +; POWERPC_64-LABEL: cols_needed: +; POWERPC_64: # %bb.0: # %entry +; POWERPC_64-NEXT: xxlxor v3, v3, v3 +; POWERPC_64-NEXT: li r3, 0 +; POWERPC_64-NEXT: vcmpequh v2, v2, v3 +; POWERPC_64-NEXT: vspltisw v3, 1 +; POWERPC_64-NEXT: xxlnor v2, v2, v2 +; POWERPC_64-NEXT: vmrghh v2, v2, v2 +; POWERPC_64-NEXT: xxland v2, v2, v3 +; POWERPC_64-NEXT: xxswapd v3, v2 +; POWERPC_64-NEXT: vadduwm v2, v2, v3 +; POWERPC_64-NEXT: xxspltw v3, v2, 1 +; POWERPC_64-NEXT: vadduwm v2, v2, v3 +; POWERPC_64-NEXT: vextuwlx r3, r3, v2 +; POWERPC_64-NEXT: blr +; +; POWERPC_32-LABEL: cols_needed: +; POWERPC_32: # %bb.0: # %entry +; POWERPC_32-NEXT: xxlxor v3, v3, v3 +; POWERPC_32-NEXT: vcmpequh v2, v2, v3 +; POWERPC_32-NEXT: vspltisw v3, 1 +; POWERPC_32-NEXT: xxlnor v2, v2, v2 +; POWERPC_32-NEXT: vmrghh v2, v2, v2 +; POWERPC_32-NEXT: xxland v2, v2, v3 +; POWERPC_32-NEXT: xxswapd v3, v2 +; POWERPC_32-NEXT: vadduwm v2, v2, v3 +; POWERPC_32-NEXT: xxspltw v3, v2, 1 +; POWERPC_32-NEXT: vadduwm v2, v2, v3 +; POWERPC_32-NEXT: stxv v2, -16(r1) +; POWERPC_32-NEXT: lwz r3, -16(r1) +; POWERPC_32-NEXT: blr +entry: + %0 = icmp ne <4 x i16> %wide.load, zeroinitializer + %1 = zext <4 x i1> %0 to <4 x i32> + %2 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1) + ret i32 %2 +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }