cmd/compile: generalize strength reduction of mulq

* This is an improved version of an earlier patch. * Verified with gcc up to 100. * Limited to two instructions based on costs from https://gmplib.org/~tege/x86-timing.pdf Change-Id: Ib7c37de6fd8e0ba554459b15c7409508cbcf6728 Reviewed-on: https://go-review.googlesource.com/21103 Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: Alexandru Moșoi <alexandru@mosoi.ro> TryBot-Result: Gobot Gobot <gobot@golang.org>
2024-09-21 10:28:27 +00:00 · 2016-03-24 22:46:37 +01:00 · 2016-03-24 22:46:37 +01:00 · d8f1f8d856
commit d8f1f8d856
parent 1624a9c9e7
3 changed files with 319 additions and 6 deletions
--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@ -583,14 +583,35 @@
 (CMPB (MOVBconst [c]) x) -> (InvertFlags (CMPBconst x [c]))

 // strength reduction
+// Assumes that the following costs from https://gmplib.org/~tege/x86-timing.pdf:
+//    1 - addq, shlq, leaq, negq
+//    3 - imulq
+// This limits the rewrites to two instructions.
+// TODO: 27, 81
 (MULQconst [-1] x) -> (NEGQ x)
 (MULQconst [0] _) -> (MOVQconst [0])
 (MULQconst [1] x) -> x
 (MULQconst [3] x) -> (LEAQ2 x x)
 (MULQconst [5] x) -> (LEAQ4 x x)
+(MULQconst [7] x) -> (LEAQ8 (NEGQ <v.Type> x) x)
 (MULQconst [9] x) -> (LEAQ8 x x)
-(MULQconst [24] x) -> (SHLQconst [3] (LEAQ2 <v.Type> x x)) // Useful for [][]T accesses
+(MULQconst [11] x) -> (LEAQ2 x (LEAQ4 <v.Type> x x))
+(MULQconst [13] x) -> (LEAQ4 x (LEAQ2 <v.Type> x x))
+(MULQconst [21] x) -> (LEAQ4 x (LEAQ4 <v.Type> x x))
+(MULQconst [25] x) -> (LEAQ8 x (LEAQ2 <v.Type> x x))
+(MULQconst [37] x) -> (LEAQ4 x (LEAQ8 <v.Type> x x))
+(MULQconst [41] x) -> (LEAQ8 x (LEAQ4 <v.Type> x x))
+(MULQconst [73] x) -> (LEAQ8 x (LEAQ8 <v.Type> x x))
+
 (MULQconst [c] x) && isPowerOfTwo(c) -> (SHLQconst [log2(c)] x)
+(MULQconst [c] x) && isPowerOfTwo(c+1) && c >= 15 -> (SUBQ (SHLQconst <v.Type> [log2(c+1)] x) x)
+(MULQconst [c] x) && isPowerOfTwo(c-1) && c >= 17 -> (LEAQ1 (SHLQconst <v.Type> [log2(c-1)] x) x)
+(MULQconst [c] x) && isPowerOfTwo(c-2) && c >= 34 -> (LEAQ2 (SHLQconst <v.Type> [log2(c-2)] x) x)
+(MULQconst [c] x) && isPowerOfTwo(c-4) && c >= 68 -> (LEAQ4 (SHLQconst <v.Type> [log2(c-4)] x) x)
+(MULQconst [c] x) && isPowerOfTwo(c-8) && c >= 136 -> (LEAQ8 (SHLQconst <v.Type> [log2(c-8)] x) x)
+(MULQconst [c] x) && c%3 == 0 && isPowerOfTwo(c/3)-> (SHLQconst [log2(c/3)] (LEAQ2 <v.Type> x x))
+(MULQconst [c] x) && c%5 == 0 && isPowerOfTwo(c/5)-> (SHLQconst [log2(c/5)] (LEAQ4 <v.Type> x x))
+(MULQconst [c] x) && c%9 == 0 && isPowerOfTwo(c/9)-> (SHLQconst [log2(c/9)] (LEAQ8 <v.Type> x x))

 // combine add/shift into LEAQ
 (ADDQ x (SHLQconst [3] y)) -> (LEAQ8 x y)
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@ -9392,6 +9392,21 @@ func rewriteValueAMD64_OpAMD64MULQconst(v *Value, config *Config) bool {
 		v.AddArg(x)
 		return true
 	}
+	// match: (MULQconst [7] x)
+	// cond:
+	// result: (LEAQ8 (NEGQ <v.Type> x) x)
+	for {
+		if v.AuxInt != 7 {
+			break
+		}
+		x := v.Args[0]
+		v.reset(OpAMD64LEAQ8)
+		v0 := b.NewValue0(v.Line, OpAMD64NEGQ, v.Type)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		v.AddArg(x)
+		return true
+	}
 	// match: (MULQconst [9] x)
 	// cond:
 	// result: (LEAQ8 x x)
@ -9405,22 +9420,118 @@ func rewriteValueAMD64_OpAMD64MULQconst(v *Value, config *Config) bool {
 		v.AddArg(x)
 		return true
 	}
-	// match: (MULQconst [24] x)
+	// match: (MULQconst [11] x)
 	// cond:
-	// result: (SHLQconst [3] (LEAQ2 <v.Type> x x))
+	// result: (LEAQ2 x (LEAQ4 <v.Type> x x))
 	for {
-		if v.AuxInt != 24 {
+		if v.AuxInt != 11 {
 			break
 		}
 		x := v.Args[0]
-		v.reset(OpAMD64SHLQconst)
-		v.AuxInt = 3
+		v.reset(OpAMD64LEAQ2)
+		v.AddArg(x)
+		v0 := b.NewValue0(v.Line, OpAMD64LEAQ4, v.Type)
+		v0.AddArg(x)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MULQconst [13] x)
+	// cond:
+	// result: (LEAQ4 x (LEAQ2 <v.Type> x x))
+	for {
+		if v.AuxInt != 13 {
+			break
+		}
+		x := v.Args[0]
+		v.reset(OpAMD64LEAQ4)
+		v.AddArg(x)
 		v0 := b.NewValue0(v.Line, OpAMD64LEAQ2, v.Type)
 		v0.AddArg(x)
 		v0.AddArg(x)
 		v.AddArg(v0)
 		return true
 	}
+	// match: (MULQconst [21] x)
+	// cond:
+	// result: (LEAQ4 x (LEAQ4 <v.Type> x x))
+	for {
+		if v.AuxInt != 21 {
+			break
+		}
+		x := v.Args[0]
+		v.reset(OpAMD64LEAQ4)
+		v.AddArg(x)
+		v0 := b.NewValue0(v.Line, OpAMD64LEAQ4, v.Type)
+		v0.AddArg(x)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MULQconst [25] x)
+	// cond:
+	// result: (LEAQ8 x (LEAQ2 <v.Type> x x))
+	for {
+		if v.AuxInt != 25 {
+			break
+		}
+		x := v.Args[0]
+		v.reset(OpAMD64LEAQ8)
+		v.AddArg(x)
+		v0 := b.NewValue0(v.Line, OpAMD64LEAQ2, v.Type)
+		v0.AddArg(x)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MULQconst [37] x)
+	// cond:
+	// result: (LEAQ4 x (LEAQ8 <v.Type> x x))
+	for {
+		if v.AuxInt != 37 {
+			break
+		}
+		x := v.Args[0]
+		v.reset(OpAMD64LEAQ4)
+		v.AddArg(x)
+		v0 := b.NewValue0(v.Line, OpAMD64LEAQ8, v.Type)
+		v0.AddArg(x)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MULQconst [41] x)
+	// cond:
+	// result: (LEAQ8 x (LEAQ4 <v.Type> x x))
+	for {
+		if v.AuxInt != 41 {
+			break
+		}
+		x := v.Args[0]
+		v.reset(OpAMD64LEAQ8)
+		v.AddArg(x)
+		v0 := b.NewValue0(v.Line, OpAMD64LEAQ4, v.Type)
+		v0.AddArg(x)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MULQconst [73] x)
+	// cond:
+	// result: (LEAQ8 x (LEAQ8 <v.Type> x x))
+	for {
+		if v.AuxInt != 73 {
+			break
+		}
+		x := v.Args[0]
+		v.reset(OpAMD64LEAQ8)
+		v.AddArg(x)
+		v0 := b.NewValue0(v.Line, OpAMD64LEAQ8, v.Type)
+		v0.AddArg(x)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
 	// match: (MULQconst [c] x)
 	// cond: isPowerOfTwo(c)
 	// result: (SHLQconst [log2(c)] x)
@ -9435,6 +9546,142 @@ func rewriteValueAMD64_OpAMD64MULQconst(v *Value, config *Config) bool {
 		v.AddArg(x)
 		return true
 	}
+	// match: (MULQconst [c] x)
+	// cond: isPowerOfTwo(c+1) && c >= 15
+	// result: (SUBQ (SHLQconst <v.Type> [log2(c+1)] x) x)
+	for {
+		c := v.AuxInt
+		x := v.Args[0]
+		if !(isPowerOfTwo(c+1) && c >= 15) {
+			break
+		}
+		v.reset(OpAMD64SUBQ)
+		v0 := b.NewValue0(v.Line, OpAMD64SHLQconst, v.Type)
+		v0.AuxInt = log2(c + 1)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		v.AddArg(x)
+		return true
+	}
+	// match: (MULQconst [c] x)
+	// cond: isPowerOfTwo(c-1) && c >= 17
+	// result: (LEAQ1 (SHLQconst <v.Type> [log2(c-1)] x) x)
+	for {
+		c := v.AuxInt
+		x := v.Args[0]
+		if !(isPowerOfTwo(c-1) && c >= 17) {
+			break
+		}
+		v.reset(OpAMD64LEAQ1)
+		v0 := b.NewValue0(v.Line, OpAMD64SHLQconst, v.Type)
+		v0.AuxInt = log2(c - 1)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		v.AddArg(x)
+		return true
+	}
+	// match: (MULQconst [c] x)
+	// cond: isPowerOfTwo(c-2) && c >= 34
+	// result: (LEAQ2 (SHLQconst <v.Type> [log2(c-2)] x) x)
+	for {
+		c := v.AuxInt
+		x := v.Args[0]
+		if !(isPowerOfTwo(c-2) && c >= 34) {
+			break
+		}
+		v.reset(OpAMD64LEAQ2)
+		v0 := b.NewValue0(v.Line, OpAMD64SHLQconst, v.Type)
+		v0.AuxInt = log2(c - 2)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		v.AddArg(x)
+		return true
+	}
+	// match: (MULQconst [c] x)
+	// cond: isPowerOfTwo(c-4) && c >= 68
+	// result: (LEAQ4 (SHLQconst <v.Type> [log2(c-4)] x) x)
+	for {
+		c := v.AuxInt
+		x := v.Args[0]
+		if !(isPowerOfTwo(c-4) && c >= 68) {
+			break
+		}
+		v.reset(OpAMD64LEAQ4)
+		v0 := b.NewValue0(v.Line, OpAMD64SHLQconst, v.Type)
+		v0.AuxInt = log2(c - 4)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		v.AddArg(x)
+		return true
+	}
+	// match: (MULQconst [c] x)
+	// cond: isPowerOfTwo(c-8) && c >= 136
+	// result: (LEAQ8 (SHLQconst <v.Type> [log2(c-8)] x) x)
+	for {
+		c := v.AuxInt
+		x := v.Args[0]
+		if !(isPowerOfTwo(c-8) && c >= 136) {
+			break
+		}
+		v.reset(OpAMD64LEAQ8)
+		v0 := b.NewValue0(v.Line, OpAMD64SHLQconst, v.Type)
+		v0.AuxInt = log2(c - 8)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		v.AddArg(x)
+		return true
+	}
+	// match: (MULQconst [c] x)
+	// cond: c%3 == 0 && isPowerOfTwo(c/3)
+	// result: (SHLQconst [log2(c/3)] (LEAQ2 <v.Type> x x))
+	for {
+		c := v.AuxInt
+		x := v.Args[0]
+		if !(c%3 == 0 && isPowerOfTwo(c/3)) {
+			break
+		}
+		v.reset(OpAMD64SHLQconst)
+		v.AuxInt = log2(c / 3)
+		v0 := b.NewValue0(v.Line, OpAMD64LEAQ2, v.Type)
+		v0.AddArg(x)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MULQconst [c] x)
+	// cond: c%5 == 0 && isPowerOfTwo(c/5)
+	// result: (SHLQconst [log2(c/5)] (LEAQ4 <v.Type> x x))
+	for {
+		c := v.AuxInt
+		x := v.Args[0]
+		if !(c%5 == 0 && isPowerOfTwo(c/5)) {
+			break
+		}
+		v.reset(OpAMD64SHLQconst)
+		v.AuxInt = log2(c / 5)
+		v0 := b.NewValue0(v.Line, OpAMD64LEAQ4, v.Type)
+		v0.AddArg(x)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
+	// match: (MULQconst [c] x)
+	// cond: c%9 == 0 && isPowerOfTwo(c/9)
+	// result: (SHLQconst [log2(c/9)] (LEAQ8 <v.Type> x x))
+	for {
+		c := v.AuxInt
+		x := v.Args[0]
+		if !(c%9 == 0 && isPowerOfTwo(c/9)) {
+			break
+		}
+		v.reset(OpAMD64SHLQconst)
+		v.AuxInt = log2(c / 9)
+		v0 := b.NewValue0(v.Line, OpAMD64LEAQ8, v.Type)
+		v0.AddArg(x)
+		v0.AddArg(x)
+		v.AddArg(v0)
+		return true
+	}
 	// match: (MULQconst [c] (MOVQconst [d]))
 	// cond:
 	// result: (MOVQconst [c*d])
--- a/test/strength.go
+++ b/test/strength.go
@ -0,0 +1,45 @@
+// runoutput
+
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Generate test of strength reduction for multiplications
+// with contstants. Especially useful for amd64/386.
+
+package main
+
+import "fmt"
+
+func testMul(fact, bits int) string {
+	n := fmt.Sprintf("testMul_%d_%d", fact, bits)
+	fmt.Printf("func %s(s int%d) {\n", n, bits)
+
+	want := 0
+	for i := 0; i < 200; i++ {
+		fmt.Printf(`	if want, got := int%d(%d), s*%d; want != got {
+		failed = true
+		fmt.Printf("got %d * %%d == %%d, wanted %d\n",  s, got)
+	}
+`, bits, want, i, i, want)
+		want += fact
+	}
+
+	fmt.Printf("}\n")
+	return fmt.Sprintf("%s(%d)", n, fact)
+}
+
+func main() {
+	fmt.Printf("package main\n")
+	fmt.Printf("import \"fmt\"\n")
+	fmt.Printf("var failed = false\n")
+
+	f1 := testMul(17, 32)
+	f2 := testMul(131, 64)
+
+	fmt.Printf("func main() {\n")
+	fmt.Println(f1)
+	fmt.Println(f2)
+	fmt.Printf("if failed {\n	panic(\"multiplication failed\")\n}\n")
+	fmt.Printf("}\n")
+}