cmd/gc: add division rewrite to walk pass.

This allows 5g and 8g to benefit from the rewrite as shifts or magic multiplies. The 64-bit arithmetic is not handled there, and left in 6g. Update #2230. R=golang-dev, dave, mtj, iant, rsc CC=golang-dev https://golang.org/cl/6819123
2024-09-29 14:26:50 +00:00 · 2012-11-26 23:45:22 +01:00 · 2012-11-26 23:45:22 +01:00 · 4cc9de9147
commit 4cc9de9147
parent 4a1b814668
13 changed files with 466 additions and 188 deletions
--- a/src/cmd/5g/cgen.c
+++ b/src/cmd/5g/cgen.c
@ -263,6 +263,10 @@ cgen(Node *n, Node *res)
 		a = optoas(n->op, nl->type);
 		goto abop;

+	case OHMUL:
+		cgen_hmul(nl, nr, res);
+		break;
+
 	case OLROT:
 	case OLSH:
 	case ORSH:
--- a/src/cmd/5g/gg.h
+++ b/src/cmd/5g/gg.h
@ -102,6 +102,7 @@ Prog*	gshift(int as, Node *lhs, int32 stype, int32 sval, Node *rhs);
 Prog *	gregshift(int as, Node *lhs, int32 stype, Node *reg, Node *rhs);
 void	naddr(Node*, Addr*, int);
 void	cgen_aret(Node*, Node*);
+void	cgen_hmul(Node*, Node*, Node*);
 void	cgen_shift(int, int, Node*, Node*, Node*);
 int	componentgen(Node*, Node*);

--- a/src/cmd/5g/ggen.c
+++ b/src/cmd/5g/ggen.c
@ -473,6 +473,62 @@ samereg(Node *a, Node *b)
 	return 1;
 }

+/*
+ * generate high multiply
+ *  res = (nl * nr) >> wordsize
+ */
+void
+cgen_hmul(Node *nl, Node *nr, Node *res)
+{
+	int w;
+	Node n1, n2, *tmp;
+	Type *t;
+	Prog *p;
+
+	if(nl->ullman < nr->ullman) {
+		tmp = nl;
+		nl = nr;
+		nr = tmp;
+	}
+	t = nl->type;
+	w = t->width * 8;
+	regalloc(&n1, t, res);
+	cgen(nl, &n1);
+	regalloc(&n2, t, N);
+	cgen(nr, &n2);
+	switch(simtype[t->etype]) {
+	case TINT8:
+	case TINT16:
+		gins(optoas(OMUL, t), &n2, &n1);
+		gshift(AMOVW, &n1, SHIFT_AR, w, &n1);
+		break;
+	case TUINT8:
+	case TUINT16:
+		gins(optoas(OMUL, t), &n2, &n1);
+		gshift(AMOVW, &n1, SHIFT_LR, w, &n1);
+		break;
+	case TINT32:
+	case TUINT32:
+		// perform a long multiplication.
+		if(issigned[t->etype])
+			p = gins(AMULL, &n2, N);
+		else
+			p = gins(AMULLU, &n2, N);
+		// n2 * n1 -> (n1 n2)
+		p->reg = n1.val.u.reg;
+		p->to.type = D_REGREG;
+		p->to.reg = n1.val.u.reg;
+		p->to.offset = n2.val.u.reg;
+		break;
+	default:
+		fatal("cgen_hmul %T", t);
+		break;
+	}
+	cgen(&n1, res);
+	regfree(&n1);
+	regfree(&n2);
+}
+
 /*
 * generate shift according to op, one of:
 *	res = nl << nr
--- a/src/cmd/5g/peep.c
+++ b/src/cmd/5g/peep.c
@ -1056,6 +1056,7 @@ copyu(Prog *p, Adr *v, Adr *s)
 		return 0;

 	case AMULLU:	/* read, read, write, write */
+	case AMULL:
 	case AMULA:
 	case AMVN:
 		return 2;
--- a/src/cmd/6g/cgen.c
+++ b/src/cmd/6g/cgen.c
@ -257,6 +257,10 @@ cgen(Node *n, Node *res)
 		a = optoas(n->op, nl->type);
 		goto abop;

+	case OHMUL:
+		cgen_hmul(nl, nr, res);
+		break;
+
 	case OCONV:
 		if(n->type->width > nl->type->width) {
 			// If loading from memory, do conversion during load,
@ -528,7 +532,7 @@ cgenr(Node *n, Node *a, Node *res)
 		fatal("cgenr on fat node");

 	if(n->addable) {
-		regalloc(a, types[tptr], res);
+		regalloc(a, n->type, res);
 		gmove(n, a);
 		return;
 	}
--- a/src/cmd/6g/gg.h
+++ b/src/cmd/6g/gg.h
@ -71,6 +71,7 @@ void	cgen_proc(Node*, int);
 void	cgen_callret(Node*, Node*);
 void	cgen_div(int, Node*, Node*, Node*);
 void	cgen_bmul(int, Node*, Node*, Node*);
+void	cgen_hmul(Node*, Node*, Node*);
 void	cgen_shift(int, int, Node*, Node*, Node*);
 void	cgen_dcl(Node*);
 int	needconvert(Type*, Type*);
@ -86,6 +87,7 @@ void	clearslim(Node*);
 */
 void	agen(Node*, Node*);
 void	agenr(Node*, Node*, Node*);
+void	cgenr(Node*, Node*, Node*);
 void	igen(Node*, Node*, Node*);
 vlong	fieldoffset(Type*, Node*);
 void	sgen(Node*, Node*, int64);
--- a/src/cmd/6g/ggen.c
+++ b/src/cmd/6g/ggen.c
@ -601,134 +601,21 @@ restx(Node *x, Node *oldx)
 void
 cgen_div(int op, Node *nl, Node *nr, Node *res)
 {
-	Node n1, n2, n3, savl, savr;
-	Node ax, dx, oldax, olddx;
-	int n, w, s, a;
+	Node n1, n2, n3;
+	int w, a;
 	Magic m;

-	if(nl->ullman >= UINF) {
-		tempname(&savl, nl->type);
-		cgen(nl, &savl);
-		nl = &savl;
-	}
-	if(nr->ullman >= UINF) {
-		tempname(&savr, nr->type);
-		cgen(nr, &savr);
-		nr = &savr;
-	}
-
 	if(nr->op != OLITERAL)
 		goto longdiv;
-
-	// special cases of mod/div
-	// by a constant
 	w = nl->type->width*8;
-	s = 0;
-	n = powtwo(nr);
-	if(n >= 1000) {
-		// negative power of 2
-		s = 1;
-		n -= 1000;
-	}

-	if(n+1 >= w) {
-		// just sign bit
-		goto longdiv;
-	}
-
-	if(n < 0)
-		goto divbymul;
-	switch(n) {
-	case 0:
-		// divide by 1
-		regalloc(&n1, nl->type, res);
-		cgen(nl, &n1);
-		if(op == OMOD) {
-			gins(optoas(OXOR, nl->type), &n1, &n1);
-		} else
-		if(s)
-			gins(optoas(OMINUS, nl->type), N, &n1);
-		gmove(&n1, res);
-		regfree(&n1);
-		return;
-	case 1:
-		// divide by 2
-		if(op == OMOD) {
-			if(issigned[nl->type->etype])
-				goto longmod;
-			regalloc(&n1, nl->type, res);
-			cgen(nl, &n1);
-			nodconst(&n2, nl->type, 1);
-			gins(optoas(OAND, nl->type), &n2, &n1);
-			gmove(&n1, res);
-			regfree(&n1);
-			return;
-		}
-		regalloc(&n1, nl->type, res);
-		cgen(nl, &n1);
-		if(!issigned[nl->type->etype])
-			break;
-
-		// develop -1 iff nl is negative
-		regalloc(&n2, nl->type, N);
-		gmove(&n1, &n2);
-		nodconst(&n3, nl->type, w-1);
-		gins(optoas(ORSH, nl->type), &n3, &n2);
-		gins(optoas(OSUB, nl->type), &n2, &n1);
-		regfree(&n2);
-		break;
-	default:
-		if(op == OMOD) {
-			if(issigned[nl->type->etype])
-				goto longmod;
-			regalloc(&n1, nl->type, res);
-			cgen(nl, &n1);
-			nodconst(&n2, nl->type, mpgetfix(nr->val.u.xval)-1);
-			if(!smallintconst(&n2)) {
-				regalloc(&n3, nl->type, N);
-				gmove(&n2, &n3);
-				gins(optoas(OAND, nl->type), &n3, &n1);
-				regfree(&n3);
-			} else
-				gins(optoas(OAND, nl->type), &n2, &n1);
-			gmove(&n1, res);
-			regfree(&n1);
-			return;
-		}
-		regalloc(&n1, nl->type, res);
-		cgen(nl, &n1);
-		if(!issigned[nl->type->etype])
-			break;
-
-		// develop (2^k)-1 iff nl is negative
-		regalloc(&n2, nl->type, N);
-		gmove(&n1, &n2);
-		nodconst(&n3, nl->type, w-1);
-		gins(optoas(ORSH, nl->type), &n3, &n2);
-		nodconst(&n3, nl->type, w-n);
-		gins(optoas(ORSH, tounsigned(nl->type)), &n3, &n2);
-		gins(optoas(OADD, nl->type), &n2, &n1);
-		regfree(&n2);
-		break;
-	}
-	nodconst(&n2, nl->type, n);
-	gins(optoas(ORSH, nl->type), &n2, &n1);
-	if(s)
-		gins(optoas(OMINUS, nl->type), N, &n1);
-	gmove(&n1, res);
-	regfree(&n1);
-	return;
-
-divbymul:
+	// Front end handled 32-bit division. We only need to handle 64-bit.
 	// try to do division by multiply by (2^w)/d
 	// see hacker's delight chapter 10
 	switch(simtype[nl->type->etype]) {
 	default:
 		goto longdiv;

-	case TUINT8:
-	case TUINT16:
-	case TUINT32:
 	case TUINT64:
 		m.w = w;
 		m.ud = mpgetfix(nr->val.u.xval);
@ -738,47 +625,28 @@ divbymul:
 		if(op == OMOD)
 			goto longmod;

-		regalloc(&n1, nl->type, N);
-		cgen(nl, &n1);				// num -> reg(n1)
-
-		savex(D_AX, &ax, &oldax, res, nl->type);
-		savex(D_DX, &dx, &olddx, res, nl->type);
-
+		cgenr(nl, &n1, N);
 		nodconst(&n2, nl->type, m.um);
-		gmove(&n2, &ax);			// const->ax
-
-		gins(optoas(OHMUL, nl->type), &n1, N);	// imul reg
-		if(w == 8) {
-			// fix up 8-bit multiply
-			Node ah, dl;
-			nodreg(&ah, types[TUINT8], D_AH);
-			nodreg(&dl, types[TUINT8], D_DL);
-			gins(AMOVB, &ah, &dl);
-		}
+		regalloc(&n3, nl->type, res);
+		cgen_hmul(&n1, &n2, &n3);

 		if(m.ua) {
 			// need to add numerator accounting for overflow
-			gins(optoas(OADD, nl->type), &n1, &dx);
+			gins(optoas(OADD, nl->type), &n1, &n3);
 			nodconst(&n2, nl->type, 1);
-			gins(optoas(ORROTC, nl->type), &n2, &dx);
+			gins(optoas(ORROTC, nl->type), &n2, &n3);
 			nodconst(&n2, nl->type, m.s-1);
-			gins(optoas(ORSH, nl->type), &n2, &dx);
+			gins(optoas(ORSH, nl->type), &n2, &n3);
 		} else {
 			nodconst(&n2, nl->type, m.s);
-			gins(optoas(ORSH, nl->type), &n2, &dx);	// shift dx
+			gins(optoas(ORSH, nl->type), &n2, &n3);	// shift dx
 		}

-
+		gmove(&n3, res);
 		regfree(&n1);
-		gmove(&dx, res);
-
-		restx(&ax, &oldax);
-		restx(&dx, &olddx);
+		regfree(&n3);
 		return;

-	case TINT8:
-	case TINT16:
-	case TINT32:
 	case TINT64:
 		m.w = w;
 		m.sd = mpgetfix(nr->val.u.xval);
@ -788,47 +656,32 @@ divbymul:
 		if(op == OMOD)
 			goto longmod;

-		regalloc(&n1, nl->type, N);
-		cgen(nl, &n1);				// num -> reg(n1)
-
-		savex(D_AX, &ax, &oldax, res, nl->type);
-		savex(D_DX, &dx, &olddx, res, nl->type);
-
+		cgenr(nl, &n1, res);
 		nodconst(&n2, nl->type, m.sm);
-		gmove(&n2, &ax);			// const->ax
-
-		gins(optoas(OHMUL, nl->type), &n1, N);	// imul reg
-		if(w == 8) {
-			// fix up 8-bit multiply
-			Node ah, dl;
-			nodreg(&ah, types[TUINT8], D_AH);
-			nodreg(&dl, types[TUINT8], D_DL);
-			gins(AMOVB, &ah, &dl);
-		}
+		regalloc(&n3, nl->type, N);
+		cgen_hmul(&n1, &n2, &n3);

 		if(m.sm < 0) {
 			// need to add numerator
-			gins(optoas(OADD, nl->type), &n1, &dx);
+			gins(optoas(OADD, nl->type), &n1, &n3);
 		}

 		nodconst(&n2, nl->type, m.s);
-		gins(optoas(ORSH, nl->type), &n2, &dx);	// shift dx
+		gins(optoas(ORSH, nl->type), &n2, &n3);	// shift n3

 		nodconst(&n2, nl->type, w-1);
 		gins(optoas(ORSH, nl->type), &n2, &n1);	// -1 iff num is neg
-		gins(optoas(OSUB, nl->type), &n1, &dx);	// added
+		gins(optoas(OSUB, nl->type), &n1, &n3);	// added

 		if(m.sd < 0) {
 			// this could probably be removed
 			// by factoring it into the multiplier
-			gins(optoas(OMINUS, nl->type), N, &dx);
+			gins(optoas(OMINUS, nl->type), N, &n3);
 		}

+		gmove(&n3, res);
 		regfree(&n1);
-		gmove(&dx, res);
-
-		restx(&ax, &oldax);
-		restx(&dx, &olddx);
+		regfree(&n3);
 		return;
 	}
 	goto longdiv;
@ -864,6 +717,42 @@ longmod:
 	regfree(&n2);
 }

+/*
+ * generate high multiply:
+ *   res = (nl*nr) >> width
+ */
+void
+cgen_hmul(Node *nl, Node *nr, Node *res)
+{
+	Type *t;
+	int a;
+	Node n1, n2, ax, dx, *tmp;
+
+	t = nl->type;
+	a = optoas(OHMUL, t);
+	if(nl->ullman < nr->ullman) {
+		tmp = nl;
+		nl = nr;
+		nr = tmp;
+	}
+	cgenr(nl, &n1, res);
+	cgenr(nr, &n2, N);
+	nodreg(&ax, t, D_AX);
+	gmove(&n1, &ax);
+	gins(a, &n2, N);
+	regfree(&n2);
+	regfree(&n1);
+
+	if(t->width == 1) {
+		// byte multiply behaves differently.
+		nodreg(&ax, t, D_AH);
+		nodreg(&dx, t, D_DL);
+		gmove(&ax, &dx);
+	}
+	nodreg(&dx, t, D_DX);
+	gmove(&dx, res);
+}
+
 /*
 * generate shift according to op, one of:
 *	res = nl << nr
--- a/src/cmd/6g/peep.c
+++ b/src/cmd/6g/peep.c
@ -536,8 +536,10 @@ elimshortmov(Reg *r)
 					p->as = ASHLQ;
 					break;
 				}
-			} else {
-				// explicit zero extension
+			} else if(p->from.type >= D_NONE) {
+				// explicit zero extension, but don't
+				// do that if source is a byte register
+				// (only AH can occur and it's forbidden).
 				switch(p->as) {
 				case AMOVB:
 					p->as = AMOVBQZX;
--- a/src/cmd/8g/cgen.c
+++ b/src/cmd/8g/cgen.c
@ -250,6 +250,10 @@ cgen(Node *n, Node *res)
 		a = optoas(n->op, nl->type);
 		goto abop;

+	case OHMUL:
+		cgen_hmul(nl, nr, res);
+		break;
+
 	case OCONV:
 		if(eqtype(n->type, nl->type) || noconv(n->type, nl->type)) {
 			cgen(nl, res);
--- a/src/cmd/8g/gg.h
+++ b/src/cmd/8g/gg.h
@ -83,6 +83,7 @@ void	cgen_proc(Node*, int);
 void	cgen_callret(Node*, Node*);
 void	cgen_div(int, Node*, Node*, Node*);
 void	cgen_bmul(int, Node*, Node*, Node*);
+void	cgen_hmul(Node*, Node*, Node*);
 void	cgen_shift(int, int, Node*, Node*, Node*);
 void	cgen_dcl(Node*);
 int	needconvert(Type*, Type*);
--- a/src/cmd/8g/ggen.c
+++ b/src/cmd/8g/ggen.c
@ -776,3 +776,39 @@ cgen_bmul(int op, Node *nl, Node *nr, Node *res)
 	regfree(&n1);
 }

+/*
+ * generate high multiply:
+ *   res = (nl*nr) >> width
+ */
+void
+cgen_hmul(Node *nl, Node *nr, Node *res)
+{
+	Type *t;
+	int a;
+	Node n1, n2, ax, dx;
+
+	t = nl->type;
+	a = optoas(OHMUL, t);
+	// gen nl in n1.
+	tempname(&n1, t);
+	cgen(nl, &n1);
+	// gen nr in n2.
+	regalloc(&n2, t, res);
+	cgen(nr, &n2);
+
+	// multiply.
+	nodreg(&ax, t, D_AX);
+	gmove(&n2, &ax);
+	gins(a, &n1, N);
+	regfree(&n2);
+
+	if(t->width == 1) {
+		// byte multiply behaves differently.
+		nodreg(&ax, t, D_AH);
+		nodreg(&dx, t, D_DL);
+		gmove(&ax, &dx);
+	}
+	nodreg(&dx, t, D_DX);
+	gmove(&dx, res);
+}
+
--- a/src/cmd/8g/gsubr.c
+++ b/src/cmd/8g/gsubr.c
@ -611,22 +611,38 @@ optoas(int op, Type *t)
 		a = ASARL;
 		break;

+	case CASE(OHMUL, TINT8):
 	case CASE(OMUL, TINT8):
 	case CASE(OMUL, TUINT8):
 		a = AIMULB;
 		break;

+	case CASE(OHMUL, TINT16):
 	case CASE(OMUL, TINT16):
 	case CASE(OMUL, TUINT16):
 		a = AIMULW;
 		break;

+	case CASE(OHMUL, TINT32):
 	case CASE(OMUL, TINT32):
 	case CASE(OMUL, TUINT32):
 	case CASE(OMUL, TPTR32):
 		a = AIMULL;
 		break;

+	case CASE(OHMUL, TUINT8):
+		a = AMULB;
+		break;
+
+	case CASE(OHMUL, TUINT16):
+		a = AMULW;
+		break;
+
+	case CASE(OHMUL, TUINT32):
+	case CASE(OHMUL, TPTR32):
+		a = AMULL;
+		break;
+
 	case CASE(ODIV, TINT8):
 	case CASE(OMOD, TINT8):
 		a = AIDIVB;
--- a/src/cmd/gc/walk.c
+++ b/src/cmd/gc/walk.c
@ -24,6 +24,7 @@ static	Node*	append(Node*, NodeList**);
 static	Node*	sliceany(Node*, NodeList**);
 static	void	walkcompare(Node**, NodeList**);
 static	void	walkrotate(Node**);
+static	void	walkdiv(Node**, NodeList**);
 static	int	bounded(Node*, int64);
 static	Mpint	mpzero;

@ -481,6 +482,7 @@ walkexpr(Node **np, NodeList **init)
 	case OAND:
 	case OSUB:
 	case OMUL:
+	case OHMUL:
 	case OLT:
 	case OLE:
 	case OGE:
@ -893,7 +895,7 @@ walkexpr(Node **np, NodeList **init)
 		 * on 386, rewrite float ops into l = l op r.
 		 * everywhere, rewrite map ops into l = l op r.
 		 * everywhere, rewrite string += into l = l op r.
-		 * everywhere, rewrite complex /= into l = l op r.
+		 * everywhere, rewrite integer/complex /= into l = l op r.
 		 * TODO(rsc): Maybe this rewrite should be done always?
 		 */
 		et = n->left->type->etype;
@ -901,7 +903,8 @@ walkexpr(Node **np, NodeList **init)
 		   (thechar == '8' && isfloat[et]) ||
 		   l->op == OINDEXMAP ||
 		   et == TSTRING ||
-		   (iscomplex[et] && n->etype == ODIV)) {
+		   (!isfloat[et] && n->etype == ODIV) ||
+		   n->etype == OMOD) {
 			l = safeexpr(n->left, init);
 			a = l;
 			if(a->op == OINDEXMAP) {
@ -945,10 +948,20 @@ walkexpr(Node **np, NodeList **init)
 			n = conv(n, t);
 			goto ret;
 		}
+		// Nothing to do for float divisions.
+		if(isfloat[et])
+			goto ret;
+
+		// Try rewriting as shifts or magic multiplies.
+		walkdiv(&n, init);
+
 		/*
-		 * rewrite div and mod into function calls
+		 * rewrite 64-bit div and mod into function calls
 		 * on 32-bit architectures.
 		 */
+		switch(n->op) {
+		case OMOD:
+		case ODIV:
 			if(widthptr > 4 || (et != TUINT64 && et != TINT64))
 				goto ret;
 			if(et == TINT64)
@ -961,10 +974,17 @@ walkexpr(Node **np, NodeList **init)
 				strcat(namebuf, "mod");
 			n = mkcall(namebuf, n->type, init,
 				conv(n->left, types[et]), conv(n->right, types[et]));
+			break;
+		default:
+			break;
+		}
 		goto ret;

 	case OINDEX:
 		walkexpr(&n->left, init);
+		// save the original node for bounds checking elision.
+		// If it was a ODIV/OMOD walk might rewrite it.
+		r = n->right;
 		walkexpr(&n->right, init);

 		// if range of type cannot exceed static array bound,
@ -975,13 +995,13 @@ walkexpr(Node **np, NodeList **init)
 		if(t != T && isptr[t->etype])
 			t = t->type;
 		if(isfixedarray(t)) {
-			n->bounded = bounded(n->right, t->bound);
+			n->bounded = bounded(r, t->bound);
 			if(debug['m'] && n->bounded && !isconst(n->right, CTINT))
 				warn("index bounds check elided");
 			if(smallintconst(n->right) && !n->bounded)
 				yyerror("index out of bounds");
 		} else if(isconst(n->left, CTSTR)) {
-			n->bounded = bounded(n->right, n->left->val.u.sval->len);
+			n->bounded = bounded(r, n->left->val.u.sval->len);
 			if(debug['m'] && n->bounded && !isconst(n->right, CTINT))
 				warn("index bounds check elided");
 			if(smallintconst(n->right)) {
@ -2863,6 +2883,248 @@ yes:
 	return;
 }

+/*
+ * walkdiv rewrites division by a constant as less expensive
+ * operations.
+ */
+static void
+walkdiv(Node **np, NodeList **init)
+{
+	Node *n, *nl, *nr, *nc;
+	Node *n1, *n2, *n3, *n4;
+	int pow; // if >= 0, nr is 1<<pow
+	int s; // 1 if nr is negative.
+	int w;
+	Type *twide;
+	Magic m;
+
+	n = *np;
+	if(n->right->op != OLITERAL)
+		return;
+	// nr is a constant.
+	nl = cheapexpr(n->left, init);
+	nr = n->right;
+
+	// special cases of mod/div
+	// by a constant
+	w = nl->type->width*8;
+	s = 0;
+	pow = powtwo(nr);
+	if(pow >= 1000) {
+		// negative power of 2
+		s = 1;
+		pow -= 1000;
+	}
+
+	if(pow+1 >= w) {
+		// divisor too large.
+		return;
+	}
+	if(pow < 0) {
+		goto divbymul;
+	}
+
+	switch(pow) {
+	case 0:
+		if(n->op == OMOD) {
+			// nl % 1 is zero.
+			nodconst(n, n->type, 0);
+		} else if(s) {
+			// divide by -1
+			n->op = OMINUS;
+			n->right = N;
+		} else {
+			// divide by 1
+			n = nl;
+		}
+		break;
+	default:
+		if(issigned[n->type->etype]) {
+			if(n->op == OMOD) {
+				// signed modulo 2^pow is like ANDing
+				// with the last pow bits, but if nl < 0,
+				// nl & (2^pow-1) is (nl+1)%2^pow - 1.
+				nc = nod(OXXX, N, N);
+				nodconst(nc, types[simtype[TUINT]], w-1);
+				n1 = nod(ORSH, nl, nc); // n1 = -1 iff nl < 0.
+				if(pow == 1) {
+					typecheck(&n1, Erv);
+					n1 = cheapexpr(n1, init);
+					// n = (nl+ε)&1 -ε where ε=1 iff nl<0.
+					n2 = nod(OSUB, nl, n1);
+					nc = nod(OXXX, N, N);
+					nodconst(nc, nl->type, 1);
+					n3 = nod(OAND, n2, nc);
+					n = nod(OADD, n3, n1);
+				} else {
+					// n = (nl+ε)&(nr-1) - ε where ε=2^pow-1 iff nl<0.
+					nc = nod(OXXX, N, N);
+					nodconst(nc, nl->type, (1LL<<pow)-1);
+					n2 = nod(OAND, n1, nc); // n2 = 2^pow-1 iff nl<0.
+					typecheck(&n2, Erv);
+					n2 = cheapexpr(n2, init);
+
+					n3 = nod(OADD, nl, n2);
+					n4 = nod(OAND, n3, nc);
+					n = nod(OSUB, n4, n2);
+				}
+				break;
+			} else {
+				// arithmetic right shift does not give the correct rounding.
+				// if nl >= 0, nl >> n == nl / nr
+				// if nl < 0, we want to add 2^n-1 first.
+				nc = nod(OXXX, N, N);
+				nodconst(nc, types[simtype[TUINT]], w-1);
+				n1 = nod(ORSH, nl, nc); // n1 = -1 iff nl < 0.
+				if(pow == 1) {
+					// nl+1 is nl-(-1)
+					n->left = nod(OSUB, nl, n1);
+				} else {
+					// Do a logical right right on -1 to keep pow bits.
+					nc = nod(OXXX, N, N);
+					nodconst(nc, types[simtype[TUINT]], w-pow);
+					n2 = nod(ORSH, conv(n1, tounsigned(nl->type)), nc);
+					n->left = nod(OADD, nl, conv(n2, nl->type));
+				}
+				// n = (nl + 2^pow-1) >> pow
+				n->op = ORSH;
+				nc = nod(OXXX, N, N);
+				nodconst(nc, types[simtype[TUINT]], pow);
+				n->right = nc;
+				n->typecheck = 0;
+			}
+			if(s)
+				n = nod(OMINUS, n, N);
+			break;
+		}
+		nc = nod(OXXX, N, N);
+		if(n->op == OMOD) {
+			// n = nl & (nr-1)
+			n->op = OAND;
+			nodconst(nc, nl->type, mpgetfix(nr->val.u.xval)-1);
+		} else {
+			// n = nl >> pow
+			n->op = ORSH;
+			nodconst(nc, types[simtype[TUINT]], pow);
+		}
+		n->typecheck = 0;
+		n->right = nc;
+		break;
+	}
+	goto ret;
+
+divbymul:
+	// try to do division by multiply by (2^w)/d
+	// see hacker's delight chapter 10
+	// TODO: support 64-bit magic multiply here.
+	m.w = w;
+	if(issigned[nl->type->etype]) {
+		m.sd = mpgetfix(nr->val.u.xval);
+		smagic(&m);
+	} else {
+		m.ud = mpgetfix(nr->val.u.xval);
+		umagic(&m);
+	}
+	if(m.bad)
+		return;
+
+	// We have a quick division method so use it
+	// for modulo too.
+	if(n->op == OMOD)
+		goto longmod;
+
+	switch(simtype[nl->type->etype]) {
+	default:
+		return;
+
+	case TUINT8:
+	case TUINT16:
+	case TUINT32:
+		// n1 = nl * magic >> w (HMUL)
+		nc = nod(OXXX, N, N);
+		nodconst(nc, nl->type, m.um);
+		n1 = nod(OMUL, nl, nc);
+		typecheck(&n1, Erv);
+		n1->op = OHMUL;
+		if(m.ua) {
+			// Select a Go type with (at least) twice the width.
+			switch(simtype[nl->type->etype]) {
+			default:
+				return;
+			case TUINT8:
+			case TUINT16:
+				twide = types[TUINT32];
+				break;
+			case TUINT32:
+				twide = types[TUINT64];
+				break;
+			case TINT8:
+			case TINT16:
+				twide = types[TINT32];
+				break;
+			case TINT32:
+				twide = types[TINT64];
+				break;
+			}
+
+			// add numerator (might overflow).
+			// n2 = (n1 + nl)
+			n2 = nod(OADD, conv(n1, twide), conv(nl, twide));
+
+			// shift by m.s
+			nc = nod(OXXX, N, N);
+			nodconst(nc, types[TUINT], m.s);
+			n = conv(nod(ORSH, n2, nc), nl->type);
+		} else {
+			// n = n1 >> m.s
+			nc = nod(OXXX, N, N);
+			nodconst(nc, types[TUINT], m.s);
+			n = nod(ORSH, n1, nc);
+		}
+		break;
+
+	case TINT8:
+	case TINT16:
+	case TINT32:
+		// n1 = nl * magic >> w
+		nc = nod(OXXX, N, N);
+		nodconst(nc, nl->type, m.sm);
+		n1 = nod(OMUL, nl, nc);
+		typecheck(&n1, Erv);
+		n1->op = OHMUL;
+		if(m.sm < 0) {
+			// add the numerator.
+			n1 = nod(OADD, n1, nl);
+		}
+		// shift by m.s
+		nc = nod(OXXX, N, N);
+		nodconst(nc, types[TUINT], m.s);
+		n2 = conv(nod(ORSH, n1, nc), nl->type);
+		// add 1 iff n1 is negative.
+		nc = nod(OXXX, N, N);
+		nodconst(nc, types[TUINT], w-1);
+		n3 = nod(ORSH, nl, nc); // n4 = -1 iff n1 is negative.
+		n = nod(OSUB, n2, n3);
+		// apply sign.
+		if(m.sd < 0)
+			n = nod(OMINUS, n, N);
+		break;
+	}
+	goto ret;
+
+longmod:
+	// rewrite as A%B = A - (A/B*B).
+	n1 = nod(ODIV, nl, nr);
+	n2 = nod(OMUL, n1, nr);
+	n = nod(OSUB, nl, n2);
+	goto ret;
+
+ret:
+	typecheck(&n, Erv);
+	walkexpr(&n, init);
+	*np = n;
+}
+
 // return 1 if integer n must be in range [0, max), 0 otherwise
 static int
 bounded(Node *n, int64 max)