diff --git a/src/math/big/float.go b/src/math/big/float.go index eca85d4bb0..f19f21f068 100644 --- a/src/math/big/float.go +++ b/src/math/big/float.go @@ -874,21 +874,43 @@ func (x *Float) Float32() (float32, Accuracy) { emax = bias // 127 largest unbiased exponent (normal) ) - // Float mantissa m is 0.5 <= m < 1.0; compute exponent for float32 mantissa. - e := x.exp - 1 // exponent for mantissa m with 1.0 <= m < 2.0 - p := mbits + 1 // precision of normal float + // Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float32 mantissa. + e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0 - // If the exponent is too small, we may have a denormal number - // in which case we have fewer mantissa bits available: recompute - // precision. + // Compute precision p for float32 mantissa. + // If the exponent is too small, we have a denormal number before + // rounding and fewer than p mantissa bits of precision available + // (the exponent remains fixed but the mantissa gets shifted right). + p := mbits + 1 // precision of normal float if e < emin { + // recompute precision p = mbits + 1 - emin + int(e) - // Make sure we have at least 1 bit so that we don't - // lose numbers rounded up to the smallest denormal. - if p < 1 { - p = 1 + // If p == 0, the mantissa of x is shifted so much to the right + // that its msb falls immediately to the right of the float32 + // mantissa space. In other words, if the smallest denormal is + // considered "1.0", for p == 0, the mantissa value m is >= 0.5. + // If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal. + // If m == 0.5, it is rounded down to even, i.e., 0.0. + // If p < 0, the mantissa value m is <= "0.25" which is never rounded up. + if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ { + // underflow to ±0 + if x.neg { + var z float32 + return -z, Above + } + return 0.0, Below + } + // otherwise, round up + // We handle p == 0 explicitly because it's easy and because + // Float.round doesn't support rounding to 0 bits of precision. + if p == 0 { + if x.neg { + return -math.SmallestNonzeroFloat32, Below + } + return math.SmallestNonzeroFloat32, Above } } + // p > 0 // round var r Float @@ -898,12 +920,8 @@ func (x *Float) Float32() (float32, Accuracy) { // Rounding may have caused r to overflow to ±Inf // (rounding never causes underflows to 0). - if r.form == inf { - e = emax + 1 // cause overflow below - } - - // If the exponent is too large, overflow to ±Inf. - if e > emax { + // If the exponent is too large, also overflow to ±Inf. + if r.form == inf || e > emax { // overflow if x.neg { return float32(math.Inf(-1)), Below @@ -921,17 +939,10 @@ func (x *Float) Float32() (float32, Accuracy) { // Rounding may have caused a denormal number to // become normal. Check again. if e < emin { - // denormal number - if e < dmin { - // underflow to ±0 - if x.neg { - var z float32 - return -z, Above - } - return 0.0, Below - } - // bexp = 0 - // recompute precision + // denormal number: recompute precision + // Since rounding may have at best increased precision + // and we have eliminated p <= 0 early, we know p > 0. + // bexp == 0 for denormals p = mbits + 1 - emin + int(e) mant = msb32(r.mant) >> uint(fbits-p) } else { @@ -983,21 +994,43 @@ func (x *Float) Float64() (float64, Accuracy) { emax = bias // 1023 largest unbiased exponent (normal) ) - // Float mantissa m is 0.5 <= m < 1.0; compute exponent for float64 mantissa. - e := x.exp - 1 // exponent for mantissa m with 1.0 <= m < 2.0 - p := mbits + 1 // precision of normal float + // Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float64 mantissa. + e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0 - // If the exponent is too small, we may have a denormal number - // in which case we have fewer mantissa bits available: recompute - // precision. + // Compute precision p for float64 mantissa. + // If the exponent is too small, we have a denormal number before + // rounding and fewer than p mantissa bits of precision available + // (the exponent remains fixed but the mantissa gets shifted right). + p := mbits + 1 // precision of normal float if e < emin { + // recompute precision p = mbits + 1 - emin + int(e) - // Make sure we have at least 1 bit so that we don't - // lose numbers rounded up to the smallest denormal. - if p < 1 { - p = 1 + // If p == 0, the mantissa of x is shifted so much to the right + // that its msb falls immediately to the right of the float64 + // mantissa space. In other words, if the smallest denormal is + // considered "1.0", for p == 0, the mantissa value m is >= 0.5. + // If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal. + // If m == 0.5, it is rounded down to even, i.e., 0.0. + // If p < 0, the mantissa value m is <= "0.25" which is never rounded up. + if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ { + // underflow to ±0 + if x.neg { + var z float64 + return -z, Above + } + return 0.0, Below + } + // otherwise, round up + // We handle p == 0 explicitly because it's easy and because + // Float.round doesn't support rounding to 0 bits of precision. + if p == 0 { + if x.neg { + return -math.SmallestNonzeroFloat64, Below + } + return math.SmallestNonzeroFloat64, Above } } + // p > 0 // round var r Float @@ -1007,17 +1040,13 @@ func (x *Float) Float64() (float64, Accuracy) { // Rounding may have caused r to overflow to ±Inf // (rounding never causes underflows to 0). - if r.form == inf { - e = emax + 1 // cause overflow below - } - - // If the exponent is too large, overflow to ±Inf. - if e > emax { + // If the exponent is too large, also overflow to ±Inf. + if r.form == inf || e > emax { // overflow if x.neg { - return math.Inf(-1), Below + return float64(math.Inf(-1)), Below } - return math.Inf(+1), Above + return float64(math.Inf(+1)), Above } // e <= emax @@ -1030,17 +1059,10 @@ func (x *Float) Float64() (float64, Accuracy) { // Rounding may have caused a denormal number to // become normal. Check again. if e < emin { - // denormal number - if e < dmin { - // underflow to ±0 - if x.neg { - var z float64 - return -z, Above - } - return 0.0, Below - } - // bexp = 0 - // recompute precision + // denormal number: recompute precision + // Since rounding may have at best increased precision + // and we have eliminated p <= 0 early, we know p > 0. + // bexp == 0 for denormals p = mbits + 1 - emin + int(e) mant = msb64(r.mant) >> uint(fbits-p) } else { diff --git a/src/math/big/float_test.go b/src/math/big/float_test.go index 6fb44026de..464619b338 100644 --- a/src/math/big/float_test.go +++ b/src/math/big/float_test.go @@ -829,7 +829,7 @@ func TestFloatFloat32(t *testing.T) { }{ {"0", 0, Exact}, - // underflow + // underflow to zero {"1e-1000", 0, Below}, {"0x0.000002p-127", 0, Below}, {"0x.0000010p-126", 0, Below}, @@ -843,25 +843,39 @@ func TestFloatFloat32(t *testing.T) { {"1p-149", math.SmallestNonzeroFloat32, Exact}, {"0x.fffffep-126", math.Float32frombits(0x7fffff), Exact}, // largest denormal - // special cases (see issue 14553) - {"0x0.bp-149", math.Float32frombits(0x000000000), Below}, // ToNearestEven rounds down (to even) - {"0x0.cp-149", math.Float32frombits(0x000000001), Above}, + // special denormal cases (see issues 14553, 14651) + {"0x0.0000001p-126", math.Float32frombits(0x00000000), Below}, // underflow to zero + {"0x0.0000008p-126", math.Float32frombits(0x00000000), Below}, // underflow to zero + {"0x0.0000010p-126", math.Float32frombits(0x00000000), Below}, // rounded down to even + {"0x0.0000011p-126", math.Float32frombits(0x00000001), Above}, // rounded up to smallest denormal + {"0x0.0000018p-126", math.Float32frombits(0x00000001), Above}, // rounded up to smallest denormal - {"0x1.0p-149", math.Float32frombits(0x000000001), Exact}, + {"0x1.0000000p-149", math.Float32frombits(0x00000001), Exact}, // smallest denormal + {"0x0.0000020p-126", math.Float32frombits(0x00000001), Exact}, // smallest denormal + {"0x0.fffffe0p-126", math.Float32frombits(0x007fffff), Exact}, // largest denormal + {"0x1.0000000p-126", math.Float32frombits(0x00800000), Exact}, // smallest normal + + {"0x0.8p-149", math.Float32frombits(0x000000000), Below}, // rounded down to even + {"0x0.9p-149", math.Float32frombits(0x000000001), Above}, // rounded up to smallest denormal + {"0x0.ap-149", math.Float32frombits(0x000000001), Above}, // rounded up to smallest denormal + {"0x0.bp-149", math.Float32frombits(0x000000001), Above}, // rounded up to smallest denormal + {"0x0.cp-149", math.Float32frombits(0x000000001), Above}, // rounded up to smallest denormal + + {"0x1.0p-149", math.Float32frombits(0x000000001), Exact}, // smallest denormal {"0x1.7p-149", math.Float32frombits(0x000000001), Below}, {"0x1.8p-149", math.Float32frombits(0x000000002), Above}, {"0x1.9p-149", math.Float32frombits(0x000000002), Above}, {"0x2.0p-149", math.Float32frombits(0x000000002), Exact}, - {"0x2.8p-149", math.Float32frombits(0x000000002), Below}, // ToNearestEven rounds down (to even) + {"0x2.8p-149", math.Float32frombits(0x000000002), Below}, // rounded down to even {"0x2.9p-149", math.Float32frombits(0x000000003), Above}, {"0x3.0p-149", math.Float32frombits(0x000000003), Exact}, {"0x3.7p-149", math.Float32frombits(0x000000003), Below}, - {"0x3.8p-149", math.Float32frombits(0x000000004), Above}, // ToNearestEven rounds up (to even) + {"0x3.8p-149", math.Float32frombits(0x000000004), Above}, // rounded up to even {"0x4.0p-149", math.Float32frombits(0x000000004), Exact}, - {"0x4.8p-149", math.Float32frombits(0x000000004), Below}, // ToNearestEven rounds down (to even) + {"0x4.8p-149", math.Float32frombits(0x000000004), Below}, // rounded down to even {"0x4.9p-149", math.Float32frombits(0x000000005), Above}, // specific case from issue 14553 @@ -907,7 +921,7 @@ func TestFloatFloat32(t *testing.T) { x := makeFloat(tx) out, acc := x.Float32() if !alike32(out, tout) || acc != tacc { - t.Errorf("%s: got %g (%#x, %s); want %g (%#x, %s)", tx, out, math.Float32bits(out), acc, test.out, math.Float32bits(test.out), tacc) + t.Errorf("%s: got %g (%#08x, %s); want %g (%#08x, %s)", tx, out, math.Float32bits(out), acc, test.out, math.Float32bits(test.out), tacc) } // test that x.SetFloat64(float64(f)).Float32() == f @@ -929,21 +943,30 @@ func TestFloatFloat64(t *testing.T) { }{ {"0", 0, Exact}, - // underflow + // underflow to zero {"1e-1000", 0, Below}, {"0x0.0000000000001p-1023", 0, Below}, {"0x0.00000000000008p-1022", 0, Below}, // denormals {"0x0.0000000000000cp-1022", math.SmallestNonzeroFloat64, Above}, // rounded up to smallest denormal - {"0x0.0000000000001p-1022", math.SmallestNonzeroFloat64, Exact}, // smallest denormal + {"0x0.00000000000010p-1022", math.SmallestNonzeroFloat64, Exact}, // smallest denormal {"0x.8p-1073", math.SmallestNonzeroFloat64, Exact}, {"1p-1074", math.SmallestNonzeroFloat64, Exact}, {"0x.fffffffffffffp-1022", math.Float64frombits(0x000fffffffffffff), Exact}, // largest denormal - // special cases (see issue 14553) - {"0x0.bp-1074", math.Float64frombits(0x00000000000000000), Below}, // ToNearestEven rounds down (to even) - {"0x0.cp-1074", math.Float64frombits(0x00000000000000001), Above}, + // special denormal cases (see issues 14553, 14651) + {"0x0.00000000000001p-1022", math.Float64frombits(0x00000000000000000), Below}, // underflow to zero + {"0x0.00000000000004p-1022", math.Float64frombits(0x00000000000000000), Below}, // underflow to zero + {"0x0.00000000000008p-1022", math.Float64frombits(0x00000000000000000), Below}, // rounded down to even + {"0x0.00000000000009p-1022", math.Float64frombits(0x00000000000000001), Above}, // rounded up to smallest denormal + {"0x0.0000000000000ap-1022", math.Float64frombits(0x00000000000000001), Above}, // rounded up to smallest denormal + + {"0x0.8p-1074", math.Float64frombits(0x00000000000000000), Below}, // rounded down to even + {"0x0.9p-1074", math.Float64frombits(0x00000000000000001), Above}, // rounded up to smallest denormal + {"0x0.ap-1074", math.Float64frombits(0x00000000000000001), Above}, // rounded up to smallest denormal + {"0x0.bp-1074", math.Float64frombits(0x00000000000000001), Above}, // rounded up to smallest denormal + {"0x0.cp-1074", math.Float64frombits(0x00000000000000001), Above}, // rounded up to smallest denormal {"0x1.0p-1074", math.Float64frombits(0x00000000000000001), Exact}, {"0x1.7p-1074", math.Float64frombits(0x00000000000000001), Below}, @@ -951,15 +974,15 @@ func TestFloatFloat64(t *testing.T) { {"0x1.9p-1074", math.Float64frombits(0x00000000000000002), Above}, {"0x2.0p-1074", math.Float64frombits(0x00000000000000002), Exact}, - {"0x2.8p-1074", math.Float64frombits(0x00000000000000002), Below}, // ToNearestEven rounds down (to even) + {"0x2.8p-1074", math.Float64frombits(0x00000000000000002), Below}, // rounded down to even {"0x2.9p-1074", math.Float64frombits(0x00000000000000003), Above}, {"0x3.0p-1074", math.Float64frombits(0x00000000000000003), Exact}, {"0x3.7p-1074", math.Float64frombits(0x00000000000000003), Below}, - {"0x3.8p-1074", math.Float64frombits(0x00000000000000004), Above}, // ToNearestEven rounds up (to even) + {"0x3.8p-1074", math.Float64frombits(0x00000000000000004), Above}, // rounded up to even {"0x4.0p-1074", math.Float64frombits(0x00000000000000004), Exact}, - {"0x4.8p-1074", math.Float64frombits(0x00000000000000004), Below}, // ToNearestEven rounds down (to even) + {"0x4.8p-1074", math.Float64frombits(0x00000000000000004), Below}, // rounded down to even {"0x4.9p-1074", math.Float64frombits(0x00000000000000005), Above}, // normals @@ -1005,7 +1028,7 @@ func TestFloatFloat64(t *testing.T) { x := makeFloat(tx) out, acc := x.Float64() if !alike64(out, tout) || acc != tacc { - t.Errorf("%s: got %g (%#x, %s); want %g (%#x, %s)", tx, out, math.Float64bits(out), acc, test.out, math.Float64bits(test.out), tacc) + t.Errorf("%s: got %g (%#016x, %s); want %g (%#016x, %s)", tx, out, math.Float64bits(out), acc, test.out, math.Float64bits(test.out), tacc) } // test that x.SetFloat64(f).Float64() == f