diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go index 5c03e54d78..ac15ab9b69 100644 --- a/src/bytes/bytes.go +++ b/src/bytes/bytes.go @@ -137,6 +137,7 @@ func LastIndexByte(s []byte, c byte) int { // If r is [utf8.RuneError], it returns the first instance of any // invalid UTF-8 byte sequence. func IndexRune(s []byte, r rune) int { + const haveFastIndex = bytealg.MaxBruteForce > 0 switch { case 0 <= r && r < utf8.RuneSelf: return IndexByte(s, byte(r)) @@ -152,9 +153,64 @@ func IndexRune(s []byte, r rune) int { case !utf8.ValidRune(r): return -1 default: + // Search for rune r using the last byte of its UTF-8 encoded form. + // The distribution of the last byte is more uniform compared to the + // first byte which has a 78% chance of being [240, 243, 244]. var b [utf8.UTFMax]byte n := utf8.EncodeRune(b[:], r) - return Index(s, b[:n]) + last := n - 1 + i := last + fails := 0 + for i < len(s) { + if s[i] != b[last] { + o := IndexByte(s[i+1:], b[last]) + if o < 0 { + return -1 + } + i += o + 1 + } + // Step backwards comparing bytes. + for j := 1; j < n; j++ { + if s[i-j] != b[last-j] { + goto next + } + } + return i - last + next: + fails++ + i++ + if (haveFastIndex && fails > bytealg.Cutover(i)) && i < len(s) || + (!haveFastIndex && fails >= 4+i>>4 && i < len(s)) { + goto fallback + } + } + return -1 + + fallback: + // Switch to bytealg.Index, if available, or a brute for search when + // IndexByte returns too many false positives. + if haveFastIndex { + if j := bytealg.Index(s[i-last:], b[:n]); j >= 0 { + return i + j - last + } + } else { + // If bytealg.Index is not available a brute force search is + // ~1.5-3x faster than Rabin-Karp since n is small. + c0 := b[last] + c1 := b[last-1] // There are at least 2 chars to match + loop: + for ; i < len(s); i++ { + if s[i] == c0 && s[i-1] == c1 { + for k := 2; k < n; k++ { + if s[i-k] != b[last-k] { + continue loop + } + } + return i - last + } + } + } + return -1 } } diff --git a/src/bytes/bytes_test.go b/src/bytes/bytes_test.go index 637880a4f7..da16882e82 100644 --- a/src/bytes/bytes_test.go +++ b/src/bytes/bytes_test.go @@ -197,6 +197,11 @@ var indexTests = []BinOpTest{ {"oxoxoxoxoxoxoxoxoxoxoxox", "oy", -1}, // test fallback to Rabin-Karp. {"000000000000000000000000000000000000000000000000000000000000000000000001", "0000000000000000000000000000000000000000000000000000000000000000001", 5}, + // test fallback to IndexRune + {"oxoxoxoxoxoxoxoxoxoxox☺", "☺", 22}, + // invalid UTF-8 byte sequence (must be longer than bytealg.MaxBruteForce to + // test that we don't use IndexRune) + {"xx0123456789012345678901234567890123456789012345678901234567890120123456789012345678901234567890123456xxx\xed\x9f\xc0", "\xed\x9f\xc0", 105}, } var lastIndexTests = []BinOpTest{ @@ -445,6 +450,31 @@ func TestIndexRune(t *testing.T) { {"some_text=some_value", '=', 9}, {"☺a", 'a', 3}, {"a☻☺b", '☺', 4}, + {"𠀳𠀗𠀾𠁄𠀧𠁆𠁂𠀫𠀖𠀪𠀲𠀴𠁀𠀨𠀿", '𠀿', 56}, + + // 2 bytes + {"ӆ", 'ӆ', 0}, + {"a", 'ӆ', -1}, + {" ӆ", 'ӆ', 2}, + {" a", 'ӆ', -1}, + {strings.Repeat("ц", 64) + "ӆ", 'ӆ', 128}, // test cutover + {strings.Repeat("ц", 64), 'ӆ', -1}, + + // 3 bytes + {"Ꚁ", 'Ꚁ', 0}, + {"a", 'Ꚁ', -1}, + {" Ꚁ", 'Ꚁ', 2}, + {" a", 'Ꚁ', -1}, + {strings.Repeat("Ꙁ", 64) + "Ꚁ", 'Ꚁ', 192}, // test cutover + {strings.Repeat("Ꙁ", 64) + "Ꚁ", '䚀', -1}, // 'Ꚁ' and '䚀' share the same last two bytes + + // 4 bytes + {"𡌀", '𡌀', 0}, + {"a", '𡌀', -1}, + {" 𡌀", '𡌀', 2}, + {" a", '𡌀', -1}, + {strings.Repeat("𡋀", 64) + "𡌀", '𡌀', 256}, // test cutover + {strings.Repeat("𡋀", 64) + "𡌀", '𣌀', -1}, // '𡌀' and '𣌀' share the same last two bytes // RuneError should match any invalid UTF-8 byte sequence. {"�", '�', 0}, @@ -458,6 +488,13 @@ func TestIndexRune(t *testing.T) { {"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", -1, -1}, {"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", 0xD800, -1}, // Surrogate pair {"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", utf8.MaxRune + 1, -1}, + + // Test the cutover to to bytealg.Index when it is triggered in + // the middle of rune that contains consecutive runs of equal bytes. + {"aaaaaKKKK\U000bc104", '\U000bc104', 17}, // cutover: (n + 16) / 8 + {"aaaaaKKKK鄄", '鄄', 17}, + {"aaKKKKKa\U000bc104", '\U000bc104', 18}, // cutover: 4 + n>>4 + {"aaKKKKKa鄄", '鄄', 18}, } for _, tt := range tests { if got := IndexRune([]byte(tt.in), tt.rune); got != tt.want { @@ -605,6 +642,21 @@ func BenchmarkIndexRuneASCII(b *testing.B) { benchBytes(b, indexSizes, bmIndexRuneASCII(IndexRune)) } +func BenchmarkIndexRuneUnicode(b *testing.B) { + b.Run("Latin", func(b *testing.B) { + // Latin is mostly 1, 2, 3 byte runes. + benchBytes(b, indexSizes, bmIndexRuneUnicode(unicode.Latin, 'é')) + }) + b.Run("Cyrillic", func(b *testing.B) { + // Cyrillic is mostly 2 and 3 byte runes. + benchBytes(b, indexSizes, bmIndexRuneUnicode(unicode.Cyrillic, 'Ꙁ')) + }) + b.Run("Han", func(b *testing.B) { + // Han consists only of 3 and 4 byte runes. + benchBytes(b, indexSizes, bmIndexRuneUnicode(unicode.Han, '𠀿')) + }) +} + func bmIndexRuneASCII(index func([]byte, rune) int) func(b *testing.B, n int) { return func(b *testing.B, n int) { buf := bmbuf[0:n] @@ -635,6 +687,61 @@ func bmIndexRune(index func([]byte, rune) int) func(b *testing.B, n int) { } } +func bmIndexRuneUnicode(rt *unicode.RangeTable, needle rune) func(b *testing.B, n int) { + var rs []rune + for _, r16 := range rt.R16 { + for r := rune(r16.Lo); r <= rune(r16.Hi); r += rune(r16.Stride) { + if r != needle { + rs = append(rs, rune(r)) + } + } + } + for _, r32 := range rt.R32 { + for r := rune(r32.Lo); r <= rune(r32.Hi); r += rune(r32.Stride) { + if r != needle { + rs = append(rs, rune(r)) + } + } + } + // Shuffle the runes so that they are not in descending order. + // The sort is deterministic since this is used for benchmarks, + // which need to be repeatable. + rr := rand.New(rand.NewSource(1)) + rr.Shuffle(len(rs), func(i, j int) { + rs[i], rs[j] = rs[j], rs[i] + }) + uchars := string(rs) + + return func(b *testing.B, n int) { + buf := bmbuf[0:n] + o := copy(buf, uchars) + for o < len(buf) { + o += copy(buf[o:], uchars) + } + + // Make space for the needle rune at the end of buf. + m := utf8.RuneLen(needle) + for o := m; o > 0; { + _, sz := utf8.DecodeLastRune(buf) + copy(buf[len(buf)-sz:], "\x00\x00\x00\x00") + buf = buf[:len(buf)-sz] + o -= sz + } + buf = utf8.AppendRune(buf[:n-m], needle) + + n -= m // adjust for rune len + for i := 0; i < b.N; i++ { + j := IndexRune(buf, needle) + if j != n { + b.Fatal("bad index", j) + } + } + for i := range buf { + buf[i] = '\x00' + } + } +} + func BenchmarkEqual(b *testing.B) { b.Run("0", func(b *testing.B) { var buf [4]byte @@ -2077,6 +2184,11 @@ func makeBenchInputHard() []byte { var benchInputHard = makeBenchInputHard() func benchmarkIndexHard(b *testing.B, sep []byte) { + n := Index(benchInputHard, sep) + if n < 0 { + n = len(benchInputHard) + } + b.SetBytes(int64(n)) for i := 0; i < b.N; i++ { Index(benchInputHard, sep) } diff --git a/src/strings/strings.go b/src/strings/strings.go index 0729c4ad42..7eb2de635c 100644 --- a/src/strings/strings.go +++ b/src/strings/strings.go @@ -125,6 +125,7 @@ func IndexByte(s string, c byte) int { // If r is [utf8.RuneError], it returns the first instance of any // invalid UTF-8 byte sequence. func IndexRune(s string, r rune) int { + const haveFastIndex = bytealg.MaxBruteForce > 0 switch { case 0 <= r && r < utf8.RuneSelf: return IndexByte(s, byte(r)) @@ -138,7 +139,60 @@ func IndexRune(s string, r rune) int { case !utf8.ValidRune(r): return -1 default: - return Index(s, string(r)) + // Search for rune r using the last byte of its UTF-8 encoded form. + // The distribution of the last byte is more uniform compared to the + // first byte which has a 78% chance of being [240, 243, 244]. + rs := string(r) + last := len(rs) - 1 + i := last + fails := 0 + for i < len(s) { + if s[i] != rs[last] { + o := IndexByte(s[i+1:], rs[last]) + if o < 0 { + return -1 + } + i += o + 1 + } + // Step backwards comparing bytes. + for j := 1; j < len(rs); j++ { + if s[i-j] != rs[last-j] { + goto next + } + } + return i - last + next: + fails++ + i++ + if (haveFastIndex && fails > bytealg.Cutover(i)) && i < len(s) || + (!haveFastIndex && fails >= 4+i>>4 && i < len(s)) { + goto fallback + } + } + return -1 + + fallback: + // see comment in ../bytes/bytes.go + if haveFastIndex { + if j := bytealg.IndexString(s[i-last:], string(r)); j >= 0 { + return i + j - last + } + } else { + c0 := rs[last] + c1 := rs[last-1] + loop: + for ; i < len(s); i++ { + if s[i] == c0 && s[i-1] == c1 { + for k := 2; k < len(rs); k++ { + if s[i-k] != rs[last-k] { + continue loop + } + } + return i - last + } + } + } + return -1 } } diff --git a/src/strings/strings_test.go b/src/strings/strings_test.go index acbf3ede7b..39f5f4e355 100644 --- a/src/strings/strings_test.go +++ b/src/strings/strings_test.go @@ -155,6 +155,11 @@ var indexTests = []IndexTest{ // test fallback to Rabin-Karp. {"oxoxoxoxoxoxoxoxoxoxoxoy", "oy", 22}, {"oxoxoxoxoxoxoxoxoxoxoxox", "oy", -1}, + // test fallback to IndexRune + {"oxoxoxoxoxoxoxoxoxoxox☺", "☺", 22}, + // invalid UTF-8 byte sequence (must be longer than bytealg.MaxBruteForce to + // test that we don't use IndexRune) + {"xx0123456789012345678901234567890123456789012345678901234567890120123456789012345678901234567890123456xxx\xed\x9f\xc0", "\xed\x9f\xc0", 105}, } var lastIndexTests = []IndexTest{ @@ -326,6 +331,37 @@ func TestIndexRune(t *testing.T) { {"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", -1, -1}, {"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", 0xD800, -1}, // Surrogate pair {"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", utf8.MaxRune + 1, -1}, + + // 2 bytes + {"ӆ", 'ӆ', 0}, + {"a", 'ӆ', -1}, + {" ӆ", 'ӆ', 2}, + {" a", 'ӆ', -1}, + {Repeat("ц", 64) + "ӆ", 'ӆ', 128}, // test cutover + {Repeat("Ꙁ", 64) + "Ꚁ", '䚀', -1}, // 'Ꚁ' and '䚀' share the same last two bytes + + // 3 bytes + {"Ꚁ", 'Ꚁ', 0}, + {"a", 'Ꚁ', -1}, + {" Ꚁ", 'Ꚁ', 2}, + {" a", 'Ꚁ', -1}, + {Repeat("Ꙁ", 64) + "Ꚁ", 'Ꚁ', 192}, // test cutover + {Repeat("𡋀", 64) + "𡌀", '𣌀', -1}, // '𡌀' and '𣌀' share the same last two bytes + + // 4 bytes + {"𡌀", '𡌀', 0}, + {"a", '𡌀', -1}, + {" 𡌀", '𡌀', 2}, + {" a", '𡌀', -1}, + {Repeat("𡋀", 64) + "𡌀", '𡌀', 256}, // test cutover + {Repeat("𡋀", 64), '𡌀', -1}, + + // Test the cutover to to bytealg.IndexString when it is triggered in + // the middle of rune that contains consecutive runs of equal bytes. + {"aaaaaKKKK\U000bc104", '\U000bc104', 17}, // cutover: (n + 16) / 8 + {"aaaaaKKKK鄄", '鄄', 17}, + {"aaKKKKKa\U000bc104", '\U000bc104', 18}, // cutover: 4 + n>>4 + {"aaKKKKKa鄄", '鄄', 18}, } for _, tt := range tests { if got := IndexRune(tt.in, tt.rune); got != tt.want { @@ -333,13 +369,14 @@ func TestIndexRune(t *testing.T) { } } - haystack := "test世界" + // Make sure we trigger the cutover and string(rune) conversion. + haystack := "test" + Repeat("𡋀", 32) + "𡌀" allocs := testing.AllocsPerRun(1000, func() { if i := IndexRune(haystack, 's'); i != 2 { t.Fatalf("'s' at %d; want 2", i) } - if i := IndexRune(haystack, '世'); i != 4 { - t.Fatalf("'世' at %d; want 4", i) + if i := IndexRune(haystack, '𡌀'); i != 132 { + t.Fatalf("'𡌀' at %d; want 4", i) } }) if allocs != 0 && testing.CoverMode() == "" {