From 5889f1fa2818f34025ca6e2feecb26928c6e6341 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Sat, 16 Jul 2022 14:53:11 -0600 Subject: Re-jigger the KMP search to be more independent of the underlying data types --- lib/diskio/kmp.go | 79 ++++++++++++++-------- lib/diskio/kmp_test.go | 29 ++++---- lib/diskio/seq.go | 68 +++++++++++++++++++ ...6820e3f98383aebadd2af3a22aa248546a228b08451c30d | 3 - ...14e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 | 3 - ...293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 | 3 - ...3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 | 3 - ...7d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d | 3 - ...6820e3f98383aebadd2af3a22aa248546a228b08451c30d | 3 + ...14e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 | 3 + ...293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 | 3 + ...3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 | 3 + ...7d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d | 3 + 13 files changed, 152 insertions(+), 54 deletions(-) create mode 100644 lib/diskio/seq.go delete mode 100644 lib/diskio/testdata/fuzz/FuzzFindAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d delete mode 100644 lib/diskio/testdata/fuzz/FuzzFindAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 delete mode 100644 lib/diskio/testdata/fuzz/FuzzFindAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 delete mode 100644 lib/diskio/testdata/fuzz/FuzzFindAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 delete mode 100644 lib/diskio/testdata/fuzz/FuzzFindAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d create mode 100644 lib/diskio/testdata/fuzz/FuzzIndexAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d create mode 100644 lib/diskio/testdata/fuzz/FuzzIndexAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 create mode 100644 lib/diskio/testdata/fuzz/FuzzIndexAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 create mode 100644 lib/diskio/testdata/fuzz/FuzzIndexAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 create mode 100644 lib/diskio/testdata/fuzz/FuzzIndexAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d (limited to 'lib/diskio') diff --git a/lib/diskio/kmp.go b/lib/diskio/kmp.go index 69a3a51..da19e81 100644 --- a/lib/diskio/kmp.go +++ b/lib/diskio/kmp.go @@ -9,12 +9,31 @@ import ( "io" ) +func mustGet[K ~int64, V any](seq Sequence[K, V], i K) V { + val, err := seq.Get(i) + if err != nil { + panic(err) + } + return val +} + // buildKMPTable takes the string 'substr', and returns a table such // that 'table[matchLen-1]' is the largest value 'val' for which 'val < matchLen' and // 'substr[:val] == substr[matchLen-val:matchLen]'. -func buildKMPTable(substr []byte) []int { - table := make([]int, len(substr)) - for j := range table { +func buildKMPTable[K ~int64, V comparable](substr Sequence[K, V]) ([]K, error) { + var substrLen K + for { + if _, err := substr.Get(substrLen); err != nil { + if errors.Is(err, io.EOF) { + break + } + return nil, err + } + substrLen++ + } + + table := make([]K, substrLen) + for j := K(0); j < substrLen; j++ { if j == 0 { // First entry must always be 0 (in order to // satisfy 'val < matchLen'). @@ -22,62 +41,68 @@ func buildKMPTable(substr []byte) []int { } val := table[j-1] // not a match; go back - for val > 0 && substr[j] != substr[val] { + for val > 0 && mustGet(substr, j) != mustGet(substr, val) { val = table[val-1] } // is a match; go forward - if substr[val] == substr[j] { + if mustGet(substr, val) == mustGet(substr, j) { val++ } table[j] = val } - return table + return table, nil } -// FindAll returns the starting-position of all possibly-overlapping -// occurances of 'substr' in the 'r' stream. +// IndexAll returns the starting-position of all possibly-overlapping +// occurances of 'substr' in the 'str' sequence. // -// Will panic if len(substr)==0. +// Will hop around in 'substr', but will only get the natural sequence +// [0...) in order from 'str'. When hopping around in 'substr' it +// assumes that once it has gotten a given index without error, it can +// continue to do so without error; errors appearing later will cause +// panics. +// +// Will panic if the length of 'substr' is 0. // // Uses the Knuth-Morris-Pratt algorithm. -func FindAll[A ~int64](r io.ByteReader, substr []byte) ([]A, error) { - if len(substr) == 0 { - panic(errors.New("diskio.FindAll: empty substring")) +func IndexAll[K ~int64, V comparable](str, substr Sequence[K, V]) ([]K, error) { + table, err := buildKMPTable(substr) + if err != nil { + return nil, err + } + substrLen := K(len(table)) + if substrLen == 0 { + panic(errors.New("diskio.IndexAll: empty substring")) } - table := buildKMPTable(substr) - var matches []A - var curMatchBeg A - var curMatchLen int + var matches []K + var curMatchBeg K + var curMatchLen K - pos := A(-1) // if 'r' were a slice; define 'pos' such that 'chr=r[pos]' - for { - // I/O - var chr byte - chr, err := r.ReadByte() + for pos := K(0); ; pos++ { + chr, err := str.Get(pos) if err != nil { if errors.Is(err, io.EOF) { err = nil } return matches, err } - pos++ // Consider 'chr' - for curMatchLen > 0 && chr != substr[curMatchLen] { // shorten the match + for curMatchLen > 0 && chr != mustGet(substr, curMatchLen) { // shorten the match overlap := table[curMatchLen-1] - curMatchBeg += A(curMatchLen - overlap) + curMatchBeg += curMatchLen - overlap curMatchLen = overlap } - if chr == substr[curMatchLen] { // lengthen the match + if chr == mustGet(substr, curMatchLen) { // lengthen the match if curMatchLen == 0 { curMatchBeg = pos } curMatchLen++ - if curMatchLen == len(substr) { + if curMatchLen == substrLen { matches = append(matches, curMatchBeg) overlap := table[curMatchLen-1] - curMatchBeg += A(curMatchLen - overlap) + curMatchBeg += curMatchLen - overlap curMatchLen = overlap } } diff --git a/lib/diskio/kmp_test.go b/lib/diskio/kmp_test.go index 836605a..51c7b5e 100644 --- a/lib/diskio/kmp_test.go +++ b/lib/diskio/kmp_test.go @@ -9,17 +9,19 @@ import ( "testing" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestBuildKMPTable(t *testing.T) { - substr := []byte("ababaa") - table := buildKMPTable(substr) - assert.Equal(t, - []int{0, 0, 1, 2, 3, 1}, + substr := SliceSequence[int64, byte]([]byte("ababaa")) + table, err := buildKMPTable[int64, byte](substr) + require.NoError(t, err) + require.Equal(t, + []int64{0, 0, 1, 2, 3, 1}, table) for j, val := range table { matchLen := j + 1 - assert.Equalf(t, substr[:val], substr[matchLen-val:matchLen], + assert.Equalf(t, substr[:val], substr[matchLen-int(val):matchLen], "for table[%d]=%d", j, val) } } @@ -27,17 +29,18 @@ func TestBuildKMPTable(t *testing.T) { func FuzzBuildKMPTable(f *testing.F) { f.Add([]byte("ababaa")) f.Fuzz(func(t *testing.T, substr []byte) { - table := buildKMPTable(substr) - assert.Equal(t, len(substr), len(table), "length") + table, err := buildKMPTable[int64, byte](SliceSequence[int64, byte](substr)) + require.NoError(t, err) + require.Equal(t, len(substr), len(table), "length") for j, val := range table { matchLen := j + 1 - assert.Equalf(t, substr[:val], substr[matchLen-val:matchLen], + assert.Equalf(t, substr[:val], substr[matchLen-int(val):matchLen], "for table[%d]=%d", j, val) } }) } -func NaiveFindAll(str, substr []byte) []int64 { +func NaiveIndexAll(str, substr []byte) []int64 { var matches []int64 for i := range str { if bytes.HasPrefix(str[i:], substr) { @@ -47,15 +50,17 @@ func NaiveFindAll(str, substr []byte) []int64 { return matches } -func FuzzFindAll(f *testing.F) { +func FuzzIndexAll(f *testing.F) { f.Fuzz(func(t *testing.T, str, substr []byte) { if len(substr) == 0 { t.Skip() } t.Logf("str =%q", str) t.Logf("substr=%q", substr) - exp := NaiveFindAll(str, substr) - act, err := FindAll[int64](bytes.NewReader(str), substr) + exp := NaiveIndexAll(str, substr) + act, err := IndexAll[int64, byte]( + &ByteReaderSequence[int64]{R: bytes.NewReader(str)}, + SliceSequence[int64, byte](substr)) assert.NoError(t, err) assert.Equal(t, exp, act) }) diff --git a/lib/diskio/seq.go b/lib/diskio/seq.go new file mode 100644 index 0000000..3c5f4ae --- /dev/null +++ b/lib/diskio/seq.go @@ -0,0 +1,68 @@ +// Copyright (C) 2022 Luke Shumaker +// +// SPDX-License-Identifier: GPL-2.0-or-later + +package diskio + +import ( + "fmt" + "io" +) + +// interface ///////////////////////////////////////////////////////// + +type Sequence[K ~int64, V any] interface { + // Get the value at 'pos' in the sequence. Positions start at + // 0 and increment naturally. Return an error that is io.EOF + // if 'pos' is past the end of the sequence'. + Get(pos K) (V, error) +} + +// implementation: slice ///////////////////////////////////////////// + +type SliceSequence[K ~int64, V any] []V + +var _ Sequence[assertAddr, byte] = SliceSequence[assertAddr, byte]([]byte(nil)) + +func (s SliceSequence[K, V]) Get(i K) (V, error) { + if i >= K(len(s)) { + var v V + return v, io.EOF + } + return s[int(i)], nil +} + +// implementation: string //////////////////////////////////////////// + +type StringSequence[K ~int64] string + +var _ Sequence[assertAddr, byte] = StringSequence[assertAddr]("") + +func (s StringSequence[K]) Get(i K) (byte, error) { + if i >= K(len(s)) { + return 0, io.EOF + } + return s[int(i)], nil +} + +// implementation: io.ByteReader ///////////////////////////////////// + +type ByteReaderSequence[K ~int64] struct { + R io.ByteReader + pos K +} + +var _ Sequence[assertAddr, byte] = &ByteReaderSequence[assertAddr]{R: nil} + +func (s *ByteReaderSequence[K]) Get(i K) (byte, error) { + if i != s.pos { + return 0, fmt.Errorf("%T.Get(%v): can only call .Get(%v)", + s, i, s.pos) + } + chr, err := s.R.ReadByte() + if err != nil { + return chr, err + } + s.pos++ + return chr, nil +} diff --git a/lib/diskio/testdata/fuzz/FuzzFindAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d b/lib/diskio/testdata/fuzz/FuzzFindAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d deleted file mode 100644 index 9d14adf..0000000 --- a/lib/diskio/testdata/fuzz/FuzzFindAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d +++ /dev/null @@ -1,3 +0,0 @@ -go test fuzz v1 -[]byte("0") -[]byte("1") diff --git a/lib/diskio/testdata/fuzz/FuzzFindAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 b/lib/diskio/testdata/fuzz/FuzzFindAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 deleted file mode 100644 index 269f061..0000000 --- a/lib/diskio/testdata/fuzz/FuzzFindAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 +++ /dev/null @@ -1,3 +0,0 @@ -go test fuzz v1 -[]byte("0") -[]byte("0") diff --git a/lib/diskio/testdata/fuzz/FuzzFindAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 b/lib/diskio/testdata/fuzz/FuzzFindAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 deleted file mode 100644 index b8f1562..0000000 --- a/lib/diskio/testdata/fuzz/FuzzFindAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 +++ /dev/null @@ -1,3 +0,0 @@ -go test fuzz v1 -[]byte("0") -[]byte("") diff --git a/lib/diskio/testdata/fuzz/FuzzFindAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 b/lib/diskio/testdata/fuzz/FuzzFindAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 deleted file mode 100644 index be67506..0000000 --- a/lib/diskio/testdata/fuzz/FuzzFindAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 +++ /dev/null @@ -1,3 +0,0 @@ -go test fuzz v1 -[]byte("\xde\xdb!") -[]byte("\xde\xdb") diff --git a/lib/diskio/testdata/fuzz/FuzzFindAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d b/lib/diskio/testdata/fuzz/FuzzFindAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d deleted file mode 100644 index c3bfa37..0000000 --- a/lib/diskio/testdata/fuzz/FuzzFindAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d +++ /dev/null @@ -1,3 +0,0 @@ -go test fuzz v1 -[]byte("\x10\x10\x15") -[]byte("\x10\x15") diff --git a/lib/diskio/testdata/fuzz/FuzzIndexAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d b/lib/diskio/testdata/fuzz/FuzzIndexAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d new file mode 100644 index 0000000..9d14adf --- /dev/null +++ b/lib/diskio/testdata/fuzz/FuzzIndexAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d @@ -0,0 +1,3 @@ +go test fuzz v1 +[]byte("0") +[]byte("1") diff --git a/lib/diskio/testdata/fuzz/FuzzIndexAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 b/lib/diskio/testdata/fuzz/FuzzIndexAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 new file mode 100644 index 0000000..269f061 --- /dev/null +++ b/lib/diskio/testdata/fuzz/FuzzIndexAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 @@ -0,0 +1,3 @@ +go test fuzz v1 +[]byte("0") +[]byte("0") diff --git a/lib/diskio/testdata/fuzz/FuzzIndexAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 b/lib/diskio/testdata/fuzz/FuzzIndexAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 new file mode 100644 index 0000000..b8f1562 --- /dev/null +++ b/lib/diskio/testdata/fuzz/FuzzIndexAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 @@ -0,0 +1,3 @@ +go test fuzz v1 +[]byte("0") +[]byte("") diff --git a/lib/diskio/testdata/fuzz/FuzzIndexAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 b/lib/diskio/testdata/fuzz/FuzzIndexAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 new file mode 100644 index 0000000..be67506 --- /dev/null +++ b/lib/diskio/testdata/fuzz/FuzzIndexAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 @@ -0,0 +1,3 @@ +go test fuzz v1 +[]byte("\xde\xdb!") +[]byte("\xde\xdb") diff --git a/lib/diskio/testdata/fuzz/FuzzIndexAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d b/lib/diskio/testdata/fuzz/FuzzIndexAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d new file mode 100644 index 0000000..c3bfa37 --- /dev/null +++ b/lib/diskio/testdata/fuzz/FuzzIndexAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d @@ -0,0 +1,3 @@ +go test fuzz v1 +[]byte("\x10\x10\x15") +[]byte("\x10\x15") -- cgit v1.2.3-54-g00ecf