summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2022-07-16 14:53:11 -0600
committerLuke Shumaker <lukeshu@lukeshu.com>2022-07-16 16:11:23 -0600
commit5889f1fa2818f34025ca6e2feecb26928c6e6341 (patch)
treed1f11679288826bd1d02093f61c30fd918ee0bf3 /lib
parent259aecbdc2e22836a6e75011549503795a22536f (diff)
Re-jigger the KMP search to be more independent of the underlying data types
Diffstat (limited to 'lib')
-rw-r--r--lib/diskio/kmp.go79
-rw-r--r--lib/diskio/kmp_test.go29
-rw-r--r--lib/diskio/seq.go68
-rw-r--r--lib/diskio/testdata/fuzz/FuzzIndexAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d (renamed from lib/diskio/testdata/fuzz/FuzzFindAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d)0
-rw-r--r--lib/diskio/testdata/fuzz/FuzzIndexAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 (renamed from lib/diskio/testdata/fuzz/FuzzFindAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600)0
-rw-r--r--lib/diskio/testdata/fuzz/FuzzIndexAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 (renamed from lib/diskio/testdata/fuzz/FuzzFindAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60)0
-rw-r--r--lib/diskio/testdata/fuzz/FuzzIndexAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 (renamed from lib/diskio/testdata/fuzz/FuzzFindAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738)0
-rw-r--r--lib/diskio/testdata/fuzz/FuzzIndexAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d (renamed from lib/diskio/testdata/fuzz/FuzzFindAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d)0
8 files changed, 137 insertions, 39 deletions
diff --git a/lib/diskio/kmp.go b/lib/diskio/kmp.go
index 69a3a51..da19e81 100644
--- a/lib/diskio/kmp.go
+++ b/lib/diskio/kmp.go
@@ -9,12 +9,31 @@ import (
"io"
)
+func mustGet[K ~int64, V any](seq Sequence[K, V], i K) V {
+ val, err := seq.Get(i)
+ if err != nil {
+ panic(err)
+ }
+ return val
+}
+
// buildKMPTable takes the string 'substr', and returns a table such
// that 'table[matchLen-1]' is the largest value 'val' for which 'val < matchLen' and
// 'substr[:val] == substr[matchLen-val:matchLen]'.
-func buildKMPTable(substr []byte) []int {
- table := make([]int, len(substr))
- for j := range table {
+func buildKMPTable[K ~int64, V comparable](substr Sequence[K, V]) ([]K, error) {
+ var substrLen K
+ for {
+ if _, err := substr.Get(substrLen); err != nil {
+ if errors.Is(err, io.EOF) {
+ break
+ }
+ return nil, err
+ }
+ substrLen++
+ }
+
+ table := make([]K, substrLen)
+ for j := K(0); j < substrLen; j++ {
if j == 0 {
// First entry must always be 0 (in order to
// satisfy 'val < matchLen').
@@ -22,62 +41,68 @@ func buildKMPTable(substr []byte) []int {
}
val := table[j-1]
// not a match; go back
- for val > 0 && substr[j] != substr[val] {
+ for val > 0 && mustGet(substr, j) != mustGet(substr, val) {
val = table[val-1]
}
// is a match; go forward
- if substr[val] == substr[j] {
+ if mustGet(substr, val) == mustGet(substr, j) {
val++
}
table[j] = val
}
- return table
+ return table, nil
}
-// FindAll returns the starting-position of all possibly-overlapping
-// occurances of 'substr' in the 'r' stream.
+// IndexAll returns the starting-position of all possibly-overlapping
+// occurances of 'substr' in the 'str' sequence.
//
-// Will panic if len(substr)==0.
+// Will hop around in 'substr', but will only get the natural sequence
+// [0...) in order from 'str'. When hopping around in 'substr' it
+// assumes that once it has gotten a given index without error, it can
+// continue to do so without error; errors appearing later will cause
+// panics.
+//
+// Will panic if the length of 'substr' is 0.
//
// Uses the Knuth-Morris-Pratt algorithm.
-func FindAll[A ~int64](r io.ByteReader, substr []byte) ([]A, error) {
- if len(substr) == 0 {
- panic(errors.New("diskio.FindAll: empty substring"))
+func IndexAll[K ~int64, V comparable](str, substr Sequence[K, V]) ([]K, error) {
+ table, err := buildKMPTable(substr)
+ if err != nil {
+ return nil, err
+ }
+ substrLen := K(len(table))
+ if substrLen == 0 {
+ panic(errors.New("diskio.IndexAll: empty substring"))
}
- table := buildKMPTable(substr)
- var matches []A
- var curMatchBeg A
- var curMatchLen int
+ var matches []K
+ var curMatchBeg K
+ var curMatchLen K
- pos := A(-1) // if 'r' were a slice; define 'pos' such that 'chr=r[pos]'
- for {
- // I/O
- var chr byte
- chr, err := r.ReadByte()
+ for pos := K(0); ; pos++ {
+ chr, err := str.Get(pos)
if err != nil {
if errors.Is(err, io.EOF) {
err = nil
}
return matches, err
}
- pos++
// Consider 'chr'
- for curMatchLen > 0 && chr != substr[curMatchLen] { // shorten the match
+ for curMatchLen > 0 && chr != mustGet(substr, curMatchLen) { // shorten the match
overlap := table[curMatchLen-1]
- curMatchBeg += A(curMatchLen - overlap)
+ curMatchBeg += curMatchLen - overlap
curMatchLen = overlap
}
- if chr == substr[curMatchLen] { // lengthen the match
+ if chr == mustGet(substr, curMatchLen) { // lengthen the match
if curMatchLen == 0 {
curMatchBeg = pos
}
curMatchLen++
- if curMatchLen == len(substr) {
+ if curMatchLen == substrLen {
matches = append(matches, curMatchBeg)
overlap := table[curMatchLen-1]
- curMatchBeg += A(curMatchLen - overlap)
+ curMatchBeg += curMatchLen - overlap
curMatchLen = overlap
}
}
diff --git a/lib/diskio/kmp_test.go b/lib/diskio/kmp_test.go
index 836605a..51c7b5e 100644
--- a/lib/diskio/kmp_test.go
+++ b/lib/diskio/kmp_test.go
@@ -9,17 +9,19 @@ import (
"testing"
"github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
)
func TestBuildKMPTable(t *testing.T) {
- substr := []byte("ababaa")
- table := buildKMPTable(substr)
- assert.Equal(t,
- []int{0, 0, 1, 2, 3, 1},
+ substr := SliceSequence[int64, byte]([]byte("ababaa"))
+ table, err := buildKMPTable[int64, byte](substr)
+ require.NoError(t, err)
+ require.Equal(t,
+ []int64{0, 0, 1, 2, 3, 1},
table)
for j, val := range table {
matchLen := j + 1
- assert.Equalf(t, substr[:val], substr[matchLen-val:matchLen],
+ assert.Equalf(t, substr[:val], substr[matchLen-int(val):matchLen],
"for table[%d]=%d", j, val)
}
}
@@ -27,17 +29,18 @@ func TestBuildKMPTable(t *testing.T) {
func FuzzBuildKMPTable(f *testing.F) {
f.Add([]byte("ababaa"))
f.Fuzz(func(t *testing.T, substr []byte) {
- table := buildKMPTable(substr)
- assert.Equal(t, len(substr), len(table), "length")
+ table, err := buildKMPTable[int64, byte](SliceSequence[int64, byte](substr))
+ require.NoError(t, err)
+ require.Equal(t, len(substr), len(table), "length")
for j, val := range table {
matchLen := j + 1
- assert.Equalf(t, substr[:val], substr[matchLen-val:matchLen],
+ assert.Equalf(t, substr[:val], substr[matchLen-int(val):matchLen],
"for table[%d]=%d", j, val)
}
})
}
-func NaiveFindAll(str, substr []byte) []int64 {
+func NaiveIndexAll(str, substr []byte) []int64 {
var matches []int64
for i := range str {
if bytes.HasPrefix(str[i:], substr) {
@@ -47,15 +50,17 @@ func NaiveFindAll(str, substr []byte) []int64 {
return matches
}
-func FuzzFindAll(f *testing.F) {
+func FuzzIndexAll(f *testing.F) {
f.Fuzz(func(t *testing.T, str, substr []byte) {
if len(substr) == 0 {
t.Skip()
}
t.Logf("str =%q", str)
t.Logf("substr=%q", substr)
- exp := NaiveFindAll(str, substr)
- act, err := FindAll[int64](bytes.NewReader(str), substr)
+ exp := NaiveIndexAll(str, substr)
+ act, err := IndexAll[int64, byte](
+ &ByteReaderSequence[int64]{R: bytes.NewReader(str)},
+ SliceSequence[int64, byte](substr))
assert.NoError(t, err)
assert.Equal(t, exp, act)
})
diff --git a/lib/diskio/seq.go b/lib/diskio/seq.go
new file mode 100644
index 0000000..3c5f4ae
--- /dev/null
+++ b/lib/diskio/seq.go
@@ -0,0 +1,68 @@
+// Copyright (C) 2022 Luke Shumaker <lukeshu@lukeshu.com>
+//
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+package diskio
+
+import (
+ "fmt"
+ "io"
+)
+
+// interface /////////////////////////////////////////////////////////
+
+type Sequence[K ~int64, V any] interface {
+ // Get the value at 'pos' in the sequence. Positions start at
+ // 0 and increment naturally. Return an error that is io.EOF
+ // if 'pos' is past the end of the sequence'.
+ Get(pos K) (V, error)
+}
+
+// implementation: slice /////////////////////////////////////////////
+
+type SliceSequence[K ~int64, V any] []V
+
+var _ Sequence[assertAddr, byte] = SliceSequence[assertAddr, byte]([]byte(nil))
+
+func (s SliceSequence[K, V]) Get(i K) (V, error) {
+ if i >= K(len(s)) {
+ var v V
+ return v, io.EOF
+ }
+ return s[int(i)], nil
+}
+
+// implementation: string ////////////////////////////////////////////
+
+type StringSequence[K ~int64] string
+
+var _ Sequence[assertAddr, byte] = StringSequence[assertAddr]("")
+
+func (s StringSequence[K]) Get(i K) (byte, error) {
+ if i >= K(len(s)) {
+ return 0, io.EOF
+ }
+ return s[int(i)], nil
+}
+
+// implementation: io.ByteReader /////////////////////////////////////
+
+type ByteReaderSequence[K ~int64] struct {
+ R io.ByteReader
+ pos K
+}
+
+var _ Sequence[assertAddr, byte] = &ByteReaderSequence[assertAddr]{R: nil}
+
+func (s *ByteReaderSequence[K]) Get(i K) (byte, error) {
+ if i != s.pos {
+ return 0, fmt.Errorf("%T.Get(%v): can only call .Get(%v)",
+ s, i, s.pos)
+ }
+ chr, err := s.R.ReadByte()
+ if err != nil {
+ return chr, err
+ }
+ s.pos++
+ return chr, nil
+}
diff --git a/lib/diskio/testdata/fuzz/FuzzFindAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d b/lib/diskio/testdata/fuzz/FuzzIndexAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d
index 9d14adf..9d14adf 100644
--- a/lib/diskio/testdata/fuzz/FuzzFindAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d
+++ b/lib/diskio/testdata/fuzz/FuzzIndexAll/593a5dd328ee86bac6820e3f98383aebadd2af3a22aa248546a228b08451c30d
diff --git a/lib/diskio/testdata/fuzz/FuzzFindAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600 b/lib/diskio/testdata/fuzz/FuzzIndexAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600
index 269f061..269f061 100644
--- a/lib/diskio/testdata/fuzz/FuzzFindAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600
+++ b/lib/diskio/testdata/fuzz/FuzzIndexAll/6bd9babbebf7eb78814e1f3425bfa6ccd6bfd42824a26e8cbd63b5117934e600
diff --git a/lib/diskio/testdata/fuzz/FuzzFindAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60 b/lib/diskio/testdata/fuzz/FuzzIndexAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60
index b8f1562..b8f1562 100644
--- a/lib/diskio/testdata/fuzz/FuzzFindAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60
+++ b/lib/diskio/testdata/fuzz/FuzzIndexAll/84ed65595ad05a58e293dbf423c1a816b697e2763a29d7c37aa476d6eef6fd60
diff --git a/lib/diskio/testdata/fuzz/FuzzFindAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738 b/lib/diskio/testdata/fuzz/FuzzIndexAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738
index be67506..be67506 100644
--- a/lib/diskio/testdata/fuzz/FuzzFindAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738
+++ b/lib/diskio/testdata/fuzz/FuzzIndexAll/9be40f71bc49b1b5c3b8e08d58d0f69cc22aefb12fd9b5931e49ff0e41953738
diff --git a/lib/diskio/testdata/fuzz/FuzzFindAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d b/lib/diskio/testdata/fuzz/FuzzIndexAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d
index c3bfa37..c3bfa37 100644
--- a/lib/diskio/testdata/fuzz/FuzzFindAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d
+++ b/lib/diskio/testdata/fuzz/FuzzIndexAll/e43317ec61b0da9627d7b2fc0237d369082a6bf5dfe39e05c113b42ff6218d5d