summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@beefcake.parabola.nu>2018-05-20 00:21:30 -0400
committerLuke Shumaker <lukeshu@beefcake.parabola.nu>2018-05-20 00:21:30 -0400
commit2c0d4c741d6da293446ce5d0f19a94e6ffe5a37a (patch)
tree3015456ecac97aa5c72a9891ed8f378f372fcec9
parentc48f4a15241e013e77670bdaec024d16b10f9874 (diff)
cow-dedupe: optimize?
-rw-r--r--go/src/cow-dedupe/dedupe.go134
1 files changed, 77 insertions, 57 deletions
diff --git a/go/src/cow-dedupe/dedupe.go b/go/src/cow-dedupe/dedupe.go
index b70aeb2..0268ed3 100644
--- a/go/src/cow-dedupe/dedupe.go
+++ b/go/src/cow-dedupe/dedupe.go
@@ -80,12 +80,12 @@ func getFiemaps(paths []string) map[string][]string {
return ret
}
-func getChecksums(paths []string) map[string][]string {
+func getChecksums(cmd []string, paths []string) map[string][]string {
ret := map[string][]string{}
sl := myStatusLine()
cnt := 0
- sl.Put(fmt.Sprintf("Generating checksums for files... %d/%d", cnt, len(paths)))
+ sl.Put(fmt.Sprintf("Generating checksums (%v) for files... %d/%d", cmd, cnt, len(paths)))
pathsTodo := paths
for len(pathsTodo) > 0 {
@@ -100,7 +100,7 @@ func getChecksums(paths []string) map[string][]string {
}
pathsTodo = pathsTodo[len(pathsDoing):]
- cmd := exec.Command("sha256sum", append([]string{"--"}, pathsDoing...)...)
+ cmd := exec.Command(cmd[0], append(cmd[1:], pathsDoing...)...)
stdout, err := cmd.StdoutPipe()
errhandle(err)
cmd.Stderr = os.Stderr
@@ -121,16 +121,43 @@ func getChecksums(paths []string) map[string][]string {
ret[checksum] = append(ret[checksum], filename)
cnt++
- sl.Put(fmt.Sprintf("Generating checksums for files... %d/%d", cnt, len(paths)))
+ sl.Put(fmt.Sprintf("Generating checksums (%v) for files... %d/%d", cmd, cnt, len(paths)))
}
errhandle(cmd.Wait())
}
- sl.Put(fmt.Sprintf("Generating checksums for files... done; summed %d files", cnt))
+ sl.Put(fmt.Sprintf("Generating checksums (%v) for files... done; summed %d files", cmd, cnt))
sl.End(true)
return ret
}
+func pruneSingles(key2vals map[string][]string) {
+ for key, vals := range key2vals {
+ if len(vals) < 2 {
+ delete(key2vals, key)
+ }
+ }
+}
+
+func dedupe(srcFile string, dupFiles []string) error {
+ stat, err := os.Stat(srcFile)
+ errhandle(err)
+ args := []string{
+ "-r", "--", strconv.FormatInt(stat.Size(), 10),
+ srcFile, "0",
+ }
+ for _, dupFile := range dupFiles {
+ args = append(args, dupFile, "0")
+ }
+
+ cmd := exec.Command("./cow-dedupe-range", args...)
+ cmd.Stdout = os.Stdout
+ cmd.Stderr = os.Stderr
+ //fmt.Println("#", checksum)
+ //fmt.Println(cmd.Args)
+ return cmd.Run()
+}
+
func main() {
// we have low parallelism, don't let syscalls fan-out weird
// on many-core systems
@@ -161,63 +188,56 @@ func main() {
sl.Put(fmt.Sprintf("Building list of spanning files... done; %d files", len(spanningFiles)))
sl.End(true)
- checksum2filenames := getChecksums(spanningFiles)
-
- checksum2fiemaps := map[string][]string{}
- for checksum, filenames := range checksum2filenames {
- for _, filename := range filenames {
- checksum2fiemaps[checksum] = append(checksum2fiemaps[checksum], filename2fiemap[filename])
- }
- }
- for checksum, fiemaps := range checksum2fiemaps {
- if len(fiemaps) < 2 {
- delete(checksum2fiemaps, checksum)
- }
- }
+ size2filenames := getChecksums([]string{"stat", "--printf=%s %n\\n", "--"}, spanningFiles)
+ fmt.Fprintf(" -> %d sets", len(size2filenames))
+ pruneSingles(size2filenames)
+ fmt.Fprintf(" -> %d non-trivial sets\n", len(size2filenames))
sl = myStatusLine()
- cnt := 0
- sl.Put(fmt.Sprintf("Deduplicating sets of files... %d/%d", cnt, len(checksum2fiemaps)))
- for checksum, fiemaps := range checksum2fiemaps {
- // choose the fiemap with the fewest extents
- minFiemap := fiemaps[0]
- minFiemapLen := strings.Count(minFiemap, "\n")
- for _, fiemap := range fiemaps {
- fiemapLen := strings.Count(fiemap, "\n")
- if fiemapLen < minFiemapLen {
- minFiemap = fiemap
- minFiemapLen = fiemapLen
+ sizeCnt := 0
+ for size, filenames := range size2filenames {
+ // The list of specific files in size2filenames isn't
+ // significant; they'e just proxies for fiemaps.
+ sizeStatus := fmt.Sprintf("Working on size-set %d/%d of %d fiemaps",
+ sizeCnt, len(size2filenames), len(filenames))
+ sl.Put(sizeStatus)
+ // Now do strict hashing, instead of the incredibly
+ // sloppy (but fast) size-bucketing.
+ checksum2filenames := getChecksums([]string{"sha256sum", "--"}, filenames)
+ pruneSingles(checksum2filenames)
+ // And loop over the smaller, precise buckets
+ sumCnt := 0
+ for checksum, filenames := range checksum2fiemaps {
+ sl.Put(fmt.Sprintf("%s :: sha256-set %d/%d of %d fiemaps", sizeStatus, sumCnt, len(checksum2filenames), len(filenames)))
+ var fiemaps []string
+ for _, filename := range filenames {
+ fiemaps = append(fiemaps, filename2fiemap[filename])
}
- }
- srcFile := fiemap2filenames[minFiemap][0]
- var dupFiles []string
- for _, fiemap := range fiemaps {
- if fiemap == minFiemap {
- continue
+ // Now we choose the fiemap with the fewest extents
+ minFiemap := fiemaps[0]
+ minFiemapLen := strings.Count(minFiemap, "\n")
+ for _, fiemap := range fiemaps {
+ fiemapLen := strings.Count(fiemap, "\n")
+ if fiemapLen < minFiemapLen {
+ minFiemap = fiemap
+ minFiemapLen = fiemapLen
+ }
}
- dupFiles = append(dupFiles, fiemap2filenames[fiemap]...)
- }
-
- stat, err := os.Stat(srcFile)
- errhandle(err)
- args := []string{
- "-r", "--", strconv.FormatInt(stat.Size(), 10),
- srcFile, "0",
- }
- for _, dupFile := range dupFiles {
- args = append(args, dupFile, "0")
+ // Set srcFile and dupFiles based on that
+ srcFile := fiemap2filenames[minFiemap][0]
+ var dupFiles []string
+ for _, fiemap := range fiemaps {
+ if fiemap == minFiemap {
+ continue
+ }
+ dupFiles = append(dupFiles, fiemap2filenames[fiemap]...)
+ }
+ // And actually dedupe those
+ dedupe(srcFile, dupFiles) // XXX ignore error?
+ sumCnt++
}
-
- cmd := exec.Command("./cow-dedupe-range", args...)
- cmd.Stdout = os.Stdout
- cmd.Stderr = os.Stderr
- fmt.Println("#", checksum)
- fmt.Println(cmd.Args)
- errhandle(cmd.Run())
-
- cnt++
- sl.Put(fmt.Sprintf("Deduplicating sets of files... %d/%d", cnt, len(checksum2fiemaps)))
+ sizeCnt++
}
- sl.Put(fmt.Sprintf("Deduplicating sets of files... done; deduplicated %d sets", cnt))
+ sl.Put(fmt.Sprintf("Working on size-set %d/%d... done", sizeCnt, len(size2filenames)))
sl.End(true)
}