From 2c0d4c741d6da293446ce5d0f19a94e6ffe5a37a Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Sun, 20 May 2018 00:21:30 -0400 Subject: cow-dedupe: optimize? --- go/src/cow-dedupe/dedupe.go | 134 +++++++++++++++++++++++++------------------- 1 file changed, 77 insertions(+), 57 deletions(-) diff --git a/go/src/cow-dedupe/dedupe.go b/go/src/cow-dedupe/dedupe.go index b70aeb2..0268ed3 100644 --- a/go/src/cow-dedupe/dedupe.go +++ b/go/src/cow-dedupe/dedupe.go @@ -80,12 +80,12 @@ func getFiemaps(paths []string) map[string][]string { return ret } -func getChecksums(paths []string) map[string][]string { +func getChecksums(cmd []string, paths []string) map[string][]string { ret := map[string][]string{} sl := myStatusLine() cnt := 0 - sl.Put(fmt.Sprintf("Generating checksums for files... %d/%d", cnt, len(paths))) + sl.Put(fmt.Sprintf("Generating checksums (%v) for files... %d/%d", cmd, cnt, len(paths))) pathsTodo := paths for len(pathsTodo) > 0 { @@ -100,7 +100,7 @@ func getChecksums(paths []string) map[string][]string { } pathsTodo = pathsTodo[len(pathsDoing):] - cmd := exec.Command("sha256sum", append([]string{"--"}, pathsDoing...)...) + cmd := exec.Command(cmd[0], append(cmd[1:], pathsDoing...)...) stdout, err := cmd.StdoutPipe() errhandle(err) cmd.Stderr = os.Stderr @@ -121,16 +121,43 @@ func getChecksums(paths []string) map[string][]string { ret[checksum] = append(ret[checksum], filename) cnt++ - sl.Put(fmt.Sprintf("Generating checksums for files... %d/%d", cnt, len(paths))) + sl.Put(fmt.Sprintf("Generating checksums (%v) for files... %d/%d", cmd, cnt, len(paths))) } errhandle(cmd.Wait()) } - sl.Put(fmt.Sprintf("Generating checksums for files... done; summed %d files", cnt)) + sl.Put(fmt.Sprintf("Generating checksums (%v) for files... done; summed %d files", cmd, cnt)) sl.End(true) return ret } +func pruneSingles(key2vals map[string][]string) { + for key, vals := range key2vals { + if len(vals) < 2 { + delete(key2vals, key) + } + } +} + +func dedupe(srcFile string, dupFiles []string) error { + stat, err := os.Stat(srcFile) + errhandle(err) + args := []string{ + "-r", "--", strconv.FormatInt(stat.Size(), 10), + srcFile, "0", + } + for _, dupFile := range dupFiles { + args = append(args, dupFile, "0") + } + + cmd := exec.Command("./cow-dedupe-range", args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + //fmt.Println("#", checksum) + //fmt.Println(cmd.Args) + return cmd.Run() +} + func main() { // we have low parallelism, don't let syscalls fan-out weird // on many-core systems @@ -161,63 +188,56 @@ func main() { sl.Put(fmt.Sprintf("Building list of spanning files... done; %d files", len(spanningFiles))) sl.End(true) - checksum2filenames := getChecksums(spanningFiles) - - checksum2fiemaps := map[string][]string{} - for checksum, filenames := range checksum2filenames { - for _, filename := range filenames { - checksum2fiemaps[checksum] = append(checksum2fiemaps[checksum], filename2fiemap[filename]) - } - } - for checksum, fiemaps := range checksum2fiemaps { - if len(fiemaps) < 2 { - delete(checksum2fiemaps, checksum) - } - } + size2filenames := getChecksums([]string{"stat", "--printf=%s %n\\n", "--"}, spanningFiles) + fmt.Fprintf(" -> %d sets", len(size2filenames)) + pruneSingles(size2filenames) + fmt.Fprintf(" -> %d non-trivial sets\n", len(size2filenames)) sl = myStatusLine() - cnt := 0 - sl.Put(fmt.Sprintf("Deduplicating sets of files... %d/%d", cnt, len(checksum2fiemaps))) - for checksum, fiemaps := range checksum2fiemaps { - // choose the fiemap with the fewest extents - minFiemap := fiemaps[0] - minFiemapLen := strings.Count(minFiemap, "\n") - for _, fiemap := range fiemaps { - fiemapLen := strings.Count(fiemap, "\n") - if fiemapLen < minFiemapLen { - minFiemap = fiemap - minFiemapLen = fiemapLen + sizeCnt := 0 + for size, filenames := range size2filenames { + // The list of specific files in size2filenames isn't + // significant; they'e just proxies for fiemaps. + sizeStatus := fmt.Sprintf("Working on size-set %d/%d of %d fiemaps", + sizeCnt, len(size2filenames), len(filenames)) + sl.Put(sizeStatus) + // Now do strict hashing, instead of the incredibly + // sloppy (but fast) size-bucketing. + checksum2filenames := getChecksums([]string{"sha256sum", "--"}, filenames) + pruneSingles(checksum2filenames) + // And loop over the smaller, precise buckets + sumCnt := 0 + for checksum, filenames := range checksum2fiemaps { + sl.Put(fmt.Sprintf("%s :: sha256-set %d/%d of %d fiemaps", sizeStatus, sumCnt, len(checksum2filenames), len(filenames))) + var fiemaps []string + for _, filename := range filenames { + fiemaps = append(fiemaps, filename2fiemap[filename]) } - } - srcFile := fiemap2filenames[minFiemap][0] - var dupFiles []string - for _, fiemap := range fiemaps { - if fiemap == minFiemap { - continue + // Now we choose the fiemap with the fewest extents + minFiemap := fiemaps[0] + minFiemapLen := strings.Count(minFiemap, "\n") + for _, fiemap := range fiemaps { + fiemapLen := strings.Count(fiemap, "\n") + if fiemapLen < minFiemapLen { + minFiemap = fiemap + minFiemapLen = fiemapLen + } } - dupFiles = append(dupFiles, fiemap2filenames[fiemap]...) - } - - stat, err := os.Stat(srcFile) - errhandle(err) - args := []string{ - "-r", "--", strconv.FormatInt(stat.Size(), 10), - srcFile, "0", - } - for _, dupFile := range dupFiles { - args = append(args, dupFile, "0") + // Set srcFile and dupFiles based on that + srcFile := fiemap2filenames[minFiemap][0] + var dupFiles []string + for _, fiemap := range fiemaps { + if fiemap == minFiemap { + continue + } + dupFiles = append(dupFiles, fiemap2filenames[fiemap]...) + } + // And actually dedupe those + dedupe(srcFile, dupFiles) // XXX ignore error? + sumCnt++ } - - cmd := exec.Command("./cow-dedupe-range", args...) - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - fmt.Println("#", checksum) - fmt.Println(cmd.Args) - errhandle(cmd.Run()) - - cnt++ - sl.Put(fmt.Sprintf("Deduplicating sets of files... %d/%d", cnt, len(checksum2fiemaps))) + sizeCnt++ } - sl.Put(fmt.Sprintf("Deduplicating sets of files... done; deduplicated %d sets", cnt)) + sl.Put(fmt.Sprintf("Working on size-set %d/%d... done", sizeCnt, len(size2filenames))) sl.End(true) } -- cgit v1.2.3