package main import ( "bufio" "fmt" "io" "os" "os/exec" "path/filepath" "runtime" "strconv" "strings" "time" "lib/statusline" ) //#include import "C" var arg_max = int(C.sysconf(C._SC_ARG_MAX)) var open_max = int(C.sysconf(C._SC_OPEN_MAX)) func errhandle(err error) { if err != nil { panic(err) } } var myStatusLine = func() func() statusline.StatusLine { var outs []func() statusline.StatusLine if C.isatty(2) != 0 { fmt.Println("Output: TTY") outs = append(outs, func() statusline.StatusLine { return statusline.StopWatch(statusline.RateLimit(statusline.TTY(os.Stderr), time.Second/2), time.Second) }) } if os.Getenv("NOTIFY_SOCKET") != "" { fmt.Println("Output: SD") outs = append(outs, func() statusline.StatusLine { return statusline.RateLimit(statusline.DaemonStatus(0), time.Second/2) }) outs = append(outs, func() statusline.StatusLine { return statusline.RateLimit(statusline.Log(os.Stderr), time.Minute) }) } if len(outs) == 0 { fmt.Println("Output: ERR") outs = append(outs, func() statusline.StatusLine { return statusline.Log(os.Stderr) }) } if len(outs) == 1 { return outs[0] } return func() statusline.StatusLine { var sls []statusline.StatusLine for _, fn := range outs { sls = append(sls, fn()) } return statusline.Tee(sls...) } }() func getFiemaps(paths []string) map[string][]string { var err error for i := range paths { paths[i], err = filepath.Abs(paths[i]) errhandle(err) } ret := map[string][]string{} sl := myStatusLine() cnt := 0 sl.Put("Mapping extents...") cmd := exec.Command("find", append(paths, "-type", "f", "-exec", "./cow-extent-map", "-m", "--", "{}", "+")...) stdout, err := cmd.StdoutPipe() errhandle(err) cmd.Stderr = os.Stderr errhandle(cmd.Start()) rd := bufio.NewReader(stdout) for { // read filename filename, err := rd.ReadString('\x00') if filename == "" && err == io.EOF { break } errhandle(err) filename = strings.TrimSuffix(filename, "\x00") if !strings.HasPrefix(filename, "/") { panic("ugly filename") } // read extents fiemap, err := rd.ReadString('\x00') errhandle(err) fiemap = strings.TrimSuffix(fiemap, "\x00") if !(strings.HasPrefix(fiemap, "logical=") || fiemap == "") { panic("ugly fiemap") } // do stuff ret[fiemap] = append(ret[fiemap], filename) cnt++ sl.Put(fmt.Sprintf("Mapping extents... %d", cnt)) } /*errhandle(*/ cmd.Wait() /*)*/ sl.Put(fmt.Sprintf("Mapping extents... done; mapped %d files", cnt)) sl.End(true) return ret } func getChecksums(sl statusline.StatusLine, slfmt string, basecmd []string, paths []string) map[string][]string { ret := map[string][]string{} cnt := 0 sl.Put(fmt.Sprintf(slfmt, cnt, len(paths))) pathsTodo := paths for len(pathsTodo) > 0 { pathsDoing := pathsTodo arg_len := 0 for i := range pathsDoing { arg_len += len(pathsDoing[i]) + 1 if arg_len > arg_max/2 { pathsDoing = pathsDoing[:i-1] break } } pathsTodo = pathsTodo[len(pathsDoing):] cmd := exec.Command(basecmd[0], append(basecmd[1:], pathsDoing...)...) stdout, err := cmd.StdoutPipe() errhandle(err) cmd.Stderr = os.Stderr errhandle(cmd.Start()) rd := bufio.NewReader(stdout) for { line, err := rd.ReadString('\n') if line == "" && err == io.EOF { break } errhandle(err) parts := strings.SplitN(strings.TrimSuffix(line, "\n"), " ", 2) if len(parts) != 2 { panic("wut") } checksum := parts[0] filename := strings.TrimPrefix(parts[1], " ") ret[checksum] = append(ret[checksum], filename) cnt++ sl.Put(fmt.Sprintf(slfmt, cnt, len(paths))) } /*errhandle(*/ cmd.Wait() /*)*/ } sl.Put(fmt.Sprintf(slfmt, "done", cnt)) return ret } func pruneSingles(key2vals map[string][]string) int { n := 0 for key, vals := range key2vals { if len(vals) < 2 { delete(key2vals, key) n++ } } return n } func dedupe(srcFile string, dupFiles []string) error { stat, err := os.Stat(srcFile) if err != nil { return err } baseArgs := []string{ "-r", "--", strconv.FormatInt(stat.Size(), 10), srcFile, "0", } dupFilesTodo := dupFiles for len(dupFilesTodo) > 0 { dupFilesDoing := dupFilesTodo arg_len := 0 for i := range dupFilesDoing { arg_len += len(dupFilesDoing[i]) + 3 if arg_len > arg_max/2 || (i+15) > open_max { dupFilesDoing = dupFilesDoing[:i-1] break } } dupFilesTodo = dupFilesTodo[len(dupFilesDoing):] args := baseArgs for _, dupFile := range dupFilesDoing { args = append(args, dupFile, "0") } cmd := exec.Command("./cow-dedupe-range", args...) cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr if _err := cmd.Run(); _err != nil { err = _err } } return err } // [ 0s ] set[n/(d+e)]->c :: (summed=n deduped=n) :: verb[m/n] type fancyStatus struct { set struct{ n, d, p, m, size int } summed, errs int deduped struct{ srcs, dstMaps, dstFiles int } verb string } func (s fancyStatus) String() string { return fmt.Sprintf("set[%d/(%d+%d-%d)]->%d :: summed=%d deduped={%dmaps:%dfiles->%d} errs=%d :: %s", s.set.n, s.set.d, s.set.p, s.set.m, s.set.size, s.summed, s.deduped.dstMaps+s.deduped.srcs, s.deduped.dstFiles+s.deduped.srcs, s.deduped.srcs, s.errs, s.verb) } func main() { // we have low parallelism, don't let syscalls fan-out weird // on many-core systems runtime.GOMAXPROCS(2) fiemap2filenames := getFiemaps(os.Args[1:]) var sl statusline.StatusLine if C.isatty(2) != 0 { sl = statusline.StopWatch(statusline.TTY(os.Stderr), time.Second) } else { sl = statusline.Log(os.Stderr) } sl.Put("Building list of spanning files...") filename2fiemap := map[string]string{} for fiemap, filenames := range fiemap2filenames { for _, filename := range filenames { if _, ok := filename2fiemap[filename]; ok { panic("not ok") } filename2fiemap[filename] = fiemap } } spanningFiles := make([]string, len(fiemap2filenames)) i := 0 for _, filenames := range fiemap2filenames { spanningFiles[i] = filenames[0] i++ } sl.Put(fmt.Sprintf("Building list of spanning files... done; %d files", len(spanningFiles))) sl.End(true) sl = myStatusLine() size2filenames := getChecksums(sl, "Getting sizes for files... %v/%v", []string{"./fastsum"}, spanningFiles) sl.End(true) fmt.Fprintf(os.Stderr, " -> %d sets", len(size2filenames)) pruneSingles(size2filenames) fmt.Fprintf(os.Stderr, " -> %d non-trivial sets\n", len(size2filenames)) sl = myStatusLine() var status fancyStatus status.set.d = len(size2filenames) for _, filenames := range size2filenames { status.set.size = len(filenames) // Now do strict hashing, instead of the incredibly // sloppy (but fast) size-bucketing. status.verb = "sha256sum[%v/%v]" checksum2filenames := getChecksums(sl, status.String(), []string{"sha256sum", "--"}, filenames) status.summed += len(filenames) status.verb = "pruneSingles" sl.Put(status.String()) status.set.p += len(checksum2filenames) - 1 status.set.m += pruneSingles(checksum2filenames) // And loop over the smaller, precise buckets for _, filenames := range checksum2filenames { status.set.size = len(filenames) status.verb = "prep" sl.Put(status.String()) var fiemaps []string for _, filename := range filenames { fiemaps = append(fiemaps, filename2fiemap[filename]) } // Now we choose the fiemap with the fewest // extents and the most files minFiemap := fiemaps[0] minFiemapLen := strings.Count(minFiemap, "\n") minFiemapCnt := len(fiemap2filenames[minFiemap]) for _, fiemap := range fiemaps { fiemapLen := strings.Count(fiemap, "\n") fiemapCnt := len(fiemap2filenames[fiemap]) if fiemapLen < minFiemapLen || (fiemapLen == minFiemapLen && fiemapCnt > minFiemapCnt) { minFiemap = fiemap minFiemapLen = fiemapLen minFiemapCnt = fiemapCnt } } // Set srcFile and dupFiles based on that srcFile := fiemap2filenames[minFiemap][0] var dupFiles []string for _, fiemap := range fiemaps { if fiemap == minFiemap { continue } dupFiles = append(dupFiles, fiemap2filenames[fiemap]...) } // And actually dedupe those name := filepath.Base(srcFile) if len(name) > 21 { name = name[:20] + "…" } status.verb = fmt.Sprintf("dedupe %q<-[%d]file{…}", name, len(dupFiles)) sl.Put(status.String()) err := dedupe(srcFile, dupFiles) if err != nil { status.errs++ } else { status.deduped.srcs++ status.deduped.dstMaps += len(fiemaps) - 1 status.deduped.dstFiles += len(dupFiles) } status.set.n++ } } status.verb = "done" sl.Put(status.String()) sl.End(true) }