package main import ( "bufio" "fmt" "io" "os" "os/exec" "path/filepath" "runtime" "strconv" "strings" ) //#include import "C" var arg_max = int(C.sysconf(C._SC_ARG_MAX)) func errhandle(err error) { if err != nil { panic(err) } } func findLikelyDups(paths []string) map[string][]string { ret := map[string][]string{} var err error for i := range paths { paths[i], err = filepath.Abs(paths[i]) errhandle(err) } cmd := exec.Command("find", append(paths, "-type", "f", "-printf", "%s %p\\0")...) stdout, err := cmd.StdoutPipe() errhandle(err) cmd.Stderr = os.Stderr errhandle(cmd.Start()) rd := bufio.NewReader(stdout) for { line, err := rd.ReadString('\x00') if line == "" && err == io.EOF { break } errhandle(err) parts := strings.SplitN(strings.TrimSuffix(line, "\x00"), " ", 2) if len(parts) != 2 { panic("wut") } size := parts[0] filename := parts[1] basename := filepath.Base(filename) key := size + " " + basename ret[key] = append(ret[key], filename) } errhandle(cmd.Wait()) for key := range ret { if len(ret[key]) < 2 { delete(ret, key) } } return ret } func getFiemaps(paths []string) map[string][]string { ret := map[string][]string{} fmt.Fprintf(os.Stderr, "Getting fiemaps for %d files...\n", len(paths)) for len(paths) > 0 { _paths := paths arg_len := 0 for i := range _paths { arg_len += len(_paths[i])+1 if arg_len > arg_max/2 { _paths = _paths[:i-1] break } } paths = paths[len(_paths):] fmt.Fprintf(os.Stderr, " -> %d\n", len(_paths)) cmd := exec.Command("./cow-extent-map", append([]string{"-m", "--"}, _paths...)...) stdout, err := cmd.StdoutPipe() errhandle(err) cmd.Stderr = os.Stderr errhandle(cmd.Start()) rd := bufio.NewReader(stdout) for { filename, err := rd.ReadString('\x00') if filename == "" && err == io.EOF { break } filename = strings.TrimSuffix(filename, "\x00") if !strings.HasPrefix(filename, "/") { panic("ugly filename") } errhandle(err) fiemap, err := rd.ReadString('\x00') fiemap = strings.TrimSuffix(fiemap, "\x00") if !(strings.HasPrefix(fiemap, "logical=") || fiemap == "") { panic("ugly fiemap") } errhandle(err) ret[fiemap] = append(ret[fiemap], filename) } errhandle(cmd.Wait()) } fmt.Fprintf(os.Stderr, "...done\n") return ret } func getChecksums(paths []string) map[string][]string { ret := map[string][]string{} fmt.Fprintf(os.Stderr, "Generating checksums for %d files...\n", len(paths)) for len(paths) > 0 { _paths := paths arg_len := 0 for i := range _paths { arg_len += len(_paths[i])+1 if arg_len > arg_max/2 { _paths = _paths[:i-1] break } } paths = paths[len(_paths):] fmt.Fprintf(os.Stderr, " -> %d\n", len(_paths)) cmd := exec.Command("sha256sum", append([]string{"--"}, _paths...)...) stdout, err := cmd.StdoutPipe() errhandle(err) cmd.Stderr = os.Stderr errhandle(cmd.Start()) rd := bufio.NewReader(stdout) for { line, err := rd.ReadString('\n') if line == "" && err == io.EOF { break } errhandle(err) parts := strings.SplitN(strings.TrimSuffix(line, "\n"), " ", 2) if len(parts) != 2 { panic("wut") } checksum := parts[0] filename := strings.TrimPrefix(parts[1], " ") ret[checksum] = append(ret[checksum], filename) } errhandle(cmd.Wait()) } fmt.Fprintf(os.Stderr, "...done\n") return ret } func main() { // we have no parallelism, don't let syscalls fan-out weird on // many-core systems runtime.GOMAXPROCS(1) likely := findLikelyDups(os.Args[1:]) var flatLikely []string for _, filenames := range likely { flatLikely = append(flatLikely, filenames...) } fiemap2filenames := getFiemaps(flatLikely) filename2fiemap := map[string]string{} for fiemap, filenames := range fiemap2filenames { for _, filename := range filenames { if _, ok := filename2fiemap[filename]; ok { panic("not ok") } filename2fiemap[filename] = fiemap } } spanningFiles := make([]string, len(fiemap2filenames)) i := 0 for _, filenames := range fiemap2filenames { spanningFiles[i] = filenames[0] i++ } checksum2filenames := getChecksums(spanningFiles) checksum2fiemaps := map[string][]string{} for checksum, filenames := range checksum2filenames { for _, filename := range filenames { checksum2fiemaps[checksum] = append(checksum2fiemaps[checksum], filename2fiemap[filename]) } } for checksum, fiemaps := range checksum2fiemaps { if len(fiemaps) < 2 { delete(checksum2fiemaps, checksum) } } fmt.Fprintf(os.Stderr, "Deduplicating %d sets of files...\n", len(checksum2fiemaps)) for checksum, fiemaps := range checksum2fiemaps { // choose the fiemap with the fewest extents minFiemap := fiemaps[0] minFiemapLen := strings.Count(minFiemap, "\n") for _, fiemap := range fiemaps { fiemapLen := strings.Count(fiemap, "\n") if fiemapLen < minFiemapLen { minFiemap = fiemap minFiemapLen = fiemapLen } } srcFile := fiemap2filenames[minFiemap][0] var dupFiles []string for _, fiemap := range fiemaps { if fiemap == minFiemap { continue } dupFiles = append(dupFiles, fiemap2filenames[fiemap]...) } stat, err := os.Stat(srcFile) errhandle(err) args := []string{ "-r", "--", strconv.FormatInt(stat.Size(), 10), srcFile, "0", } for _, dupFile := range dupFiles { args = append(args, dupFile, "0") } cmd := exec.Command("./cow-dedupe-range", args...) cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr fmt.Println("#", checksum) fmt.Println(cmd.Args) errhandle(cmd.Run()) } }