summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--dedupe.go241
1 files changed, 241 insertions, 0 deletions
diff --git a/dedupe.go b/dedupe.go
new file mode 100644
index 0000000..aaeae2e
--- /dev/null
+++ b/dedupe.go
@@ -0,0 +1,241 @@
+package main
+
+import (
+ "bufio"
+ "fmt"
+ "io"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "runtime"
+ "strconv"
+ "strings"
+)
+
+//#include <unistd.h>
+import "C"
+
+var arg_max = int(C.sysconf(C._SC_ARG_MAX))
+
+func errhandle(err error) {
+ if err != nil {
+ panic(err)
+ }
+}
+
+func findLikelyDups(paths []string) map[string][]string {
+ ret := map[string][]string{}
+ var err error
+ for i := range paths {
+ paths[i], err = filepath.Abs(paths[i])
+ errhandle(err)
+ }
+ cmd := exec.Command("find", append(paths, "-type", "f", "-printf", "%s %p\\0")...)
+ stdout, err := cmd.StdoutPipe()
+ errhandle(err)
+ cmd.Stderr = os.Stderr
+ errhandle(cmd.Start())
+ rd := bufio.NewReader(stdout)
+ for {
+ line, err := rd.ReadString('\x00')
+ if line == "" && err == io.EOF {
+ break
+ }
+ errhandle(err)
+ parts := strings.SplitN(strings.TrimSuffix(line, "\x00"), " ", 2)
+ if len(parts) != 2 {
+ panic("wut")
+ }
+ size := parts[0]
+ filename := parts[1]
+ basename := filepath.Base(filename)
+ key := size + " " + basename
+ ret[key] = append(ret[key], filename)
+ }
+ errhandle(cmd.Wait())
+ for key := range ret {
+ if len(ret[key]) < 2 {
+ delete(ret, key)
+ }
+ }
+ return ret
+}
+
+func getFiemaps(paths []string) map[string][]string {
+ ret := map[string][]string{}
+ fmt.Fprintf(os.Stderr, "Getting fiemaps for %d files...\n", len(paths))
+
+ for len(paths) > 0 {
+ _paths := paths
+ arg_len := 0
+ for i := range _paths {
+ arg_len += len(_paths[i])+1
+ if arg_len > arg_max/2 {
+ _paths = _paths[:i-1]
+ break
+ }
+ }
+ paths = paths[len(_paths):]
+ fmt.Fprintf(os.Stderr, " -> %d\n", len(_paths))
+
+ cmd := exec.Command("./cow-extent-map", append([]string{"-m", "--"}, _paths...)...)
+ stdout, err := cmd.StdoutPipe()
+ errhandle(err)
+ cmd.Stderr = os.Stderr
+ errhandle(cmd.Start())
+ rd := bufio.NewReader(stdout)
+ for {
+ filename, err := rd.ReadString('\x00')
+ if filename == "" && err == io.EOF {
+ break
+ }
+ filename = strings.TrimSuffix(filename, "\x00")
+ if !strings.HasPrefix(filename, "/") {
+ panic("ugly filename")
+ }
+ errhandle(err)
+ fiemap, err := rd.ReadString('\x00')
+ fiemap = strings.TrimSuffix(fiemap, "\x00")
+ if !(strings.HasPrefix(fiemap, "logical=") || fiemap == "") {
+ panic("ugly fiemap")
+ }
+ errhandle(err)
+
+ ret[fiemap] = append(ret[fiemap], filename)
+ }
+ errhandle(cmd.Wait())
+ }
+
+ fmt.Fprintf(os.Stderr, "...done\n")
+ return ret
+}
+
+func getChecksums(paths []string) map[string][]string {
+ ret := map[string][]string{}
+ fmt.Fprintf(os.Stderr, "Generating checksums for %d files...\n", len(paths))
+
+ for len(paths) > 0 {
+ _paths := paths
+ arg_len := 0
+ for i := range _paths {
+ arg_len += len(_paths[i])+1
+ if arg_len > arg_max/2 {
+ _paths = _paths[:i-1]
+ break
+ }
+ }
+ paths = paths[len(_paths):]
+ fmt.Fprintf(os.Stderr, " -> %d\n", len(_paths))
+
+ cmd := exec.Command("sha256sum", append([]string{"--"}, _paths...)...)
+ stdout, err := cmd.StdoutPipe()
+ errhandle(err)
+ cmd.Stderr = os.Stderr
+ errhandle(cmd.Start())
+ rd := bufio.NewReader(stdout)
+ for {
+ line, err := rd.ReadString('\n')
+ if line == "" && err == io.EOF {
+ break
+ }
+ errhandle(err)
+ parts := strings.SplitN(strings.TrimSuffix(line, "\n"), " ", 2)
+ if len(parts) != 2 {
+ panic("wut")
+ }
+ checksum := parts[0]
+ filename := strings.TrimPrefix(parts[1], " ")
+
+ ret[checksum] = append(ret[checksum], filename)
+ }
+ errhandle(cmd.Wait())
+ }
+
+ fmt.Fprintf(os.Stderr, "...done\n")
+ return ret
+}
+
+func main() {
+ // we have no parallelism, don't let syscalls fan-out weird on
+ // many-core systems
+ runtime.GOMAXPROCS(1)
+
+ likely := findLikelyDups(os.Args[1:])
+
+ var flatLikely []string
+ for _, filenames := range likely {
+ flatLikely = append(flatLikely, filenames...)
+ }
+
+ fiemap2filenames := getFiemaps(flatLikely)
+
+ filename2fiemap := map[string]string{}
+ for fiemap, filenames := range fiemap2filenames {
+ for _, filename := range filenames {
+ if _, ok := filename2fiemap[filename]; ok {
+ panic("not ok")
+ }
+ filename2fiemap[filename] = fiemap
+ }
+ }
+
+ spanningFiles := make([]string, len(fiemap2filenames))
+ i := 0
+ for _, filenames := range fiemap2filenames {
+ spanningFiles[i] = filenames[0]
+ i++
+ }
+
+ checksum2filenames := getChecksums(spanningFiles)
+
+ checksum2fiemaps := map[string][]string{}
+ for checksum, filenames := range checksum2filenames {
+ for _, filename := range filenames {
+ checksum2fiemaps[checksum] = append(checksum2fiemaps[checksum], filename2fiemap[filename])
+ }
+ }
+ for checksum, fiemaps := range checksum2fiemaps {
+ if len(fiemaps) < 2 {
+ delete(checksum2fiemaps, checksum)
+ }
+ }
+
+ fmt.Fprintf(os.Stderr, "Deduplicating %d sets of files...\n", len(checksum2fiemaps))
+ for checksum, fiemaps := range checksum2fiemaps {
+ // choose the fiemap with the fewest extents
+ minFiemap := fiemaps[0]
+ minFiemapLen := strings.Count(minFiemap, "\n")
+ for _, fiemap := range fiemaps {
+ fiemapLen := strings.Count(fiemap, "\n")
+ if fiemapLen < minFiemapLen {
+ minFiemap = fiemap
+ minFiemapLen = fiemapLen
+ }
+ }
+ srcFile := fiemap2filenames[minFiemap][0]
+ var dupFiles []string
+ for _, fiemap := range fiemaps {
+ if fiemap == minFiemap {
+ continue
+ }
+ dupFiles = append(dupFiles, fiemap2filenames[fiemap]...)
+ }
+
+ stat, err := os.Stat(srcFile)
+ errhandle(err)
+ args := []string{
+ "-r", "--", strconv.FormatInt(stat.Size(), 10),
+ srcFile, "0",
+ }
+ for _, dupFile := range dupFiles {
+ args = append(args, dupFile, "0")
+ }
+
+ cmd := exec.Command("./cow-dedupe-range", args...)
+ cmd.Stdout = os.Stdout
+ cmd.Stderr = os.Stderr
+ fmt.Println("#", checksum)
+ fmt.Println(cmd.Args)
+ errhandle(cmd.Run())
+ }
+}