summaryrefslogtreecommitdiff
path: root/dedupe.go
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@beefcake.parabola.nu>2018-05-18 23:06:08 -0400
committerLuke Shumaker <lukeshu@beefcake.parabola.nu>2018-05-18 23:06:08 -0400
commit1fd8e694c26a05069da7c660f1c4b95395bfea59 (patch)
treed30d61b2899f75bda04daa06e6ed441fd217eaf1 /dedupe.go
parent9018cb140b386582a1c0f19e7dbd8026960b3803 (diff)
use a go workspace
Diffstat (limited to 'dedupe.go')
-rw-r--r--dedupe.go245
1 files changed, 0 insertions, 245 deletions
diff --git a/dedupe.go b/dedupe.go
deleted file mode 100644
index f28d2f4..0000000
--- a/dedupe.go
+++ /dev/null
@@ -1,245 +0,0 @@
-package main
-
-import (
- "bufio"
- "fmt"
- "io"
- "os"
- "os/exec"
- "path/filepath"
- "runtime"
- "strconv"
- "strings"
-)
-
-//#include <unistd.h>
-import "C"
-
-var arg_max = int(C.sysconf(C._SC_ARG_MAX))
-
-func errhandle(err error) {
- if err != nil {
- panic(err)
- }
-}
-
-func findLikelyDups(paths []string) map[string][]string {
- ret := map[string][]string{}
- var err error
- for i := range paths {
- paths[i], err = filepath.Abs(paths[i])
- errhandle(err)
- }
- cmd := exec.Command("find", append(paths, "-type", "f", "-printf", "%s %p\\0")...)
- stdout, err := cmd.StdoutPipe()
- errhandle(err)
- cmd.Stderr = os.Stderr
- errhandle(cmd.Start())
- rd := bufio.NewReader(stdout)
- for {
- line, err := rd.ReadString('\x00')
- if line == "" && err == io.EOF {
- break
- }
- errhandle(err)
- parts := strings.SplitN(strings.TrimSuffix(line, "\x00"), " ", 2)
- if len(parts) != 2 {
- panic("wut")
- }
- size := parts[0]
- filename := parts[1]
- basename := filepath.Base(filename)
- key := size + " " + basename
- ret[key] = append(ret[key], filename)
- }
- errhandle(cmd.Wait())
- for key := range ret {
- if len(ret[key]) < 2 {
- delete(ret, key)
- }
- }
- return ret
-}
-
-func getFiemaps(paths []string) map[string][]string {
- ret := map[string][]string{}
- fmt.Fprintf(os.Stderr, "Getting fiemaps for %d files...\n", len(paths))
-
- cnt := 0
- for len(paths) > 0 {
- _paths := paths
- arg_len := 0
- for i := range _paths {
- arg_len += len(_paths[i]) + 1
- if arg_len > arg_max/2 {
- _paths = _paths[:i-1]
- break
- }
- }
- paths = paths[len(_paths):]
-
- cmd := exec.Command("./cow-extent-map", append([]string{"-m", "--"}, _paths...)...)
- stdout, err := cmd.StdoutPipe()
- errhandle(err)
- cmd.Stderr = os.Stderr
- errhandle(cmd.Start())
- rd := bufio.NewReader(stdout)
- for {
- filename, err := rd.ReadString('\x00')
- if filename == "" && err == io.EOF {
- break
- }
- filename = strings.TrimSuffix(filename, "\x00")
- if !strings.HasPrefix(filename, "/") {
- panic("ugly filename")
- }
- errhandle(err)
- fiemap, err := rd.ReadString('\x00')
- fiemap = strings.TrimSuffix(fiemap, "\x00")
- if !(strings.HasPrefix(fiemap, "logical=") || fiemap == "") {
- panic("ugly fiemap")
- }
- errhandle(err)
-
- ret[fiemap] = append(ret[fiemap], filename)
- cnt++
- fmt.Fprintf(os.Stderr, "\r%d ", cnt)
- }
- errhandle(cmd.Wait())
- }
-
- fmt.Fprintf(os.Stderr, "\r...done \n")
- return ret
-}
-
-func getChecksums(paths []string) map[string][]string {
- ret := map[string][]string{}
- fmt.Fprintf(os.Stderr, "Generating checksums for %d files...\n", len(paths))
-
- cnt := 0
- for len(paths) > 0 {
- _paths := paths
- arg_len := 0
- for i := range _paths {
- arg_len += len(_paths[i]) + 1
- if arg_len > arg_max/2 {
- _paths = _paths[:i-1]
- break
- }
- }
- paths = paths[len(_paths):]
-
- cmd := exec.Command("sha256sum", append([]string{"--"}, _paths...)...)
- stdout, err := cmd.StdoutPipe()
- errhandle(err)
- cmd.Stderr = os.Stderr
- errhandle(cmd.Start())
- rd := bufio.NewReader(stdout)
- for {
- line, err := rd.ReadString('\n')
- if line == "" && err == io.EOF {
- break
- }
- errhandle(err)
- parts := strings.SplitN(strings.TrimSuffix(line, "\n"), " ", 2)
- if len(parts) != 2 {
- panic("wut")
- }
- checksum := parts[0]
- filename := strings.TrimPrefix(parts[1], " ")
-
- ret[checksum] = append(ret[checksum], filename)
- cnt++
- fmt.Fprintf(os.Stderr, "\r%d ", cnt)
- }
- errhandle(cmd.Wait())
- }
-
- fmt.Fprintf(os.Stderr, "\r...done \n")
- return ret
-}
-
-func main() {
- // we have no parallelism, don't let syscalls fan-out weird on
- // many-core systems
- runtime.GOMAXPROCS(1)
-
- likely := findLikelyDups(os.Args[1:])
-
- var flatLikely []string
- for _, filenames := range likely {
- flatLikely = append(flatLikely, filenames...)
- }
-
- fiemap2filenames := getFiemaps(flatLikely)
-
- filename2fiemap := map[string]string{}
- for fiemap, filenames := range fiemap2filenames {
- for _, filename := range filenames {
- if _, ok := filename2fiemap[filename]; ok {
- panic("not ok")
- }
- filename2fiemap[filename] = fiemap
- }
- }
-
- spanningFiles := make([]string, len(fiemap2filenames))
- i := 0
- for _, filenames := range fiemap2filenames {
- spanningFiles[i] = filenames[0]
- i++
- }
-
- checksum2filenames := getChecksums(spanningFiles)
-
- checksum2fiemaps := map[string][]string{}
- for checksum, filenames := range checksum2filenames {
- for _, filename := range filenames {
- checksum2fiemaps[checksum] = append(checksum2fiemaps[checksum], filename2fiemap[filename])
- }
- }
- for checksum, fiemaps := range checksum2fiemaps {
- if len(fiemaps) < 2 {
- delete(checksum2fiemaps, checksum)
- }
- }
-
- fmt.Fprintf(os.Stderr, "Deduplicating %d sets of files...\n", len(checksum2fiemaps))
- for checksum, fiemaps := range checksum2fiemaps {
- // choose the fiemap with the fewest extents
- minFiemap := fiemaps[0]
- minFiemapLen := strings.Count(minFiemap, "\n")
- for _, fiemap := range fiemaps {
- fiemapLen := strings.Count(fiemap, "\n")
- if fiemapLen < minFiemapLen {
- minFiemap = fiemap
- minFiemapLen = fiemapLen
- }
- }
- srcFile := fiemap2filenames[minFiemap][0]
- var dupFiles []string
- for _, fiemap := range fiemaps {
- if fiemap == minFiemap {
- continue
- }
- dupFiles = append(dupFiles, fiemap2filenames[fiemap]...)
- }
-
- stat, err := os.Stat(srcFile)
- errhandle(err)
- args := []string{
- "-r", "--", strconv.FormatInt(stat.Size(), 10),
- srcFile, "0",
- }
- for _, dupFile := range dupFiles {
- args = append(args, dupFile, "0")
- }
-
- cmd := exec.Command("./cow-dedupe-range", args...)
- cmd.Stdout = os.Stdout
- cmd.Stderr = os.Stderr
- fmt.Println("#", checksum)
- fmt.Println(cmd.Args)
- errhandle(cmd.Run())
- }
-}