diff options
Diffstat (limited to 'lib/btrfsprogs/btrfsinspect/rebuildnodes/rebuild.go')
-rw-r--r-- | lib/btrfsprogs/btrfsinspect/rebuildnodes/rebuild.go | 619 |
1 files changed, 619 insertions, 0 deletions
diff --git a/lib/btrfsprogs/btrfsinspect/rebuildnodes/rebuild.go b/lib/btrfsprogs/btrfsinspect/rebuildnodes/rebuild.go new file mode 100644 index 0000000..ef50653 --- /dev/null +++ b/lib/btrfsprogs/btrfsinspect/rebuildnodes/rebuild.go @@ -0,0 +1,619 @@ +// Copyright (C) 2022 Luke Shumaker <lukeshu@lukeshu.com> +// +// SPDX-License-Identifier: GPL-2.0-or-later + +package rebuildnodes + +import ( + "context" + "fmt" + "sort" + + "github.com/datawire/dlib/dlog" + + "git.lukeshu.com/btrfs-progs-ng/lib/btrfs" + "git.lukeshu.com/btrfs-progs-ng/lib/btrfs/btrfsitem" + "git.lukeshu.com/btrfs-progs-ng/lib/btrfs/btrfsprim" + "git.lukeshu.com/btrfs-progs-ng/lib/btrfs/btrfssum" + "git.lukeshu.com/btrfs-progs-ng/lib/btrfs/btrfstree" + "git.lukeshu.com/btrfs-progs-ng/lib/btrfs/btrfsvol" + "git.lukeshu.com/btrfs-progs-ng/lib/btrfsprogs/btrfsinspect" + "git.lukeshu.com/btrfs-progs-ng/lib/btrfsprogs/btrfsinspect/rebuildmappings" + "git.lukeshu.com/btrfs-progs-ng/lib/btrfsprogs/btrfsinspect/rebuildnodes/graph" + "git.lukeshu.com/btrfs-progs-ng/lib/btrfsprogs/btrfsinspect/rebuildnodes/uuidmap" + "git.lukeshu.com/btrfs-progs-ng/lib/btrfsprogs/btrfsutil" + "git.lukeshu.com/btrfs-progs-ng/lib/containers" + "git.lukeshu.com/btrfs-progs-ng/lib/maps" +) + +type Rebuilder struct { + raw *btrfs.FS + inner interface { + btrfstree.TreeOperator + Augment(treeID btrfsprim.ObjID, nodeAddr btrfsvol.LogicalAddr) ([]btrfsprim.Key, error) + } + sb btrfstree.Superblock + + graph graph.Graph + uuidMap uuidmap.UUIDMap + csums containers.RBTree[containers.NativeOrdered[btrfsvol.LogicalAddr], btrfsinspect.SysExtentCSum] + orphans containers.Set[btrfsvol.LogicalAddr] + leaf2orphans map[btrfsvol.LogicalAddr]containers.Set[btrfsvol.LogicalAddr] + key2leaf containers.SortedMap[keyAndTree, btrfsvol.LogicalAddr] + + augments map[btrfsprim.ObjID]containers.Set[btrfsvol.LogicalAddr] + + pendingAugments map[btrfsprim.ObjID][]map[btrfsvol.LogicalAddr]int +} + +func RebuildNodes(ctx context.Context, fs *btrfs.FS, nodeScanResults btrfsinspect.ScanDevicesResult) (map[btrfsprim.ObjID]containers.Set[btrfsvol.LogicalAddr], error) { + scanData, err := ScanDevices(ctx, fs, nodeScanResults) // ScanDevices does its own logging + if err != nil { + return nil, err + } + + dlog.Info(ctx, "Reading superblock...") + sb, err := fs.Superblock() + if err != nil { + return nil, err + } + + dlog.Info(ctx, "Indexing checksums...") + csums, _ := rebuildmappings.ExtractLogicalSums(ctx, nodeScanResults) + if csums == nil { + csums = new(containers.RBTree[containers.NativeOrdered[btrfsvol.LogicalAddr], btrfsinspect.SysExtentCSum]) + } + + dlog.Info(ctx, "Indexing orphans...") + orphans, leaf2orphans, key2leaf, err := indexOrphans(ctx, fs, *sb, *scanData.nodeGraph) + if err != nil { + return nil, err + } + + dlog.Info(ctx, "Rebuilding node tree...") + o := &Rebuilder{ + raw: fs, + inner: btrfsutil.NewBrokenTrees(ctx, fs), + sb: *sb, + + graph: *scanData.nodeGraph, + uuidMap: *scanData.uuidMap, + csums: *csums, + orphans: orphans, + leaf2orphans: leaf2orphans, + key2leaf: *key2leaf, + + augments: make(map[btrfsprim.ObjID]containers.Set[btrfsvol.LogicalAddr]), + } + if err := o.rebuild(ctx); err != nil { + return nil, err + } + + return o.augments, nil +} + +func (o *Rebuilder) ioErr(ctx context.Context, err error) { + err = fmt.Errorf("should not happen: i/o error: %w", err) + dlog.Error(ctx, err) + panic(err) +} + +func (o *Rebuilder) rebuild(ctx context.Context) error { + passNum := 0 + dlog.Infof(ctx, "... pass %d: scanning for implied items", passNum) + o.pendingAugments = make(map[btrfsprim.ObjID][]map[btrfsvol.LogicalAddr]int) + btrfsutil.WalkAllTrees(ctx, o.inner, btrfsutil.WalkAllTreesHandler{ + Err: func(*btrfsutil.WalkError) {}, + TreeWalkHandler: btrfstree.TreeWalkHandler{ + Item: func(path btrfstree.TreePath, item btrfstree.Item) error { + handleItem(o, ctx, path[0].FromTree, item) + return nil + }, + }, + }) + for len(o.pendingAugments) > 0 { + // Apply the augments, keeping track of what keys are added to what tree. + dlog.Infof(ctx, "... pass %d: augmenting trees to add implied items", passNum) + newKeys := make(map[btrfsprim.ObjID][]btrfsprim.Key) + for _, treeID := range maps.SortedKeys(o.pendingAugments) { + dlog.Infof(ctx, "... ... augmenting tree %v:", treeID) + treeAugments := o.resolveTreeAugments(ctx, o.pendingAugments[treeID]) + for _, nodeAddr := range maps.SortedKeys(treeAugments) { + added, err := o.inner.Augment(treeID, nodeAddr) + if err != nil { + dlog.Errorf(ctx, "error augmenting: %v", err) + continue + } + newKeys[treeID] = append(newKeys[treeID], added...) + + set := o.augments[treeID] + if set == nil { + set = make(containers.Set[btrfsvol.LogicalAddr]) + o.augments[treeID] = set + } + set.Insert(nodeAddr) + } + } + // Clear the list of pending augments. + o.pendingAugments = make(map[btrfsprim.ObjID][]map[btrfsvol.LogicalAddr]int) + passNum++ + // Call handleItem() for each of the added keys. + dlog.Infof(ctx, "... pass %d: scanning for implied items", passNum) + for _, treeID := range maps.SortedKeys(newKeys) { + for _, key := range newKeys[treeID] { + item, err := o.inner.TreeLookup(treeID, key) + if err != nil { + o.ioErr(ctx, fmt.Errorf("error looking up already-inserted item: tree=%v key=%v: %w", + treeID, key, err)) + } + handleItem(o, ctx, treeID, item) + } + } + } + return nil +} + +func (o *Rebuilder) resolveTreeAugments(ctx context.Context, listsWithDistances []map[btrfsvol.LogicalAddr]int) containers.Set[btrfsvol.LogicalAddr] { + distances := make(map[btrfsvol.LogicalAddr]int) + generations := make(map[btrfsvol.LogicalAddr]btrfsprim.Generation) + lists := make([]containers.Set[btrfsvol.LogicalAddr], len(listsWithDistances)) + for i, listWithDistances := range listsWithDistances { + lists[i] = make(containers.Set[btrfsvol.LogicalAddr], len(listWithDistances)) + for item, dist := range listWithDistances { + lists[i].Insert(item) + distances[item] = dist + generations[item] = o.graph.Nodes[item].Generation + } + } + + // Define an algorithm that takes several lists of items, and returns a + // set of those items such that each input list contains zero or one of + // the items from your return set. The same item may appear in multiple + // of the input lists. + // + // > Example 1: Given the input lists + // > + // > 0: [A, B] + // > 2: [A, C] + // > + // > legal solutions would be `[]`, `[A]`, `[B]`, `[C]`, or `[B, C]`. It + // > would not be legal to return `[A, B]` or `[A, C]`. + // + // > Example 2: Given the input lists + // > + // > 1: [A, B] + // > 2: [A] + // > 3: [B] + // > + // > legal solution woudl be `[]`, `[A]` or `[B]`. It would not be legal + // > to return `[A, B]`. + // + // The algorithm should optimize for the following goals: + // + // - We prefer that each input list have an item in the return set. + // + // > In Example 1, while `[]`, `[B]`, and `[C]` are permissable + // > solutions, they are not optimal, because one or both of the input + // > lists are not represented. + // > + // > It may be the case that it is not possible to represent all lists + // > in the result; in Example 2, either list 2 or list 3 must be + // > unrepresented. + // + // - Each item has a non-negative scalar "distance" score, we prefer + // lower distances. Distance scores are comparable; 0 is preferred, + // and a distance of 4 is twice as bad as a distance of 2. + // + // - Each item has a "generation" score, we prefer higher generations. + // Generation scores should not be treated as a linear scale; the + // magnitude of deltas is meaningless; only the sign of a delta is + // meaningful. + // + // > So it would be wrong to say something like + // > + // > desirability = (-a*distance) + (b*generation) // for some constants `a` and `b` + // > + // > because `generation` can't be used that way + // + // - We prefer items that appear in more lists over items that appear in + // fewer lists. + // + // The relative priority of these 4 goals is undefined; preferrably the + // algorithm should be defined in a way that makes it easy to adjust the + // relative priorities. + + ret := make(containers.Set[btrfsvol.LogicalAddr]) + illegal := make(containers.Set[btrfsvol.LogicalAddr]) // cannot-be-accepted and already-accepted + accept := func(item btrfsvol.LogicalAddr) { + ret.Insert(item) + for _, list := range lists { + if list.Has(item) { + illegal.InsertFrom(list) + } + } + } + + counts := make(map[btrfsvol.LogicalAddr]int) + for _, list := range lists { + for item := range list { + counts[item] = counts[item] + 1 + } + } + + sortedItems := maps.Keys(distances) + sort.Slice(sortedItems, func(i, j int) bool { + iItem, jItem := sortedItems[i], sortedItems[j] + if counts[iItem] != counts[jItem] { + return counts[iItem] > counts[jItem] // reverse this check; higher counts should sort lower + } + if distances[iItem] != distances[jItem] { + return distances[iItem] < distances[jItem] + } + if generations[iItem] != generations[jItem] { + return generations[iItem] > generations[jItem] // reverse this check; higher generations should sort lower + } + return iItem < jItem // laddr is as good a tiebreaker as anything + }) + for _, item := range sortedItems { + if !illegal.Has(item) { + accept(item) + } + } + + for i, list := range lists { + dlog.Infof(ctx, "... ... ... %d: %v: %v", i, list.Intersection(ret).TakeOne(), maps.SortedKeys(list)) + } + + return ret +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// _NodeFile is a subset of btrfstree.NodeFile. +type _NodeFile interface { + ParentTree(btrfsprim.ObjID) (btrfsprim.ObjID, bool) +} + +func treeDistance(fs _NodeFile, tree, leaf btrfsprim.ObjID) (int, bool) { + dist := 0 + for { + if tree == leaf { + return dist, true + } + + parentTree, ok := fs.ParentTree(tree) + if !ok { + // Failed to look up parent info. + return 0, false + } + if parentTree == 0 { + // End of the line. + return 0, false + } + + tree = parentTree + dist++ + } +} + +func (o *Rebuilder) wantAugment(ctx context.Context, treeID btrfsprim.ObjID, choices containers.Set[btrfsvol.LogicalAddr]) { + choicesWithDist := make(map[btrfsvol.LogicalAddr]int) + for choice := range choices { + if dist, ok := treeDistance(o.uuidMap, treeID, o.graph.Nodes[choice].Owner); ok { + choicesWithDist[choice] = dist + } + } + if len(choicesWithDist) == 0 { + dlog.Errorf(ctx, "augment(tree=%v): could not find wanted item", treeID) + return + } + dlog.Infof(ctx, "augment(tree=%v): %v", treeID, maps.SortedKeys(choicesWithDist)) + o.pendingAugments[treeID] = append(o.pendingAugments[treeID], choicesWithDist) +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// fsErr implements rebuildCallbacks. +func (o *Rebuilder) fsErr(ctx context.Context, e error) { + dlog.Errorf(ctx, "filesystem error: %v", e) +} + +// want implements rebuildCallbacks. +func (o *Rebuilder) want(ctx context.Context, treeID btrfsprim.ObjID, objID btrfsprim.ObjID, typ btrfsprim.ItemType) { + // check if we already have it + + tgt := btrfsprim.Key{ + ObjectID: objID, + ItemType: typ, + } + if _, err := o.inner.TreeSearch(treeID, func(key btrfsprim.Key, _ uint32) int { + key.Offset = 0 + return tgt.Cmp(key) + }); err == nil { + return + } + + // OK, we need to insert it + + ctx = dlog.WithField(ctx, "want_key", fmt.Sprintf("tree=%v key={%v %v ?}", treeID, objID, typ)) + wants := make(containers.Set[btrfsvol.LogicalAddr]) + o.key2leaf.Subrange( + func(k keyAndTree, _ btrfsvol.LogicalAddr) int { k.Key.Offset = 0; return tgt.Cmp(k.Key) }, + func(_ keyAndTree, v btrfsvol.LogicalAddr) bool { + wants.InsertFrom(o.leaf2orphans[v]) + return true + }) + o.wantAugment(ctx, treeID, wants) +} + +// wantOff implements rebuildCallbacks. +func (o *Rebuilder) wantOff(ctx context.Context, treeID btrfsprim.ObjID, objID btrfsprim.ObjID, typ btrfsprim.ItemType, off uint64) { + // check if we already have it + + tgt := btrfsprim.Key{ + ObjectID: objID, + ItemType: typ, + Offset: off, + } + if _, err := o.inner.TreeLookup(treeID, tgt); err == nil { + return + } + + // OK, we need to insert it + + ctx = dlog.WithField(ctx, "want_key", fmt.Sprintf("tree=%v key=%v", treeID, tgt)) + wants := make(containers.Set[btrfsvol.LogicalAddr]) + o.key2leaf.Subrange( + func(k keyAndTree, _ btrfsvol.LogicalAddr) int { return tgt.Cmp(k.Key) }, + func(_ keyAndTree, v btrfsvol.LogicalAddr) bool { + wants.InsertFrom(o.leaf2orphans[v]) + return true + }) + o.wantAugment(ctx, treeID, wants) +} + +// wantFunc implements rebuildCallbacks. +func (o *Rebuilder) wantFunc(ctx context.Context, treeID btrfsprim.ObjID, objID btrfsprim.ObjID, typ btrfsprim.ItemType, fn func(btrfsitem.Item) bool) { + // check if we already have it + + tgt := btrfsprim.Key{ + ObjectID: objID, + ItemType: typ, + } + items, _ := o.inner.TreeSearchAll(treeID, func(key btrfsprim.Key, _ uint32) int { + key.Offset = 0 + return tgt.Cmp(key) + }) + for _, item := range items { + if fn(item.Body) { + return + } + } + + // OK, we need to insert it + + ctx = dlog.WithField(ctx, "want_key", fmt.Sprintf("tree=%v key=%v +func", treeID, tgt)) + wants := make(containers.Set[btrfsvol.LogicalAddr]) + o.key2leaf.Subrange( + func(k keyAndTree, _ btrfsvol.LogicalAddr) int { k.Key.Offset = 0; return tgt.Cmp(k.Key) }, + func(k keyAndTree, v btrfsvol.LogicalAddr) bool { + nodeRef, err := btrfstree.ReadNode[btrfsvol.LogicalAddr](o.raw, o.sb, v, btrfstree.NodeExpectations{ + LAddr: containers.Optional[btrfsvol.LogicalAddr]{OK: true, Val: v}, + Generation: containers.Optional[btrfsprim.Generation]{OK: true, Val: o.graph.Nodes[v].Generation}, + }) + if err != nil { + o.ioErr(ctx, err) + } + for _, item := range nodeRef.Data.BodyLeaf { + if k.Key == item.Key && fn(item.Body) { + wants.InsertFrom(o.leaf2orphans[v]) + } + } + return true + }) + o.wantAugment(ctx, treeID, wants) +} + +// func implements rebuildCallbacks. +// +// interval is [beg, end) +func (o *Rebuilder) wantCSum(ctx context.Context, beg, end btrfsvol.LogicalAddr) { + for beg < end { + // check if we already have it + if run, err := btrfs.LookupCSum(o.inner, o.sb.ChecksumType, beg); err == nil { + // we already have it + beg = run.Addr.Add(run.Size()) + } else { + // we need to insert it + ctx := dlog.WithField(ctx, "want_key", fmt.Sprintf("csum for laddr=%v", beg)) + rbNode := o.csums.Search(func(item btrfsinspect.SysExtentCSum) int { + switch { + case item.Sums.Addr > beg: + return -1 + case item.Sums.Addr.Add(item.Sums.Size()) <= beg: + return 1 + default: + return 0 + } + + }) + if rbNode == nil { + o.wantAugment(ctx, btrfsprim.CSUM_TREE_OBJECTID, nil) // log an error + beg += btrfssum.BlockSize + continue + } + run := rbNode.Value.Sums + key := keyAndTree{ + Key: rbNode.Value.Key, + TreeID: btrfsprim.CSUM_TREE_OBJECTID, + } + leaf, ok := o.key2leaf.Load(key) + if !ok { + // This is a panic because if we found it in `o.csums` then it has + // to be in some Node, and if we didn't find it from + // btrfs.LookupCSum(), then that Node must be an orphan. + panic(fmt.Errorf("should not happen: no orphan contains %v", key.Key)) + } + o.wantAugment(ctx, key.TreeID, o.leaf2orphans[leaf]) + + beg = run.Addr.Add(run.Size()) + } + } +} + +// wantFileExt implements rebuildCallbacks. +func (o *Rebuilder) wantFileExt(ctx context.Context, treeID btrfsprim.ObjID, ino btrfsprim.ObjID, size int64) { + min := btrfsprim.Key{ + ObjectID: ino, + ItemType: btrfsitem.EXTENT_DATA_KEY, + Offset: 0, + } + max := btrfsprim.Key{ + ObjectID: ino, + ItemType: btrfsitem.EXTENT_DATA_KEY, + Offset: uint64(size - 1), + } + exts, _ := o.inner.TreeSearchAll(treeID, func(key btrfsprim.Key, _ uint32) int { + switch { + case min.Cmp(key) < 0: + return 1 + case max.Cmp(key) > 0: + return -1 + default: + return 0 + } + }) + + type gap struct { + // range is [Beg,End) + Beg, End int64 + } + gaps := &containers.RBTree[containers.NativeOrdered[int64], gap]{ + KeyFn: func(gap gap) containers.NativeOrdered[int64] { + return containers.NativeOrdered[int64]{Val: gap.Beg} + }, + } + gaps.Insert(gap{ + Beg: 0, + End: size, + }) + for _, ext := range exts { + switch extBody := ext.Body.(type) { + case btrfsitem.FileExtent: + extBeg := int64(ext.Key.Offset) + extSize, err := extBody.Size() + if err != nil { + o.fsErr(ctx, fmt.Errorf("FileExtent: tree=%v key=%v: %w", treeID, ext.Key, err)) + continue + } + extEnd := extBeg + extSize + overlappingGaps := gaps.SearchRange(func(gap gap) int { + switch { + case gap.End <= extBeg: + return 1 + case extEnd <= gap.Beg: + return -1 + default: + return 0 + } + }) + if len(overlappingGaps) == 0 { + continue + } + beg := overlappingGaps[0].Beg + end := overlappingGaps[len(overlappingGaps)-1].End + for _, gap := range overlappingGaps { + gaps.Delete(containers.NativeOrdered[int64]{Val: gap.Beg}) + } + if beg < extBeg { + gaps.Insert(gap{ + Beg: beg, + End: extBeg, + }) + } + if end > extEnd { + gaps.Insert(gap{ + Beg: extEnd, + End: end, + }) + } + case btrfsitem.Error: + o.fsErr(ctx, fmt.Errorf("error decoding item: tree=%v key=%v: %w", treeID, ext.Key, extBody.Err)) + default: + // This is a panic because the item decoder should not emit EXTENT_DATA + // items as anything but btrfsitem.FileExtent or btrfsitem.Error without + // this code also being updated. + panic(fmt.Errorf("should not happen: EXTENT_DATA item has unexpected type: %T", extBody)) + } + } + _ = gaps.Walk(func(rbNode *containers.RBNode[gap]) error { + gap := rbNode.Value + min := btrfsprim.Key{ + ObjectID: ino, + ItemType: btrfsitem.EXTENT_DATA_KEY, + Offset: 0, + } + max := btrfsprim.Key{ + ObjectID: ino, + ItemType: btrfsitem.EXTENT_DATA_KEY, + Offset: uint64(gap.End - 1), + } + ctx := dlog.WithField(ctx, "want_key", fmt.Sprintf("file extent for tree=%v inode=%v bytes [%v, %v)", treeID, ino, gap.Beg, gap.End)) + wants := make(containers.Set[btrfsvol.LogicalAddr]) + o.key2leaf.Subrange( + func(k keyAndTree, _ btrfsvol.LogicalAddr) int { + switch { + case min.Cmp(k.Key) < 0: + return 1 + case max.Cmp(k.Key) > 0: + return -1 + default: + return 0 + } + }, + func(k keyAndTree, v btrfsvol.LogicalAddr) bool { + nodeRef, err := btrfstree.ReadNode[btrfsvol.LogicalAddr](o.raw, o.sb, v, btrfstree.NodeExpectations{ + LAddr: containers.Optional[btrfsvol.LogicalAddr]{OK: true, Val: v}, + Generation: containers.Optional[btrfsprim.Generation]{OK: true, Val: o.graph.Nodes[v].Generation}, + }) + if err != nil { + o.ioErr(ctx, fmt.Errorf("error reading previously read node@%v: %w", v, err)) + } + for _, item := range nodeRef.Data.BodyLeaf { + if k.Key != item.Key { + continue + } + switch itemBody := item.Body.(type) { + case btrfsitem.FileExtent: + itemBeg := int64(item.Key.Offset) + itemSize, err := itemBody.Size() + if err != nil { + o.fsErr(ctx, fmt.Errorf("FileExtent: tree=%v key=%v: %w", treeID, item.Key, err)) + continue + } + itemEnd := itemBeg + itemSize + // We're being and "wanting" any extent that has any overlap with the + // gap. But maybe instead we sould only want extents that are + // *entirely* within the gap. I'll have to run it on real filesystems + // to see what works better. + // + // TODO(lukeshu): Re-evaluate whether being greedy here is the right + // thing. + if itemEnd > gap.Beg && itemBeg < gap.End { + wants.InsertFrom(o.leaf2orphans[v]) + } + case btrfsitem.Error: + o.fsErr(ctx, fmt.Errorf("error decoding item: tree=%v key=%v: %w", treeID, item.Key, itemBody.Err)) + default: + // This is a panic because the item decoder should not emit EXTENT_DATA + // items as anything but btrfsitem.FileExtent or btrfsitem.Error without + // this code also being updated. + panic(fmt.Errorf("should not happen: EXTENT_DATA item has unexpected type: %T", itemBody)) + } + } + return true + }) + o.wantAugment(ctx, treeID, wants) + return nil + }) +} |