BoltDB 数据结构

BoltDB

Posted by WR on December 25, 2022
Page

​ 使用 BoltDB 的时候我们会初始化一个 DB 对象,一个 DB 对应的就是一个磁盘文件。而对应的磁盘文件,BoltDB 是以 Page 为单位进行数据读写的,换句话说数据在磁盘上是以 Page 为单位存储的。这里的 Page 和操作系统的 Page cache 大小一致,即 4K。

Page 结构
1
2
3
4
5
6
7
8
9
10
11
12
type page struct {
  // id 页id
	id       pgid
  // flags 页类型,分为:分支,叶子节点,元信息,空闲列表
	flags    uint16
  // count 统计叶子节点、非叶子节点、空闲列表页的个数
	count    uint16
  // overflow 数据是否有溢出,主要在空闲列表上有用
	overflow uint32
  // ptr 无类线指针,指向真实数据
	ptr uintptr
}

由以上定义可知,页由页头和页数据两部分组成。

在 boltDB 中页分为四种类型,如以下定义:

1
2
3
4
5
6
const (
	branchPageFlag   = 0x01 // 存储索引信息(页号、元素key值)
	leafPageFlag     = 0x02 // 存储数据信息(页号、插入的key值、插入的value值)
	metaPageFlag     = 0x04 // 存储数据库的元信息,例如空闲列表页id、放置桶的根页等
	freelistPageFlag = 0x10 // 存储哪些页是空闲页,可以用来后续分配空间时,优先考虑分配
)

每个数据页有一个 meta 方法,这个方法主要用于获取元数据信息,前提是改页类型是元数据页类型

1
2
3
4
// meta returns a pointer to the metadata section of the page.
func (p *page) meta() *meta {
	return (*meta)(unsafe.Pointer(&p.ptr))
}
Meta 结构

meta 结构如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
type meta struct {
  // magic 魔数
	magic    uint32
  // version 版本
	version  uint32
  // pageSize page页的大小,该值和操作系统默认的页大小保持一致
	pageSize uint32
  // flags 保留值
	flags    uint32
  // root 所有 bucket 的根
	root     bucket
  // freelist 空闲列表页的id
	freelist pgid
  // pgid 元数据页的id
	pgid     pgid
  // txid 最大的事务id
	txid     txid
  // checksum 用作校验的校验和
	checksum uint64
}

meta 写入到 page 中:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
// write writes the meta onto a page.
func (m *meta) write(p *page) {
	if m.root.root >= m.pgid {
		panic(fmt.Sprintf("root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid))
	} else if m.freelist >= m.pgid {
		panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid))
	}

	// Page id is either going to be 0 or 1 which we can determine by the transaction ID.
	p.id = pgid(m.txid % 2)
	p.flags |= metaPageFlag

	// Calculate the checksum.
	m.checksum = m.sum64()

	m.copy(p.meta())
}

// meta returns a pointer to the metadata section of the page.
func (p *page) meta() *meta {
	return (*meta)(unsafe.Pointer(&p.ptr))
}

// copy copies one meta object to another.
func (m *meta) copy(dest *meta) {
	*dest = *m
}
Freelist 结构
1
2
3
4
5
6
7
8
9
10
11
// freelist represents a list of all pages that are available for allocation.
// It also tracks pages that have been freed but are still in use by open transactions.
type freelist struct {
  // ids 可用的空闲页 id
	ids     []pgid          // all free and available free page ids.
	// pending 不久将会释放的空闲页
  pending map[txid][]pgid // mapping of soon-to-be free page ids by tx.
  // cache 所有可用的空闲页及不久将释放的页 ID,便于快速查找
	cache   map[pgid]bool   // fast lookup of all free and pending page ids.
}

​ freelist 写入到 page 中:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
// write writes the page ids onto a freelist page. All free and pending ids are
// saved to disk since in the event of a program crash, all pending ids will
// become free.
func (f *freelist) write(p *page) error {
	// Combine the old free pgids and pgids waiting on an open transaction.

	// Update the header flag.
  // 设置页标识
	p.flags |= freelistPageFlag

	// The page.count can only hold up to 64k elements so if we overflow that
	// number then we handle it by putting the size in the first element.
  // 获取空闲页 ID 数,总 ID 数 = 空闲页ID + pending ID
	lenids := f.count()
  // 如果空闲页为0,记录空闲页数目到 p.count
	if lenids == 0 {
		p.count = uint16(lenids)
	} else if lenids < 0xFFFF {
    // 如果空闲页数目小于 0xFFFF,记录空闲页数目到 p.count
		p.count = uint16(lenids)
		f.copyall(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[:])
	} else {
    // 如果空闲页数目大于 0xFFFF,p.count = 0xFFFF 因为p.count是uint64,最多只能记录 65535 大小
    // 然后使用 p.ptr 的第一个字节记录空闲页大小
		p.count = 0xFFFF
    // 使用 p.ptr 的第一个字节记录空闲页大小
		((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0] = pgid(lenids)
    // 从第一个元素位置拷贝,把空闲页 id 拷贝到 p.ptr
		f.copyall(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[1:])
	}

	return nil
}


// copyall copies into dst a list of all free ids and all pending ids in one sorted list.
// f.count returns the minimum length required for dst.
func (f *freelist) copyall(dst []pgid) {
	m := make(pgids, 0, f.pending_count())
	for _, list := range f.pending {
		m = append(m, list...)
	}
	sort.Sort(m)
  // 对 f.ids 和 f.pending 进行合并,并且拷贝到 dst
	mergepgids(dst, f.ids, m)
}

// mergepgids copies the sorted union of a and b into dst.
// If dst is too small, it panics.
func mergepgids(dst, a, b pgids) {
	if len(dst) < len(a)+len(b) {
		panic(fmt.Errorf("mergepgids bad len %d < %d + %d", len(dst), len(a), len(b)))
	}
	// Copy in the opposite slice if one is nil.
	if len(a) == 0 {
		copy(dst, b)
		return
	}
	if len(b) == 0 {
		copy(dst, a)
		return
	}

	// Merged will hold all elements from both lists.
	merged := dst[:0]

	// Assign lead to the slice with a lower starting value, follow to the higher value.
	lead, follow := a, b
	if b[0] < a[0] {
		lead, follow = b, a
	}

	// Continue while there are elements in the lead.
	for len(lead) > 0 {
		// Merge largest prefix of lead that is ahead of follow[0].
		n := sort.Search(len(lead), func(i int) bool { return lead[i] > follow[0] })
		merged = append(merged, lead[:n]...)
		if n >= len(lead) {
			break
		}

		// Swap lead and follow.
		lead, follow = follow, lead[n:]
	}

	// Append what's left in follow.
	_ = append(merged, follow...)
}


从磁盘中加载 freelist 页,并转为 freelist 结构:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
// read initializes the freelist from a freelist page.
func (f *freelist) read(p *page) {
	// If the page.count is at the max uint16 value (64k) then it's considered
	// an overflow and the size of the freelist is stored as the first element.
	idx, count := 0, int(p.count)
  // 如果 count == 0xFFFF,从 ptr 第一个字节读取实际空闲页个数
	if count == 0xFFFF {
		idx = 1
		count = int(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0])
	}

	// Copy the list of page ids from the freelist.
	if count == 0 {
		f.ids = nil
	} else {
		ids := ((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[idx:count]
		f.ids = make([]pgid, len(ids))
		copy(f.ids, ids)

		// Make sure they're sorted.
		sort.Sort(pgids(f.ids))
	}

	// Rebuild the page cache.
	f.reindex()
}

// reindex rebuilds the free cache based on available and pending free lists.
func (f *freelist) reindex() {
	f.cache = make(map[pgid]bool, len(f.ids))
	for _, id := range f.ids {
		f.cache[id] = true
	}
	for _, pendingIDs := range f.pending {
		for _, pendingID := range pendingIDs {
			f.cache[pendingID] = true
		}
	}
}

allocate 分配指定大小空闲页

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
// allocate returns the starting page id of a contiguous list of pages of a given size.
// If a contiguous block cannot be found then 0 is returned.
// 分配n个连续的页,如果没有连续的页则返回0
func (f *freelist) allocate(n int) pgid {
	if len(f.ids) == 0 {
		return 0
	}

	var initial, previd pgid
	for i, id := range f.ids {
		if id <= 1 {
			panic(fmt.Sprintf("invalid page allocation: %d", id))
		}

		// Reset initial page if this is not contiguous.
		if previd == 0 || id-previd != 1 {
			initial = id
		}

		// If we found a contiguous block then remove it and return it.
    // 如果找到连续的块,则返回并且从f.ids,f.cache 中移除空闲页
		if (id-initial)+1 == pgid(n) {
			// If we're allocating off the beginning then take the fast path
			// and just adjust the existing slice. This will use extra memory
			// temporarily but the append() in free() will realloc the slice
			// as is necessary.
			if (i + 1) == n {
				f.ids = f.ids[i+1:]
			} else {
				copy(f.ids[i-n+1:], f.ids[i+1:])
				f.ids = f.ids[:len(f.ids)-n]
			}

			// Remove from the free cache.
			for i := pgid(0); i < pgid(n); i++ {
				delete(f.cache, initial+i)
			}

			return initial
		}

		previd = id
	}
	return 0
}
branchPage 结构

​ 分支节点主要用来构建索引,提升查询效率。

​ 分支节点在存储时,一个分支节点页上会存储多个分支页元素即branchPageElement。这个信息可以记做为分支页元素元信息。

1
2
3
4
5
6
7
8
9
// branchPageElement represents a node on a branch page.
type branchPageElement struct {
  // pos 最小key的值存储的位置距离当前的元信息的偏移量pos
	pos   uint32
  // ksize 该元素所指向的页中存储的最小key的值大小
	ksize uint32
  // pgid 元素页 id
	pgid  pgid
}

​ 分支节点页 page 获取下标为 index 的某一个element 的信息

1
2
3
4
5
// leafPageElement retrieves the leaf node by index
func (p *page) leafPageElement(index uint16) *leafPageElement {
	n := &((*[0x7FFFFFF]leafPageElement)(unsafe.Pointer(&p.ptr)))[index]
	return n
}

非叶子结点图示:

image

在内存中,分支节点和叶子节点都是通过 node 来表示,它们是通过 node 里面的 isleaf 字段进行类型区分

node 结构
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
// node represents an in-memory, deserialized page.
type node struct {
	bucket     *Bucket
	isLeaf     bool
	unbalanced bool
	spilled    bool
	key        []byte
	pgid       pgid
	parent     *node
	children   nodes
	inodes     inodes
}

type inodes []inode

// inode represents an internal node inside of a node.
// It can be used to point to elements in a page or point
// to an element which hasn't been added to a page yet.
type inode struct {
  // flags 表示是否是子桶叶子节点还是普通叶子节点。如果flags值为1表示子桶叶子节点,否则为普通叶子节点
	flags uint32
  // pgid 当inode为分支元素时,pgid才有值,为叶子元素时,则没值
	pgid  pgid
	key   []byte
  // 当inode为分支元素时,value为空,为叶子元素时,才有值
	value []byte
}


​ page 转化为 node 结构:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// read initializes the node from a page.
func (n *node) read(p *page) {
	n.pgid = p.id
	n.isLeaf = ((p.flags & leafPageFlag) != 0)
	n.inodes = make(inodes, int(p.count))

	for i := 0; i < int(p.count); i++ {
		inode := &n.inodes[i]
		if n.isLeaf {
			elem := p.leafPageElement(uint16(i))
			inode.flags = elem.flags
			inode.key = elem.key()
			inode.value = elem.value()
		} else {
			elem := p.branchPageElement(uint16(i))
			inode.pgid = elem.pgid
			inode.key = elem.key()
		}
		_assert(len(inode.key) > 0, "read: zero-length inode key")
	}

	// Save first key so we can find the node in the parent when we spill.
  // 存储第一个 key
	if len(n.inodes) > 0 {
		n.key = n.inodes[0].key
		_assert(len(n.key) > 0, "read: zero-length node key")
	} else {
		n.key = nil
	}
}

// key returns a byte slice of the node key.
func (n *branchPageElement) key() []byte {
	buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
	return (*[maxAllocSize]byte)(unsafe.Pointer(&buf[n.pos]))[:n.ksize]
}

node 转化为 page:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
// write writes the items onto one or more pages.
func (n *node) write(p *page) {
	// Initialize page.
	if n.isLeaf {
		p.flags |= leafPageFlag
	} else {
		p.flags |= branchPageFlag
	}

	if len(n.inodes) >= 0xFFFF {
		panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
	}
	p.count = uint16(len(n.inodes))

	// Stop here if there are no items to write.
  // 没有节点可存储,直接返回
	if p.count == 0 {
		return
	}

	// Loop over each item and write it to the page.
	b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
	for i, item := range n.inodes {
		_assert(len(item.key) > 0, "write: zero-length inode key")

		// Write the page element.
		if n.isLeaf {
			elem := p.leafPageElement(uint16(i))
			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
			elem.flags = item.flags
			elem.ksize = uint32(len(item.key))
			elem.vsize = uint32(len(item.value))
		} else {
			elem := p.branchPageElement(uint16(i))
      // 计算第一个 key 与当前元素的地址偏移量
			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
			elem.ksize = uint32(len(item.key))
			elem.pgid = item.pgid
			_assert(elem.pgid != p.id, "write: circular dependency occurred")
		}

		// If the length of key+value is larger than the max allocation size
		// then we need to reallocate the byte array pointer.
		//
		// See: https://github.com/boltdb/bolt/pull/335
		klen, vlen := len(item.key), len(item.value)
		if len(b) < klen+vlen {
			b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:]
		}

		// Write data for the element to the end of the page.
		copy(b[0:], item.key)
		b = b[klen:]
		copy(b[0:], item.value)
		b = b[vlen:]
	}

	// DEBUG ONLY: n.dump()
}
leafPage 结构
1
2
3
4
5
6
7
8
9
10
11
12
// leafPageElement represents a node on a leaf page.
type leafPageElement struct {
  // flags 该值主要用来区分,是子桶叶子节点元素还是普通的key/value叶子节点元素。flags值为1时表示子桶。否则为key/value
	flags uint32
  // pos 具体存储的值距离元信息的偏移位置pos
	pos   uint32
  // ksize key的长度
	ksize uint32
  // vsize value的长度
	vsize uint32
}

LeafPage 结构图示:

image