0%

cgroup_manager

runc的cgroup管理模块

cgroup是容器runtime最重要的基础设施之一,所以runc支持多种cgroup的多种管理方式:

  • cgroup v1直接管理:libcontainer/cgroups/fs/fs.go
  • cgroup v2直接管理:libcontainer/cgroups/fs2/fs2.go
  • cgroup v1通过systemd管理:libcontainer/cgroups/systemd/v1.go
  • cgroup v2通过sytemd管理:libcontainer/cgroups/systemd/v2.go

统一的管理接口

runc的为cgroup的管理定义了统一的对外接口Manager,定义在libcontainer/cgroups/cgroups.go,详细接口如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
type Manager interface {
// Applies cgroup configuration to the process with the specified pid
Apply(pid int) error

// Returns the PIDs inside the cgroup set
GetPids() ([]int, error)

// Returns the PIDs inside the cgroup set & all sub-cgroups
GetAllPids() ([]int, error)

// Returns statistics for the cgroup set
GetStats() (*Stats, error)

// Toggles the freezer cgroup according with specified state
Freeze(state configs.FreezerState) error

// Destroys the cgroup set
Destroy() error

// Path returns a cgroup path to the specified controller/subsystem.
// For cgroupv2, the argument is unused and can be empty.
Path(string) string

// Sets the cgroup as configured.
Set(container *configs.Config) error

// GetPaths returns cgroup path(s) to save in a state file in order to restore later.
//
// For cgroup v1, a key is cgroup subsystem name, and the value is the path
// to the cgroup for this subsystem.
//
// For cgroup v2 unified hierarchy, a key is "", and the value is the unified path.
GetPaths() map[string]string

// GetCgroups returns the cgroup data as configured.
GetCgroups() (*configs.Cgroup, error)

// GetFreezerState retrieves the current FreezerState of the cgroup.
GetFreezerState() (configs.FreezerState, error)

// Whether the cgroup path exists or not
Exists() bool
}

cgroup v1

v1支持直接管理和通过systemd管理两种方式,两种方式的家大体结构是一致的;

graph TB
    O(manager) --> A[subsystemSet]
    A --> CpusetGroup -.-> B(subsystem)
    A --> DevicesGroup -.-> B
    A --> MemoryGroup -.-> B
    A --> CpuGroup -.-> B
    A --> CpuacctGroup -.-> B
    A --> PidsGroup -.-> B
    A --> BlkioGroup -.-> B
    A --> HugetlbGroup -.-> B
    A --> NetClsGroup -.-> B
    A --> NetPrioGroup -.-> B
    A --> PerfEventGroup -.-> B
    A --> FreezerGroup -.-> B
    A --> NameGroup -.-> B

直接管理

manager实现统一Manager的接口,具体定义如下:

1
2
3
4
5
6
type manager struct {    
mu sync.Mutex
cgroups *configs.Cgroup // cgroup的配置
rootless bool // ignore permission-related errors
paths map[string]string // 存储cgroup各子系统的路径,以子系统名为key(Apply是初始化)
}

subsystem接口在两种方式下是不一样的,systemd方式的接口是直接管理的子集。直接管理方式的subsystem接口定义如下:

1
2
3
4
5
6
7
8
9
10
11
type subsystem interface {    
// Name returns the name of the subsystem.
Name() string
// Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
GetStats(path string, stats *cgroups.Stats) error
// Removes the cgroup represented by 'cgroupData'.
Remove(*cgroupData) error
// Creates and joins the cgroup represented by 'cgroupData'.
Apply(*cgroupData) error
// Set the cgroup represented by cgroup.
Set(path string, cgroup *configs.Cgroup) error }

Apply接口实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
func (m *manager) Apply(pid int) (err error) {
if m.cgroups == nil {
return nil
}
m.mu.Lock()
defer m.mu.Unlock()

var c = m.cgroups

d, err := getCgroupData(m.cgroups, pid)
if err != nil {
return err
}

m.paths = make(map[string]string)
if c.Paths != nil {
// 容器已配置各子系统所在的路径
for name, path := range c.Paths {
_, err := d.path(name)
if err != nil {
if cgroups.IsNotFound(err) {
continue
}
return err
}
m.paths[name] = path
}
// 把pid加入到配置的cgroup的子系统
return cgroups.EnterPid(m.paths, pid)
}

// 依次把pid加入到系统支持的cgroup子系统
for _, sys := range m.getSubsystems() {
p, err := d.path(sys.Name())
if err != nil {
// The non-presence of the devices subsystem is
// considered fatal for security reasons.
if cgroups.IsNotFound(err) && sys.Name() != "devices" {
continue
}
return err
}
m.paths[sys.Name()] = p

// 调用subsystem的Apply接口,依赖各子系统的实现(cpuset为例分析)
if err := sys.Apply(d); err != nil {
// In the case of rootless (including euid=0 in userns), where an
// explicit cgroup path hasn't been set, we don't bail on error in
// case of permission problems. Cases where limits have been set
// (and we couldn't create our own cgroup) are handled by Set.
if isIgnorableError(m.rootless, err) && m.cgroups.Path == "" {
delete(m.paths, sys.Name())
continue
}
return err
}

}
return nil
}

cpuset子系统的Apply实现为例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
func (s *CpusetGroup) Apply(d *cgroupData) error {
// 获取子系统的完整路径
dir, err := d.path("cpuset")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return s.ApplyDir(dir, d.config, d.pid)
}

func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) error {
// This might happen if we have no cpuset cgroup mounted.
// Just do nothing and don't fail.
if dir == "" {
return nil
}
//获取挂载点路径
root, err := getMount(dir)
if err != nil {
return err
}
root = filepath.Dir(root)
// 'ensureParent' start with parent because we don't want to
// explicitly inherit from parent, it could conflict with
// 'cpuset.cpu_exclusive'.
if err := s.ensureParent(filepath.Dir(dir), root); err != nil {
return err
}
if err := os.MkdirAll(dir, 0755); err != nil {
return err
}
// We didn't inherit cpuset configs from parent, but we have
// to ensure cpuset configs are set before moving task into the
// cgroup.
// The logic is, if user specified cpuset configs, use these
// specified configs, otherwise, inherit from parent. This makes
// cpuset configs work correctly with 'cpuset.cpu_exclusive', and
// keep backward compatibility.
if err := s.ensureCpusAndMems(dir, cgroup); err != nil {
return err
}

// because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems
// 把pid加入cgroup的子系统
return cgroups.WriteCgroupProc(dir, pid)
}

其他子系统实现类似,都是把pid加入到对应的子系统中。

其他接口

实现逻辑基本一致,manager通过调用subsystem的接口完成对各子系统的操作。

  • Set接口:负责cgroup配置的写入;
  • GetStats接口:获取部分子系统(MemoryGroup,CpuGroup,PidsGroup,BlkioGroup,HugetlbGroup等等)的状态信息
  • Freeze接口:设置freeze子系统的状态;

还有一些接口不需要通过subsystem的实现,就能完成具体的操作。

  • Destroy接口:删除manager管理cgroup的所有子系统;
  • GetPids接口:获取属于当前cgroup的所有pid(通过读取devices子系统的cgroup.procs文件实现)
  • GetAllPids接口:获取属于当前cgroup以及子cgroup的所有pid

systemd管理

整体和直接管理差不多,区别在于通过systemd的unit任务管理容器的cgroup,它的回收由systemd负责。因此,可以看到subsystem的接口也简单一些。

manager的结构

1
2
3
4
5
type legacyManager struct {    
mu sync.Mutex
cgroups *configs.Cgroup
paths map[string]string
}

subsystem接口

1
2
3
4
5
6
7
8
type subsystem interface {    
// Name returns the name of the subsystem.
Name() string
// Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
GetStats(path string, stats *cgroups.Stats) error
// Set the cgroup represented by cgroup.
Set(path string, cgroup *configs.Cgroup) error
}

管理的子系统集合,依然使用直接管理定义的结构集合。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
var legacySubsystems = subsystemSet{
&fs.CpusetGroup{},
&fs.DevicesGroup{},
&fs.MemoryGroup{},
&fs.CpuGroup{},
&fs.CpuacctGroup{},
&fs.PidsGroup{},
&fs.BlkioGroup{},
&fs.HugetlbGroup{},
&fs.PerfEventGroup{},
&fs.FreezerGroup{},
&fs.NetPrioGroup{},
&fs.NetClsGroup{},
&fs.NameGroup{GroupName: "name=systemd"},
}

Apply接口实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
func (m *legacyManager) Apply(pid int) error {
var (
c = m.cgroups
unitName = getUnitName(c)
slice = "system.slice"
properties []systemdDbus.Property
)

m.mu.Lock()
defer m.mu.Unlock()
if c.Paths != nil {
// 和直接方式一样,如果配置了,直接使用用户定义的子系统路径
paths := make(map[string]string)
for name, path := range c.Paths {
_, err := getSubsystemPath(m.cgroups, name)
if err != nil {
// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
if cgroups.IsNotFound(err) {
continue
}
return err
}
paths[name] = path
}
m.paths = paths
// 依次把pid加入到paths保存的cgroup子系统中
return cgroups.EnterPid(m.paths, pid)
}

if c.Parent != "" {
slice = c.Parent
}

// 配置systemd unit任务的属性。。。。
properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))

// if we create a slice, the parent is defined via a Wants=
if strings.HasSuffix(unitName, ".slice") {
properties = append(properties, systemdDbus.PropWants(slice))
} else {
// otherwise, we use Slice=
properties = append(properties, systemdDbus.PropSlice(slice))
}

// only add pid if its valid, -1 is used w/ general slice creation.
if pid != -1 {
properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
}

// Check if we can delegate. This is only supported on systemd versions 218 and above.
if !strings.HasSuffix(unitName, ".slice") {
// Assume scopes always support delegation.
properties = append(properties, newProp("Delegate", true))
}

// Always enable accounting, this gets us the same behaviour as the fs implementation,
// plus the kernel has some problems with joining the memory cgroup at a later time.
properties = append(properties,
newProp("MemoryAccounting", true),
newProp("CPUAccounting", true),
newProp("BlockIOAccounting", true))

// Assume DefaultDependencies= will always work (the check for it was previously broken.)
properties = append(properties,
newProp("DefaultDependencies", false))

dbusConnection, err := getDbusConnection(false)
if err != nil {
return err
}
resourcesProperties, err := genV1ResourcesProperties(c, dbusConnection)
if err != nil {
return err
}
properties = append(properties, resourcesProperties...)
properties = append(properties, c.SystemdProps...)

// We have to set kernel memory here, as we can't change it once
// processes have been attached to the cgroup.
if c.Resources.KernelMemory != 0 {
if err := enableKmem(c); err != nil {
return err
}
}

// 启动systemd unit任务,管理容器的cgroup
if err := startUnit(dbusConnection, unitName, properties); err != nil {
return err
}

// 依次把pid加入到paths保存的cgroup子系统中
if err := joinCgroups(c, pid); err != nil {
return err
}

// 记录cgroup子系统的路径
paths := make(map[string]string)
for _, s := range legacySubsystems {
subsystemPath, err := getSubsystemPath(m.cgroups, s.Name())
if err != nil {
// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
if cgroups.IsNotFound(err) {
continue
}
return err
}
paths[s.Name()] = subsystemPath
}
m.paths = paths
return nil
}

cgroup v2

基本用法

挂载

1
$ mount -t cgroup2 nodev /mnt/cgroup2/

创建子cgroup

1
2
3
4
5
$ cd /mnt/cgroup2/
$ mkdir xxx
$ ls xxx/
cgroup.controllers cgroup.freeze cgroup.max.descendants cgroup.stat cgroup.threads cpu.pressure io.pressure
cgroup.events cgroup.max.depth cgroup.procs cgroup.subtree_control cgroup.type cpu.stat memory.pressure

删除无用的子cgroup

1
$ rmdir xxx/

查看当前cgroup支持的controller,默认不支持任何controller

1
2
$ cat cgroup.controllers
cpu io memory

使能和禁用controller,通过写文件cgroup.subtree_control实现:

1
$ echo "+cpu +memory -io" > cgroup.subtree_control

参考文档https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html

直接管理

由于v2简化了cgroup的机制,因此,管理架构变得很简单,直接通过manager即可完成。省去了subsystem的管理模块。

1
2
3
4
5
6
7
8
9
type manager struct {
config *configs.Cgroup
// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
dirPath string
// controllers is content of "cgroup.controllers" file.
// excludes pseudo-controllers ("devices" and "freezer").
controllers map[string]struct{}
rootless bool
}

Apply接口实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
func (m *manager) Apply(pid int) error {
// 创建目录结构,并且写入支持的controller到cgroup.subtree_control
if err := CreateCgroupPath(m.dirPath, m.config); err != nil {
// Related tests:
// - "runc create (no limits + no cgrouppath + no permission) succeeds"
// - "runc create (rootless + no limits + cgrouppath + no permission) fails with permission error"
// - "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if m.rootless {
if m.config.Path == "" {
if blNeed, nErr := needAnyControllers(m.config); nErr == nil && !blNeed {
return nil
}
return errors.Wrap(err, "rootless needs no limits + no cgrouppath when no permission is granted for cgroups")
}
}
return err
}
// 把pid加入到cgroup中
if err := cgroups.WriteCgroupProc(m.dirPath, pid); err != nil {
return err
}
return nil
}

Set接口实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
func (m *manager) Set(container *configs.Config) error {
if container == nil || container.Cgroups == nil {
return nil
}
if err := m.getControllers(); err != nil {
return err
}
// pids (since kernel 4.5)
if err := setPids(m.dirPath, container.Cgroups); err != nil {
return err
}
// memory (since kernel 4.5)
if err := setMemory(m.dirPath, container.Cgroups); err != nil {
return err
}
// io (since kernel 4.5)
if err := setIo(m.dirPath, container.Cgroups); err != nil {
return err
}
// cpu (since kernel 4.15)
if err := setCpu(m.dirPath, container.Cgroups); err != nil {
return err
}
// devices (since kernel 4.15, pseudo-controller)
//
// When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// However, errors from other subsystems are not ignored.
// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if err := setDevices(m.dirPath, container.Cgroups); err != nil && !m.rootless {
return err
}
// cpuset (since kernel 5.0)
if err := setCpuset(m.dirPath, container.Cgroups); err != nil {
return err
}
// hugetlb (since kernel 5.6)
if err := setHugeTlb(m.dirPath, container.Cgroups); err != nil {
return err
}
// freezer (since kernel 5.2, pseudo-controller)
if err := setFreezer(m.dirPath, container.Cgroups.Freezer); err != nil {
return err
}
m.config = container.Cgroups
return nil
}

支持的controllers都是直接写入到对应的配置文件即可。