Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 84 additions & 26 deletions libcontainer/process_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@ import (
"syscall"
"time"

"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"

"github.com/opencontainers/runtime-spec/specs-go"

"github.com/opencontainers/cgroups"
"github.com/opencontainers/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs"
Expand Down Expand Up @@ -274,32 +275,66 @@ func (p *setnsProcess) addIntoCgroupV1() error {
return nil
}

// initProcessCgroupPath returns container init's cgroup path,
// as read from /proc/PID/cgroup. Only works for cgroup v2.
// Returns empty string if the path can not be obtained.
//
// This is used by runc exec in these cases:
//
// 1. On cgroup v2 + nesting + domain controllers, adding to initial cgroup
// may fail with EBUSY (https://github.com/opencontainers/runc/issues/2356);
//
// 2. A container init process with no cgroupns and /sys/fs/cgroup rw access
// may move itself to any other cgroup, and the original cgroup will disappear.
func (p *setnsProcess) initProcessCgroupPath() string {
if p.initProcessPid == 0 || !cgroups.IsCgroup2UnifiedMode() {
return ""
}

cg, err := cgroups.ParseCgroupFile("/proc/" + strconv.Itoa(p.initProcessPid) + "/cgroup")
if err != nil {
return ""
}
cgroup, ok := cg[""]
if !ok {
return ""
}

return fs2.UnifiedMountpoint + cgroup
}

func (p *setnsProcess) addIntoCgroupV2() error {
sub := p.process.SubCgroupPaths[""]
err := p.manager.AddPid(sub, p.pid())
if err != nil && !p.rootlessCgroups {
// On cgroup v2 + nesting + domain controllers, adding to initial cgroup may fail with EBUSY.
// https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643
// Try to join the cgroup of InitProcessPid, unless sub-cgroup is explicitly set.
if p.initProcessPid != 0 && sub == "" {
initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid)
initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile)
if initCgErr == nil {
if initCgPath, ok := initCg[""]; ok {
initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath)
logrus.Debugf("adding pid %d to cgroup failed (%v), attempting to join %s",
p.pid(), err, initCgDirpath)
// NOTE: initCgDirPath is not guaranteed to exist because we didn't pause the container.
err = cgroups.WriteCgroupProc(initCgDirpath, p.pid())
}
}
}
if err != nil {
return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err)
}
if err == nil {
return nil
}

// Failed to join the configured cgroup. Fall back to container init's cgroup
// unless sub-cgroup is explicitly requested.
var path string
if sub != "" {
goto fail
}
path = p.initProcessCgroupPath()
if path == "" {
goto fail
}
logrus.Debugf("adding pid %d to configured cgroup failed (%v), will join container init cgroup %q", p.pid(), err, path)
// NOTE: path is not guaranteed to exist because we didn't pause the container.
err = cgroups.WriteCgroupProc(path, p.pid())
if err != nil {
goto fail
}
return nil

fail:
if p.rootlessCgroups {
// Ignore cgroup join errors when rootless.
return nil
}

return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err)
}

func (p *setnsProcess) addIntoCgroup() error {
Expand All @@ -318,6 +353,8 @@ func (p *setnsProcess) addIntoCgroup() error {
// to join cgroup early, in p.cmd.Start. Returns an *os.File which
// must be closed by the caller after p.Cmd.Start return.
func (p *setnsProcess) prepareCgroupFD() (*os.File, error) {
const openFlags = unix.O_PATH | unix.O_DIRECTORY | unix.O_CLOEXEC

if !cgroups.IsCgroup2UnifiedMode() {
return nil, nil
}
Expand All @@ -335,14 +372,28 @@ func (p *setnsProcess) prepareCgroupFD() (*os.File, error) {
return nil, fmt.Errorf("bad sub cgroup path: %s", sub)
}

fd, err := cgroups.OpenFile(base, sub, unix.O_PATH|unix.O_DIRECTORY|unix.O_CLOEXEC)
fd, err := cgroups.OpenFile(base, sub, openFlags)
if err == nil {
goto success
}
// Failed to open the configured cgroup. Fall back to container init's cgroup
// unless sub-cgroup is explicitly requested. The fallback logic should be
// the same as in addIntoCgroupV2.
if sub != "" {
goto fail
}
cgroup = p.initProcessCgroupPath()
if cgroup == "" {
goto fail
}
logrus.Debugf("failed to open configured cgroup (%v), will open container init cgroup %q", err, cgroup)
// NOTE: path is not guaranteed to exist because we didn't pause the container.
fd, err = cgroups.OpenFile(cgroup, "", openFlags)
if err != nil {
if p.rootlessCgroups {
return nil, nil
}
return nil, fmt.Errorf("can't open cgroup: %w", err)
goto fail
}

success:
logrus.Debugf("using CLONE_INTO_CGROUP %q", cgroup)
if p.cmd.SysProcAttr == nil {
p.cmd.SysProcAttr = &syscall.SysProcAttr{}
Expand All @@ -351,6 +402,13 @@ func (p *setnsProcess) prepareCgroupFD() (*os.File, error) {
p.cmd.SysProcAttr.CgroupFD = int(fd.Fd())

return fd, nil

fail:
// Ignore cgroup join error for rootless.
if p.rootlessCgroups {
return nil, nil
}
return nil, fmt.Errorf("can't open cgroup: %w", err)
}

// startWithCgroupFD starts a process via clone3 with CLONE_INTO_CGROUP,
Expand Down
35 changes: 35 additions & 0 deletions tests/integration/exec.bats
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,41 @@ function check_exec_debug() {
[ "$status" -eq 0 ]
}

# https://github.com/opencontainers/runc/issues/5089
@test "runc exec [init changes cgroup]" {
requires root cgroups_v2

NEW_CGROUP_REL=/runc-tst-$$
NEW_CGROUP=/sys/fs/cgroup$NEW_CGROUP_REL
mkdir $NEW_CGROUP

# The container is placed into a $CGROUP_V2_PATH cgroup.
set_cgroups_path
# And upon the start it moves itself into $NEW_CGROUP.
set_cgroup_mount_writable
update_config ' .linux.namespaces -= [{"type": "cgroup"}]
| .process.args = ["sh", "-c", "echo 1 > '$NEW_CGROUP'/cgroup.procs && exec sleep 1h"]'

runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox
[ $status -eq 0 ]
testcontainer test_busybox running
sleep 1
# Remove the original container cgroup. If systemd cgroup manager is used by runc,
# the cgroup might have already be deleted by systemd, so we ignore rmdir errors.
rmdir "$CGROUP_V2_PATH" || true
test -d "$CGROUP_V2_PATH" && false

# Test that runc exec is able to fallback to container's init cgroup
# even if the original cgroup is gone.
runc exec test_busybox cat /proc/self/cgroup
[ $status -eq 0 ]
[ "$output" = "0::$NEW_CGROUP_REL" ]

# Cleanup.
runc delete -f test_busybox
rmdir "$NEW_CGROUP"
}

@test "runc exec [execve error]" {
cat <<EOF >rootfs/run.sh
#!/mmnnttbb foo bar
Expand Down