1. 介绍
简单玩了下Linux kernel为容器技术提供的基础设施之一namespace(另一个是cgroups),包括uts/user/pid/mnt/ipc/net六个(3.13.0的内核). 这东西主要用来做资源的隔离,我感觉本质上是全局资源的映射,映射之间独立了自然隔离了。主要涉及到的东西是:
- clone
- setns
- unshare
- /proc/pid/ns, /proc/pid/uid_map, /proc/pid/gid_map等
2. 测试流程及代码
下面是一些简单的例子,主要测试uts/pid/user/mnt四个namespace的效果,测试代码主要用到三个进程,一个是clone系统调用执行/bin/bash后的进程,也是生成新的子namespace的初始进程,然后是打开/proc/pid/ns下的namespace链接文件,用setns将第二个可执行文件的进程加入/bin/bash的进程的namespace(容器),并让其fork出一个子进程,测试pid namespace的差异。值得注意的几个点:
- 不同版本的内核setns和unshare对namespace的支持不一样,较老的内核可能只支持ipc/net/uts三个namespace
- 某个进程创建后其pid namespace就固定了,使用setns和unshare改变后,其本身的pid namespace不会改变,只有fork出的子进程的pid namespace改变
- setns将进程加入的新namespace需是此进程的后代namespace
- 用setns添加mnt namespace应该放在其他namespace之后,否则可能出现无法打开/proc/pid/ns/…的错误
// 代码1: 开一些新的namespace(启动新容器)
#define _GNU_SOURCE
#include <sys/wait.h>
#include <sched.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \
} while (0)
/* Start function for cloned child */
static int childFunc(void *arg)
{
const char *binary = "/bin/bash";
char *const argv[] = {
"/bin/bash",
NULL
};
char *const envp[] = { NULL };
/* wrappers for execve */
// has const char * as argument list
// execl
// execle => has envp
// execlp => need search PATH
// has char *const arr[] as argument list
// execv
// execvpe => need search PATH and has envp
// execvp => need search PATH
//int ret = execve(binary, argv, envp);
int ret = execv(binary, argv);
if (ret < 0) {
errExit("execve error");
}
return ret;
}
#define STACK_SIZE (1024 * 1024) /* Stack size for cloned child */
int main(int argc, char *argv[])
{
char *stack;
char *stackTop;
pid_t pid;
stack = malloc(STACK_SIZE);
if (stack == NULL)
errExit("malloc");
stackTop = stack + STACK_SIZE; /* Assume stack grows downward */
//pid = clone(childFunc, stackTop, CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWUSER | SIGCHLD, NULL);
pid = clone(childFunc, stackTop, CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWUSER | CLONE_NEWIPC | SIGCHLD, NULL);
//pid = clone(childFunc, stackTop, CLONE_NEWUTS | //CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWUSER | CLONE_NEWIPC //| CLONE_NEWNET | SIGCHLD, NULL);
if (pid == -1)
errExit("clone");
printf("clone() returned %ld\n", (long) pid);
if (waitpid(pid, NULL, 0) == -1)
errExit("waitpid");
printf("child has terminated\n");
exit(EXIT_SUCCESS);
}
// 代码2: 使用setns加入新进程
#define _GNU_SOURCE // ?
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/utsname.h>
#include <unistd.h>
#include <sys/types.h>
#include <sched.h>
#include <fcntl.h>
#include <wait.h>
// mainly setns and unshare system calls
/* int setns(int fd, int nstype); */
// 不同版本内核/proc/pid/ns下namespace文件情况
/*
CLONE_NEWCGROUP (since Linux 4.6)
fd must refer to a cgroup namespace.
CLONE_NEWIPC (since Linux 3.0)
fd must refer to an IPC namespace.
CLONE_NEWNET (since Linux 3.0)
fd must refer to a network namespace.
CLONE_NEWNS (since Linux 3.8)
fd must refer to a mount namespace.
CLONE_NEWPID (since Linux 3.8)
fd must refer to a descendant PID namespace.
CLONE_NEWUSER (since Linux 3.8)
fd must refer to a user namespace.
CLONE_NEWUTS (since Linux 3.0)
fd must refer to a UTS namespace.
*/
/* // 特殊的pid namespace
CLONE_NEWPID behaves somewhat differently from the other nstype
values: reassociating the calling thread with a PID namespace changes
only the PID namespace that child processes of the caller will be
created in; it does not change the PID namespace of the caller
itself. Reassociating with a PID namespace is allowed only if the
PID namespace specified by fd is a descendant (child, grandchild,
etc.) of the PID namespace of the caller. For further details on
PID namespaces, see pid_namespaces(7).
*/
/*
int unshare(int flags);
CLONE_FILES | CLONE_FS | CLONE_NEWCGROUP | CLONE_NEWIPC | CLONE_NEWNET
| CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWUSER | CLONE_NEWUTS | CLONE_SYSVSEM
*/
#define MAX_PROCPATH_LEN 1024
#define errorExit(msg) \
do { fprintf(stderr, "%s in file %s in line %d\n", msg, __FILE__, __LINE__); exit(EXIT_FAILURE); } while (0)
void printInfo();
int openAndSetns(const char *path);
int main(int argc, char *argv[])
{
if (argc < 2) {
fprintf(stdout, "usage : execname pid(find namespaces of this process)\n");
return 0;
}
printInfo();
fprintf(stdout, "---- setns for uts ----\n");
char uts[MAX_PROCPATH_LEN];
snprintf(uts, MAX_PROCPATH_LEN, "/proc/%s/ns/uts", argv[1]);
openAndSetns(uts);
printInfo();
fprintf(stdout, "---- setns for user ----\n");
char user[MAX_PROCPATH_LEN];
snprintf(user, MAX_PROCPATH_LEN, "/proc/%s/ns/user", argv[1]);
openAndSetns(user);
printInfo();
// 注意pid namespace的不同行为,只有后续创建的子进程进入setns设置
// 的新的pid namespace,本进程不会改变
fprintf(stdout, "---- setns for pid ----\n");
char pidpath[MAX_PROCPATH_LEN];
snprintf(pidpath, MAX_PROCPATH_LEN, "/proc/%s/ns/pid", argv[1]);
openAndSetns(pidpath);
printInfo();
fprintf(stdout, "---- setns for ipc ----\n");
char ipc[MAX_PROCPATH_LEN];
snprintf(ipc, MAX_PROCPATH_LEN, "/proc/%s/ns/ipc", argv[1]);
openAndSetns(ipc);
printInfo();
fprintf(stdout, "---- setns for net ----\n");
char net[MAX_PROCPATH_LEN];
snprintf(net, MAX_PROCPATH_LEN, "/proc/%s/ns/net", argv[1]);
openAndSetns(net);
printInfo();
// 注意mnt namespace需要放在其他后面,避免mnt namespace改变后
// 找不到/proc/pid/ns下的文件
fprintf(stdout, "---- setns for mount ----\n");
char mount[MAX_PROCPATH_LEN];
snprintf(mount, MAX_PROCPATH_LEN, "/proc/%s/ns/mnt", argv[1]);
openAndSetns(mount);
printInfo();
// 测试子进程的pid namespace
int ret = fork();
if (-1 == ret) {
errorExit("failed to fork");
} else if (ret == 0) {
fprintf(stdout, "********\n");
fprintf(stdout, "in child process\n");
printInfo();
fprintf(stdout, "********\n");
for (;;) {
sleep(5);
}
} else {
fprintf(stdout, "child pid : %d\n", ret);
}
for (;;) {
sleep(5);
}
waitpid(ret, NULL, 0);
return 0;
}
void printInfo()
{
pid_t pid;
struct utsname uts;
uid_t uid;
gid_t gid;
// pid namespace
pid = getpid();
// user namespace
uid = getuid();
gid = getgid();
// uts namespace
uname(&uts);
fprintf(stdout, "pid : %d\n", pid);
fprintf(stdout, "uid : %d\n", uid);
fprintf(stdout, "gid : %d\n", gid);
fprintf(stdout, "hostname : %s\n", uts.nodename);
}
int openAndSetns(const char *path)
{
int ret = open(path, O_RDONLY, 0);
if (-1 == ret) {
fprintf(stderr, "%s\n", strerror(errno));
errorExit("failed to open fd");
}
if (-1 == (ret = setns(ret, 0))) {
fprintf(stderr, "%s\n", strerror(errno));
errorExit("failed to setns");
}
return ret;
}
3. 测试效果
- user的效果 : 通过/proc/pid/uid_map和/proc/pid/gid_map设置container外用户id和容器内用户id的映射关系(把这放前面是因为后面hostname和mount需要权限…)
- uts的效果 : 改变container中的hostname不会影响container外面的hostname
- pid和mnt的效果 : container中进程id被重新映射,在container中重新挂载/proc filesystem不会影响容器外的/proc
- setns的测试
- 依次为init进程,container init进程(6个namespace的flag都指定了),新加入container的进程以及其fork出的子进程的namespace情况,可以看到container init进程与init进程的namespace完全不同了,新加入container的进程除了pid与init相同外,其他namespace与container init进程相同,而新加入container的进程fork出的子进程的namespace则与container init进程完全相同
- 新加入container init进程pid namespace的子进程
- 程序2输出
时间: 2024-11-12 05:08:57