System Call Interposition: how to implement virtualization
A System Call Interposition (SCI) support tracks all the system service requests of processes. Each system request can be modified or denied.
It is possible to implement tools to trace, monitor, or virtualize processes.
This posting shows three different ways to implement a System Call Interposition service. The simple virtualization problem to hide the contents of the file /etc/passwd will be implemented by each SCI service, showing pros and cons of each proposal.
This example can also be used as a proof-of-concept test to propose others services for SCI.
Contents |
The example
When a process tries to open the file "/etc/passwd" the system call must fail returning errno=ENOENT.
Purelibc
#define _GNU_SOURCE #include <stdio.h> #include <string.h> #include <stdarg.h> #include <sys/syscall.h> #include <unistd.h> #include <purelibc.h> #include <errno.h> static sfun _native_syscall; static char buf[128]; static long int mysc(long int sysno, ...){ va_list ap; long int a1,a2,a3,a4,a5,a6; va_start (ap, sysno); a1=va_arg(ap,long int); a2=va_arg(ap,long int); a3=va_arg(ap,long int); a4=va_arg(ap,long int); a5=va_arg(ap,long int); a6=va_arg(ap,long int); va_end(ap); if (sysno == __NR_open) { char *path=(char *)a1; if (a1 && strcmp(path,"/etc/passwd")==0) { errno=ENOENT; return -1; } } return _native_syscall(sysno,a1,a2,a3,a4,a5,a6); } void __attribute ((constructor)) init_test (void) { _native_syscall=_pure_start(mysc,NULL,PUREFLAG_STDALL); }
Compile this source code (sci_purelibc.c):
gcc -shared -o sci_purelibc.so sci_purelibc.c
preload purelibc and this shared object:
export LD_PRELOAD=libpurelibc.so:/tmp/tests/syscall_interposition/sci_purelibc.so
and now /etc/passwd has disappeared
$cat /etc/passwd cat: /etc/passwd: No such file or directory
Requirements: depends on the purelibc library
Pros: very fast.
Cons: unsafe (can be easily cincunvented), it works only for dynamically linked executables.
ptrace
#include <sys/ptrace.h> #include <sys/types.h> #include <sys/wait.h> #include <unistd.h> #include <stdio.h> #include <limits.h> #include <errno.h> #include <sys/user.h> #include <asm/ptrace-abi.h> #include <asm/unistd.h> int main(int argc, char *argv[]) { pid_t child; long orig_eax; child = fork(); if(child == 0) { ptrace(PTRACE_TRACEME, 0, NULL, NULL); argv++; execvp(argv[0],argv); } else { int status; int gotpasswd=0; int out=0; while(1) { waitpid(child,&status,0); if(WIFEXITED(status) || WIFSIGNALED(status)) break; orig_eax = ptrace(PTRACE_PEEKUSER, child, 4 * ORIG_EAX, NULL); if (gotpasswd == 0) { if (orig_eax == __NR_open) { if (out==0) { char path[PATH_MAX]; int i; long pathaddr=ptrace(PTRACE_PEEKUSER, child, 4 * EBX, NULL); errno=0; for (i=0; i<PATH_MAX; i++) { if ((i&0x3) == 0) { long chunk=ptrace(PTRACE_PEEKDATA, child, (char *)(pathaddr+i), 0); if (errno != 0) break; * ((long *) (&path[i])) = chunk; } if (path[i] == 0) break; } if (strcmp(path,"/etc/passwd")==0) { ptrace(PTRACE_POKEUSER, child, 4 * ORIG_EAX, __NR_getpid); gotpasswd=1; } } out = 1-out; } } else { ptrace(PTRACE_POKEUSER, child, 4 * EAX, -ENOENT); gotpasswd=out=0; } ptrace(PTRACE_SYSCALL, child, NULL, NULL); } } return 0; }
Compile the source code (sci_ptrace.c)
gcc -o sci_ptrace sci_ptrace.c
Run it:
./sci_ptrace cat /etc/passwd cat: /etc/passwd: No such file or directory
Requirements: none (the kernel must provide ptrace)
Pros: it works
Cons: Slow, many "addresses" are processor architeture dependent, the interface is not clean (some signals cannot be used, SIGSTOP/SIGCONT, it overrides the natural semantics of the wait system call).
kmview.ko (based on utrace)
#define _GNU_SOURCE #include <sys/types.h> #include <sys/wait.h> #include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <limits.h> #include <errno.h> #include <fcntl.h> #include <string.h> #include <asm/unistd.h> #include <sys/ioctl.h> #include <kmview.h> void dowait(int signal) { int w; wait(&w); } #ifdef OPT_PATH_HASH static int hash(char *s) { int rv=0; while (*s) { rv ^= (rv << 5) + (rv >> 2) + *s; s++; } return rv; } #endif main(int argc, char *argv[]) { int fd; struct kmview_event event; int flags=0; #ifdef OPT_OPEN_ONLY int bitmap[INT_PER_MAXSYSCALL]; #endif #ifdef OPT_PATH_HASH struct ghosthash64 gh; #endif fd=open("/dev/kmview",O_RDONLY); if (fd <0) exit(1); #ifdef OPT_OPEN_ONLY scbitmap_fill(bitmap); scbitmap_clr(bitmap, __NR_open); ioctl(fd, KMVIEW_SYSCALLBITMAP,bitmap); #endif #ifdef OPT_PATH_HASH flags|=KMVIEW_FLAG_PATH_SYSCALL_SKIP; gh.deltalen[0]=strlen("/etc/passwd"); gh.hash[0] = hash("/etc/passwd"); gh.deltalen[1]=GH_TERMINATE; ioctl(fd,KMVIEW_GHOSTMOUNTS,&gh); #endif #ifdef OPT_FDSET flags|=KMVIEW_FLAG_FDSET; #endif ioctl(fd, KMVIEW_SET_FLAGS, flags); signal(SIGCHLD,dowait); if (fork()) { while (1) { read(fd,&event,sizeof(event)); switch (event.tag) { case KMVIEW_EVENT_NEWTHREAD: { struct kmview_ioctl_umpid ump; ump.kmpid=event.x.newthread.kmpid; ump.umpid=event.x.newthread.kmpid; ioctl(fd, KMVIEW_UMPID, &ump); break; } case KMVIEW_EVENT_TERMTHREAD: if (event.x.termthread.remaining == 0) exit (0); break; case KMVIEW_EVENT_SYSCALL_ENTRY: if (event.x.syscall.scno == __NR_open) { char path[PATH_MAX]; struct kmview_ioctl_data data={event.x.syscall.x.umpid, event.x.syscall.args[0],PATH_MAX,path}; ioctl(fd,KMVIEW_READSTRINGDATA, &data); if (strcmp(path,"/etc/passwd") == 0) { struct kmview_event_ioctl_sysreturn outevent; outevent.x.kmpid=event.x.syscall.x.umpid; outevent.retval=-1; outevent.erno = ENOENT; ioctl(fd,KMVIEW_SYSVIRTUALIZED, &outevent); } else ioctl(fd, KMVIEW_SYSRESUME, event.x.syscall.x.umpid); } else ioctl(fd, KMVIEW_SYSRESUME, event.x.syscall.x.umpid); break; } } } else { /* traced root process*/ ioctl(fd, KMVIEW_ATTACH); close(fd); argv++; execvp(argv[0],argv); } }
Compile the source code (sci_kmview.c)
gcc -o sci_kmview sci_kmview.c
Run it:
./sci_kmview cat /etc/passwd cat: /etc/passwd: No such file or directory
The code include several optimizations:
- OPT_OPEN_ONLY: the kernel module filters only the "open" system calls
- OPT_PATH_HASH: when a system calls uses a path, kmview.ko forward only those whose path matches a hash key
- OPT_FDSET: kmview.ko manages a table of the "virtualized" file descriptors
Optimizations can be added at compile time using a combination of -DOPT_OPEN_ONLY, -DOPT_PATH_HASH and -DOPT_FDSET.
Requirements: the kernel must support utrace and the kmview.ko kernel module must be loaded
Pros: fast, several optimizations can run in kernel space, clean design (event can be read from a device), architecture independent.
Cons: utrace is not a feature of the vanilla Linux kernel
Basic Performance Evaluation
The benchmarking code is the following:
#include <stdio.h> #include <fcntl.h> main() { int i; int fd; for (i=0; i<100000; i++) { fd=open("/etc/passwd",O_RDONLY); close(fd); fd=open("/etc/hosts",O_RDONLY); close(fd); } }
The execution times are the following:
* kernel (not virtualized): 0.8sec * purelibc: 0.48sec * ptrace: ~37.5sec * kmview.ko (no opt): ~22sec * kmview.ko (opt): ~7.1sec
(purelibc virtualization is even faster than the non virtualized case because it generates less system calls)
Please note that this example has been designed to provide almost the worst case for the virtualizing service. The implementation based on kmview creates a minimal overhead when tested in a more common scenario (e.g. a compilation),
$ time gcc -o test test.c real 0m0.147s user 0m0.084s sys 0m0.044s $ time ./sci_kmview gcc -o test test.c real 0m0.146s user 0m0.088s sys 0m0.048s