System Call Interposition: how to implement virtualization

From Virtualsquare
Jump to: navigation, search

A System Call Interposition (SCI) support tracks all the system service requests of processes. Each system request can be modified or denied.

It is possible to implement tools to trace, monitor, or virtualize processes.

This posting shows three different ways to implement a System Call Interposition service. The simple virtualization problem to hide the contents of the file /etc/passwd will be implemented by each SCI service, showing pros and cons of each proposal.

This example can also be used as a proof-of-concept test to propose others services for SCI.

Contents

The example

When a process tries to open the file "/etc/passwd" the system call must fail returning errno=ENOENT.

Purelibc

#define _GNU_SOURCE
#include <stdio.h>
#include <string.h>
#include <stdarg.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <purelibc.h>
#include <errno.h>

static sfun _native_syscall;
static char buf[128];
static long int mysc(long int sysno, ...){
  va_list ap;
  long int a1,a2,a3,a4,a5,a6;
  va_start (ap, sysno);
  a1=va_arg(ap,long int);
  a2=va_arg(ap,long int);
  a3=va_arg(ap,long int);
  a4=va_arg(ap,long int);
  a5=va_arg(ap,long int);
  a6=va_arg(ap,long int);
  va_end(ap);
  if (sysno == __NR_open) {
    char *path=(char *)a1;
    if (a1 && strcmp(path,"/etc/passwd")==0) {
      errno=ENOENT;
      return -1;
    }
  }
  return _native_syscall(sysno,a1,a2,a3,a4,a5,a6);
}
  void
  __attribute ((constructor))
init_test (void)
{
  _native_syscall=_pure_start(mysc,NULL,PUREFLAG_STDALL);
}

Compile this source code (sci_purelibc.c):

 gcc -shared -o sci_purelibc.so sci_purelibc.c

preload purelibc and this shared object:

 export LD_PRELOAD=libpurelibc.so:/tmp/tests/syscall_interposition/sci_purelibc.so

and now /etc/passwd has disappeared

 $cat /etc/passwd
 cat: /etc/passwd: No such file or directory

Requirements: depends on the purelibc library

Pros: very fast.

Cons: unsafe (can be easily cincunvented), it works only for dynamically linked executables.

ptrace

#include  <sys/ptrace.h>
#include  <sys/types.h>
#include  <sys/wait.h>
#include  <unistd.h>
#include  <stdio.h>
#include  <limits.h>
#include  <errno.h>
#include  <sys/user.h>
#include  <asm/ptrace-abi.h>
#include  <asm/unistd.h>

int main(int argc, char *argv[])
{
  pid_t child;
  long orig_eax;
  child = fork();
  if(child == 0) {
    ptrace(PTRACE_TRACEME, 0, NULL, NULL);
    argv++;
    execvp(argv[0],argv);
  }
  else {
    int status;
    int gotpasswd=0;
    int out=0;
    while(1) {
      waitpid(child,&status,0);
      if(WIFEXITED(status) || WIFSIGNALED(status))
        break;
      orig_eax = ptrace(PTRACE_PEEKUSER, child, 4 * ORIG_EAX, NULL);
      if (gotpasswd == 0) {
        if (orig_eax == __NR_open) {
          if (out==0) {
            char path[PATH_MAX];
            int i;
            long pathaddr=ptrace(PTRACE_PEEKUSER, child, 4 * EBX, NULL);
            errno=0;
            for (i=0; i<PATH_MAX; i++) {
              if ((i&0x3) == 0) {
                long chunk=ptrace(PTRACE_PEEKDATA, child, (char *)(pathaddr+i), 0);
                if (errno != 0)
                  break;
                * ((long *) (&path[i])) = chunk;
              }
              if (path[i] == 0)
                break;
            }
            if (strcmp(path,"/etc/passwd")==0) {
              ptrace(PTRACE_POKEUSER, child, 4 * ORIG_EAX, __NR_getpid);
              gotpasswd=1;
            }
          }
          out = 1-out;
        }
      } else {
        ptrace(PTRACE_POKEUSER, child, 4 * EAX, -ENOENT);
        gotpasswd=out=0;
      }
      ptrace(PTRACE_SYSCALL, child, NULL, NULL);
    }
  }
  return 0;
}

Compile the source code (sci_ptrace.c)

 gcc -o sci_ptrace sci_ptrace.c

Run it:

 ./sci_ptrace cat /etc/passwd
 cat: /etc/passwd: No such file or directory

Requirements: none (the kernel must provide ptrace)

Pros: it works

Cons: Slow, many "addresses" are processor architeture dependent, the interface is not clean (some signals cannot be used, SIGSTOP/SIGCONT, it overrides the natural semantics of the wait system call).

kmview.ko (based on utrace)

#define _GNU_SOURCE
#include  <sys/types.h>
#include  <sys/wait.h>
#include  <unistd.h>
#include  <stdio.h>
#include  <stdlib.h>
#include  <limits.h>
#include  <errno.h>
#include  <fcntl.h>
#include  <string.h>
#include  <asm/unistd.h>
#include <sys/ioctl.h>
#include <kmview.h>


void dowait(int signal)
{
  int w;
  wait(&w);
}

#ifdef OPT_PATH_HASH
static int hash(char *s)
{
    int rv=0;
      while (*s) {
            rv ^= (rv << 5) + (rv >> 2) + *s;
                s++;
                  }
        return rv;
}
#endif

main(int argc, char *argv[])
{
  int fd;
  struct kmview_event event;
  int flags=0;
#ifdef OPT_OPEN_ONLY
  int bitmap[INT_PER_MAXSYSCALL];
#endif
#ifdef OPT_PATH_HASH
  struct ghosthash64 gh;
#endif
  fd=open("/dev/kmview",O_RDONLY);
  if (fd <0)
    exit(1);
#ifdef OPT_OPEN_ONLY
  scbitmap_fill(bitmap);
  scbitmap_clr(bitmap, __NR_open);
  ioctl(fd, KMVIEW_SYSCALLBITMAP,bitmap);
#endif
#ifdef OPT_PATH_HASH
  flags|=KMVIEW_FLAG_PATH_SYSCALL_SKIP;
  gh.deltalen[0]=strlen("/etc/passwd");
  gh.hash[0] = hash("/etc/passwd");
  gh.deltalen[1]=GH_TERMINATE;
  ioctl(fd,KMVIEW_GHOSTMOUNTS,&gh);
#endif
#ifdef OPT_FDSET
  flags|=KMVIEW_FLAG_FDSET;
#endif
  ioctl(fd, KMVIEW_SET_FLAGS, flags);
  signal(SIGCHLD,dowait);
  if (fork()) {
    while (1) {
      read(fd,&event,sizeof(event));
      switch (event.tag) {
        case KMVIEW_EVENT_NEWTHREAD:
          {
            struct kmview_ioctl_umpid ump;
            ump.kmpid=event.x.newthread.kmpid;
            ump.umpid=event.x.newthread.kmpid;
            ioctl(fd, KMVIEW_UMPID, &ump);
            break;
          }
        case KMVIEW_EVENT_TERMTHREAD:
          if (event.x.termthread.remaining == 0)
            exit (0);
          break;
        case KMVIEW_EVENT_SYSCALL_ENTRY:
          if (event.x.syscall.scno == __NR_open) {
            char path[PATH_MAX];
            struct kmview_ioctl_data data={event.x.syscall.x.umpid,
              event.x.syscall.args[0],PATH_MAX,path};
            ioctl(fd,KMVIEW_READSTRINGDATA, &data);
            if (strcmp(path,"/etc/passwd") == 0) {
              struct kmview_event_ioctl_sysreturn outevent;
              outevent.x.kmpid=event.x.syscall.x.umpid;
              outevent.retval=-1;
              outevent.erno = ENOENT;
              ioctl(fd,KMVIEW_SYSVIRTUALIZED, &outevent);
            } else
              ioctl(fd, KMVIEW_SYSRESUME, event.x.syscall.x.umpid);
          } else
            ioctl(fd, KMVIEW_SYSRESUME, event.x.syscall.x.umpid);
          break;
      }
    }
  } else { /* traced root process*/
    ioctl(fd, KMVIEW_ATTACH);
    close(fd);
    argv++;
    execvp(argv[0],argv);
  }
}

Compile the source code (sci_kmview.c)

 gcc -o sci_kmview sci_kmview.c

Run it:

 ./sci_kmview cat /etc/passwd
 cat: /etc/passwd: No such file or directory

The code include several optimizations:

  • OPT_OPEN_ONLY: the kernel module filters only the "open" system calls
  • OPT_PATH_HASH: when a system calls uses a path, kmview.ko forward only those whose path matches a hash key
  • OPT_FDSET: kmview.ko manages a table of the "virtualized" file descriptors

Optimizations can be added at compile time using a combination of -DOPT_OPEN_ONLY, -DOPT_PATH_HASH and -DOPT_FDSET.

Requirements: the kernel must support utrace and the kmview.ko kernel module must be loaded

Pros: fast, several optimizations can run in kernel space, clean design (event can be read from a device), architecture independent.

Cons: utrace is not a feature of the vanilla Linux kernel

Basic Performance Evaluation

The benchmarking code is the following:

#include <stdio.h>
#include <fcntl.h>
main()
{
        int i;
        int fd;
        for (i=0; i<100000; i++) {
                fd=open("/etc/passwd",O_RDONLY);
                close(fd);
                fd=open("/etc/hosts",O_RDONLY);
                close(fd);
        }
}

The execution times are the following:

* kernel (not virtualized): 0.8sec
* purelibc: 0.48sec
* ptrace: ~37.5sec
* kmview.ko (no opt): ~22sec
* kmview.ko (opt): ~7.1sec

(purelibc virtualization is even faster than the non virtualized case because it generates less system calls)

Please note that this example has been designed to provide almost the worst case for the virtualizing service. The implementation based on kmview creates a minimal overhead when tested in a more common scenario (e.g. a compilation),

$ time gcc -o test test.c
real    0m0.147s
user    0m0.084s
sys     0m0.044s
$ time ./sci_kmview gcc -o test test.c
real    0m0.146s
user    0m0.088s
sys     0m0.048s
Personal tools
Namespaces

Variants
Actions
Navigation
Toolbox