diff -u -r --show-c-function src.orig/sys/kern/init_sysent.c src/sys/kern/init_sysent.c --- src.orig/sys/kern/init_sysent.c Thu Mar 16 18:47:32 2006 +++ src/sys/kern/init_sysent.c Mon Aug 21 04:21:16 2006 @@ -2,7 +2,7 @@ * System call switch table. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/kern/init_sysent.c,v 1.195.2.2 2006/03/17 01:47:32 rwatson Exp $ + * $FreeBSD$ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.198.2.2 2006/03/17 01:47:06 rwatson Exp */ @@ -485,4 +485,5 @@ struct sysent sysent[] = { { SYF_MPSAFE | AS(auditctl_args), (sy_call_t *)auditctl, AUE_NULL }, /* 453 = auditctl */ { SYF_MPSAFE | AS(_umtx_op_args), (sy_call_t *)_umtx_op, AUE_NULL }, /* 454 = _umtx_op */ { SYF_MPSAFE | AS(thr_new_args), (sy_call_t *)thr_new, AUE_NULL }, /* 455 = thr_new */ + { SYF_MPSAFE | AS(jail_set_resource_limits_args), (sy_call_t *)jail_set_resource_limits, AUE_NULL }, /* 456 = jail_set_resource_limits */ }; diff -u -r --show-c-function src.orig/sys/kern/kern_jail.c src/sys/kern/kern_jail.c --- src.orig/sys/kern/kern_jail.c Sat Nov 12 20:12:32 2005 +++ src/sys/kern/kern_jail.c Mon Aug 28 23:09:39 2006 @@ -5,6 +5,35 @@ * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- + * + * Portions copyright (c) 2006 Chris Jones + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Chris Jones + * thanks to the support of Google's Summer of Code program and + * mentoring by Kip Macy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * */ #include @@ -15,12 +44,19 @@ __FBSDID("$FreeBSD: src/sys/kern/kern_ja #include #include #include +#include #include #include #include #include #include #include +#include +#include +#include +#include +#include +#include #include #include #include @@ -71,6 +107,17 @@ SYSCTL_INT(_security_jail, OID_AUTO, chf &jail_chflags_allowed, 0, "Processes in jail can alter system file flags"); +int jail_limit_memory = 0; +SYSCTL_INT(_security_jail, OID_AUTO, limit_jail_memory, CTLFLAG_RW, + &jail_limit_memory, 0, + "Limit jails' memory usage"); + +int jail_memory_pager_interval = 5; +SYSCTL_INT(_security_jail, OID_AUTO, jail_pager_interval, + CTLTYPE_INT | CTLFLAG_RW, + &jail_memory_pager_interval, 0, + "Interval between jail memory limit checks"); + /* allprison, lastprid, and prisoncount are protected by allprison_mtx. */ struct prisonlist allprison; struct mtx allprison_mtx; @@ -92,6 +139,99 @@ init_prison(void *data __unused) SYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL); +static void +jpager_td(void *arg) +{ + struct proc *p; + struct prison *pr = arg; + struct thread *td; + long limit, cursize, newsize, usage; + int breakout; + int flags = J_PAGER_TD_ACTIVE; + pr->pr_pager_flags_ptr = &flags; + + for (;;) { + if (flags & J_PAGER_TD_DIE) + break; + + if (jail_limit_memory && pr->pr_mem_limit) { + /* + * TODO: consider whether it might be better to start + * pushing back when we approach the limit, rather than + * when we hit it. + * + */ + limit = prison_memory_limit(pr); + usage = prison_memory(pr); + + /* + * The logic from vm_daemon() really needs to go here. + * Problem: we want to push things below their rlimits, + * and vm_daemon doesn't do that. It'd be better to + * refactor vm_daemon to fit, but this'll do for now. + * + */ + + if ((usage - limit) > 0) { + sx_slock(&allproc_lock); + LIST_FOREACH(p, &allproc, p_list) { + + if (pr != p->p_ucred->cr_prison) + continue; + + PROC_LOCK(p); + if (p->p_flag & (P_SYSTEM | P_WEXIT)) { + PROC_UNLOCK(p); + continue; + } + + mtx_lock_spin(&sched_lock); + breakout = 0; + FOREACH_THREAD_IN_PROC(p, td) { + if (!TD_ON_RUNQ(td) && + !TD_IS_RUNNING(td) && + !TD_IS_SLEEPING(td)) { + breakout = 1; + break; + } + } + mtx_unlock_spin(&sched_lock); + if (breakout) { + PROC_UNLOCK(p); + continue; + } + + /* NOTE: we differ here from vm_daemon b/c we don't + * care about the rlimit; things that are exceeding that will + * get caught in due course. We need, however, to decrease + * the pressure on our permitted memory allocation. Fortunately, + * we only care about eventually hitting the limit, so if we + * don't get there right away, it's okay. + */ + + /* TODO: this arbitrarily reduces each process's space by + * 6.25% (until it's completely swapped out) while + * we're under memory pressure. A better way would be + * to either hit large processes first, or to hit the + * least-active processes first, or go proportionally, + * or .... + */ + newsize = cursize = vmspace_resident_count(p->p_vmspace); + newsize -= newsize / 16; + if (cursize < 0) + newsize = 0; + PROC_UNLOCK(p); + vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, newsize); + } /* end LIST_FOREACH procs */ + sx_sunlock(&allproc_lock); + } + } + tsleep(pr, 0, "-", jail_memory_pager_interval * hz); + } + + kthread_exit(0); +} + /* * MPSAFE * @@ -106,6 +246,7 @@ jail(struct thread *td, struct jail_args struct prison *pr, *tpr; struct jail j; struct jail_attach_args jaa; + struct proc *j_pager_proc = NULL; int vfslocked, error, tryprid; error = copyin(uap->jail, &j, sizeof(j)); @@ -135,7 +276,9 @@ jail(struct thread *td, struct jail_args goto e_dropvnref; pr->pr_ip = j.ip_number; pr->pr_linux = NULL; + pr->pr_sched_shares = j.sched_shares; pr->pr_securelevel = securelevel; + pr->pr_mem_limit = j.mem_limit; /* Determine next pr_id and add prison to allprison list. */ mtx_lock(&allprison_mtx); @@ -159,6 +302,11 @@ next: prisoncount++; mtx_unlock(&allprison_mtx); + if (kthread_create(jpager_td, pr, (void *) j_pager_proc, 0, 0, "jpager %d", pr->pr_id)) + goto e_dropprref; + KASSERT(j_pager_proc != NULL, ("NULL j_pager_proc")); + pr->pr_pager = j_pager_proc; + error = jail_attach(td, &jaa); if (error) goto e_dropprref; @@ -168,6 +316,10 @@ next: td->td_retval[0] = jaa.jid; return (0); e_dropprref: + if (j_pager_proc != NULL) { + *pr->pr_pager_flags_ptr = J_PAGER_TD_DIE; + wakeup(pr); + } mtx_lock(&allprison_mtx); LIST_REMOVE(pr, pr_list); prisoncount--; @@ -282,6 +434,10 @@ prison_free(struct prison *pr) prisoncount--; mtx_unlock(&allprison_mtx); + /* Tell scheduler, pager to die. No need to wait. */ + *pr->pr_pager_flags_ptr = J_PAGER_TD_DIE; + wakeup(pr); + TASK_INIT(&pr->pr_task, 0, prison_complete, pr); taskqueue_enqueue(taskqueue_thread, &pr->pr_task); return; @@ -393,6 +549,41 @@ prison_if(struct ucred *cred, struct soc return (ok); } +/* Given credential, return memory usage in bytes. */ +long +prison_memory(struct prison *pr) +{ + struct proc *p; + long mem_used = 0; + + /* + * TODO: this is a really bad way of doing the + * search, as we end up going across all processes + * for each jail. It'd be more efficient to just do + * this once in a period and update the relevant jail. + * + */ + FOREACH_PROC_IN_SYSTEM(p) { + if (!jailed(p->p_ucred) || + (pr != p->p_ucred->cr_prison)) + continue; + mem_used += vmspace_resident_count(p->p_vmspace); + } + mem_used *= PAGE_SIZE; + return mem_used; +} + +/* Given credential, return permitted memory usage in bytes. */ +long +prison_memory_limit(struct prison *pr) +{ + vm_pindex_t memlimit; + mtx_lock(&pr->pr_mtx); + memlimit = (vm_pindex_t) pr->pr_mem_limit; + mtx_unlock(&pr->pr_mtx); + return memlimit; +} + /* * Return 0 if jails permit p1 to frob p2, otherwise ESRCH. */ @@ -523,6 +714,52 @@ prison_enforce_statfs(struct ucred *cred } } +/* + * Change resource limit for a prison. + * + * unsigned int jid: id of jail to mess with + * + * int cpushares: 0 -> remove prison from cpu limits + * -1 -> don't change existing shares + * >0 -> set cpu shares + * + * int memlimit: 0 -> remove prison from mem limits + * -1 -> don't change existing limit + * >1 -> set memory limit (bytes) + * + * TODO: might this be better handled via a writable + * sysctl than with a new syscall? + */ +int +jail_set_resource_limits(struct thread *td, struct jail_set_resource_limits_args *uap) +{ + struct prison *pr; + int error; + + error = suser(td); + if (error) + return (error); + + mtx_lock(&allprison_mtx); + LIST_FOREACH(pr, &allprison, pr_list) { + if (pr->pr_id == uap->jid) + break; + } + if (NULL == pr) { + mtx_unlock(&allprison_mtx); + return 1; + } + + mtx_lock(&pr->pr_mtx); + if (-1 != uap->cpushares) + pr->pr_sched_shares = uap->cpushares; + if (-1 != uap->memlimit) + pr->pr_mem_limit = uap->memlimit; + mtx_unlock(&pr->pr_mtx); + mtx_unlock(&allprison_mtx); + return 0; +} + static int sysctl_jail_list(SYSCTL_HANDLER_ARGS) { @@ -555,6 +792,10 @@ retry: strlcpy(xp->pr_path, pr->pr_path, sizeof(xp->pr_path)); strlcpy(xp->pr_host, pr->pr_host, sizeof(xp->pr_host)); xp->pr_ip = pr->pr_ip; + xp->pr_sched_shares = pr->pr_sched_shares; + xp->pr_estcpu = pr->pr_estcpu; + xp->pr_mem_limit = pr->pr_mem_limit; + xp->pr_mem_usage = pr->pr_mem_usage; mtx_unlock(&pr->pr_mtx); xp++; } diff -u -r --show-c-function src.orig/sys/kern/sched_4bsd.c src/sys/kern/sched_4bsd.c --- src.orig/sys/kern/sched_4bsd.c Fri Jun 16 16:11:55 2006 +++ src/sys/kern/sched_4bsd.c Mon Aug 21 05:05:23 2006 @@ -41,6 +41,7 @@ __FBSDID("$FreeBSD: src/sys/kern/sched_4 #include #include +#include #include #include #include @@ -176,6 +177,11 @@ static void resetpriority_thread(struct static int forward_wakeup(int cpunum); #endif +static uint32_t total_cpu_sched_shares; +static u_int total_est_cpu; +extern struct mtx allprison_mtx; +extern int prisoncount; + static struct kproc_desc sched_kp = { "schedcpu", schedcpu_thread, @@ -289,6 +295,18 @@ SYSCTL_INT(_kern_sched, OID_AUTO, kgfoll &sched_kgfollowons, 0, "number of followons done in a ksegrp"); +static int sched_limitjailcpu = 0; +SYSCTL_INT(_kern_sched, OID_AUTO, limit_jail_cpu, + CTLFLAG_RW, + &sched_limitjailcpu, 0, + "limit jailed process cpu usage"); + +static int sched_unjailedProcessShares = 0; +SYSCTL_INT(_kern_sched, OID_AUTO, system_cpu_shares, + CTLTYPE_INT | CTLFLAG_RW, + &sched_unjailedProcessShares, 0, + "number of shares to allocate to unjailed processes"); + static __inline void sched_load_add(void) { @@ -435,10 +453,23 @@ schedcpu(void) struct proc *p; struct kse *ke; struct ksegrp *kg; + struct prison *pr; int awake, realstathz; realstathz = stathz ? stathz : hz; + /* + * Need to acquire each jail's mutex and hold throughout to keep + * everything out while we recalculate per-jail CPU usage. + * TODO: this is excessively icky. + */ sx_slock(&allproc_lock); + mtx_lock(&allprison_mtx); + if (prisoncount) { + LIST_FOREACH(pr, &allprison, pr_list) { + pr->pr_estcpu = 0; + } + } + total_est_cpu = 0; FOREACH_PROC_IN_SYSTEM(p) { /* * Prevent state changes and protect run queue. @@ -523,6 +554,12 @@ schedcpu(void) if (kg->kg_slptime > 1) continue; kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu); + total_est_cpu += kg->kg_estcpu; + if (sched_limitjailcpu && + NULL != kg->kg_proc->p_ucred && + NULL != kg->kg_proc->p_ucred->cr_prison) + kg->kg_proc->p_ucred->cr_prison->pr_estcpu += + kg->kg_estcpu; resetpriority(kg); FOREACH_THREAD_IN_GROUP(kg, td) { resetpriority_thread(td, kg); @@ -530,6 +567,7 @@ schedcpu(void) } /* end of ksegrp loop */ mtx_unlock_spin(&sched_lock); } /* end of process loop */ + mtx_unlock(&allprison_mtx); sx_sunlock(&allproc_lock); } @@ -540,8 +578,29 @@ static void schedcpu_thread(void) { int nowake; + struct prison *pr; + u_int32_t shares = 0; for (;;) { + if (sched_limitjailcpu) { + /* + * Update total jail CPU shares in case they've changed. + * Safe to read pr_sched_shares without mutex because + * in worst case, we get a bogus value which will be + * corrected on the next pass. + * + * TODO: this should be done by forcing a recalculation + * when jail CPU shares are added / changed, rather than + * doing it every secondc. + */ + + shares = sched_unjailedProcessShares; + LIST_FOREACH(pr, &allprison, pr_list) { + shares += pr->pr_sched_shares; + } + total_cpu_sched_shares = shares; + } + schedcpu(); tsleep(&nowake, 0, "-", hz); } @@ -579,12 +638,37 @@ static void resetpriority(struct ksegrp *kg) { register unsigned int newpriority; + struct prison *pr = NULL; + if (NULL != kg->kg_proc->p_ucred) + pr = kg->kg_proc->p_ucred->cr_prison; if (kg->kg_pri_class == PRI_TIMESHARE) { newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT + - NICE_WEIGHT * (kg->kg_proc->p_nice - PRIO_MIN); - newpriority = min(max(newpriority, PRI_MIN_TIMESHARE), - PRI_MAX_TIMESHARE); + NICE_WEIGHT * (kg->kg_proc->p_nice - PRIO_MIN); + if (sched_limitjailcpu && NULL != pr) { + /* + * Skew the priority by the jail's share of CPU resources. + * The unjailed processes get half the CPU time. + * + * TODO: this is a hard limit. We should really also have + * soft limits available. Also, the amount of CPU time + * reserved to unjailed processes really should be sysctl'd. + */ + register unsigned int skew; + skew = pr->pr_estcpu * total_cpu_sched_shares; + skew /= max(total_est_cpu, 1) * max(pr->pr_sched_shares, 1); + if (skew > 0) { + /* wait your turn until your cpu usage's proportionate */ + newpriority = PRI_MAX_IDLE - 1; + } else { + newpriority = min(max(newpriority, PRI_MIN_TIMESHARE), + PRI_MAX_TIMESHARE); + } + } else { + newpriority = min(max(newpriority, PRI_MIN_TIMESHARE), + PRI_MAX_TIMESHARE); + } + kg->kg_user_pri = newpriority; } } diff -u -r --show-c-function src.orig/sys/kern/syscalls.c src/sys/kern/syscalls.c --- src.orig/sys/kern/syscalls.c Thu Mar 16 18:47:32 2006 +++ src/sys/kern/syscalls.c Mon Aug 21 04:21:16 2006 @@ -2,7 +2,7 @@ * System call names. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/kern/syscalls.c,v 1.181.2.2 2006/03/17 01:47:32 rwatson Exp $ + * $FreeBSD$ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.198.2.2 2006/03/17 01:47:06 rwatson Exp */ @@ -463,4 +463,5 @@ const char *syscallnames[] = { "auditctl", /* 453 = auditctl */ "_umtx_op", /* 454 = _umtx_op */ "thr_new", /* 455 = thr_new */ + "jail_set_resource_limits", /* 456 = jail_set_resource_limits */ }; diff -u -r --show-c-function src.orig/sys/kern/syscalls.master src/sys/kern/syscalls.master --- src.orig/sys/kern/syscalls.master Thu Mar 16 18:47:06 2006 +++ src/sys/kern/syscalls.master Mon Aug 21 02:56:18 2006 @@ -793,6 +793,8 @@ long id, void *uaddr, void *uaddr2); } 455 AUE_NULL MSTD { int thr_new(struct thr_param *param, \ int param_size); } +456 AUE_NULL MSTD { int jail_set_resource_limits(unsigned int jid, \ + int cpushares, int memlimit); } ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master diff -u -r --show-c-function src.orig/sys/sys/jail.h src/sys/sys/jail.h --- src.orig/sys/sys/jail.h Thu Jun 9 12:49:19 2005 +++ src/sys/sys/jail.h Mon Aug 28 22:53:42 2006 @@ -18,6 +18,8 @@ struct jail { char *path; char *hostname; u_int32_t ip_number; + unsigned int sched_shares; + unsigned int mem_limit; }; struct xprison { @@ -26,13 +28,24 @@ struct xprison { char pr_path[MAXPATHLEN]; char pr_host[MAXHOSTNAMELEN]; u_int32_t pr_ip; + unsigned int pr_sched_shares; + unsigned int pr_estcpu; + unsigned int pr_mem_limit; + unsigned int pr_mem_usage; }; -#define XPRISON_VERSION 1 +#define XPRISON_VERSION 2 + +#define JAIL_MINIMUM_SHARES 1 + +#define J_PAGER_TD_ACTIVE 0x01 +#define J_PAGER_TD_DIE 0x02 +#define J_PAGER_TD_DEAD 0x04 #ifndef _KERNEL int jail(struct jail *); int jail_attach(int); +int jail_set_resource_limits(unsigned int, int, int); #else /* _KERNEL */ @@ -73,6 +86,12 @@ struct prison { int pr_securelevel; /* (p) securelevel */ struct task pr_task; /* (d) destroy task */ struct mtx pr_mtx; + u_int32_t pr_sched_shares; /* (p) jail priority */ + u_int pr_estcpu; /* (p) est. cpu of jail */ + struct proc *pr_pager; /* (c) pager pid */ + int *pr_pager_flags_ptr; /* (p) communication to pager */ + size_t pr_mem_limit; /* (p) memory allocation limit */ + size_t pr_mem_usage; /* (p) memory in use */ }; #endif /* _KERNEL || _WANT_PRISON */ @@ -110,6 +129,8 @@ u_int32_t prison_getip(struct ucred *cre void prison_hold(struct prison *pr); int prison_if(struct ucred *cred, struct sockaddr *sa); int prison_ip(struct ucred *cred, int flag, u_int32_t *ip); +long prison_memory(struct prison *pr); +long prison_memory_limit(struct prison *pr); void prison_remote_ip(struct ucred *cred, int flags, u_int32_t *ip); #endif /* _KERNEL */ diff -u -r --show-c-function src.orig/sys/sys/syscall.h src/sys/sys/syscall.h --- src.orig/sys/sys/syscall.h Thu Mar 16 18:47:33 2006 +++ src/sys/sys/syscall.h Mon Aug 21 04:21:16 2006 @@ -2,7 +2,7 @@ * System call numbers. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/sys/syscall.h,v 1.178.2.2 2006/03/17 01:47:33 rwatson Exp $ + * $FreeBSD$ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.198.2.2 2006/03/17 01:47:06 rwatson Exp */ @@ -371,4 +371,5 @@ #define SYS_auditctl 453 #define SYS__umtx_op 454 #define SYS_thr_new 455 -#define SYS_MAXSYSCALL 456 +#define SYS_jail_set_resource_limits 456 +#define SYS_MAXSYSCALL 457 diff -u -r --show-c-function src.orig/sys/sys/syscall.mk src/sys/sys/syscall.mk --- src.orig/sys/sys/syscall.mk Thu Mar 16 18:47:33 2006 +++ src/sys/sys/syscall.mk Mon Aug 21 04:21:16 2006 @@ -1,6 +1,6 @@ # FreeBSD system call names. # DO NOT EDIT-- this file is automatically generated. -# $FreeBSD: src/sys/sys/syscall.mk,v 1.133.2.2 2006/03/17 01:47:33 rwatson Exp $ +# $FreeBSD$ # created from FreeBSD: src/sys/kern/syscalls.master,v 1.198.2.2 2006/03/17 01:47:06 rwatson Exp MIASM = \ syscall.o \ @@ -312,4 +312,5 @@ MIASM = \ setaudit_addr.o \ auditctl.o \ _umtx_op.o \ - thr_new.o + thr_new.o \ + jail_set_resource_limits.o diff -u -r --show-c-function src.orig/sys/sys/sysproto.h src/sys/sys/sysproto.h --- src.orig/sys/sys/sysproto.h Thu Mar 16 18:47:33 2006 +++ src/sys/sys/sysproto.h Mon Aug 21 04:21:16 2006 @@ -2,7 +2,7 @@ * System call prototypes. * * DO NOT EDIT-- this file is automatically generated. - * $FreeBSD: src/sys/sys/sysproto.h,v 1.177.2.2 2006/03/17 01:47:33 rwatson Exp $ + * $FreeBSD$ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.198.2.2 2006/03/17 01:47:06 rwatson Exp */ @@ -1365,6 +1365,11 @@ struct thr_new_args { char param_l_[PADL_(struct thr_param *)]; struct thr_param * param; char param_r_[PADR_(struct thr_param *)]; char param_size_l_[PADL_(int)]; int param_size; char param_size_r_[PADR_(int)]; }; +struct jail_set_resource_limits_args { + char jid_l_[PADL_(unsigned int)]; unsigned int jid; char jid_r_[PADR_(unsigned int)]; + char cpushares_l_[PADL_(int)]; int cpushares; char cpushares_r_[PADR_(int)]; + char memlimit_l_[PADL_(int)]; int memlimit; char memlimit_r_[PADR_(int)]; +}; int nosys(struct thread *, struct nosys_args *); void sys_exit(struct thread *, struct sys_exit_args *); int fork(struct thread *, struct fork_args *); @@ -1674,6 +1679,7 @@ int setaudit_addr(struct thread *, struc int auditctl(struct thread *, struct auditctl_args *); int _umtx_op(struct thread *, struct _umtx_op_args *); int thr_new(struct thread *, struct thr_new_args *); +int jail_set_resource_limits(struct thread *, struct jail_set_resource_limits_args *); #ifdef COMPAT_43 diff -u -r --show-c-function src.orig/sys/vm/vm_pageout.c src/sys/vm/vm_pageout.c --- src.orig/sys/vm/vm_pageout.c Wed Mar 8 17:02:51 2006 +++ src/sys/vm/vm_pageout.c Mon Aug 28 22:54:37 2006 @@ -205,7 +205,6 @@ int vm_pageout_page_count = VM_PAGEOUT_P int vm_page_max_wired; /* XXX max # of wired pages system-wide */ #if !defined(NO_SWAPPING) -static void vm_pageout_map_deactivate_pages(vm_map_t, long); static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); static void vm_req_vmdaemon(void); #endif @@ -592,7 +591,7 @@ unlock_return: * deactivate some number of pages in a map, try to do it fairly, but * that is really hard to do. */ -static void +void vm_pageout_map_deactivate_pages(map, desired) vm_map_t map; long desired; diff -u -r --show-c-function src.orig/sys/vm/vm_pageout.h src/sys/vm/vm_pageout.h --- src.orig/sys/vm/vm_pageout.h Thu Jan 6 19:29:27 2005 +++ src/sys/vm/vm_pageout.h Mon Aug 14 16:30:20 2006 @@ -87,6 +87,8 @@ extern int vm_pageout_page_count; * Exported routines. */ +void vm_pageout_map_deactivate_pages(vm_map_t map, long desired); + /* * Signal pageout-daemon and wait for it. */ diff -u -r --show-c-function src.orig/usr.sbin/jail/jail.8 src/usr.sbin/jail/jail.8 --- src.orig/usr.sbin/jail/jail.8 Sun May 28 03:49:42 2006 +++ src/usr.sbin/jail/jail.8 Mon Aug 21 02:56:18 2006 @@ -45,6 +45,8 @@ .Op Fl J Ar jid_file .Op Fl s Ar securelevel .Op Fl l u Ar username | Fl U Ar username +.Op Fl S Ar cpu_shares +.Op Fl M Ar mem_limit .Ar path hostname ip-number command ... .Sh DESCRIPTION The @@ -86,6 +88,10 @@ should run. The user name from jailed environment as whom the .Ar command should run. +.It Fl S Ar cpu_shares +CPU shares to assign to the prison. +.It Fl M Ar mem_limit +Amount of memory (in MB) to allow the prison to use. .It Ar path Directory which is to be the root of the prison. .It Ar hostname @@ -542,6 +548,17 @@ or clear system file flags; if non-zero, privileged, and may manipulate system file flags subject to the usual constraints on .Va kern.securelevel . +.It Va security.jail.limit_jail_memory, Va security.jail.jail_pager_interval +These MIB entries determine whether and how often (in seconds) a +jail's memory-limit monitoring daemon will run, and consequently the +period during which a jail can be overcommitted for resident memory. +.It Va kern.sched.limit_jail_cpu +This MIB entry sets whether CPU usage limits will be enforced +against processes in jails with CPU limits. +.It Va kern.sched.system_cpu_shares +Number of CPU usage shares to allocate to unjailed processes for the +purposes of determining CPU usage permitted for jailed processes. +Unjailed processes are not subject to CPU usage limits. .El .Pp The read-only diff -u -r --show-c-function src.orig/usr.sbin/jail/jail.c src/usr.sbin/jail/jail.c --- src.orig/usr.sbin/jail/jail.c Fri May 26 04:30:59 2006 +++ src/usr.sbin/jail/jail.c Mon Aug 21 02:56:18 2006 @@ -56,6 +56,8 @@ main(int argc, char **argv) struct in_addr in; gid_t groups[NGROUPS]; int ch, i, iflag, Jflag, lflag, ngroups, securelevel, uflag, Uflag; + unsigned int mem_limit = 0; + unsigned int sched_shares = 0; char path[PATH_MAX], *ep, *username, *JidFile; static char *cleanenv; const char *shell, *p = NULL; @@ -67,7 +69,7 @@ main(int argc, char **argv) username = JidFile = cleanenv = NULL; fp = NULL; - while ((ch = getopt(argc, argv, "ils:u:U:J:")) != -1) { + while ((ch = getopt(argc, argv, "ilS:M:s:u:U:J:")) != -1) { switch (ch) { case 'i': iflag = 1; @@ -76,6 +78,13 @@ main(int argc, char **argv) JidFile = optarg; Jflag = 1; break; + case 'M': + mem_limit = atoi(optarg); + mem_limit *= 1024 * 1024; + break; + case 'S': + sched_shares = atoi(optarg); + break; case 's': ltmp = strtol(optarg, &ep, 0); if (*ep || ep == optarg || ltmp > INT_MAX || !ltmp) @@ -118,6 +127,8 @@ main(int argc, char **argv) if (inet_aton(argv[2], &in) == 0) errx(1, "Could not make sense of ip-number: %s", argv[2]); j.ip_number = ntohl(in.s_addr); + j.mem_limit = mem_limit; + j.sched_shares = sched_shares; if (Jflag) { fp = fopen(JidFile, "w"); if (fp == NULL) @@ -182,8 +193,10 @@ static void usage(void) { - (void)fprintf(stderr, "%s%s%s\n", - "usage: jail [-i] [-J jid_file] [-s securelevel] [-l -u ", + (void)fprintf(stderr, "%s%s%s%s%s\n", + "usage: jail [-i] [-J jid_file] [-M mem_limit] ", + "[-S cpu_shares] [-s securelevel]", + " [-l -u ", "username | -U username]", " path hostname ip-number command ..."); exit(1); diff -u -r --show-c-function src.orig/usr.sbin/jls/jls.8 src/usr.sbin/jls/jls.8 --- src.orig/usr.sbin/jls/jls.8 Tue Apr 8 21:04:12 2003 +++ src/usr.sbin/jls/jls.8 Mon Aug 21 02:56:18 2006 @@ -42,7 +42,8 @@ jail identifier (JID), IP address, hostn .Sh SEE ALSO .Xr jail 2 , .Xr jail 8 , -.Xr jexec 8 +.Xr jexec 8 , +.Xr jtune 8 .Sh HISTORY The .Nm diff -u /usr/src.orig/usr.sbin/jtune/Makefile /usr/src/usr.sbin/jtune/Makefile --- /usr/src.orig/usr.sbin/jtune/Makefile Mon Aug 28 23:22:17 2006 +++ /usr/src/usr.sbin/jtune/Makefile Mon Aug 21 02:56:18 2006 @@ -0,0 +1,10 @@ +# $FreeBSD$ + +PROG= jtune +MAN= jtune.8 +DPADD= ${LIBUTIL} +LDADD= -lutil + +WARNS?= 6 + +.include diff -u /usr/src.orig/usr.sbin/jtune/jtune.8 /usr/src/usr.sbin/jtune/jtune.8 --- /usr/src.orig/usr.sbin/jtune/jtune.8 Mon Aug 28 23:22:21 2006 +++ /usr/src/usr.sbin/jtune/jtune.8 Mon Aug 21 02:56:18 2006 @@ -0,0 +1,75 @@ +.\" Copyright (c) 2006 Chris Jones +.\" All rights reserved. +.\" +.\" This software was developed for the FreeBSD Project by Chris Jones +.\" thanks to the support of Google's Summer of Code program and +.\" mentoring by Kip Macy. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd August 21, 2006 +.Dt JTUNE 8 +.Os +.Sh NAME +.Nm jtune +.Nd "modify jail resource limits" +.Sh SYNOPSIS +.Nm +.Fl j Ar jail_id +.Op Fl i +.Op Fl m Ar mem_limit +.Op Fl s Ar cpu_shares +.Sh DESCRIPTION +The +.Nm +utility modifies a jail's memory and CPU usage limits. +.Pp +The options are as follows: +.Bl -tag -width ".Fl u Ar cpu_shares" +.It Ar jail_id +Jail identifier (JID) of the jail whose limits are being tuned. +.It Fl i +Show jail's resource limits. +.It Fl m Ar mem_limit +Limit a jail's memory usage (resident set size) to +.Ar mem_limit +megabytes. +.It Fl s Ar cpu_shares +Set a jail's CPU shares to +.Ar cpu_shares +shares. +.Sh SEE ALSO +.Xr jail 2 , +.Xr jail 8 , +.Xr jexec 8 +.Xr jls 8 +.Sh HISTORY +The +.Nm +utility first appeared in +.Fx FIXME . +.Pp +.Nm +was written by Chris Jones through the 2006 Google Summer of Code +program. diff -u /usr/src.orig/usr.sbin/jtune/jtune.c /usr/src/usr.sbin/jtune/jtune.c --- /usr/src.orig/usr.sbin/jtune/jtune.c Mon Aug 28 23:22:22 2006 +++ /usr/src/usr.sbin/jtune/jtune.c Mon Aug 21 02:56:18 2006 @@ -0,0 +1,188 @@ +/*- + * Copyright (c) 2006 Chris Jones + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Chris Jones + * thanks to the support of Google's Summer of Code program and + * mentoring by Kip Macy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void usage(void); +static struct xprison *getxprison(int); +extern char **environ; + +int +main(int argc, char **argv) +{ + struct xprison *xp; + int jid = 0; + int memlimit = -1; + int shares = -1; + int iflag = 0; + int retval; + int ch; + + while ((ch = getopt(argc, argv, "ij:m:s:")) != -1) { + switch (ch) { + case 'i': + iflag = 1; + break; + case 'j': + jid = atoi(optarg); + if (!jid && errno) + err(1, "invalid jail id '%s'", optarg); + break; + + case 'm': + memlimit = atoi(optarg); + if (!memlimit && errno) + err(1, "invalid memory limit '%s'", optarg); + if (memlimit < 0) + errx(1, "invalid memory limit '%s'", optarg); + memlimit *= 1024 * 1024; + break; + + case 's': + shares = atoi(optarg); + if (!shares && errno) + err(1, "invalid cpu share '%s'", optarg); + if (shares < 0) + errx(1, "invalid cpu share '%s'", optarg); + break; + + default: + usage(); + } + } + + argc -= optind; + argv += optind; + + if (!jid) + usage(); + + xp = getxprison(jid); + if (NULL == xp) + errx(1, "no jail with id %d", jid); + + if (iflag) { + char *memlimstr, *memusestr; + + asprintf(&memusestr, "%d M", + xp->pr_mem_usage / (1024 * 1024)); + if (xp->pr_mem_limit) { + asprintf(&memlimstr, "%d M", + xp->pr_mem_limit / (1024 * 1024)); + } else { + asprintf(&memlimstr, "None"); + } + + if (NULL == memusestr || NULL == memlimstr) + err(1, "couldn't allocate memory"); + + printf(" JID Hostname Memory Used / Limit CPU Shares\n"); + printf("%6d %-24.24s %6s / %-6.6s %-4d\n", + xp->pr_id, xp->pr_host, + memusestr, memlimstr, + xp->pr_sched_shares); + exit(0); + } + + retval = jail_set_resource_limits(jid, shares, memlimit); + if (retval) { + errx(1, "jail_set_resource_limit(%d, %d, %d) failed", + jid, memlimit, shares); + } + exit(0); + +} + +static void +usage() +{ + (void)fprintf(stderr, "%s\n", + "usage: jtune -j jid_id [-m mem_limit] [-s cpu_shares]"); + exit(0); +} + +static struct xprison * +getxprison(int jid) +{ + size_t i, len; + struct xprison *xpl, *sxpl; + if (sysctlbyname("security.jail.list", NULL, &len, NULL, 0) == -1) + err(1, "sysctlbyname(): security.jail.list"); + + if (len <= 0) + errx(1, "sysctl security.jail.list has no entries for jid %d", jid); + + /* getxprison allocates the structure, caller frees */ + sxpl = xpl = malloc(len); + if (NULL == xpl) + err(1, "malloc()"); + + if (sysctlbyname("security.jail.list", xpl, &len, NULL, 0) == -1) { + free(xpl); + err(1, "sysctlbyname(): security.jail.list"); + } + + if (len < sizeof(*xpl) || len % sizeof(*xpl) || + xpl->pr_version != XPRISON_VERSION) + errx(1, "Kernel and userland out of sync"); + + for (i = 0; i < len / sizeof(*xpl); i++) { + if (jid == xpl->pr_id) { + struct xprison *xp; + xp = malloc(sizeof (struct xprison)); + if (NULL == xp) + err(1, "malloc()"); + memcpy(xp, xpl, sizeof (struct xprison)); + free(sxpl); + return xp; + } + xpl++; + } + + free(sxpl); + return NULL; +}