diff -urN --exclude=compile sys-orig/conf/files sys/conf/files --- sys-orig/conf/files Wed Aug 31 17:21:01 2005 +++ sys/conf/files Wed Aug 31 17:22:10 2005 @@ -1226,6 +1226,11 @@ kern/subr_clock.c optional genclock kern/subr_devstat.c standard kern/subr_disk.c standard +# +kern/subr_disk_sched.c standard +ufs/ufs/deviceps.c standard +ufs/ufs/heap.c standard +# kern/subr_eventhandler.c standard kern/subr_hints.c standard kern/subr_kdb.c standard diff -urN --exclude=compile sys-orig/kern/kern_resource.c sys/kern/kern_resource.c --- sys-orig/kern/kern_resource.c Wed Aug 31 17:21:24 2005 +++ sys/kern/kern_resource.c Wed Aug 31 17:32:58 2005 @@ -60,7 +60,6 @@ #include #include - static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures"); static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures"); #define UIHASH(uid) (&uihashtbl[(uid) & uihash]) @@ -70,7 +69,7 @@ static void calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up, struct timeval *sp); -static int donice(struct thread *td, struct proc *chgp, int n); +static int donice(struct thread *td, struct proc *chgp, int n, int func); static struct uidinfo *uilookup(uid_t uid); /* @@ -183,17 +182,18 @@ curp = td->td_proc; switch (uap->which) { + case 4: /* PRIO_DISK */ case PRIO_PROCESS: if (uap->who == 0) { PROC_LOCK(curp); - error = donice(td, curp, uap->prio); + error = donice(td, curp, uap->prio, uap->which); PROC_UNLOCK(curp); } else { p = pfind(uap->who); if (p == 0) break; if (p_cansee(td, p) == 0) - error = donice(td, p, uap->prio); + error = donice(td, p, uap->prio, uap->which); PROC_UNLOCK(p); } found++; @@ -215,7 +215,7 @@ LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (!p_cansee(td, p)) { - error = donice(td, p, uap->prio); + error = donice(td, p, uap->prio, uap->which); found++; } PROC_UNLOCK(p); @@ -231,7 +231,7 @@ PROC_LOCK(p); if (p->p_ucred->cr_uid == uap->who && !p_cansee(td, p)) { - error = donice(td, p, uap->prio); + error = donice(td, p, uap->prio, uap->which); found++; } PROC_UNLOCK(p); @@ -252,7 +252,7 @@ * Set "nice" for a (whole) process. */ static int -donice(struct thread *td, struct proc *p, int n) +donice(struct thread *td, struct proc *p, int n, int func) { int error; @@ -266,7 +266,10 @@ if (n < p->p_nice && suser(td) != 0) return (EACCES); mtx_lock_spin(&sched_lock); - sched_nice(p, n); + if(func == 4) /* PRIO_DISK */ + p->p_pad3[0] = n; + else + sched_nice(p, n); mtx_unlock_spin(&sched_lock); return (0); } diff -urN --exclude=compile sys-orig/kern/subr_disk.c sys/kern/subr_disk.c --- sys-orig/kern/subr_disk.c Wed Aug 31 17:21:24 2005 +++ sys/kern/subr_disk.c Wed Aug 31 19:01:07 2005 @@ -18,6 +18,8 @@ #include #include #include +#include /* SYSINIT ? */ +#include /* SYSINIT */ /*- * Disk error is the preface to plaintive error messages @@ -64,16 +66,16 @@ * BIO queue implementation */ -void -bioq_init(struct bio_queue_head *head) +static void +cscan_bioq_init(struct bio_queue_head *head) { TAILQ_INIT(&head->queue); head->last_offset = 0; head->insert_point = NULL; } -void -bioq_remove(struct bio_queue_head *head, struct bio *bp) +static void +cscan_bioq_remove(struct bio_queue_head *head, struct bio *bp) { if (bp == head->insert_point) { head->last_offset = bp->bio_offset; @@ -113,8 +115,8 @@ TAILQ_INSERT_TAIL(&head->queue, bp, bio_queue); } -struct bio * -bioq_first(struct bio_queue_head *head) +static struct bio * +cscan_bioq_first(struct bio_queue_head *head) { return (TAILQ_FIRST(&head->queue)); @@ -142,10 +144,8 @@ * always sorted in ascending order and the queue always restarts at 0. * This implements the one-way scan which optimizes disk seek times. */ -void -bioq_disksort(bioq, bp) - struct bio_queue_head *bioq; - struct bio *bp; +static void +cscan_bioq_disksort(struct bio_queue_head *bioq, struct bio *bp) { struct bio *bq; struct bio *bn; @@ -153,7 +153,7 @@ /* * If the queue is empty then it's easy. */ - if ((bq = bioq_first(bioq)) == NULL) { + if ((bq = cscan_bioq_first(bioq)) == NULL) { bioq_insert_tail(bioq, bp); return; } @@ -194,3 +194,32 @@ } TAILQ_INSERT_AFTER(&bioq->queue, bq, bp, bio_queue); } + +/* + * The CSCAN scheduler interface. + * Usually they are private, but we make this particular one public + * as it can be used as a backend by other schedulers. + */ +struct _disk_sched_interface cscan_sched = { + .next= NULL, + .name= "cscan", + .disksort= cscan_bioq_disksort, + .remove= cscan_bioq_remove, + .get_first= cscan_bioq_first, + .init = cscan_bioq_init, + .delete = NULL, + .load = NULL, /* XXX fixme */ + .unload = NULL, /* XXX fixme */ +}; + +SYSINIT(cscanload, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, + disk_sched_load, &cscan_sched); +SYSCTL_DECL(_vfs_scheduler); +SYSCTL_NODE(_vfs_scheduler, OID_AUTO, cscan, + CTLFLAG_RW, 0, "cscan disk I/O scheduler"); +SYSCTL_INT(_vfs_scheduler_cscan, OID_AUTO, refcount, + CTLTYPE_INT, &cscan_sched.refcount, 0, "refcount"); + + + + diff -urN --exclude=compile sys-orig/kern/subr_disk_sched.c sys/kern/subr_disk_sched.c --- sys-orig/kern/subr_disk_sched.c Thu Jan 1 00:00:00 1970 +++ sys/kern/subr_disk_sched.c Wed Aug 31 19:02:45 2005 @@ -0,0 +1,587 @@ +/* + * (C) 2005 Emiliano Mennucci, Luigi Rizzo, Paolo Valente + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include /* SYSINIT ? */ +#include +#include /* MALLOC_DECLARE */ +#include /* SYSINIT */ +#include /* for add_class */ + +#if __FreeBSD_version < 501000 +/* + * FreeBSD 4.x uses different names for these struct and fields. + * Fortunately the changes are small so we can use some #defines + * to fix for that. + * The documentation refers to the 5.x/6.x names + */ +#include +#define bio_queue_head buf_queue_head +#define bio buf +#define bio_queue b_act /* link field in struct buf */ +#define bioq_init bufq_init +#define bioq_disksort bufqdisksort +#define bioq_first bufq_first +#define bioq_remove bufq_remove +#define LEN(x) ( (dn_key) (x)->b_bufsize ) +#else +#include +#include +#define LEN(x) ( (dn_key) (x)->bio_bcount ) +#endif + +/* + * This code implements the support for pluggable disk schedulers. + * + * Disk I/O requests (bio) are stored into a queue (bioq), one per device, + * sorted by a 'disksort' routine. + * Manipulations of the data structure occur through the following routines + * + * - bioq_init(bioq) initializes the queue. + * Typically called at device attach time. + * + * - bioq_flush(bioq, ...) flushes the queue. + * Typically called at device detach time. + * + * - bioq_disksort(bioq, bio) adds the new request to the queue. + * + * - bioq_first(bioq) returns the next request to serve, + * . or NULL if the queue is empty. + * + * - bioq_remove(bioq, bio) removes the request from the queue. + * XXX check if this is always the result of the + * previous bioq_first; it would be very strange otherwise. + * + * + * This code virtualizes this interface through function pointers + * or wrappers, depending on the needs, that perform generic actions + * and then invoke scheduler-specific functions. This way different disk + * schedulers can be loaded and possibly switched at runtime. + * + * --- SCHEDULER DESCRIPTOR --- + * Each disk scheduling algorithm implements a disk scheduler API which + * is exported through an instance of struct _disk_sched_interface + * (in sys/buf.h or sys/bio.h), initialized with pointers to the different + * functions of the API. In particular: + * + * - init(bioq) allocates a scheduler-specific structure; + * - disksort(bioq, bio) enqueues a new request + * - first(bioq) returns the next request to serve + * - remove(bioq, bio) removes the request from the queue + * - delete(bioq) deallocates the structure allocated by + * init() after the queue has been drained; + * + * Other fields in the descriptor are used for housekeeping. + * + * --- QUEUE DESCRIPTOR --- + * The native queue descriptor is designed for the default disk + * scheduling algorithm, * which is a one-way elevator called C-LOOK. + * As such, it contains a TAILQ and a few fields to decide how to insert + * new requests basing on their position on the disk. + * Clearly, different algorithms may require different data structures, + * so the queue descriptor must be extended with additional information. + * To minimize differences with the existing code we leave the existing + * fields in the queue, and extend it with pointers to additional, + * scheduler-specific, data structures, and to the descriptor of the + * scheduler currently in use for the queue and the associated version number. + * + * + * --- SCHEDULER SWITCHING --- + * + * The scheduler to use is selected through the sysctl variable + * vfs.scheduler.name + * which can be set to the name of the new scheduler to use. + * + * To cope with the locking needs of FreeBSD 5.x and above, the sysctl + * only changes the name and a few related fields (a version number and + * a pointer to the corresponding scheduler descriptor). This is because + * the sysctl has no knowledge of the locks used to protect the various + * queues, and so it cannot manipulate them. + * The actual scheduler switching occurs separately on each queue at + * the first request for that queue, because the requests are issued + * by the device driver with the lock already held and thus are protected + * from interference. + * Normal operation is implemented when the version number in the bioq + * is the same as the one in the system. When they differ, the queue is + * in the 'switch' phase, and the behaviour of the wrapper during a + * 'switch' becomes the following: + * + * disksort(): if the queue is empty performs the actual switch (see below), + * otherwise requests are stored in a 'suspend' queue in FIFO order. + * Because the queue is not empty, the driver, which expects the queue + * to be non empty after the disksort(), will be satisfied. + * + * bioq_first(): if the queue is empty performs the actual switch (see below), + * In all cases return the result of the first() call. + * + * bioq_remove(): remove the request, then + * if the queue is empty performs the actual switch. + * + * The actual switch involves calling the delete() function for the old + * scheduler, updating the scheduler pointer and version number, calling + * the init() function for the new scheduler, and requeueing pending + * requests from the suspend queue. + * + * This approach works even with multiple sysctl issued while a scheduler + * switching is pending on a given queue. + */ + +/* the global list of disk schedulers */ +static struct _disk_sched_interface *disk_schedulers = NULL; +static struct _disk_sched_interface *curr_sched; +static int sched_version; +static char disk_sched_name[80]; /* name of scheduling arg */ + +/* lock routine */ +#define SCHED_LOCK() disk_sched_lock(__func__) +#define SCHED_UNLOCK() disk_sched_lock(NULL) +static struct mtx mtx; /* main disk scheduler lock */ +static struct mtx w_mtx; /* protects weight table */ + +static void +disk_sched_lock(const char *arg) +{ + static const char *locked = NULL; + + if (arg) { + if (locked) + panic("recursive lock old <%s> new <%s>\n", + locked, arg ); + mtx_lock(&mtx); + locked = arg; + } else { + if (!locked) + panic("unlock without lock\n"); + locked = 0; + mtx_unlock(&mtx); + } +} + +/* generic load routine */ +void +disk_sched_load(void *desc) +{ + struct _disk_sched_interface *p = desc; + + if (disk_schedulers == NULL) { + mtx_init(&mtx, "disk scheduler lock", NULL, MTX_DEF); + mtx_init(&w_mtx, "weight table lock", NULL, MTX_DEF); + } + SCHED_LOCK(); + p->next = disk_schedulers; + disk_schedulers = p; + SCHED_UNLOCK(); + printf("Loaded Disk Scheduler %s\n", p->name); +} + +static void +bioq_delete(struct bio_queue_head *head) +{ + + if (head->sched->delete) + head->sched->delete(head); + head->sched_info = NULL; + SCHED_LOCK(); + head->sched->refcount--; + printf("bioq_delete, refcount %d for %s\n", + head->sched->refcount, head->sched->name); + head->sched = NULL; + head->sched_version = 0; + SCHED_UNLOCK(); +} + +/* + * Helper function for the actual scheduler switch. + * Make a copy of the suspend queue so it can be initialized by + * the following bioq_init(), free the old data structure, + * allocate and initialize the new one, the requeue any pending + * request from the suspend queue. + */ +static void +switch_and_requeue(struct bio_queue_head *head) +{ + struct bio *bp; + /* save a copy of the suspend queue to restore it after bioq_init */ + char tmp[sizeof(head->suspend)]; + bcopy(&head->suspend, tmp, sizeof(tmp)); + + printf("switch_and_requeue %p %s\n", head, head->sched->name); + bioq_delete(head); + bioq_init(head); + bcopy(tmp, &head->suspend, sizeof(tmp)); + while ((bp = TAILQ_FIRST(&head->suspend)) != NULL) { + TAILQ_REMOVE(&head->suspend, bp, bio_queue); + head->sched->disksort(head, bp); + } +} + +/* + * The init wrapper, called at device attach time or scheduler switch. + * Reference the current scheduler, call the scheduler-specific init routine. + */ +void +bioq_init(struct bio_queue_head *head) +{ + + head->sched_info = NULL; + SCHED_LOCK(); + head->sched_version = sched_version; + head->sched = curr_sched; + head->sched->refcount++; + printf("bioq_init, refcount %d for %s\n", + head->sched->refcount, head->sched->name); + TAILQ_INIT(&head->suspend); + if (head->sched->init) + head->sched->init(head); + SCHED_UNLOCK(); +} + +/* + * The disksort wrapper. + * If there is no pending scheduler switch (same sched_version), + * then do the regular disksort. + * If there is a pending scheduler switch (different sched_version) and + * the old queue is empty, do the actual switch and then behave as above. + * Finally, on a pending switch and non-empty queue, store requests on + * a 'suspend' queue for later requeueing. + */ +void +bioq_disksort(struct bio_queue_head *head, struct bio *bp) +{ +/* static int count=0;*/ + switch(bp->class.t) { + case PID: + case UID: + case GID: + break; /* ok */ + default: + /*printf("uninitialized bio [%8d] type %d bio %p queue %p\n", + count++, bp->class.t, bp, head);*/ + break; + } + if (head->sched_version != sched_version) { /* pending switch */ + /* if the queue is empty, time to switch and requeue */ + if (head->sched->get_first(head) == NULL) + switch_and_requeue(head); + else { /* put aside the request */ + TAILQ_INSERT_TAIL(&head->suspend, bp, bio_queue); + return; + } + } + head->sched->disksort(head, bp); + bzero(&bp->class, sizeof(bp->class)); +} + +struct bio * +bioq_first(struct bio_queue_head *head) +{ + return head->sched->get_first(head); +} + +/* + * The remove wrapper. + * Remove the given element from the queue; then, + * if there is a pending scheduler switch (different sched_version) + * and the old queue is empty, do the actual switch. + */ +void +bioq_remove(struct bio_queue_head *head, struct bio *bp) +{ + head->sched->remove(head, bp); + if (head->sched_version == sched_version) + return; + if (head->sched->get_first(head) != NULL) + return; + switch_and_requeue(head); +} + +/* + * Disk scheduler switch routine, called by the sysctl handler or once + * at boot time. Must be called with lock held. + * Call the load routine on the new scheduler, unload on the old one, + * then increment the version number and sets the pointer to the new one. + * Finally, save the scheduler name. + * The actual switch is deferred to the first use of the queue. + */ +extern struct _disk_sched_interface cscan_sched; +static void +disk_sched_setup(void *p) +{ + struct _disk_sched_interface *new_sched = p; + + if (new_sched == NULL) + { + printf("Initializing scheduler variables\n"); + new_sched = &cscan_sched; /* initial setup */ + } + printf("Active disk scheduler %s [%p]\n", new_sched->name, p); + + if (new_sched->load) + new_sched->load(); + if (curr_sched != NULL && curr_sched->unload) + curr_sched->unload(); + curr_sched = new_sched; + sched_version++; + strncpy(disk_sched_name, curr_sched->name, + sizeof(disk_sched_name)); + disk_sched_name[sizeof(disk_sched_name)-1] = '\0'; +} + +/* + * Implements scheduling algorithm switch + */ +static int +sysctl_disk_scheduler(SYSCTL_HANDLER_ARGS) +{ + int error; + struct _disk_sched_interface *p; + + error = sysctl_handle_string(oidp, disk_sched_name, + sizeof(disk_sched_name), req); + if (error != 0 || req->newptr == NULL /* just reading */) + return (error); + if (!strcmp(curr_sched->name, disk_sched_name)) + return 0; /* no change */ + + SCHED_LOCK(); + for (p = disk_schedulers; p; p = p->next) /* lookup the string */ + if (!strcmp(p->name, disk_sched_name)) + break; + if (p == NULL) { /* not found */ + printf("scheduler [%s] not found, resetting to [%s]\n", + disk_sched_name, curr_sched->name); + strncpy(disk_sched_name, curr_sched->name, + sizeof(disk_sched_name)); + disk_sched_name[sizeof(disk_sched_name)-1] = '\0'; + } else if (p != curr_sched) { /* switch */ + disk_sched_setup(p); + } + + SCHED_UNLOCK(); + return (0); +} + +SYSCTL_NODE(_vfs, OID_AUTO, scheduler, + CTLFLAG_RW, 0, "Disk I/O scheduler"); + +SYSCTL_PROC(_vfs_scheduler, OID_AUTO, name, CTLTYPE_STRING|CTLFLAG_RW, + 0, 0, sysctl_disk_scheduler, "A", "Current disk scheduling algorithm"); + +SYSINIT(disksched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, + disk_sched_setup, NULL /* argument */); + +/* + * An additional component of the scheduler interface is a + * service to assign weights or priorities to the scheduler. + * + * The classification of request can be based on parameters such as + * the PID, UID, GID or others. The weight should be in the corresponding + * data structure. + * For PID-based classification, the weight is in the p_pad3[0] field + * of the struct proc, and 'nice' and 'renice' have been patched + * accordingly. + * For UID and GID-based classification we use a sysctl interface + * which stores values in an array of struct _generic_weight + */ + +/* + * add classifier info to the buffer + */ +void +add_class(struct buf *bp, struct proc *p, char *msg) +{ + + if (p != curproc) + printf("add_class: p %p curproc %p\n", p, curproc); + if (bp->class.msg != NULL) + printf("add_class: %p marked <%s> in %s\n", + bp, bp->class.msg, msg); + bp->class.t = disk_sched_classifier; + bp->class.msg = msg; + if (p == NULL) { /* system ? */ + printf("add_class: proc NULL\n"); + bp->class.pug = 0; + bp->class.weight = 0; /* XXX or -20 ? */ + } else if (bp->class.t == PID) { + bp->class.pug = p->p_pid; + bp->class.weight = p->p_pad3[0]; + } else { /* UID or GID */ + bp->class.pug = (bp->class.t == UID) ? + p->p_ucred->cr_ruid : + p->p_ucred->cr_rgid ; + bp->class.weight = find_weight(bp->class.t, + bp->class.pug, W_LOOKUP); + } +#if 0 + { + static int count = 0; + printf("add_class %s (%d): t %d pug %d weight %d\n", + msg, + count++, + bp->class.t, bp->class.pug, bp->class.weight); + } +#endif +} + +MALLOC_DECLARE(M_PSIO); +MALLOC_DEFINE(M_PSIO, "psio", "psio"); + +struct _generic_weight { + enum client_type_t type; + int uid_or_gid; + int windex; /* weight, as index in wtable */ +}; + +static struct { + int sz; + int used; + struct _generic_weight *w; /* array of size sz */ +} weights; + +/* + * Given a type-uid/gid pair, locate the weight (if w = LOOKUP), or set it. + * Protected by a private lock. + */ +int +find_weight(enum client_type_t type, int uid_or_gid, int w) +{ + struct _generic_weight *tmp; + int i; + mtx_lock(&w_mtx); + for (i=0; i < weights.used; i++) + if (weights.w[i].uid_or_gid == uid_or_gid && + weights.w[i].type == type) + break; + if (w == W_LOOKUP) { /* return 0 if not found on a lookup */ + w = (i != weights.used) ? weights.w[i].windex : 0; + goto done; + } + if (i == weights.used) { /* new entry */ + if (weights.used == weights.sz) { /* realloc */ + weights.sz += 10; /* XXX room for more */ + printf("realloc weights from %d to %d\n", + weights.used, weights.sz); + tmp = malloc(weights.sz * sizeof(*tmp), + M_PSIO, M_NOWAIT | M_ZERO); + if (tmp == NULL) + panic("cannot allocate new weights"); + if (weights.w != NULL) { + bcopy(weights.w, tmp, + weights.used * sizeof(*tmp)); + free(weights.w, M_PSIO); + } + weights.w = tmp; + } + i = weights.used++; + weights.w[i].uid_or_gid = uid_or_gid; + weights.w[i].type = type; + } + weights.w[i].windex = w; /* update */ +done: + mtx_unlock(&w_mtx); + return w; +} + +/* + * Used to set the user weight + */ +static int +sysctl_user_weight(SYSCTL_HANDLER_ARGS) +{ + int error; + char *tmp, value[80] = "set user weight"; /* sysctl value */ + int uid, windex; + + error = sysctl_handle_string(oidp, value, sizeof(value), req); + if (error != 0 || req->newptr == NULL /* just reading */) + return (error); + + /* Parses the string to get uid and weight */ + uid = (int)strtol(value, &tmp, 10); + windex = (int)strtol(tmp, NULL, 10); + if (1) + printf("User %d has now weight %d\n", uid, windex); + find_weight(UID, uid, windex); /* create */ + return 0; +} + +SYSCTL_PROC(_vfs_scheduler, OID_AUTO, set, + CTLTYPE_STRING | CTLFLAG_RW, 0, 0, &sysctl_user_weight, + "A", "Set User weight"); + +enum client_type_t disk_sched_classifier = PID; + +static const char * +type_to_string(enum client_type_t t) +{ + switch (t) { + case PID: + return "PID"; + case GID: + return "GID"; + case UID: + return "UID"; + default: + return "unknown"; + } +} + +/* + * Change disk classifier (PID, UID, GID) + */ +static int +sysctl_disk_classifier(SYSCTL_HANDLER_ARGS) +{ + int error; + char classifier_name[16]; + enum client_type_t t; + + strcpy(classifier_name, type_to_string(disk_sched_classifier)); + error = sysctl_handle_string(oidp, classifier_name, + sizeof(classifier_name), req); + if (error != 0 || req->newptr == NULL /* just reading */) + return (error); + if (!strcmp(classifier_name, "PID")) + t = PID; + else if (!strcmp(classifier_name, "UID")) + t = UID; + else if (!strcmp(classifier_name, "GID")) + t = GID; + else + return EINVAL; + SCHED_LOCK(); + disk_sched_classifier = t; + sched_version++; + SCHED_UNLOCK(); + return 0; +} + +SYSCTL_PROC(_vfs_scheduler, OID_AUTO, classifier, CTLTYPE_STRING|CTLFLAG_RW, + 0, 0, sysctl_disk_classifier, "A", "Current disk classifier"); + +/* end of file */ diff -urN --exclude=compile sys-orig/kern/vfs_bio.c sys/kern/vfs_bio.c --- sys-orig/kern/vfs_bio.c Wed Aug 31 17:21:24 2005 +++ sys/kern/vfs_bio.c Wed Aug 31 18:21:08 2005 @@ -1932,6 +1932,7 @@ bp->b_saveaddr = bp->b_kvabase; bp->b_data = bp->b_saveaddr; } + add_class(bp, curproc, "getnewbuf"); return(bp); } @@ -2993,6 +2994,8 @@ bip->bio_done = bufdonebio; bip->bio_caller2 = bp; bip->bio_dev = dev; + bip->class = bp->class; + KASSERT(dev->si_refcount > 0, ("dev_strategy on un-referenced struct cdev *(%s)", devtoname(dev))); diff -urN --exclude=compile sys-orig/sys/bio.h sys/sys/bio.h --- sys-orig/sys/bio.h Wed Aug 31 17:21:41 2005 +++ sys/sys/bio.h Wed Aug 31 18:26:13 2005 @@ -42,10 +42,61 @@ struct disk; struct bio; +struct bio_queue_head; +struct buf; typedef void bio_task_t(void *); /* + * Pluggable disk scheduler interface. + * + * Each scheduler is described by a struct _disk_sched_interface. + * Available schedulers are in a linked list where modules register + * on load and unregister on unload. + * + * The legacy API for disk schedulers uses bioq_disksort() + * bioq_first(), bioq_remove(), bioq_init(), bioq_flush(). + * We intercept and redirect these through either wrappers or + * function pointers, which dispatch to the appropriate per-device + * scheduler. + * sysctl vfs.scheduler.name is used to switch schedulers. + */ + +struct _disk_sched_interface { + struct _disk_sched_interface *next; + char *name; /* symbolic name */ + int refcount; /* users of this scheduler */ + /* XXX do we need a lock ? */ + void (*disksort)(struct bio_queue_head *head, struct bio *bp); + void (*remove)(struct bio_queue_head *head, struct bio *bp); + struct bio * (*get_first)(struct bio_queue_head *head); + void (*init)(struct bio_queue_head *head); + void (*delete)(struct bio_queue_head *head); + void (*load)(void); /* load scheduler */ + void (*unload)(void); /* unload scheduler */ +}; + +void disk_sched_load(void *); +/* + * Requests are grouped together according to some classifier based + * on PID, UID, GID. + */ +enum client_type_t { UNKNOWN=0, PID, UID, GID }; +extern enum client_type_t disk_sched_classifier; +#define W_LOOKUP 0x123456 /* key to lookup a weight */ +int find_weight(enum client_type_t type, int uid_or_gid, int w); +struct proc; /* Declared to avoid include of proc.h */ +void add_class(struct buf *bp, struct proc *p, char *msg); +struct buf_class { + enum client_type_t t; + int pug; + int weight; + char *msg; /* set to non-null when initialized */ +}; + + + +/* * The bio structure describes an I/O operation in the kernel. */ struct bio { @@ -81,6 +132,9 @@ /* XXX: these go away when bio chaining is introduced */ daddr_t bio_pblkno; /* physical block number */ + + /* Disk scheduler fields, used for accounting. */ + struct buf_class class; }; /* bio_cmd */ @@ -106,12 +160,23 @@ TAILQ_HEAD(bio_queue, bio) queue; off_t last_offset; struct bio *insert_point; + + void *sched_info; /* scheduler's private info */ + struct _disk_sched_interface *sched; + int sched_version; + TAILQ_HEAD(, bio) suspend; /* suspended requests */ }; void biodone(struct bio *bp); void biofinish(struct bio *bp, struct devstat *stat, int error); int biowait(struct bio *bp, const char *wchan); + +/* + * Note that bioq_insert_head/tail are not part of the scheduler API but just + * of the queue management API. Even though they update some info + * (insert_point) used by the C-SCAN algorithm. + */ void bioq_disksort(struct bio_queue_head *ap, struct bio *bp); struct bio *bioq_first(struct bio_queue_head *head); struct bio *bioq_takefirst(struct bio_queue_head *head); diff -urN --exclude=compile sys-orig/sys/buf.h sys/sys/buf.h --- sys-orig/sys/buf.h Wed Aug 31 17:21:41 2005 +++ sys/sys/buf.h Wed Aug 31 18:25:29 2005 @@ -42,6 +42,7 @@ #include #include #include +#include struct bio; struct buf; @@ -92,6 +93,15 @@ * Q - Protected by the buf queue lock * D - Protected by an dependency implementation specific lock */ +/*enum client_type_t {UNKNOWN = 0, PID, UID, GID}; +struct buf_class +{ + enum client_type_t t; + int pug; + int weight; + char* msg; +};*/ + struct buf { struct bufobj *b_bufobj; long b_bcount; @@ -135,6 +145,7 @@ struct vm_page *b_pages[btoc(MAXPHYS)]; int b_npages; struct workhead b_dep; /* (D) List of filesystem dependencies. */ + struct buf_class class; }; #define b_object b_bufobj->bo_object diff -urN --exclude=compile sys-orig/sys/proc.h sys/sys/proc.h --- sys-orig/sys/proc.h Wed Aug 31 17:21:41 2005 +++ sys/sys/proc.h Wed Aug 31 17:22:10 2005 @@ -587,6 +587,7 @@ struct pargs *p_args; /* (c) Process arguments. */ rlim_t p_cpulimit; /* (j) Current CPU limit in seconds. */ signed char p_nice; /* (c + j) Process "nice" value. */ + char p_pad3[3]; /* XXX padding for alignment */ /* End area that is copied on creation. */ #define p_endcopy p_xstat diff -urN --exclude=compile sys-orig/ufs/ufs/deviceps.c sys/ufs/ufs/deviceps.c --- sys-orig/ufs/ufs/deviceps.c Thu Jan 1 00:00:00 1970 +++ sys/ufs/ufs/deviceps.c Wed Aug 31 17:22:10 2005 @@ -0,0 +1,750 @@ +/* + * (C) 2005 Emiliano Mennucci, Luigi Rizzo, Paolo Valente + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#if __FreeBSD_version < 501000 +/* + * FreeBSD 4.x uses different names for these struct and fields. + * Fortunately the changes are small so we can use some #defines + * to fix for that. + * The documentation refers to the 5.x/6.x names + */ +#define bio_queue_head buf_queue_head +#define bio buf +#define bio_queue b_act /* link field in struct buf */ +#define LEN(x) ( (dn_key) (x)->b_bufsize ) + +#include +#else +#include +#include /* curproc ? */ +#define LEN(x) ( (dn_key) (x)->bio_bcount ) +#endif + +/* + * --- HYBRID - Efficient Proportional Share I/O Scheduler --- + * + * This code implements a Proportional Share scheduler for storage devices + * called HYBRID (designed by Paolo Valente and Luigi Rizzo). + * HYBRID can be used to replace the standard CSCAN (elevator) + * disk scheduler used in traditional BSD system. + * + * The scheduler interfaces with the system through these functions: + * bioq_disksort, bioq_first and bioq_remove. + * Additionally, there are 'init' and 'flush' calls used to, + * respectively, activate and deactivate the scheduler itself. + * + * The goal of HYBRID is to achieve weighted fair sharing + * of device bandwith among requests coming from the same client + * (which can be a single process, thread, userC, etc.), + * while preserving a reasonable throughput of the disk unit. + * + * The standard disk scheduler tries to maximize throughput by + * sorting I/O requests according to their position on the disk. + * A standard Proportional Share (PS) allocates bandwidth according + * to the weight of the clients, but this approach used on a disk + * unit may result in excessive head movement, which consumes time. + * + * HYBRID tries to achieve reasonable sharing and good throughput + * using a tandem of modules. Requests are first sent to a pure + * PS scheduler, which sorts them according to weights. Then, + * requests are pulled from the PS scheduler and sent to a + * CSCAN scheduler which reorders the top W requests trying to + * preserve locality, but also avoid starvation. + * W is a parameter of the scheduler that defines the queue length. + * W=0 transforms HYBRID in a pure Proportional Share scheduler, + * whereas a large W makes it work almost as a CSCAN scheduler. + * + * To avoid starvation, the second scheduler in HYBRID + * uses two queue, an 'active' queue and a 'shadow' queue. + * Both sizes are settable through sysctl variables. + * more it grows the more the serve is unfair and viceversa. + * + * -- DATA STRUCTURES -- + * + * Each bioq has a sched_info pointer to an auxiliary descriptor + * containing scheduler info, in our case a struct ps_device. + * This is allocated the first time a disk unit is used, + * and deallocated when the scheduler is deactivated. + * + * A lock on the bioq also locks all the data structures for the + * device. + * + */ + +struct client_queue; /* forward declaration */ + +/* + * Descriptor of device handled by HYBRID (looked up through bioq). + * A new descriptor is created on the first access to a given device + * (bioq is used as a key to identify the device). + * + * XXX ps_count == 0 can be replaced by + * sch_heap.elements + ne_heap.elements == 0 + * + * There are cases in which a disksort can change the head element + * of the queue, violating one of the assumptions in the CSCAN queue. + * We address this using two variables, pending_remove and removed. + * See the code for details. + */ +struct ps_device { + int ps_count; /* Requests in PS module */ + struct bio_queue_head *bioq; /* device buf queue */ + struct client_queue *client_list; /* XXX hash table maybe ? */ + int shadowenabled; /* Set if shadow q. enabled */ + TAILQ_HEAD(, bio) shadow; /* Shadow queue */ + /* XXX shadow should be a bio_queue_head */ + int qlen; /* bytes in TB */ + int rmbuf; /* Removed bufs */ + struct dn_heap sch_heap, /* Schedulable process heap */ + ne_heap,/* Not eligible process heap */ + idle_heap; /* No more backlogged process heap */ + dn_key P; /* Device Potential */ + u_int sum; /* sum of Weights of active clients */ + + struct bio *pending_remove; + int removed; /* pending already removed */ +}; + +#if 0 +/* + * Identity of a client. + */ +union client_id { + struct { + /* + * We need a reference to proc because + * when the weight changes it is stored + * in *proc. XXX this must be changed. + */ + struct proc *proc; /* Process we belong to */ + pid_t p_pid; /* Process pid */ + } ps_proc; + int uid_or_gid; +}; +#endif + +/* + * Per-device client descriptor. + */ +struct client_queue { + struct client_queue *next; /* client list */ + struct ps_device *psd; /* Pointer to Device */ + TAILQ_HEAD(, bio) queue; /* client's local queue */ + dn_key S, F; /* client's potentials */ + struct dn_heap *heap; /* which heap are we in */ +#if 0 + union client_id id; /* client identity */ +#endif + u_short weight; /* client's weight */ + struct buf_class class; +}; + +/* + * Given an I/O request x, LEN(x) returns a value proportional + * to the 'cost' of serving the request. + * We can experiment with this. Initially use the length in bytes, + * but something like c1 + c2*len is probably more appropriate. + */ +#define WEIGHT(d) ((d)->weight) + +/* + * Given a buffer and a weight, computes the difference in Potentials + * related to the buffer. Uses the LEN macro above. + * __P_SCALE is a scaling factor. + */ +#define __P_SCALE 8 +#define DELTA_P(bp, weight) \ + ( (LEN(bp) << __P_SCALE) / (dn_key)(weight)) + + +/* Reference to cscan_sched which we use as a backend. */ +extern struct _disk_sched_interface cscan_sched; + +/* + * Parameters for the scheduler are queue len and shadow threshold, + * set through sysctl variables in KBYTES + */ +#define KBYTE 1024 + +static int hybrid_qlen = 0; /* scheduler queue length */ +static int shadow_threshold = 0; /* Threshold to activate shadow queue */ + +static int verbose = 3; /* XXX */ + +MALLOC_DECLARE(M_PSIO); + +/* + * Process weight is computed from p_nice using a lookup table. + * p_nice can assume values between PRIO_MIN (-20) and PRIO_MAX (20). + */ +static const u_short wtable[PRIO_MAX - PRIO_MIN + 1] = { + 1000, 950, 900, 850, 800, + 750, 700, 650, 600, 550, + 500, 450, 400, 350, 300, + 250, 200, 180, 150, 120, + 100, /* base priority/weight */ + 90, 80, 75, 70, 65, + 60, 55, 50, 45, 40, + 35, 30, 25, 20, 15, + 10, 5, 3, 2, 1 +}; + +/* + * Update device sum of weight if client weight has changed. + * We update unconditionally, as the check is as expensive as updating. + * XXX make sure proc is valid, otherwise keep old value + */ +static void +updateweight(struct client_queue* d) +{ + int a = d->class.weight; + + if (a < PRIO_MIN) + a = PRIO_MIN; + if (a > PRIO_MAX) + a = PRIO_MAX; + a -= PRIO_MIN; + d->weight = wtable[a]; +} + +/* + * Create and initialize a client queue for a specific device + */ +static struct client_queue * +new_client_queue(struct ps_device *psd, struct bio *bp) +{ + struct client_queue *d; + + /* New client for this device */ + d = malloc(sizeof(*d), M_PSIO, M_NOWAIT | M_ZERO); + if (d == NULL) + panic("new_client_queue: cannot allocate memory"); + TAILQ_INIT(&d->queue); + d->psd = psd; /* Assign device to client device */ + d->class = bp->class; + if (verbose & 1) + printf("new_client_queue, pug %d:%d weight %d on %p\n", + bp->class.t, + bp->class.pug, bp->class.weight, psd); + /* Insert new device into process own list */ + d->next = psd->client_list; + psd->client_list = d; + + return d; +} + +/* + * Create and initialize a device queue + */ +static void +newps_device(struct bio_queue_head *bioq) +{ + struct ps_device *psd; + struct bio *bp; + + if (verbose & 2) + printf("new_ps_device bioq %p\n", bioq); + psd = malloc(sizeof(*psd), M_PSIO, M_NOWAIT | M_ZERO); + if (psd == NULL) + panic("newps_device: cannot allocate memory"); + /* only set non-zero fields. */ + /* XXX check that heaps are properly initialized */ + psd->bioq = bioq; + printf("new client classifier on %d\n", disk_sched_classifier); + bioq->sched_info = psd; + /* link into device list */ + + /* Create and initialize the shadow queue */ + TAILQ_INIT(&psd->shadow); + psd->shadowenabled = 0; + + /* + * If we are switching from the old scheduler, the queue + * could be non-empty, so record the total queue size. + */ + TAILQ_FOREACH(bp, &psd->bioq->queue, bio_queue) { + psd->qlen += LEN(bp); + } +} + +/* + * Find the client queue associated with the psd. If none exists, + * create one. + */ +static struct client_queue * +find_client_queue(struct ps_device *psd, struct bio *bp) +{ + struct client_queue *h; + + for (h = psd->client_list; h; h = h->next) + if (h->class.t == bp->class.t && + h->class.pug == bp->class.pug && + h->class.weight == bp->class.weight) + return h; + return new_client_queue(psd, bp); +} + + +/* + * Manage heaps associated with device, because potentials have changed. + * Thus move client in appropiate heaps. + */ +static void +updateheaps(struct ps_device *psd) +{ + struct dn_heap *sch = &psd->sch_heap; + struct dn_heap *ne = &psd->ne_heap; + struct dn_heap *idl = &psd->idle_heap; + struct client_queue *d; + int flush; + + /* + * If there is no SCHEDULABLE client, update pontential to the + * starting value (S) of the first NOT ELEGIBLE one. This is done to + * make sure that device potential (P) follows the client device + * potentials. + */ + if (sch->elements == 0 && ne->elements > 0) { + d = ne->p[0].object; /* First client on heap */ + psd->P = MAX64(d->S, psd->P); + } + + /* + * Make sure that if a SCHEDULABLE client could be avaliable, + * it will be. + */ + while (ne->elements > 0) { + d = ne->p[0].object; /* First client on heap */ + if (DN_KEY_GT(d->S, psd->P)) + break; + heap_extract(ne, d); + heap_insert(sch, d->S, d); + d->heap = sch; + } + + /* + * IDL entries are removed if both SCH and NE are empty (we can + * fully reset the scheduler) or d->F <= psd->P i.e. they have + * no debit with the scheduler. + */ + flush = (sch->elements == 0 && ne->elements == 0); + + while (idl->elements > 0 && (d = idl->p[0].object) && + (flush || DN_KEY_LEQ(d->F, psd->P)) ) { + heap_extract(idl, d); + d->heap = NULL; + psd->sum -= WEIGHT(d); + } + if (flush) { + if (verbose & 4) + printf("flush %p, ps_count %d P is %llu, sum is %u\n", + psd->bioq, + psd->ps_count, + psd->P, psd->sum); + psd->P = 0; + } + /* XXX here could free all records in NULL heap */ +} + + +/* + * client has been served. Put it in appropriate heap and update device + * potetial accordingly. + */ +static void +requeue_client(struct client_queue *d) +{ + struct bio *bp; + struct ps_device *psd = d->psd; + + if (TAILQ_EMPTY(&d->queue)) { /* no requests in this queue */ + if (DN_KEY_LEQ(d->F, psd->P)) { + d->heap = NULL;/* Inactive client */ + psd->sum -= d->weight; + /* XXX we could free the descriptor */ + } else { + d->heap = &psd->idle_heap; + heap_insert(d->heap, d->F, d); + } + } else { /* more requests in queue */ + bp = TAILQ_FIRST(&d->queue); + /* Update Finish (F) potential for next request */ + d->F += DELTA_P(bp, d->weight); + if (DN_KEY_LEQ(d->S, psd->P)) { /* eligible, sort by F */ + d->heap = &psd->sch_heap; + heap_insert(d->heap, d->F, d); + } else { /* not eligible, sort by S */ + d->heap = &psd->ne_heap; + heap_insert(d->heap, d->S, d); + } + } + updateheaps(psd); +} + +/* + * Put a buffer in the TB. It goes in either the bioq (CSCAN) + * or the shadow queue (FIFO). + */ +static void +tb_enqueue(struct ps_device* psd, struct bio* bp) +{ + if (psd->shadowenabled) + TAILQ_INSERT_TAIL( &psd->shadow, bp, bio_queue ); + else + cscan_sched.disksort( psd->bioq, bp ); + psd->qlen += LEN(bp); +} + +/* put a buffer in the PS queue */ +static void +ps_enqueue(struct client_queue* d, struct bio* bp) +{ + TAILQ_INSERT_TAIL(&d->queue, bp, bio_queue); + d->psd->ps_count++; /* XXX remove later */ +} + +static struct bio * +ps_first(struct ps_device* psd) +{ + + if (psd->sch_heap.elements != 0) { /* Choose the next buf */ + struct client_queue *d = psd->sch_heap.p[0].object; + return TAILQ_FIRST(&d->queue); + } + return NULL; +} + +/* Copies from shadow queue to device queue. */ +static void +spill_queue(struct ps_device* psd) +{ + struct bio* bp; + + if (psd->qlen == 0) + return; + while (!TAILQ_EMPTY(&psd->shadow)) { + bp = TAILQ_FIRST(&psd->shadow); + TAILQ_REMOVE(&psd->shadow, bp, bio_queue); + cscan_sched.disksort(psd->bioq, bp); + } +} + +static int +ps_next_size(struct ps_device* psd) +{ + struct client_queue* d = psd->sch_heap.p[0].object; + return LEN(TAILQ_FIRST(&d->queue)); +} + +/* + * Removes *bp from a PS queue. bp _must_ be in the queue of the + * client queue at the top of sch_heap. + */ +static void +ps_remove(struct ps_device* psd, struct bio *bp) +{ + struct client_queue* d = psd->sch_heap.p[0].object; + + heap_extract(&psd->sch_heap, d); /* locate client */ + TAILQ_REMOVE(&d->queue, bp, bio_queue); /* remove request */ + psd->ps_count--; /* update stats */ + +if ((verbose & 0x10 && psd->ps_count == 0) || + (verbose & 0x20 && psd->ps_count > 0)) + printf("ps_remove len %d count %d\n", (int)LEN(bp), psd->ps_count); + psd->P += DELTA_P( bp, psd->sum); /* update Potential */ + d->S = d->F; + requeue_client(d); /* put back in queue */ +} + +/* + * refill the TB queue from the PS queue. + * Called when TB drains, or its size increases. + */ +static void +refill_queue(struct ps_device* psd) +{ + int qlen = hybrid_qlen * KBYTE; + + while (psd->sch_heap.elements > 0 && + psd->qlen + ps_next_size(psd) <= qlen ) { + struct bio* bp = ps_first(psd); + ps_remove(psd, bp); + tb_enqueue(psd, bp); + } +} + +/*---- the disk scheduler API ----*/ +/* + * Return first buf in a queue. + */ +static struct bio* +hy_first(struct bio_queue_head *bioq) +{ + struct ps_device *psd = bioq->sched_info; + + if (psd->pending_remove) + return psd->pending_remove; + psd->pending_remove = (psd->qlen == 0) ? + ps_first(psd) : cscan_sched.get_first(bioq); + return psd->pending_remove; +} + +/* + * Really remove a buf from a device queue + */ +static void +hy_remove2(struct ps_device* psd, struct bio *bp) +{ + int empty; + + if (psd->qlen == 0) { /* TB empty, remove from PS */ + ps_remove(psd, bp); + return; + } + /* otherwise remove from TB */ + cscan_sched.remove(psd->bioq, bp); + psd->qlen -= LEN(bp); + refill_queue(psd); /* have room in TB, refill it */ + + /* XXX describe hybrid algorithm. */ + empty = TAILQ_EMPTY(&psd->bioq->queue); + if (psd->shadowenabled == 0) { /* Device mode? */ + if (empty) + psd->rmbuf = 0; + else { + psd->rmbuf += LEN(bp); + if (psd->rmbuf >= shadow_threshold * KBYTE ) + psd->shadowenabled = 1; + } + } else { /* Shadow mode */ + if (empty) { + psd->rmbuf = 0; + psd->shadowenabled = 0; + /* XXX inefficient. put both queues in psd */ + spill_queue(psd); + } + } +} + +static void +hy_remove(struct bio_queue_head *bioq, struct bio *bp) +{ + struct ps_device* psd = bioq->sched_info; + + if (psd->pending_remove != bp) + panic("invalid remove, have %p want %p\n", + psd->pending_remove, bp); + psd->pending_remove = NULL; + if (psd->removed) + psd->removed = 0; + else + hy_remove2(psd, bp); +} + +/* + * Insert a buf into a sorted device queue (bioq_disksort) + */ +static void +hy_disksort(struct bio_queue_head *bioq, struct bio *bp) +{ + struct client_queue *d; + struct ps_device *psd = bioq->sched_info; + + if (psd->pending_remove != NULL && !psd->removed && psd->qlen == 0) { + /* + * make sure to actually remove the head element + * as the new insert might cause a reordering + * that we could not deal with + */ + psd->removed = 1; + hy_remove2(psd, psd->pending_remove); + } + /* find or create the per-device client descriptor. */ + d = find_client_queue(psd, bp); + + if (d->heap == NULL) { + /* basically no state for the client queue */ + updateweight(d); + psd->sum += d->weight; + d->S = psd->P; + d->F = d->S + DELTA_P(bp, d->weight); + } else if (d->heap == &psd->idle_heap) { + /* we have state for this client + * We are not allowed to change weight because + * it affects psd->sum and might destroy the + * invariants on P. XXX the correct fix is to + * wait until the client exits from IDL and then + * insert all pending requests. + * XXX For the time being, ignore the change. + */ + heap_extract(d->heap, d ); + d->S = MAX64( d->F, psd->P ); + d->F = d->S + DELTA_P(bp, d->weight); + } + + /* + * If the PS queue is empty _and_ TB has room, bypass PS. + * This also means that no client_queue's are in SCH or NE. + * Note that if all client_queues are IDL, we can reset the + * scheduler and make them all NONE. XXX + */ + if (psd->ps_count == 0 && + psd->qlen + LEN(bp) <= hybrid_qlen * KBYTE) { + /* update potentials */ + psd->P += DELTA_P(bp, psd->sum); + d->S = d->F; + tb_enqueue(psd, bp); + psd->sum -= d->weight; + } else { /* regular PS behaviour */ + ps_enqueue(d, bp); + /* + * If the client is in SCH or NE heap just return, + * otherwise (IDL or NONE) we only have the single + * request that we just inserted. + */ + if (d->heap == &psd->sch_heap || + d->heap == &psd->ne_heap) + return; + requeue_client(d); + } +} + +/* + * Final part of the 'flush' routine + */ +static void +hy_delete(struct bio_queue_head *bioq) +{ + struct ps_device* psd = bioq->sched_info; + + if (psd == NULL) + return; + /* Free heaps */ + heap_free(&psd->sch_heap); + heap_free(&psd->idle_heap); + heap_free(&psd->ne_heap); + free(psd, M_PSIO); +} + +/* + * placeholders, for the time being. + */ +static void +hy_unload(void) +{ + printf("hybrid unload\n"); +} + +static void +hy_load(void) +{ + printf("hybrid_load()\n"); +} + +static struct _disk_sched_interface hy_sched = { + .next= NULL, + .name= "hybrid", + .disksort= hy_disksort, + .remove= hy_remove, + .get_first= hy_first, + .init = newps_device, + .delete = hy_delete, + .load = hy_load, + .unload = hy_unload, +}; + +SYSINIT(hyload, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, disk_sched_load, &hy_sched); + +/* + * Called by sysctl to change shadow queue threshold + */ +static int +sysctl_shadow_threshold(SYSCTL_HANDLER_ARGS) +{ + int error, new_val = shadow_threshold; + + error = sysctl_handle_int(oidp, &new_val, sizeof(new_val), req); + + if (error != 0 || req->newptr == NULL || new_val == shadow_threshold) + return error; + if (new_val < 0) + printf("Negative value are invalid!\n"); + else + shadow_threshold = new_val; + return error; +} + +/* + * Called by sysctl to change the scheduler queue length. + */ +static int +sysctl_hybrid_qlen(SYSCTL_HANDLER_ARGS) +{ + int error, new_val = hybrid_qlen; + + error = sysctl_handle_int(oidp, &new_val, sizeof(new_val), req); + + if (error != 0 || req->newptr == NULL || new_val == hybrid_qlen) + return error; + if (new_val < 0) + printf("Negative qlen is invalid\n"); + else + hybrid_qlen = new_val; + return error; +} + +SYSCTL_DECL(_vfs_scheduler); +SYSCTL_NODE(_vfs_scheduler, OID_AUTO, hybrid, + CTLFLAG_RW, 0, "Hybrid disk I/O scheduler"); +SYSCTL_PROC(_vfs_scheduler_hybrid, OID_AUTO, threshold, + CTLTYPE_INT | CTLFLAG_RW, &shadow_threshold, 0, + &sysctl_shadow_threshold, + "I", "Set shadow queue threashold"); +SYSCTL_PROC(_vfs_scheduler_hybrid, OID_AUTO, qlen, + CTLTYPE_INT | CTLFLAG_RW, &hybrid_qlen, 0, + &sysctl_hybrid_qlen, + "I", "Set PS scheduling unfairness"); +SYSCTL_INT(_vfs_scheduler_hybrid, OID_AUTO, verbose, + CTLTYPE_INT | CTLFLAG_RW, &verbose, 0, "verbose hybrid"); + +SYSCTL_INT(_vfs_scheduler_hybrid, OID_AUTO, refcount, + CTLTYPE_INT, &hy_sched.refcount, 0, "refcount"); +/* end of file */ diff -urN --exclude=compile sys-orig/ufs/ufs/heap.c sys/ufs/ufs/heap.c --- sys-orig/ufs/ufs/heap.c Thu Jan 1 00:00:00 1970 +++ sys/ufs/ufs/heap.c Wed Aug 31 17:22:10 2005 @@ -0,0 +1,308 @@ +#define DEB(x) +#define DDB(x) x + +/* + * This module implements + * + heap management functions; + * + * include files marked with XXX are probably not needed + */ + +#include +#include +#include +#include +#include /* XXX */ +#include +#include +#include +#include + + +MALLOC_DEFINE(M_HEAP, "heap", "heap data structures"); /* XXX Check this */ + +/* + * Heap management functions. + * + * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. + * Some macros help finding parent/children so we can optimize them. + * + * heap_init() is called to expand the heap when needed. + * Increment size in blocks of 16 entries. + * XXX failure to allocate a new element is a pretty bad failure + * as we basically stall a whole queue forever!! + * Returns 1 on error, 0 on success + */ +#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) +#define HEAP_LEFT(x) ( 2*(x) + 1 ) +#define HEAP_IS_LEFT(x) ( (x) & 1 ) +#define HEAP_RIGHT(x) ( 2*(x) + 2 ) +#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } +#define HEAP_INCREMENT 15 + +/* + * DEBUGGING facility + */ +void +check_heap(struct dn_heap *h, u_char * s) +{ + int i, error = 0 ; + + for (i = 0 ; (i < h->elements) && (!error) ; i++) { + void *obj = h->p[i].object ; + + error = 1 ; + if (obj == NULL) + printf("check_heap: null object") ; + else if (h->offset > 0 && *((int *)((char *)obj + h->offset)) != i) + printf("check_heap: internal offset %d index %d", + *((int *)((char *)obj + h->offset)),i) ; + else + error = 0 ; + } + if (error) + panic(s) ; + DEB(printf("%s: check_heap OK\n", s) ); +} + +int +heap_init(struct dn_heap *h, int new_size) +{ + struct dn_heap_entry *p; + + if (h->size >= new_size ) { + printf("heap_init, Bogus call, have %d want %d\n", + h->size, new_size); + return 0 ; + } + new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ; + p = malloc(new_size * sizeof(*p), M_HEAP, M_DONTWAIT ); + if (p == NULL) { + printf(" heap_init, resize %d failed\n", new_size ); + return 1 ; /* error */ + } + if (h->size > 0) { + bcopy(h->p, p, h->size * sizeof(*p) ); + free(h->p, M_HEAP); + } + h->p = p ; + h->size = new_size ; + DDB(check_heap(h, "heap_init")) ; + return 0 ; +} + +/* + * Insert element in heap. Normally, p != NULL, we insert p in + * a new position and bubble up. If p == NULL, then the element is + * already in place, and key is the position where to start the + * bubble-up. + * Returns 1 on failure (cannot allocate new heap entry) + * + * If offset > 0 the position (index, int) of the element in the heap is + * also stored in the element itself at the given offset in bytes. + */ +#define SET_OFFSET(heap, node) \ + if (heap->offset > 0) \ + *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ; +/* + * RESET_OFFSET sets offset to an invalid value. + */ +#define RESET_OFFSET(heap, node) \ + if (heap->offset > 0) \ + *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ; +int +heap_insert(struct dn_heap *h, dn_key key1, void *p) +{ + int son = h->elements ; + + if (p == NULL) /* data already there, set starting point */ + son = key1 ; + else { /* insert new element at the end, possibly resize */ + son = h->elements ; + if (son == h->size) /* need resize... */ + if (heap_init(h, h->elements+1) ) + return 1 ; /* failure... */ + h->p[son].object = p ; + h->p[son].key = key1 ; + h->elements++ ; + } + while (son > 0) { /* bubble up */ + int father = HEAP_FATHER(son) ; + struct dn_heap_entry tmp ; + + if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) + break ; /* found right position */ + /* son smaller than father, swap and repeat */ + HEAP_SWAP(h->p[son], h->p[father], tmp) ; + SET_OFFSET(h, son); + son = father ; + } + SET_OFFSET(h, son); + return 0 ; +} + +/* + * update element pointing to old_obj to point to new_obj + */ +void +heap_modify(struct dn_heap *h, void *old_obj, void *new_obj) +{ + /* modify specific element, index is at offset */ + if (old_obj != NULL && h->offset > 0) { + int index = *((int *)((char *)old_obj + h->offset)) ; + + if (index < 0 || index >= h->elements) { + printf("heap_modify, index %d out of bound 0..%d\n", + index, h->elements); + panic("heap_modify"); + } + h->p[index].object = new_obj ; + } else { + printf("heap_modify, null obj or index offset\n") ; + panic("heap_modify") ; + } +} + +/* + * remove top element from heap, or obj if obj != NULL + */ +void +heap_extract(struct dn_heap *h, void *obj) +{ + int child, father, max = h->elements - 1 ; + + DDB(check_heap(h, "entering heap_extract")) ; + if (max < 0) { + /*DANIELE*/ + /*printf("warning, extract from empty heap 0x%p\n", h);*/ + panic("warning, extract from empty heap 0x%p\n", h); + return ; + } + father = 0 ; /* default: move up smallest child */ + if (obj != NULL) { /* extract specific element, index is at offset */ + if (h->offset > 0) + father = *((int *)((char *)obj + h->offset)) ; + else + for ( ; father < h->elements && h->p[father].object != obj ; + father++ ) ; + if (father < 0 || father >= h->elements) { + printf("dummynet: heap_extract, father %d out of bound 0..%d\n", + father, h->elements); + panic("heap_extract"); + } + } + RESET_OFFSET(h, father); + child = HEAP_LEFT(father) ; /* left child */ + while (child <= max) { /* valid entry */ + if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) + child = child+1 ; /* take right child, otherwise left */ + h->p[father] = h->p[child] ; + SET_OFFSET(h, father); + father = child ; + child = HEAP_LEFT(child) ; /* left child for next loop */ + } + h->elements-- ; + if (father != max) { + /* + * Fill hole with last entry and bubble up, reusing the insert code + */ + h->p[father] = h->p[max] ; + heap_insert(h, father, NULL); /* this one cannot fail */ + } +} + +#if 0 +/* + * change object position and update references + * XXX this one is never used! + */ +void +heap_move(struct dn_heap *h, dn_key new_key, void *object) +{ + int temp; + int i ; + int max = h->elements-1 ; + struct dn_heap_entry buf ; + + if (h->offset <= 0) + panic("cannot move items on this heap"); + + i = *((int *)((char *)object + h->offset)); + if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */ + h->p[i].key = new_key ; + for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ; + i = temp ) { /* bubble up */ + HEAP_SWAP(h->p[i], h->p[temp], buf) ; + SET_OFFSET(h, i); + } + } else { /* must move down */ + h->p[i].key = new_key ; + while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */ + if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key)) + temp++ ; /* select child with min key */ + if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */ + HEAP_SWAP(h->p[i], h->p[temp], buf) ; + SET_OFFSET(h, i); + } else + break ; + i = temp ; + } + } + SET_OFFSET(h, i); +} +#endif /* heap_move, unused */ + +/* + * heapify() will reorganize data inside an array to maintain the + * heap property. It is needed when we delete a bunch of entries. + */ +void +heapify(struct dn_heap *h) +{ + int i ; + + for (i = 0 ; i < h->elements ; i++ ) + heap_insert(h, i , NULL) ; +} + +/* + * cleanup the heap and free data structure + */ +void +heap_free(struct dn_heap *h) +{ + if (h->size >0 ) + free(h->p, M_HEAP); + bzero(h, sizeof(*h) ); +} + +/*DANIELE*/ +/* + * search in heap for a element whith a specific key value + */ + +void * +heap_find(h,key) + struct dn_heap *h; + dn_key key; +{ + int i = 0; + int max = h->elements; + + if (max != 0)/* empty heap? */ + while (i < max) { + if (h->p[i].key == key) /* found right element */ + return h->p[i].object; + /*CHECK RIGHT ELEMENT ON HEAP!!!!*/ + if (DN_KEY_LT(h->p[i].key, key)) + i = HEAP_LEFT(i); + else + i = HEAP_RIGHT(i); + } + /* no item found */ + return NULL; +} + +/* + * --- end of heap management functions --- + */ diff -urN --exclude=compile sys-orig/ufs/ufs/heap.h sys/ufs/ufs/heap.h --- sys-orig/ufs/ufs/heap.h Thu Jan 1 00:00:00 1970 +++ sys/ufs/ufs/heap.h Wed Aug 31 17:22:10 2005 @@ -0,0 +1,69 @@ +#ifndef _HEAP_H +#define _HEAP_H + +/* + * Definition of heap data structures. In the structures, I decided + * not to use the macros in in the hope of making the code + * easier to port to other architectures. The type of lists and queue we + * use here is pretty simple anyways. + */ + +/* + * So we use a key "dn_key" which is 64 bits. Some macros are used to + * compare key values and handle wraparounds. + * MAX64 returns the largest of two key values. + * MY_M is used as a shift count when doing fixed point arithmetic + * (a better name would be useful...). + */ + +typedef u_int64_t dn_key ; /* sorting key */ +#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0) +#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0) +#define DN_KEY_GT(a,b) ((int64_t)((a)-(b)) > 0) +#define DN_KEY_GEQ(a,b) ((int64_t)((a)-(b)) >= 0) +#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) +/* + * The OFFSET_OF macro is used to return the offset of a field within + * a structure. It is used by the heap management routines. + */ +#define OFFSET_OF(type, field) ((int)&( ((type *)0)->field) ) + +/* + * A heap entry is made of a key and a pointer to the actual + * object stored in the heap. + * The heap is an array of dn_heap_entry entries, dynamically allocated. + * Current size is "size", with "elements" actually in use. + * The heap normally supports only ordered insert and extract from the top. + * If we want to extract an object from the middle of the heap, we + * have to know where the object itself is located in the heap (or we + * need to scan the whole array). To this purpose, an object has a + * field (int) which contains the index of the object itself into the + * heap. When the object is moved, the field must also be updated. + * The offset of the index in the object is stored in the 'offset' + * field in the heap descriptor. The assumption is that this offset + * is non-zero if we want to support extract from the middle. + */ +struct dn_heap_entry { + dn_key key ; /* sorting key. Topmost element is smallest one */ + void *object ; /* object pointer */ +} ; + +struct dn_heap { + int size ; + int elements ; + int offset ; /* XXX if > 0 this is the offset of direct ptr to obj */ + struct dn_heap_entry *p ; /* really an array of "size" entries */ +} ; + +int heap_init(struct dn_heap *h, int size) ; +int heap_insert (struct dn_heap *h, dn_key key1, void *p); +void heap_modify(struct dn_heap *h, void *old, void *new); +void heap_extract(struct dn_heap *h, void *obj); +void heapify(struct dn_heap *h); +void heap_free(struct dn_heap *h); +/*DANIELE XXX */ +void* heap_find(struct dn_heap *h, dn_key key); +void check_heap(struct dn_heap *h, u_char * s); /* debugging */ + + +#endif /* _HEAP_H */ diff -urN --exclude=compile sys-orig/vm/vm_pager.c sys/vm/vm_pager.c --- sys-orig/vm/vm_pager.c Wed Aug 31 17:21:33 2005 +++ sys/vm/vm_pager.c Wed Aug 31 18:21:56 2005 @@ -297,6 +297,7 @@ bp->b_ioflags = 0; bp->b_iodone = NULL; bp->b_error = 0; + add_class(bp, curproc, "initpbuf"); BUF_LOCK(bp, LK_EXCLUSIVE, NULL); }