Index: sys/pool.h =================================================================== RCS file: /cvs/src/sys/sys/pool.h,v retrieving revision 1.47 diff -u -p -r1.47 pool.h --- sys/pool.h 2 Jul 2014 00:23:36 -0000 1.47 +++ sys/pool.h 25 Aug 2014 09:30:41 -0000 @@ -66,6 +66,11 @@ struct kinfo_pool { unsigned long pr_nidle; /* # of idle pages */ }; +#define PR_WAITOK 0x0001 /* M_WAITOK */ +#define PR_NOWAIT 0x0002 /* M_NOWAIT */ +#define PR_LIMITFAIL 0x0004 /* M_CANFAIL */ +#define PR_ZERO 0x0008 /* M_ZERO */ + #if defined(_KERNEL) || defined(_LIBKVM) #include @@ -73,74 +78,63 @@ struct kinfo_pool { #include struct pool; +struct pool_request; +TAILQ_HEAD(pool_requests, pool_request); struct pool_allocator { void *(*pa_alloc)(struct pool *, int, int *); void (*pa_free)(struct pool *, void *); int pa_pagesz; - int pa_pagemask; - int pa_pageshift; }; -LIST_HEAD(pool_pagelist, pool_item_header); +TAILQ_HEAD(pool_page_list, pool_page_header); struct pool { struct mutex pr_mtx; SIMPLEQ_ENTRY(pool) pr_poollist; - struct pool_pagelist + + struct pool_page_list pr_emptypages; /* Empty pages */ - struct pool_pagelist + struct pool_page_list pr_fullpages; /* Full pages */ - struct pool_pagelist + struct pool_page_list pr_partpages; /* Partially-allocated pages */ - struct pool_item_header * + struct pool_page_header * pr_curpage; + + unsigned int pr_serial; /* Unique serial number of the pool */ + unsigned int pr_size; /* Size of item */ unsigned int pr_align; /* Requested alignment, must be 2^n */ - unsigned int pr_itemoffset; /* Align this offset in item */ + unsigned int pr_pgsize; /* Size of a "page" */ + vaddr_t pr_pgmask; /* Mask with an item to get a page */ + unsigned int pr_itemsperpg; /* # items that fit in a page */ + unsigned int pr_minitems; /* minimum # of items to keep */ unsigned int pr_minpages; /* same in page units */ unsigned int pr_maxpages; /* maximum # of idle pages to keep */ unsigned int pr_npages; /* # of pages allocated */ - unsigned int pr_itemsperpage;/* # items that fit in a page */ - unsigned int pr_slack; /* unused space in a page */ unsigned int pr_nitems; /* number of available items in pool */ unsigned int pr_nout; /* # items currently allocated */ unsigned int pr_hardlimit; /* hard limit to number of allocated items */ - unsigned int pr_serial; /* unique serial number of the pool */ - struct pool_allocator * - pr_alloc; /* backend allocator */ - const char * pr_wchan; /* tsleep(9) identifier */ - unsigned int pr_flags; /* r/w flags */ - unsigned int pr_roflags; /* r/o flags */ -#define PR_WAITOK 0x0001 /* M_WAITOK */ -#define PR_NOWAIT 0x0002 /* M_NOWAIT */ -#define PR_LIMITFAIL 0x0004 /* M_CANFAIL */ -#define PR_ZERO 0x0008 /* M_ZERO */ -#define PR_WANTED 0x0100 -#define PR_PHINPAGE 0x0200 -#define PR_LOGGING 0x0400 -#define PR_DEBUG 0x0800 -#define PR_DEBUGCHK 0x1000 int pr_ipl; + const char *pr_wchan; /* sleep(9) identifier */ + const struct pool_allocator + *pr_alloc; /* Backend allocator */ + const struct kmem_pa_mode + *pr_crange; /* Physical memory configuration. */ - RB_HEAD(phtree, pool_item_header) - pr_phtree; - - int pr_maxcolor; /* Cache colouring */ - int pr_curcolor; int pr_phoffset; /* Offset in page of page header */ + RB_HEAD(phtree, pool_page_header) + pr_phtree; /* Page header tree */ - /* - * Warning message to be issued, and a per-time-delta rate cap, - * if the hard limit is reached. - */ - const char *pr_hardlimit_warning; - struct timeval pr_hardlimit_ratecap; - struct timeval pr_hardlimit_warning_last; + struct mutex pr_requests_mtx; + struct pool_requests + pr_requests; + unsigned int pr_requesting; /* * Instrumentation @@ -152,19 +146,23 @@ struct pool { unsigned long pr_npagefree; /* # of pages released */ unsigned int pr_hiwat; /* max # of pages in pool */ unsigned long pr_nidle; /* # of idle pages */ - - /* Physical memory configuration. */ - const struct kmem_pa_mode * - pr_crange; }; #endif /* _KERNEL || _LIBKVM */ #ifdef _KERNEL - extern struct pool_allocator pool_allocator_nointr; -/* these functions are not locked */ +struct pool_request { + TAILQ_ENTRY(pool_request) pr_entry; + void (*pr_handler)(void *, void *); + void *pr_cookie; + void *pr_item; +}; + +void pool_request_init(struct pool_request *, + void (*)(void *, void *), void *); + void pool_init(struct pool *, size_t, u_int, u_int, int, const char *, struct pool_allocator *); void pool_destroy(struct pool *); @@ -172,17 +170,22 @@ void pool_setipl(struct pool *, int); void pool_setlowat(struct pool *, int); void pool_sethiwat(struct pool *, int); int pool_sethardlimit(struct pool *, u_int, const char *, int); -struct uvm_constraint_range; /* XXX */ void pool_set_constraints(struct pool *, const struct kmem_pa_mode *mode); -/* these functions are locked */ void *pool_get(struct pool *, int) __malloc; +void pool_request(struct pool *, struct pool_request *); void pool_put(struct pool *, void *); int pool_reclaim(struct pool *); void pool_reclaim_all(void); int pool_prime(struct pool *, int); +/* the allocator for dma-able memory is a thin layer on top of pool */ +void dma_alloc_init(void); +void *dma_alloc(size_t size, int flags); +void dma_free(void *m, size_t size); +#endif /* _KERNEL */ + #ifdef DDB /* * Debugging and diagnostic aides. @@ -192,11 +195,5 @@ void pool_printit(struct pool *, const void pool_walk(struct pool *, int, int (*)(const char *, ...), void (*)(void *, int, int (*)(const char *, ...))); #endif - -/* the allocator for dma-able memory is a thin layer on top of pool */ -void dma_alloc_init(void); -void *dma_alloc(size_t size, int flags); -void dma_free(void *m, size_t size); -#endif /* _KERNEL */ #endif /* _SYS_POOL_H_ */ Index: kern/subr_pool.c =================================================================== RCS file: /cvs/src/sys/kern/subr_pool.c,v retrieving revision 1.147 diff -u -p -r1.147 subr_pool.c --- kern/subr_pool.c 20 Aug 2014 00:00:46 -0000 1.147 +++ kern/subr_pool.c 25 Aug 2014 09:30:41 -0000 @@ -58,97 +58,95 @@ * an internal pool of page headers (`phpool'). */ -/* List of all pools */ -SIMPLEQ_HEAD(,pool) pool_head = SIMPLEQ_HEAD_INITIALIZER(pool_head); +struct pool_item { + u_long pi_magic; + XSIMPLEQ_ENTRY(pool_item) pi_list; +}; +XSIMPLEQ_HEAD(pool_items, pool_item); + +struct pool_page_header { + RB_ENTRY(pool_page_header) + ph_node; /* Off-page page headers */ + TAILQ_ENTRY(pool_page_header) + ph_entry; /* pool page list */ + struct pool_items ph_itemlist; /* chunk list for this page */ + caddr_t ph_addr; /* this page's address */ + u_long ph_magic; + u_int ph_nmissing; /* # of chunks in use */ + int ph_emptied; +}; + +#define POOL_INPGHDR(_pp) ((_pp)->pr_phoffset != -1) /* + * Global pool data + */ + +struct rwlock pool_lock = RWLOCK_INITIALIZER("pools"); +SIMPLEQ_HEAD(, pool) pool_list = SIMPLEQ_HEAD_INITIALIZER(pool_list); +int pool_count = 0; +/* * Every pool gets a unique serial number assigned to it. If this counter * wraps, we're screwed, but we shouldn't create so many pools anyway. */ unsigned int pool_serial; -unsigned int pool_count; - -/* Lock the previous variables making up the global pool state */ -struct rwlock pool_lock = RWLOCK_INITIALIZER("pools"); /* Private pool for page header structures */ struct pool phpool; +int pool_debug = 0; /* XXX for sysctl */ -struct pool_item_header { - /* Page headers */ - LIST_ENTRY(pool_item_header) - ph_pagelist; /* pool page list */ - XSIMPLEQ_HEAD(,pool_item) ph_itemlist; /* chunk list for this page */ - RB_ENTRY(pool_item_header) - ph_node; /* Off-page page headers */ - int ph_nmissing; /* # of chunks in use */ - caddr_t ph_page; /* this page's address */ - caddr_t ph_colored; /* page's colored address */ - int ph_pagesize; - int ph_magic; -}; - -struct pool_item { - u_int32_t pi_magic; - /* Other entries use only this list entry */ - XSIMPLEQ_ENTRY(pool_item) pi_list; -}; - -#ifdef POOL_DEBUG -int pool_debug = 1; -#else -int pool_debug = 0; -#endif +int pool_catchup(struct pool *); -#define POOL_NEEDS_CATCHUP(pp) \ - ((pp)->pr_nitems < (pp)->pr_minitems) +struct pool_page_header * + pool_p_alloc(struct pool *, int); +void pool_p_insert(struct pool *, struct pool_page_header *); +void pool_p_remove(struct pool *, struct pool_page_header *); +void pool_p_free(struct pool *, struct pool_page_header *); -int pool_catchup(struct pool *); -void pool_prime_page(struct pool *, caddr_t, struct pool_item_header *); void pool_update_curpage(struct pool *); -void pool_swizzle_curpage(struct pool *); void *pool_do_get(struct pool *, int); -void pool_do_put(struct pool *, void *); -void pr_rmpage(struct pool *, struct pool_item_header *, - struct pool_pagelist *); -int pool_chk_page(struct pool *, struct pool_item_header *, int); -int pool_chk(struct pool *); -struct pool_item_header *pool_alloc_item_header(struct pool *, caddr_t , int); +void pool_get_done(void *, void *); +void pool_runqueue(struct pool *, int); void *pool_allocator_alloc(struct pool *, int, int *); void pool_allocator_free(struct pool *, void *); /* - * XXX - quick hack. For pools with large items we want to use a special - * allocator. For now, instead of having the allocator figure out - * the allocation size from the pool (which can be done trivially - * with round_page(pr_itemsperpage * pr_size)) which would require - * lots of changes everywhere, we just create allocators for each - * size. We limit those to 128 pages. + * The default pool allocator. + */ +void *pool_page_alloc(struct pool *, int, int *); +void pool_page_free(struct pool *, void *); + +/* + * safe for interrupts, name preserved for compat this is the default + * allocator */ -#define POOL_LARGE_MAXPAGES 128 -struct pool_allocator pool_allocator_large[POOL_LARGE_MAXPAGES]; -struct pool_allocator pool_allocator_large_ni[POOL_LARGE_MAXPAGES]; +struct pool_allocator pool_allocator_nointr = { + pool_page_alloc, + pool_page_free +}; + void *pool_large_alloc(struct pool *, int, int *); void pool_large_free(struct pool *, void *); -void *pool_large_alloc_ni(struct pool *, int, int *); -void pool_large_free_ni(struct pool *, void *); +struct pool_allocator pool_allocator_large = { + pool_large_alloc, + pool_large_free +}; -#ifdef DDB -void pool_print_pagelist(struct pool_pagelist *, int (*)(const char *, ...) - __attribute__((__format__(__kprintf__,1,2)))); -void pool_print1(struct pool *, const char *, int (*)(const char *, ...) - __attribute__((__format__(__kprintf__,1,2)))); -#endif +void *pool_large_ni_alloc(struct pool *, int, int *); +void pool_large_ni_free(struct pool *, void *); -#define pool_sleep(pl) msleep(pl, &pl->pr_mtx, PSWP, pl->pr_wchan, 0) +struct pool_allocator pool_allocator_large_ni = { + pool_large_ni_alloc, + pool_large_ni_free +}; static inline int -phtree_compare(struct pool_item_header *a, struct pool_item_header *b) +phtree_compare(struct pool_page_header *a, struct pool_page_header *b) { - vaddr_t va = (vaddr_t)a->ph_page; - vaddr_t vb = (vaddr_t)b->ph_page; + vaddr_t va = (vaddr_t)a->ph_addr; + vaddr_t vb = (vaddr_t)b->ph_addr; /* the compares in this order are important for the NFIND to work */ if (vb < va) @@ -159,84 +157,36 @@ phtree_compare(struct pool_item_header * return (0); } -RB_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare); -RB_GENERATE(phtree, pool_item_header, ph_node, phtree_compare); +RB_PROTOTYPE(phtree, pool_page_header, ph_node, phtree_compare); +RB_GENERATE(phtree, pool_page_header, ph_node, phtree_compare); /* * Return the pool page header based on page address. */ -static inline struct pool_item_header * -pr_find_pagehead(struct pool *pp, void *v) +static inline struct pool_page_header * +pool_p_find(struct pool *pp, void *v) { - struct pool_item_header *ph, key; + struct pool_page_header *ph, key; - if ((pp->pr_roflags & PR_PHINPAGE) != 0) { + if (POOL_INPGHDR(pp)) { caddr_t page; - - page = (caddr_t)((vaddr_t)v & pp->pr_alloc->pa_pagemask); - - return ((struct pool_item_header *)(page + pp->pr_phoffset)); + page = (caddr_t)((vaddr_t)v & pp->pr_pgmask); + return ((struct pool_page_header *)(page + pp->pr_phoffset)); } - key.ph_page = v; + key.ph_addr = (caddr_t)v; ph = RB_NFIND(phtree, &pp->pr_phtree, &key); - if (ph == NULL) { - panic("pr_find_pagehead: %s: page header missing", - pp->pr_wchan); - } + if (ph == NULL) + panic("pool_p_find: %s: page header missing", pp->pr_wchan); - KASSERT(ph->ph_page <= (caddr_t)v); - if (ph->ph_page + ph->ph_pagesize <= (caddr_t)v) { - panic("pr_find_pagehead: %s: incorrect page", - pp->pr_wchan); - } + KASSERT(ph->ph_addr <= (caddr_t)v); + if (ph->ph_addr + pp->pr_pgsize <= (caddr_t)v) + panic("pool_p_find: %s: incorrect page", pp->pr_wchan); return (ph); } /* - * Remove a page from the pool. - */ -void -pr_rmpage(struct pool *pp, struct pool_item_header *ph, - struct pool_pagelist *pq) -{ - - /* - * If the page was idle, decrement the idle page count. - */ - if (ph->ph_nmissing == 0) { -#ifdef DIAGNOSTIC - if (pp->pr_nidle == 0) - panic("pr_rmpage: nidle inconsistent"); - if (pp->pr_nitems < pp->pr_itemsperpage) - panic("pr_rmpage: nitems inconsistent"); -#endif - pp->pr_nidle--; - } - - pp->pr_nitems -= pp->pr_itemsperpage; - - /* - * Unlink a page from the pool and release it (or queue it for release). - */ - LIST_REMOVE(ph, ph_pagelist); - if ((pp->pr_roflags & PR_PHINPAGE) == 0) - RB_REMOVE(phtree, &pp->pr_phtree, ph); - pp->pr_npages--; - pp->pr_npagefree++; - pool_update_curpage(pp); - - if (pq) { - LIST_INSERT_HEAD(pq, ph, ph_pagelist); - } else { - pool_allocator_free(pp, ph->ph_page); - if ((pp->pr_roflags & PR_PHINPAGE) == 0) - pool_put(&phpool, ph); - } -} - -/* * Initialize the given pool resource structure. * * We export this routine to allow other kernel parts to declare @@ -246,57 +196,15 @@ void pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags, const char *wchan, struct pool_allocator *palloc) { - int off, slack; + unsigned int pgsize = PAGE_SIZE, items; #ifdef DIAGNOSTIC struct pool *iter; + KASSERT(ioff == 0); #endif -#ifdef MALLOC_DEBUG - if ((flags & PR_DEBUG) && (ioff != 0 || align != 0)) - flags &= ~PR_DEBUG; -#endif /* * Check arguments and construct default values. */ - if (palloc == NULL) { - if (size > PAGE_SIZE) { - int psize; - - /* - * XXX - should take align into account as well. - */ - if (size == round_page(size)) - psize = size / PAGE_SIZE; - else - psize = PAGE_SIZE / roundup(size % PAGE_SIZE, - 1024); - if (psize > POOL_LARGE_MAXPAGES) - psize = POOL_LARGE_MAXPAGES; - if (flags & PR_WAITOK) - palloc = &pool_allocator_large_ni[psize-1]; - else - palloc = &pool_allocator_large[psize-1]; - if (palloc->pa_pagesz == 0) { - palloc->pa_pagesz = psize * PAGE_SIZE; - if (flags & PR_WAITOK) { - palloc->pa_alloc = pool_large_alloc_ni; - palloc->pa_free = pool_large_free_ni; - } else { - palloc->pa_alloc = pool_large_alloc; - palloc->pa_free = pool_large_free; - } - } - } else { - palloc = &pool_allocator_nointr; - } - } - if (palloc->pa_pagesz == 0) { - palloc->pa_pagesz = PAGE_SIZE; - } - if (palloc->pa_pagemask == 0) { - palloc->pa_pagemask = ~(palloc->pa_pagesz - 1); - palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1; - } if (align == 0) align = ALIGN(1); @@ -305,100 +213,68 @@ pool_init(struct pool *pp, size_t size, size = sizeof(struct pool_item); size = roundup(size, align); -#ifdef DIAGNOSTIC - if (size > palloc->pa_pagesz) - panic("pool_init: pool item size (%lu) too large", - (u_long)size); -#endif + + if (palloc == NULL) { + while (size * 8 > pgsize) + pgsize <<= 1; + + if (pgsize > PAGE_SIZE) { + palloc = ISSET(flags, PR_WAITOK) ? + &pool_allocator_large_ni : &pool_allocator_large; + } else + palloc = &pool_allocator_nointr; + } else + pgsize = palloc->pa_pagesz ? palloc->pa_pagesz : PAGE_SIZE; + + items = pgsize / size; + KASSERT(items > 0); /* * Initialize the pool structure. */ - LIST_INIT(&pp->pr_emptypages); - LIST_INIT(&pp->pr_fullpages); - LIST_INIT(&pp->pr_partpages); - pp->pr_curpage = NULL; - pp->pr_npages = 0; - pp->pr_minitems = 0; - pp->pr_minpages = 0; - pp->pr_maxpages = 8; - pp->pr_roflags = flags; - pp->pr_flags = 0; + memset(pp, 0, sizeof(*pp)); + mtx_init(&pp->pr_mtx, IPL_NONE); + TAILQ_INIT(&pp->pr_emptypages); + TAILQ_INIT(&pp->pr_fullpages); + TAILQ_INIT(&pp->pr_partpages); pp->pr_size = size; - pp->pr_align = align; + pp->pr_pgsize = pgsize; + pp->pr_pgmask = ~0UL ^ (pgsize - 1); + pp->pr_itemsperpg = items; pp->pr_wchan = wchan; pp->pr_alloc = palloc; - pp->pr_nitems = 0; - pp->pr_nout = 0; pp->pr_hardlimit = UINT_MAX; - pp->pr_hardlimit_warning = NULL; - pp->pr_hardlimit_ratecap.tv_sec = 0; - pp->pr_hardlimit_ratecap.tv_usec = 0; - pp->pr_hardlimit_warning_last.tv_sec = 0; - pp->pr_hardlimit_warning_last.tv_usec = 0; - - /* - * Decide whether to put the page header off page to avoid - * wasting too large a part of the page. Off-page page headers - * go into an RB tree, so we can match a returned item with - * its header based on the page address. - * We use 1/16 of the page size as the threshold (XXX: tune) - */ - if (pp->pr_size < palloc->pa_pagesz/16 && pp->pr_size < PAGE_SIZE) { - /* Use the end of the page for the page header */ - pp->pr_roflags |= PR_PHINPAGE; - pp->pr_phoffset = off = palloc->pa_pagesz - - ALIGN(sizeof(struct pool_item_header)); - } else { - /* The page header will be taken from our page header pool */ - pp->pr_phoffset = 0; - off = palloc->pa_pagesz; - RB_INIT(&pp->pr_phtree); + pp->pr_ipl = IPL_NONE; + pp->pr_phoffset = -1; + RB_INIT(&pp->pr_phtree); + TAILQ_INIT(&pp->pr_requests); + mtx_init(&pp->pr_requests_mtx, IPL_NONE); + pp->pr_requesting = 0; + + pgsize = size * items; + if (pp->pr_pgsize - pgsize > sizeof(struct pool_page_header)) + pp->pr_phoffset = pgsize; + else if (sizeof(struct pool_page_header) * 2 >= size) { + pp->pr_itemsperpg = + (pp->pr_pgsize - sizeof(struct pool_page_header)) / size; + pp->pr_phoffset = pp->pr_itemsperpg * size; } - /* - * Alignment is to take place at `ioff' within the item. This means - * we must reserve up to `align - 1' bytes on the page to allow - * appropriate positioning of each item. - * - * Silently enforce `0 <= ioff < align'. - */ - pp->pr_itemoffset = ioff = ioff % align; - pp->pr_itemsperpage = (off - ((align - ioff) % align)) / pp->pr_size; - KASSERT(pp->pr_itemsperpage != 0); - - /* - * Use the slack between the chunks and the page header - * for "cache coloring". - */ - slack = off - pp->pr_itemsperpage * pp->pr_size; - pp->pr_maxcolor = (slack / align) * align; - pp->pr_curcolor = 0; - - pp->pr_nget = 0; - pp->pr_nfail = 0; - pp->pr_nput = 0; - pp->pr_npagealloc = 0; - pp->pr_npagefree = 0; - pp->pr_hiwat = 0; - pp->pr_nidle = 0; - - pp->pr_ipl = -1; - mtx_init(&pp->pr_mtx, IPL_NONE); + /* pglistalloc/constraint parameters */ + pp->pr_crange = &kp_dirty; + /* this can race, but its unlikely the first pools attach together */ if (phpool.pr_size == 0) { - pool_init(&phpool, sizeof(struct pool_item_header), 0, 0, + pool_init(&phpool, sizeof(struct pool_page_header), 0, 0, 0, "phpool", NULL); pool_setipl(&phpool, IPL_HIGH); } - /* pglistalloc/constraint parameters */ - pp->pr_crange = &kp_dirty; - /* Insert this into the list of all pools. */ rw_enter_write(&pool_lock); + #ifdef DIAGNOSTIC - SIMPLEQ_FOREACH(iter, &pool_head, pr_poollist) { + SIMPLEQ_FOREACH(iter, &pool_list, pr_poollist) { if (iter == pp) panic("init pool already on list"); } @@ -407,9 +283,10 @@ pool_init(struct pool *pp, size_t size, pp->pr_serial = ++pool_serial; if (pool_serial == 0) panic("pool_init: too much uptime"); + SIMPLEQ_INSERT_HEAD(&pool_list, pp, pr_poollist); - SIMPLEQ_INSERT_HEAD(&pool_head, pp, pr_poollist); pool_count++; + rw_exit_write(&pool_lock); } @@ -418,6 +295,7 @@ pool_setipl(struct pool *pp, int ipl) { pp->pr_ipl = ipl; mtx_init(&pp->pr_mtx, ipl); + mtx_init(&pp->pr_requests_mtx, ipl); } /* @@ -426,412 +304,311 @@ pool_setipl(struct pool *pp, int ipl) void pool_destroy(struct pool *pp) { - struct pool_item_header *ph; + struct pool_page_header *ph; struct pool *prev, *iter; - /* Remove from global pool list */ +#ifdef DIAGNOSTIC + if (pp->pr_nout != 0) { + panic("pool_destroy: %s pool busy: %u still out", + pp->pr_wchan, pp->pr_nout); + } +#endif + rw_enter_write(&pool_lock); - pool_count--; - if (pp == SIMPLEQ_FIRST(&pool_head)) - SIMPLEQ_REMOVE_HEAD(&pool_head, pr_poollist); + if (pp == SIMPLEQ_FIRST(&pool_list)) + SIMPLEQ_REMOVE_HEAD(&pool_list, pr_poollist); else { - prev = SIMPLEQ_FIRST(&pool_head); - SIMPLEQ_FOREACH(iter, &pool_head, pr_poollist) { + prev = SIMPLEQ_FIRST(&pool_list); + SIMPLEQ_FOREACH(iter, &pool_list, pr_poollist) { if (iter == pp) { - SIMPLEQ_REMOVE_AFTER(&pool_head, prev, + SIMPLEQ_REMOVE_AFTER(&pool_list, prev, pr_poollist); - goto removed; + break; } prev = iter; } -#ifdef DIAGNOSTIC - panic("destroyed pool not on list"); -#endif } -removed: + pool_count--; rw_exit_write(&pool_lock); -#ifdef DIAGNOSTIC - if (pp->pr_nout != 0) - panic("pool_destroy: pool busy: still out: %u", pp->pr_nout); -#endif /* Remove all pages */ - while ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL) - pr_rmpage(pp, ph, NULL); - KASSERT(LIST_EMPTY(&pp->pr_fullpages)); - KASSERT(LIST_EMPTY(&pp->pr_partpages)); - + while ((ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL) { + pool_p_remove(pp, ph); + pool_p_free(pp, ph); + } + KASSERT(TAILQ_EMPTY(&pp->pr_fullpages)); + KASSERT(TAILQ_EMPTY(&pp->pr_partpages)); } -struct pool_item_header * -pool_alloc_item_header(struct pool *pp, caddr_t storage, int flags) +void +pool_request_init(struct pool_request *pr, + void (*handler)(void *, void *), void *cookie) { - struct pool_item_header *ph; + pr->pr_handler = handler; + pr->pr_cookie = cookie; + pr->pr_item = NULL; +} - if ((pp->pr_roflags & PR_PHINPAGE) != 0) - ph = (struct pool_item_header *)(storage + pp->pr_phoffset); - else - ph = pool_get(&phpool, (flags & ~(PR_WAITOK | PR_ZERO)) | - PR_NOWAIT); -#ifdef DIAGNOSTIC - if (pool_debug && ph != NULL) - ph->ph_magic = poison_value(ph); -#endif - return (ph); +void +pool_request(struct pool *pp, struct pool_request *pr) +{ + mtx_enter(&pp->pr_requests_mtx); + TAILQ_INSERT_TAIL(&pp->pr_requests, pr, pr_entry); + pool_runqueue(pp, PR_NOWAIT); + mtx_leave(&pp->pr_requests_mtx); } +struct pool_get_memory { + struct mutex mtx; + void * volatile v; +}; + /* - * Grab an item from the pool; must be called at appropriate spl level + * Grab an item from the pool. */ void * pool_get(struct pool *pp, int flags) { - void *v; + void *v = NULL; KASSERT(flags & (PR_WAITOK | PR_NOWAIT)); - if ((flags & PR_WAITOK) != 0) { -#ifdef DIAGNOSTIC - assertwaitok(); - if (pool_debug == 2) - yield(); -#endif - if (!cold && pool_debug) { - KERNEL_UNLOCK(); - KERNEL_LOCK(); - } - } - mtx_enter(&pp->pr_mtx); -#ifdef POOL_DEBUG - if (pp->pr_roflags & PR_DEBUGCHK) { - if (pool_chk(pp)) - panic("before pool_get"); + if (pp->pr_nout >= pp->pr_hardlimit) { + if (ISSET(flags, PR_NOWAIT|PR_LIMITFAIL)) + goto fail; + } else if ((v = pool_do_get(pp, flags)) == NULL) { + if (ISSET(flags, PR_NOWAIT)) + goto fail; } -#endif - v = pool_do_get(pp, flags); -#ifdef POOL_DEBUG - if (pp->pr_roflags & PR_DEBUGCHK) { - if (pool_chk(pp)) - panic("after pool_get"); - } -#endif - if (v != NULL) - pp->pr_nget++; mtx_leave(&pp->pr_mtx); - if (v == NULL) - return (v); - if (flags & PR_ZERO) + if (v == NULL) { + struct pool_get_memory mem = + { MUTEX_INITIALIZER(pp->pr_ipl), NULL }; + struct pool_request pr; + + pool_request_init(&pr, pool_get_done, &mem); + pool_request(pp, &pr); + + mtx_enter(&mem.mtx); + while (mem.v == NULL) + msleep(&mem, &mem.mtx, PSWP, pp->pr_wchan, 0); + mtx_leave(&mem.mtx); + + v = mem.v; + } + + if (ISSET(flags, PR_ZERO)) memset(v, 0, pp->pr_size); return (v); + +fail: + pp->pr_nfail++; + mtx_leave(&pp->pr_mtx); + return (NULL); } -void * -pool_do_get(struct pool *pp, int flags) +void +pool_get_done(void *xmem, void *v) { - struct pool_item *pi; - struct pool_item_header *ph; - void *v; - int slowdown = 0; + struct pool_get_memory *mem = xmem; -#ifdef MALLOC_DEBUG - if (pp->pr_roflags & PR_DEBUG) { - void *addr; - - addr = NULL; - debug_malloc(pp->pr_size, M_DEBUG, - (flags & PR_WAITOK) ? M_WAITOK : M_NOWAIT, &addr); - return (addr); - } -#endif + mtx_enter(&mem->mtx); + mem->v = v; + mtx_leave(&mem->mtx); + + wakeup_one(mem); +} + +void +pool_runqueue(struct pool *pp, int flags) +{ + struct pool_requests prl = TAILQ_HEAD_INITIALIZER(prl); + struct pool_request *pr; -startover: - /* - * Check to see if we've reached the hard limit. If we have, - * and we can wait, then wait until an item has been returned to - * the pool. - */ #ifdef DIAGNOSTIC - if (pp->pr_nout > pp->pr_hardlimit) - panic("pool_do_get: %s: crossed hard limit", pp->pr_wchan); + MUTEX_ASSERT_UNLOCKED(&pp->pr_mtx); + MUTEX_ASSERT_LOCKED(&pp->pr_requests_mtx); #endif - if (pp->pr_nout == pp->pr_hardlimit) { - if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) { - /* - * XXX: A warning isn't logged in this case. Should - * it be? - */ - pp->pr_flags |= PR_WANTED; - pool_sleep(pp); - goto startover; - } - /* - * Log a message that the hard limit has been hit. - */ - if (pp->pr_hardlimit_warning != NULL && - ratecheck(&pp->pr_hardlimit_warning_last, - &pp->pr_hardlimit_ratecap)) - log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning); + if (pp->pr_requesting++) + return; - pp->pr_nfail++; - return (NULL); - } + do { + pp->pr_requesting = 1; - pool_swizzle_curpage(pp); - /* - * The convention we use is that if `curpage' is not NULL, then - * it points at a non-empty bucket. In particular, `curpage' - * never points at a page header which has PR_PHINPAGE set and - * has no items in its bucket. - */ - if ((ph = pp->pr_curpage) == NULL) { -#ifdef DIAGNOSTIC - if (pp->pr_nitems != 0) { - printf("pool_do_get: %s: curpage NULL, nitems %u\n", - pp->pr_wchan, pp->pr_nitems); - panic("pool_do_get: nitems inconsistent"); + /* no TAILQ_JOIN? :( */ + while ((pr = TAILQ_FIRST(&pp->pr_requests)) != NULL) { + TAILQ_REMOVE(&pp->pr_requests, pr, pr_entry); + TAILQ_INSERT_TAIL(&prl, pr, pr_entry); } -#endif + if (TAILQ_EMPTY(&prl)) + continue; - /* - * Call the back-end page allocator for more memory. - */ - v = pool_allocator_alloc(pp, flags, &slowdown); - if (v != NULL) - ph = pool_alloc_item_header(pp, v, flags); - - if (v == NULL || ph == NULL) { - if (v != NULL) - pool_allocator_free(pp, v); - - if ((flags & PR_WAITOK) == 0) { - pp->pr_nfail++; - return (NULL); - } + mtx_leave(&pp->pr_requests_mtx); + + mtx_enter(&pp->pr_mtx); + pr = TAILQ_FIRST(&prl); + while (pr != NULL) { + if (pp->pr_nout >= pp->pr_hardlimit) + break; + + pr->pr_item = pool_do_get(pp, flags); + if (pr->pr_item == NULL) + break; - /* - * Wait for items to be returned to this pool. - * - * XXX: maybe we should wake up once a second and - * try again? - */ - pp->pr_flags |= PR_WANTED; - pool_sleep(pp); - goto startover; + pr = TAILQ_NEXT(pr, pr_entry); } + mtx_leave(&pp->pr_mtx); - /* We have more memory; add it to the pool */ - pool_prime_page(pp, v, ph); - pp->pr_npagealloc++; - - if (slowdown && (flags & PR_WAITOK)) { - mtx_leave(&pp->pr_mtx); - yield(); - mtx_enter(&pp->pr_mtx); + while ((pr = TAILQ_FIRST(&prl)) != NULL && + pr->pr_item != NULL) { + TAILQ_REMOVE(&prl, pr, pr_entry); + (*pr->pr_handler)(pr->pr_cookie, pr->pr_item); } - /* Start the allocation process over. */ - goto startover; - } - if ((v = pi = XSIMPLEQ_FIRST(&ph->ph_itemlist)) == NULL) { - panic("pool_do_get: %s: page empty", pp->pr_wchan); + mtx_enter(&pp->pr_requests_mtx); + } while (--pp->pr_requesting); + + /* no TAILQ_JOIN :( */ + while ((pr = TAILQ_FIRST(&prl)) != NULL) { + TAILQ_REMOVE(&prl, pr, pr_entry); + TAILQ_INSERT_TAIL(&pp->pr_requests, pr, pr_entry); } +} + +void * +pool_do_get(struct pool *pp, int flags) +{ + struct pool_item *pi; + struct pool_page_header *ph; + u_long magic; + #ifdef DIAGNOSTIC - if (pp->pr_nitems == 0) { - printf("pool_do_get: %s: items on itemlist, nitems %u\n", - pp->pr_wchan, pp->pr_nitems); - panic("pool_do_get: nitems inconsistent"); - } + MUTEX_ASSERT_LOCKED(&pp->pr_mtx); #endif -#ifdef DIAGNOSTIC - if (pi->pi_magic != poison_value(pi)) - panic("pool_do_get(%s): free list modified: " - "page %p; item addr %p; offset 0x%x=0x%x", - pp->pr_wchan, ph->ph_page, pi, 0, pi->pi_magic); - if (pool_debug && ph->ph_magic) { - size_t pidx; - uint32_t pval; - if (poison_check(pi + 1, pp->pr_size - sizeof(*pi), - &pidx, &pval)) { - int *ip = (int *)(pi + 1); - panic("pool_do_get(%s): free list modified: " - "page %p; item addr %p; offset 0x%zx=0x%x", - pp->pr_wchan, ph->ph_page, pi, - pidx * sizeof(int), ip[pidx]); + /* + * Account for this item now to avoid races if we need to give up + * pr_mtx to allocate a page. + */ + pp->pr_nout++; + + if (pp->pr_curpage == NULL) { + mtx_leave(&pp->pr_mtx); + ph = pool_p_alloc(pp, flags); + mtx_enter(&pp->pr_mtx); + + if (ph == NULL) { + pp->pr_nout--; + return (NULL); } + + pool_p_insert(pp, ph); } -#endif /* DIAGNOSTIC */ - /* - * Remove from item list. - */ + ph = pp->pr_curpage; + pi = XSIMPLEQ_FIRST(&ph->ph_itemlist); + if (pi == NULL) + panic("pool_do_get: %s: page empty", pp->pr_wchan); + + magic = (u_long)pi ^ ph->ph_magic; + if (pi->pi_magic != magic) { + panic("pool_do_get: %s: %p bad magic (0x%lx != 0x%lx)", + pp->pr_wchan, pi, pi->pi_magic, magic); + } + /* Remove from item list. */ XSIMPLEQ_REMOVE_HEAD(&ph->ph_itemlist, pi_list); - pp->pr_nitems--; - pp->pr_nout++; - if (ph->ph_nmissing == 0) { -#ifdef DIAGNOSTIC - if (pp->pr_nidle == 0) - panic("pool_do_get: nidle inconsistent"); -#endif - pp->pr_nidle--; + if (ph->ph_nmissing++ == 0) { /* * This page was previously empty. Move it to the list of * partially-full pages. This page is already curpage. */ - LIST_REMOVE(ph, ph_pagelist); - LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist); + TAILQ_REMOVE(&pp->pr_emptypages, ph, ph_entry); + TAILQ_INSERT_HEAD(&pp->pr_partpages, ph, ph_entry); + + pp->pr_nidle--; } - ph->ph_nmissing++; - if (XSIMPLEQ_EMPTY(&ph->ph_itemlist)) { -#ifdef DIAGNOSTIC - if (ph->ph_nmissing != pp->pr_itemsperpage) { - panic("pool_do_get: %s: nmissing inconsistent", - pp->pr_wchan); - } -#endif + + if (ph->ph_nmissing == pp->pr_itemsperpg) { /* * This page is now full. Move it to the full list * and select a new current page. */ - LIST_REMOVE(ph, ph_pagelist); - LIST_INSERT_HEAD(&pp->pr_fullpages, ph, ph_pagelist); + TAILQ_REMOVE(&pp->pr_partpages, ph, ph_entry); + TAILQ_INSERT_HEAD(&pp->pr_fullpages, ph, ph_entry); pool_update_curpage(pp); } - /* - * If we have a low water mark and we are now below that low - * water mark, add more items to the pool. - */ - if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) { - /* - * XXX: Should we log a warning? Should we set up a timeout - * to try again in a second or so? The latter could break - * a caller's assumptions about interrupt protection, etc. - */ - } - return (v); -} + pp->pr_nget++; -/* - * Return resource to the pool; must be called at appropriate spl level - */ -void -pool_put(struct pool *pp, void *v) -{ - mtx_enter(&pp->pr_mtx); -#ifdef POOL_DEBUG - if (pp->pr_roflags & PR_DEBUGCHK) { - if (pool_chk(pp)) - panic("before pool_put"); - } -#endif - pool_do_put(pp, v); -#ifdef POOL_DEBUG - if (pp->pr_roflags & PR_DEBUGCHK) { - if (pool_chk(pp)) - panic("after pool_put"); - } -#endif - pp->pr_nput++; - mtx_leave(&pp->pr_mtx); + return (pi); } -/* - * Internal version of pool_put(). - */ void -pool_do_put(struct pool *pp, void *v) +pool_put(struct pool *pp, void *v) { struct pool_item *pi = v; - struct pool_item_header *ph; + struct pool_page_header *ph, *freeph = NULL; + extern int ticks; +#ifdef DIAGNOSTIC if (v == NULL) panic("pool_put of NULL"); - -#ifdef MALLOC_DEBUG - if (pp->pr_roflags & PR_DEBUG) { - debug_free(v, M_DEBUG); - return; - } #endif -#ifdef DIAGNOSTIC - if (pp->pr_ipl != -1) - splassert(pp->pr_ipl); + mtx_enter(&pp->pr_mtx); - if (pp->pr_nout == 0) { - printf("pool %s: putting with none out\n", - pp->pr_wchan); - panic("pool_do_put"); - } -#endif + ph = pool_p_find(pp, v); - ph = pr_find_pagehead(pp, v); + pi->pi_magic = (u_long)pi ^ ph->ph_magic; + XSIMPLEQ_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list); /* - * Return to item list. + * If the page was previously completely full, move it to the + * partially-full list. */ -#ifdef DIAGNOSTIC - if (pool_debug) { - struct pool_item *qi; - XSIMPLEQ_FOREACH(qi, &ph->ph_itemlist, pi_list) - if (pi == qi) - panic("double pool_put: %p", pi); - } - pi->pi_magic = poison_value(pi); - if (ph->ph_magic) { - poison_mem(pi + 1, pp->pr_size - sizeof(*pi)); - } -#endif /* DIAGNOSTIC */ - - XSIMPLEQ_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list); - ph->ph_nmissing--; - pp->pr_nitems++; - pp->pr_nout--; - - /* Cancel "pool empty" condition if it exists */ - if (pp->pr_curpage == NULL) - pp->pr_curpage = ph; - - if (pp->pr_flags & PR_WANTED) { - pp->pr_flags &= ~PR_WANTED; - wakeup(pp); + if (ph->ph_nmissing-- == pp->pr_itemsperpg) { + TAILQ_REMOVE(&pp->pr_fullpages, ph, ph_entry); + TAILQ_INSERT_HEAD(&pp->pr_partpages, ph, ph_entry); } /* - * If this page is now empty, do one of two things: - * - * (1) If we have more pages than the page high water mark, - * free the page back to the system. - * - * (2) Otherwise, move the page to the empty page list. + * If this page is now empty, move the page to the empty page list. * * Either way, select a new current page (so we use a partially-full * page if one is available). */ if (ph->ph_nmissing == 0) { pp->pr_nidle++; - if (pp->pr_nidle > pp->pr_maxpages) { - pr_rmpage(pp, ph, NULL); - } else { - LIST_REMOVE(ph, ph_pagelist); - LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist); - pool_update_curpage(pp); - } + + ph->ph_emptied = ticks; + TAILQ_REMOVE(&pp->pr_partpages, ph, ph_entry); + TAILQ_INSERT_HEAD(&pp->pr_emptypages, ph, ph_entry); + pool_update_curpage(pp); } - /* - * If the page was previously completely full, move it to the - * partially-full list. - */ - else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) { - LIST_REMOVE(ph, ph_pagelist); - LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist); + + pp->pr_nout--; + pp->pr_nput++; + + if (pp->pr_nidle > pp->pr_maxpages) { + ph = TAILQ_LAST(&pp->pr_emptypages, pool_page_list); + if (ticks - ph->ph_emptied > hz) { + pool_p_remove(pp, ph); + freeph = ph; + } } + mtx_leave(&pp->pr_mtx); + + mtx_enter(&pp->pr_requests_mtx); + pool_runqueue(pp, PR_NOWAIT); + mtx_leave(&pp->pr_requests_mtx); + + if (freeph != NULL) + pool_p_free(pp, ph); } /* @@ -840,223 +617,182 @@ pool_do_put(struct pool *pp, void *v) int pool_prime(struct pool *pp, int n) { - struct pool_item_header *ph; - caddr_t cp; + struct pool_page_list pl = TAILQ_HEAD_INITIALIZER(pl); + struct pool_page_header *ph; int newpages; - int slowdown; - mtx_enter(&pp->pr_mtx); - newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; + newpages = roundup(n, pp->pr_itemsperpg) / pp->pr_itemsperpg; while (newpages-- > 0) { - cp = pool_allocator_alloc(pp, PR_NOWAIT, &slowdown); - if (cp != NULL) - ph = pool_alloc_item_header(pp, cp, PR_NOWAIT); - if (cp == NULL || ph == NULL) { - if (cp != NULL) - pool_allocator_free(pp, cp); + ph = pool_p_alloc(pp, PR_NOWAIT); + if (ph == NULL) break; - } - pool_prime_page(pp, cp, ph); - pp->pr_npagealloc++; - pp->pr_minpages++; + TAILQ_INSERT_HEAD(&pl, ph, ph_entry); } - if (pp->pr_minpages >= pp->pr_maxpages) - pp->pr_maxpages = pp->pr_minpages + 1; /* XXX */ - + mtx_enter(&pp->pr_mtx); + while ((ph = TAILQ_FIRST(&pl)) != NULL) { + TAILQ_REMOVE(&pl, ph, ph_entry); + pool_p_insert(pp, ph); + } mtx_leave(&pp->pr_mtx); + return (0); } -/* - * Add a page worth of items to the pool. - * - * Note, we must be called with the pool descriptor LOCKED. - */ -void -pool_prime_page(struct pool *pp, caddr_t storage, struct pool_item_header *ph) +struct pool_page_header * +pool_p_alloc(struct pool *pp, int flags) { + struct pool_page_header *ph; struct pool_item *pi; - caddr_t cp = storage; - unsigned int align = pp->pr_align; - unsigned int ioff = pp->pr_itemoffset; - int n; + caddr_t addr; + int n, slowdown = 0; - /* - * Insert page header. - */ - LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist); - XSIMPLEQ_INIT(&ph->ph_itemlist); - ph->ph_page = storage; - ph->ph_pagesize = pp->pr_alloc->pa_pagesz; - ph->ph_nmissing = 0; - if ((pp->pr_roflags & PR_PHINPAGE) == 0) - RB_INSERT(phtree, &pp->pr_phtree, ph); +#ifdef DIAGNOSTIC + MUTEX_ASSERT_UNLOCKED(&pp->pr_mtx); + KASSERT(pp->pr_size >= sizeof(*pi)); +#endif - pp->pr_nidle++; + addr = pool_allocator_alloc(pp, flags, &slowdown); + if (addr == NULL) + return (NULL); - /* - * Color this page. - */ - cp = (caddr_t)(cp + pp->pr_curcolor); - if ((pp->pr_curcolor += align) > pp->pr_maxcolor) - pp->pr_curcolor = 0; + if (slowdown && ISSET(flags, PR_WAITOK)) + yield(); - /* - * Adjust storage to apply alignment to `pr_itemoffset' in each item. - */ - if (ioff != 0) - cp = (caddr_t)(cp + (align - ioff)); - ph->ph_colored = cp; + if (POOL_INPGHDR(pp)) + ph = (struct pool_page_header *)(addr + pp->pr_phoffset); + else { + ph = pool_get(&phpool, flags); + if (ph == NULL) { + pool_allocator_free(pp, addr); + return (NULL); + } + } - /* - * Insert remaining chunks on the bucket list. - */ - n = pp->pr_itemsperpage; - pp->pr_nitems += n; + XSIMPLEQ_INIT(&ph->ph_itemlist); + ph->ph_addr = addr; + ph->ph_nmissing = 0; + arc4random_buf(&ph->ph_magic, sizeof(ph->ph_magic)); + n = pp->pr_itemsperpg; while (n--) { - pi = (struct pool_item *)cp; + pi = (struct pool_item *)addr; + pi->pi_magic = (u_long)pi ^ ph->ph_magic; + XSIMPLEQ_INSERT_TAIL(&ph->ph_itemlist, pi, pi_list); + addr += pp->pr_size; + } - KASSERT(((((vaddr_t)pi) + ioff) & (align - 1)) == 0); + return (ph); +} - /* Insert on page list */ - XSIMPLEQ_INSERT_TAIL(&ph->ph_itemlist, pi, pi_list); +void +pool_p_free(struct pool *pp, struct pool_page_header *ph) +{ + struct pool_item *pi; + u_long magic; #ifdef DIAGNOSTIC - pi->pi_magic = poison_value(pi); - if (ph->ph_magic) { - poison_mem(pi + 1, pp->pr_size - sizeof(*pi)); + MUTEX_ASSERT_UNLOCKED(&pp->pr_mtx); + KASSERT(ph->ph_nmissing == 0); +#endif + + XSIMPLEQ_FOREACH(pi, &ph->ph_itemlist, pi_list) { + magic = (u_long)pi ^ ph->ph_magic; + if (pi->pi_magic != magic) { + panic("pool_p_free: %s: %p bad magic (0x%lx != 0x%lx)", + pp->pr_wchan, pi, pi->pi_magic, magic); } -#endif /* DIAGNOSTIC */ - cp = (caddr_t)(cp + pp->pr_size); } - /* - * If the pool was depleted, point at the new page. - */ - if (pp->pr_curpage == NULL) - pp->pr_curpage = ph; + pool_allocator_free(pp, ph->ph_addr); - if (++pp->pr_npages > pp->pr_hiwat) - pp->pr_hiwat = pp->pr_npages; + if (!POOL_INPGHDR(pp)) + pool_put(&phpool, ph); } -/* - * Used by pool_get() when nitems drops below the low water mark. This - * is used to catch up pr_nitems with the low water mark. - * - * Note we never wait for memory here, we let the caller decide what to do. - */ -int -pool_catchup(struct pool *pp) +void +pool_p_insert(struct pool *pp, struct pool_page_header *ph) { - struct pool_item_header *ph; - caddr_t cp; - int error = 0; - int slowdown; +#ifdef DIAGNOSTIC + MUTEX_ASSERT_LOCKED(&pp->pr_mtx); +#endif - while (POOL_NEEDS_CATCHUP(pp)) { - /* - * Call the page back-end allocator for more memory. - */ - cp = pool_allocator_alloc(pp, PR_NOWAIT, &slowdown); - if (cp != NULL) - ph = pool_alloc_item_header(pp, cp, PR_NOWAIT); - if (cp == NULL || ph == NULL) { - if (cp != NULL) - pool_allocator_free(pp, cp); - error = ENOMEM; - break; - } - pool_prime_page(pp, cp, ph); - pp->pr_npagealloc++; - } + /* If the pool was depleted, point at the new page */ + if (pp->pr_curpage == NULL) + pp->pr_curpage = ph; + + TAILQ_INSERT_HEAD(&pp->pr_emptypages, ph, ph_entry); + if (!POOL_INPGHDR(pp)) + RB_INSERT(phtree, &pp->pr_phtree, ph); + + pp->pr_nitems += pp->pr_itemsperpg; + pp->pr_nidle++; - return (error); + pp->pr_npagealloc++; + if (++pp->pr_npages > pp->pr_hiwat) + pp->pr_hiwat = pp->pr_npages; } void -pool_update_curpage(struct pool *pp) +pool_p_remove(struct pool *pp, struct pool_page_header *ph) { +#ifdef DIAGNOSTIC + MUTEX_ASSERT_LOCKED(&pp->pr_mtx); +#endif - pp->pr_curpage = LIST_FIRST(&pp->pr_partpages); - if (pp->pr_curpage == NULL) { - pp->pr_curpage = LIST_FIRST(&pp->pr_emptypages); - } + pp->pr_npagefree++; + pp->pr_npages--; + pp->pr_nidle--; + pp->pr_nitems -= pp->pr_itemsperpg; + + if (!POOL_INPGHDR(pp)) + RB_REMOVE(phtree, &pp->pr_phtree, ph); + TAILQ_REMOVE(&pp->pr_emptypages, ph, ph_entry); + + pool_update_curpage(pp); } void -pool_swizzle_curpage(struct pool *pp) +pool_update_curpage(struct pool *pp) { - struct pool_item_header *ph, *next; - - if ((ph = pp->pr_curpage) == NULL) - return; - if (arc4random_uniform(16) != 0) - return; - next = LIST_FIRST(&pp->pr_partpages); - if (next == ph) - next = LIST_NEXT(next, ph_pagelist); - if (next == NULL) { - next = LIST_FIRST(&pp->pr_emptypages); - if (next == ph) - next = LIST_NEXT(next, ph_pagelist); - } - if (next != NULL) - pp->pr_curpage = next; + pp->pr_curpage = TAILQ_FIRST(&pp->pr_partpages); + if (pp->pr_curpage == NULL) + pp->pr_curpage = TAILQ_FIRST(&pp->pr_emptypages); } void pool_setlowat(struct pool *pp, int n) { + int prime; + mtx_enter(&pp->pr_mtx); pp->pr_minitems = n; pp->pr_minpages = (n == 0) ? 0 - : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; + : roundup(n, pp->pr_itemsperpg) / pp->pr_itemsperpg; - mtx_enter(&pp->pr_mtx); - /* Make sure we're caught up with the newly-set low water mark. */ - if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) { - /* - * XXX: Should we log a warning? Should we set up a timeout - * to try again in a second or so? The latter could break - * a caller's assumptions about interrupt protection, etc. - */ - } + prime = pp->pr_nitems - n; mtx_leave(&pp->pr_mtx); + + if (prime > 0) + pool_prime(pp, prime); } void pool_sethiwat(struct pool *pp, int n) { - pp->pr_maxpages = (n == 0) ? 0 - : roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; + : roundup(n, pp->pr_itemsperpg) / pp->pr_itemsperpg; } int pool_sethardlimit(struct pool *pp, u_int n, const char *warnmsg, int ratecap) { - int error = 0; - - if (n < pp->pr_nout) { - error = EINVAL; - goto done; - } - pp->pr_hardlimit = n; - pp->pr_hardlimit_warning = warnmsg; - pp->pr_hardlimit_ratecap.tv_sec = ratecap; - pp->pr_hardlimit_warning_last.tv_sec = 0; - pp->pr_hardlimit_warning_last.tv_usec = 0; - -done: - return (error); + return (0); } void @@ -1073,14 +809,12 @@ pool_set_constraints(struct pool *pp, co int pool_reclaim(struct pool *pp) { - struct pool_item_header *ph, *phnext; - struct pool_pagelist pq; - - LIST_INIT(&pq); + struct pool_page_header *ph, *phnext; + struct pool_page_list pl = TAILQ_HEAD_INITIALIZER(pl); mtx_enter(&pp->pr_mtx); - for (ph = LIST_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) { - phnext = LIST_NEXT(ph, ph_pagelist); + for (ph = TAILQ_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) { + phnext = TAILQ_NEXT(ph, ph_entry); /* Check our minimum page claim */ if (pp->pr_npages <= pp->pr_minpages) @@ -1092,22 +826,20 @@ pool_reclaim(struct pool *pp) * If freeing this page would put us below * the low water mark, stop now. */ - if ((pp->pr_nitems - pp->pr_itemsperpage) < + if ((pp->pr_nitems - pp->pr_itemsperpg) < pp->pr_minitems) break; - pr_rmpage(pp, ph, &pq); + pool_p_remove(pp, ph); } mtx_leave(&pp->pr_mtx); - if (LIST_EMPTY(&pq)) + if (TAILQ_EMPTY(&pl)) return (0); - while ((ph = LIST_FIRST(&pq)) != NULL) { - LIST_REMOVE(ph, ph_pagelist); - pool_allocator_free(pp, ph->ph_page); - if (pp->pr_roflags & PR_PHINPAGE) - continue; - pool_put(&phpool, ph); + + while ((ph = TAILQ_FIRST(&pl)) != NULL) { + TAILQ_REMOVE(&pl, ph, ph_entry); + pool_p_free(pp, ph); } return (1); @@ -1120,10 +852,10 @@ pool_reclaim(struct pool *pp) void pool_reclaim_all(void) { - struct pool *pp; + struct pool *pp; rw_enter_read(&pool_lock); - SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) + SIMPLEQ_FOREACH(pp, &pool_list, pr_poollist) pool_reclaim(pp); rw_exit_read(&pool_lock); } @@ -1133,6 +865,13 @@ pool_reclaim_all(void) #include #include +void +pool_print1(struct pool *pp, const char *modif, + int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); +void +pool_print_pagelist(struct pool_page_list *pl, + int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))); + /* * Diagnostic helpers. */ @@ -1144,25 +883,24 @@ pool_printit(struct pool *pp, const char } void -pool_print_pagelist(struct pool_pagelist *pl, +pool_print_pagelist(struct pool_page_list *pl, int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) { - struct pool_item_header *ph; -#ifdef DIAGNOSTIC + struct pool_page_header *ph; struct pool_item *pi; -#endif + u_long magic; - LIST_FOREACH(ph, pl, ph_pagelist) { - (*pr)("\t\tpage %p, nmissing %d\n", - ph->ph_page, ph->ph_nmissing); -#ifdef DIAGNOSTIC + TAILQ_FOREACH(ph, pl, ph_entry) { + (*pr)("\t\tpage %p, nmissing %d, magic 0x%lx\n", + ph->ph_addr, ph->ph_nmissing, ph->ph_magic); XSIMPLEQ_FOREACH(pi, &ph->ph_itemlist, pi_list) { - if (pi->pi_magic != poison_value(pi)) { - (*pr)("\t\t\titem %p, magic 0x%x\n", - pi, pi->pi_magic); + (*pr)("\t\t\titem %p, magic 0x%lx", pi, pi->pi_magic); + magic = (u_long)pi ^ ph->ph_magic; + if (pi->pi_magic != magic) { + (*pr)(", magic is bad (!=0x%lx)", magic); } + (*pr)("\n"); } -#endif } } @@ -1170,7 +908,7 @@ void pool_print1(struct pool *pp, const char *modif, int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) { - struct pool_item_header *ph; + struct pool_page_header *ph; int print_pagelist = 0; char c; @@ -1180,14 +918,13 @@ pool_print1(struct pool *pp, const char modif++; } - (*pr)("POOL %s: size %u, align %u, ioff %u, roflags 0x%08x\n", - pp->pr_wchan, pp->pr_size, pp->pr_align, pp->pr_itemoffset, - pp->pr_roflags); + (*pr)("POOL %s: size %u, align %u\n", + pp->pr_wchan, pp->pr_size, pp->pr_align); (*pr)("\talloc %p\n", pp->pr_alloc); (*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n", pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages); - (*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n", - pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit); + (*pr)("\titemsperpg %u, nitems %u, nout %u, hardlimit %u\n", + pp->pr_itemsperpg, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit); (*pr)("\n\tnget %lu, nfail %lu, nput %lu\n", pp->pr_nget, pp->pr_nfail, pp->pr_nput); @@ -1197,20 +934,20 @@ pool_print1(struct pool *pp, const char if (print_pagelist == 0) return; - if ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL) + if ((ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL) (*pr)("\n\tempty page list:\n"); pool_print_pagelist(&pp->pr_emptypages, pr); - if ((ph = LIST_FIRST(&pp->pr_fullpages)) != NULL) + if ((ph = TAILQ_FIRST(&pp->pr_fullpages)) != NULL) (*pr)("\n\tfull page list:\n"); pool_print_pagelist(&pp->pr_fullpages, pr); - if ((ph = LIST_FIRST(&pp->pr_partpages)) != NULL) + if ((ph = TAILQ_FIRST(&pp->pr_partpages)) != NULL) (*pr)("\n\tpartial-page list:\n"); pool_print_pagelist(&pp->pr_partpages, pr); if (pp->pr_curpage == NULL) (*pr)("\tno current page\n"); else - (*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page); + (*pr)("\tcurpage %p\n", pp->pr_curpage->ph_addr); } void @@ -1245,7 +982,7 @@ db_show_all_pools(db_expr_t expr, int ha db_printf("%-12s %18s %18s\n", "Name", "Address", "Allocator"); - SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) { + SIMPLEQ_FOREACH(pp, &pool_list, pr_poollist) { if (mode == 'a') { db_printf("%-12s %18p %18p\n", pp->pr_wchan, pp, pp->pr_alloc); @@ -1282,140 +1019,47 @@ db_show_all_pools(db_expr_t expr, int ha PRWORD(ovflw, " %*d", 6, 1, pp->pr_minpages); PRWORD(ovflw, " %*s", 6, 1, maxp); PRWORD(ovflw, " %*lu\n", 5, 1, pp->pr_nidle); - - pool_chk(pp); } } #endif /* DDB */ -#if defined(POOL_DEBUG) || defined(DDB) -int -pool_chk_page(struct pool *pp, struct pool_item_header *ph, int expected) -{ - struct pool_item *pi; - caddr_t page; - int n; - const char *label = pp->pr_wchan; - - page = (caddr_t)((u_long)ph & pp->pr_alloc->pa_pagemask); - if (page != ph->ph_page && - (pp->pr_roflags & PR_PHINPAGE) != 0) { - printf("%s: ", label); - printf("pool(%p:%s): page inconsistency: page %p; " - "at page head addr %p (p %p)\n", - pp, pp->pr_wchan, ph->ph_page, ph, page); - return 1; - } - - for (pi = XSIMPLEQ_FIRST(&ph->ph_itemlist), n = 0; - pi != NULL; - pi = XSIMPLEQ_NEXT(&ph->ph_itemlist, pi, pi_list), n++) { - -#ifdef DIAGNOSTIC - if (pi->pi_magic != poison_value(pi)) { - printf("%s: ", label); - printf("pool(%s): free list modified: " - "page %p; item ordinal %d; addr %p " - "(p %p); offset 0x%x=0x%x\n", - pp->pr_wchan, ph->ph_page, n, pi, page, - 0, pi->pi_magic); - } - if (pool_debug && ph->ph_magic) { - size_t pidx; - uint32_t pval; - if (poison_check(pi + 1, pp->pr_size - sizeof(*pi), - &pidx, &pval)) { - int *ip = (int *)(pi + 1); - printf("pool(%s): free list modified: " - "page %p; item ordinal %d; addr %p " - "(p %p); offset 0x%zx=0x%x\n", - pp->pr_wchan, ph->ph_page, n, pi, - page, pidx * sizeof(int), ip[pidx]); - } - } -#endif /* DIAGNOSTIC */ - page = - (caddr_t)((u_long)pi & pp->pr_alloc->pa_pagemask); - if (page == ph->ph_page) - continue; - - printf("%s: ", label); - printf("pool(%p:%s): page inconsistency: page %p;" - " item ordinal %d; addr %p (p %p)\n", pp, - pp->pr_wchan, ph->ph_page, n, pi, page); - return 1; - } - if (n + ph->ph_nmissing != pp->pr_itemsperpage) { - printf("pool(%p:%s): page inconsistency: page %p;" - " %d on list, %d missing, %d items per page\n", pp, - pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing, - pp->pr_itemsperpage); - return 1; - } - if (expected >= 0 && n != expected) { - printf("pool(%p:%s): page inconsistency: page %p;" - " %d on list, %d missing, %d expected\n", pp, - pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing, - expected); - return 1; - } - return 0; -} - -int -pool_chk(struct pool *pp) -{ - struct pool_item_header *ph; - int r = 0; - - LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) - r += pool_chk_page(pp, ph, pp->pr_itemsperpage); - LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) - r += pool_chk_page(pp, ph, 0); - LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) - r += pool_chk_page(pp, ph, -1); - - return (r); -} -#endif /* defined(POOL_DEBUG) || defined(DDB) */ - #ifdef DDB void pool_walk(struct pool *pp, int full, int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))), void (*func)(void *, int, int (*)(const char *, ...) - __attribute__((__format__(__kprintf__,1,2))))) + __attribute__((__format__(__kprintf__,1,2))))) { - struct pool_item_header *ph; + struct pool_page_header *ph; struct pool_item *pi; - caddr_t cp; + caddr_t addr; int n; - LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) { - cp = ph->ph_colored; + TAILQ_FOREACH(ph, &pp->pr_fullpages, ph_entry) { + addr = ph->ph_addr; n = ph->ph_nmissing; while (n--) { - func(cp, full, pr); - cp += pp->pr_size; + func(addr, full, pr); + addr += pp->pr_size; } } - LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) { - cp = ph->ph_colored; + TAILQ_FOREACH(ph, &pp->pr_partpages, ph_entry) { + addr = ph->ph_addr; n = ph->ph_nmissing; do { XSIMPLEQ_FOREACH(pi, &ph->ph_itemlist, pi_list) { - if (cp == (caddr_t)pi) + if (addr == (caddr_t)pi) break; } - if (cp != (caddr_t)pi) { - func(cp, full, pr); + if (addr != (caddr_t)pi) { + func(addr, full, pr); n--; } - cp += pp->pr_size; + addr += pp->pr_size; } while (n > 0); } } @@ -1443,20 +1087,19 @@ sysctl_dopool(int *name, u_int namelen, case KERN_POOL_NAME: case KERN_POOL_POOL: break; + default: - return (EINVAL); + return (EOPNOTSUPP); } if (namelen != 2) return (ENOTDIR); rw_enter_read(&pool_lock); - - SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) { + SIMPLEQ_FOREACH(pp, &pool_list, pr_poollist) { if (name[1] == pp->pr_serial) break; } - if (pp == NULL) goto done; @@ -1469,8 +1112,8 @@ sysctl_dopool(int *name, u_int namelen, mtx_enter(&pp->pr_mtx); pi.pr_size = pp->pr_size; - pi.pr_pgsize = pp->pr_alloc->pa_pagesz; - pi.pr_itemsperpage = pp->pr_itemsperpage; + pi.pr_pgsize = pp->pr_pgsize; + pi.pr_itemsperpage = pp->pr_itemsperpg; pi.pr_npages = pp->pr_npages; pi.pr_minpages = pp->pr_minpages; pi.pr_maxpages = pp->pr_maxpages; @@ -1498,46 +1141,27 @@ done: /* * Pool backend allocators. - * - * Each pool has a backend allocator that handles allocation, deallocation - */ -void *pool_page_alloc(struct pool *, int, int *); -void pool_page_free(struct pool *, void *); - -/* - * safe for interrupts, name preserved for compat this is the default - * allocator - */ -struct pool_allocator pool_allocator_nointr = { - pool_page_alloc, pool_page_free, 0, -}; - -/* - * XXX - we have at least three different resources for the same allocation - * and each resource can be depleted. First we have the ready elements in - * the pool. Then we have the resource (typically a vm_map) for this - * allocator, then we have physical memory. Waiting for any of these can - * be unnecessary when any other is freed, but the kernel doesn't support - * sleeping on multiple addresses, so we have to fake. The caller sleeps on - * the pool (so that we can be awakened when an item is returned to the pool), - * but we set PA_WANT on the allocator. When a page is returned to - * the allocator and PA_WANT is set pool_allocator_free will wakeup all - * sleeping pools belonging to this allocator. (XXX - thundering herd). - * We also wake up the allocator in case someone without a pool (malloc) - * is sleeping waiting for this allocator. */ void * pool_allocator_alloc(struct pool *pp, int flags, int *slowdown) { - int waitok = flags & PR_WAITOK; void *v; - if (waitok) - mtx_leave(&pp->pr_mtx); - v = pp->pr_alloc->pa_alloc(pp, flags, slowdown); - if (waitok) - mtx_enter(&pp->pr_mtx); + KERNEL_LOCK(); + v = (*pp->pr_alloc->pa_alloc)(pp, flags, slowdown); + KERNEL_UNLOCK(); + +#ifdef DIAGNOSTIC + if (v != NULL && POOL_INPGHDR(pp)) { + vaddr_t addr = (vaddr_t)v; + if ((addr & pp->pr_pgmask) != addr) { + panic("pool_allocator_alloc: %s" + " page address %p isnt aligned to %u", + pp->pr_wchan, v, pp->pr_pgsize); + } + } +#endif return (v); } @@ -1545,9 +1169,9 @@ pool_allocator_alloc(struct pool *pp, in void pool_allocator_free(struct pool *pp, void *v) { - struct pool_allocator *pa = pp->pr_alloc; - - (*pa->pa_free)(pp, v); + KERNEL_LOCK(); + (*pp->pr_alloc->pa_free)(pp, v); + KERNEL_UNLOCK(); } void * @@ -1555,31 +1179,34 @@ pool_page_alloc(struct pool *pp, int fla { struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER; - kd.kd_waitok = (flags & PR_WAITOK); + kd.kd_waitok = ISSET(flags, PR_WAITOK); kd.kd_slowdown = slowdown; - return (km_alloc(PAGE_SIZE, &kv_page, pp->pr_crange, &kd)); + return (km_alloc(pp->pr_pgsize, &kv_page, pp->pr_crange, &kd)); } void pool_page_free(struct pool *pp, void *v) { - km_free(v, PAGE_SIZE, &kv_page, pp->pr_crange); + km_free(v, pp->pr_pgsize, &kv_page, pp->pr_crange); } void * pool_large_alloc(struct pool *pp, int flags, int *slowdown) { + struct kmem_va_mode kv = kv_intrsafe; struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER; void *v; int s; - kd.kd_waitok = (flags & PR_WAITOK); + if (POOL_INPGHDR(pp)) + kv.kv_align = pp->pr_pgsize; + + kd.kd_waitok = ISSET(flags, PR_WAITOK); kd.kd_slowdown = slowdown; s = splvm(); - v = km_alloc(pp->pr_alloc->pa_pagesz, &kv_intrsafe, pp->pr_crange, - &kd); + v = km_alloc(pp->pr_pgsize, &kv, pp->pr_crange, &kd); splx(s); return (v); @@ -1588,26 +1215,39 @@ pool_large_alloc(struct pool *pp, int fl void pool_large_free(struct pool *pp, void *v) { + struct kmem_va_mode kv = kv_intrsafe; int s; + if (POOL_INPGHDR(pp)) + kv.kv_align = pp->pr_pgsize; + s = splvm(); - km_free(v, pp->pr_alloc->pa_pagesz, &kv_intrsafe, pp->pr_crange); + km_free(v, pp->pr_pgsize, &kv, pp->pr_crange); splx(s); } void * -pool_large_alloc_ni(struct pool *pp, int flags, int *slowdown) +pool_large_ni_alloc(struct pool *pp, int flags, int *slowdown) { + struct kmem_va_mode kv = kv_any; struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER; - kd.kd_waitok = (flags & PR_WAITOK); + if (POOL_INPGHDR(pp)) + kv.kv_align = pp->pr_pgsize; + + kd.kd_waitok = ISSET(flags, PR_WAITOK); kd.kd_slowdown = slowdown; - return (km_alloc(pp->pr_alloc->pa_pagesz, &kv_any, pp->pr_crange, &kd)); + return (km_alloc(pp->pr_pgsize, &kv, pp->pr_crange, &kd)); } void -pool_large_free_ni(struct pool *pp, void *v) +pool_large_ni_free(struct pool *pp, void *v) { - km_free(v, pp->pr_alloc->pa_pagesz, &kv_any, pp->pr_crange); + struct kmem_va_mode kv = kv_any; + + if (POOL_INPGHDR(pp)) + kv.kv_align = pp->pr_pgsize; + + km_free(v, pp->pr_pgsize, &kv, pp->pr_crange); }