From owner-hackers+M63918=david=gwynne.id.au@openbsd.org Mon Feb 9 16:04:14 2015 Delivered-To: david@gwynne.id.au Received: by 10.217.142.208 with SMTP id et58csp2201927web; Sun, 8 Feb 2015 22:04:15 -0800 (PST) X-Received: by 10.52.0.226 with SMTP id 2mr7839576vdh.32.1423461855021; Sun, 08 Feb 2015 22:04:15 -0800 (PST) Return-Path: Received: from shear.ucar.edu (lists.openbsd.org. [192.43.244.163]) by mx.google.com with ESMTPS id n7si6110501vcj.17.2015.02.08.22.04.14 for (version=TLSv1 cipher=ECDHE-RSA-RC4-SHA bits=128/128); Sun, 08 Feb 2015 22:04:14 -0800 (PST) Received-SPF: pass (google.com: manual fallback record for domain of owner-hackers+M63918=david=gwynne.id.au@openbsd.org designates 192.43.244.163 as permitted sender) client-ip=192.43.244.163; Authentication-Results: mx.google.com; spf=pass (google.com: manual fallback record for domain of owner-hackers+M63918=david=gwynne.id.au@openbsd.org designates 192.43.244.163 as permitted sender) smtp.mail=owner-hackers+M63918=david=gwynne.id.au@openbsd.org Received: from openbsd.org (localhost [127.0.0.1]) by shear.ucar.edu (8.14.8/8.14.5) with ESMTP id t196IGS4018585 for ; Sun, 8 Feb 2015 23:18:17 -0700 (MST) Received: from glazunov.sibelius.xs4all.nl (sibelius.xs4all.nl [83.163.83.176]) by shear.ucar.edu (8.14.8/8.14.8) with ESMTP id t196IAV1003418 (version=TLSv1/SSLv3 cipher=DHE-DSS-AES256-SHA bits=256 verify=NO) for ; Sun, 8 Feb 2015 23:18:12 -0700 (MST) Received: from glazunov.sibelius.xs4all.nl (kettenis@localhost [127.0.0.1]) by glazunov.sibelius.xs4all.nl (8.14.5/8.14.3) with ESMTP id t196425r024630 for ; Mon, 9 Feb 2015 07:04:02 +0100 (CET) Received: (from kettenis@localhost) by glazunov.sibelius.xs4all.nl (8.14.5/8.14.3/Submit) id t19642cY007529; Mon, 9 Feb 2015 07:04:02 +0100 (CET) Date: Mon, 9 Feb 2015 07:04:02 +0100 (CET) Message-Id: <201502090604.t19642cY007529@glazunov.sibelius.xs4all.nl> From: Mark Kettenis To: hackers@openbsd.org Subject: Unlocking the reaper List-Help: List-ID: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: X-Loop: hackers@openbsd.org Precedence: list Sender: owner-hackers@openbsd.org Status: RO Content-Length: 12802 Lines: 444 The diff below makes the reaper code drop the kernel lock during map teardown. In order to do so it also makes the amd64 pmap (largely) mpsafe. The intention is for pmap_remove() to be completely safe; there might still be some issues with pmap_enter(). This is probably not going to be committed soon. For one thing, I'll need to make other pmaps safe as well. But I'd be interested in some benchmarks as this would remove one of the major sources of kernel lock contention. Index: arch/amd64/include/pmap.h =================================================================== RCS file: /home/cvs/src/sys/arch/amd64/include/pmap.h,v retrieving revision 1.52 diff -u -p -r1.52 pmap.h --- arch/amd64/include/pmap.h 7 Feb 2015 01:46:27 -0000 1.52 +++ arch/amd64/include/pmap.h 9 Feb 2015 03:19:59 -0000 @@ -74,6 +74,7 @@ #include #include #endif /* _KERNEL */ +#include #include #include #endif @@ -284,15 +285,11 @@ LIST_HEAD(pmap_head, pmap); /* struct pm * * note that the pm_obj contains the reference count, * page list, and number of PTPs within the pmap. - * - * pm_lock is the same as the spinlock for vm object 0. Changes to - * the other objects may only be made if that lock has been taken - * (the other object locks are only used when uvm_pagealloc is called) */ struct pmap { + struct mutex pm_mtx; struct uvm_object pm_obj[PTP_LEVELS-1]; /* objects for lvl >= 1) */ -#define pm_lock pm_obj[0].vmobjlock #define pm_obj_l1 pm_obj[0] #define pm_obj_l2 pm_obj[1] #define pm_obj_l3 pm_obj[2] @@ -531,10 +528,12 @@ kvtopte(vaddr_t va) #ifndef _LOCORE struct pv_entry; struct vm_page_md { + struct mutex pv_mtx; struct pv_entry *pv_list; }; #define VM_MDPAGE_INIT(pg) do { \ + mtx_init(&(pg)->mdpage.pv_mtx, IPL_VM); \ (pg)->mdpage.pv_list = NULL; \ } while (0) #endif /* !_LOCORE */ Index: arch/amd64/amd64/pmap.c =================================================================== RCS file: /home/cvs/src/sys/arch/amd64/amd64/pmap.c,v retrieving revision 1.88 diff -u -p -r1.88 pmap.c --- arch/amd64/amd64/pmap.c 7 Feb 2015 01:46:27 -0000 1.88 +++ arch/amd64/amd64/pmap.c 9 Feb 2015 03:19:59 -0000 @@ -385,6 +385,9 @@ pmap_map_ptes(struct pmap *pmap, pt_entr lcr3(pmap->pm_pdirpa); } + if (pmap != pmap_kernel()) + mtx_enter(&pmap->pm_mtx); + *ptepp = PTE_BASE; *pdeppp = normal_pdes; return; @@ -393,6 +396,9 @@ pmap_map_ptes(struct pmap *pmap, pt_entr void pmap_unmap_ptes(struct pmap *pmap, paddr_t save_cr3) { + if (pmap != pmap_kernel()) + mtx_leave(&pmap->pm_mtx); + if (save_cr3 != 0) { x86_atomic_clearbits_u64(&pmap->pm_cpus, (1ULL << cpu_number())); lcr3(save_cr3); @@ -680,6 +686,7 @@ pmap_bootstrap(paddr_t first_avail, padd pool_init(&pmap_pv_pool, sizeof(struct pv_entry), 0, 0, 0, "pvpl", &pool_allocator_nointr); pool_sethiwat(&pmap_pv_pool, 32 * 1024); + pool_setipl(&pmap_pv_pool, IPL_VM); /* * initialize the PDE pool. @@ -764,8 +771,10 @@ pmap_enter_pv(struct vm_page *pg, struct pve->pv_pmap = pmap; pve->pv_va = va; pve->pv_ptp = ptp; /* NULL for kernel pmap */ + mtx_enter(&pg->mdpage.pv_mtx); pve->pv_next = pg->mdpage.pv_list; /* add to ... */ pg->mdpage.pv_list = pve; /* ... list */ + mtx_leave(&pg->mdpage.pv_mtx); } /* @@ -780,6 +789,7 @@ pmap_remove_pv(struct vm_page *pg, struc { struct pv_entry *pve, **prevptr; + mtx_enter(&pg->mdpage.pv_mtx); prevptr = &pg->mdpage.pv_list; while ((pve = *prevptr) != NULL) { if (pve->pv_pmap == pmap && pve->pv_va == va) { /* match? */ @@ -788,6 +798,7 @@ pmap_remove_pv(struct vm_page *pg, struc } prevptr = &pve->pv_next; /* previous pointer */ } + mtx_leave(&pg->mdpage.pv_mtx); return(pve); /* return removed pve */ } @@ -1007,6 +1018,8 @@ pmap_create(void) pmap = pool_get(&pmap_pmap_pool, PR_WAITOK); + mtx_init(&pmap->pm_mtx, IPL_VM); + /* init uvm_object */ for (i = 0; i < PTP_LEVELS - 1; i++) { uvm_objinit(&pmap->pm_obj[i], NULL, 1); @@ -1049,7 +1062,7 @@ pmap_destroy(struct pmap *pmap) * drop reference count */ - refs = --pmap->pm_obj[0].uo_refs; + refs = atomic_sub_int_nv(&pmap->pm_obj[0].uo_refs, 1); if (refs > 0) { return; } @@ -1095,7 +1108,7 @@ pmap_destroy(struct pmap *pmap) void pmap_reference(struct pmap *pmap) { - pmap->pm_obj[0].uo_refs++; + atomic_inc_int(&pmap->pm_obj[0].uo_refs); } /* @@ -1422,7 +1435,6 @@ pmap_do_remove(struct pmap *pmap, vaddr_ pmap_map_ptes(pmap, &ptes, &pdes, &scr3); shootself = (scr3 == 0); - /* * removing one page? take shortcut function. */ @@ -1563,8 +1575,11 @@ pmap_page_remove(struct vm_page *pg) TAILQ_INIT(&empty_ptps); + mtx_enter(&pg->mdpage.pv_mtx); while ((pve = pg->mdpage.pv_list) != NULL) { pg->mdpage.pv_list = pve->pv_next; + pmap_reference(pve->pv_pmap); + mtx_leave(&pg->mdpage.pv_mtx); /* XXX use direct map? */ pmap_map_ptes(pve->pv_pmap, &ptes, &pdes, &scr3); @@ -1604,8 +1619,11 @@ pmap_page_remove(struct vm_page *pg) } } pmap_unmap_ptes(pve->pv_pmap, scr3); + pmap_destroy(pve->pv_pmap); pool_put(&pmap_pv_pool, pve); + mtx_enter(&pg->mdpage.pv_mtx); } + mtx_leave(&pg->mdpage.pv_mtx); pmap_tlb_shootwait(); @@ -1640,12 +1658,14 @@ pmap_test_attrs(struct vm_page *pg, unsi return (TRUE); mybits = 0; + mtx_enter(&pg->mdpage.pv_mtx); for (pve = pg->mdpage.pv_list; pve != NULL && mybits == 0; pve = pve->pv_next) { level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes, &offs); mybits |= (ptes[offs] & testbits); } + mtx_leave(&pg->mdpage.pv_mtx); if (mybits == 0) return (FALSE); @@ -1675,6 +1695,7 @@ pmap_clear_attrs(struct vm_page *pg, uns if (result) atomic_clearbits_int(&pg->pg_flags, clearflags); + mtx_enter(&pg->mdpage.pv_mtx); for (pve = pg->mdpage.pv_list; pve != NULL; pve = pve->pv_next) { level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes, &offs); @@ -1686,6 +1707,7 @@ pmap_clear_attrs(struct vm_page *pg, uns pmap_is_curpmap(pve->pv_pmap)); } } + mtx_leave(&pg->mdpage.pv_mtx); pmap_tlb_shootwait(); @@ -1785,7 +1807,6 @@ pmap_write_protect(struct pmap *pmap, va pmap_tlb_shootrange(pmap, sva, eva, shootself); pmap_unmap_ptes(pmap, scr3); - pmap_tlb_shootwait(); } @@ -1874,7 +1895,7 @@ pmap_enter(struct pmap *pmap, vaddr_t va pt_entry_t *ptes, opte, npte; pd_entry_t **pdes; struct vm_page *ptp, *pg = NULL; - struct pv_entry *pve = NULL; + struct pv_entry *pve, *opve = NULL; int ptpdelta, wireddelta, resdelta; boolean_t wired = (flags & PMAP_WIRED) != 0; boolean_t nocache = (pa & PMAP_NOCACHE) != 0; @@ -1896,6 +1917,15 @@ pmap_enter(struct pmap *pmap, vaddr_t va #endif + pve = pool_get(&pmap_pv_pool, PR_NOWAIT); + if (pve == NULL) { + if (flags & PMAP_CANFAIL) { + error = ENOMEM; + goto out; + } + panic("%s: no pv entries available", __func__); + } + /* * map in ptes and get a pointer to our PTP (unless we are the kernel) */ @@ -1908,6 +1938,7 @@ pmap_enter(struct pmap *pmap, vaddr_t va ptp = pmap_get_ptp(pmap, va, pdes); if (ptp == NULL) { if (flags & PMAP_CANFAIL) { + pmap_unmap_ptes(pmap, scr3); error = ENOMEM; goto out; } @@ -1983,11 +2014,10 @@ pmap_enter(struct pmap *pmap, vaddr_t va __func__, pa, atop(pa)); #endif pmap_sync_flags_pte(pg, opte); - pve = pmap_remove_pv(pg, pmap, va); + opve = pmap_remove_pv(pg, pmap, va); pg = NULL; /* This is not the page we are looking for */ } } else { /* opte not valid */ - pve = NULL; resdelta = 1; if (wired) wireddelta = 1; @@ -2010,21 +2040,8 @@ pmap_enter(struct pmap *pmap, vaddr_t va pg = PHYS_TO_VM_PAGE(pa); if (pg != NULL) { - if (pve == NULL) { - pve = pool_get(&pmap_pv_pool, PR_NOWAIT); - if (pve == NULL) { - if (flags & PMAP_CANFAIL) { - error = ENOMEM; - goto out; - } - panic("%s: no pv entries available", __func__); - } - } pmap_enter_pv(pg, pve, pmap, va, ptp); - } else { - /* new mapping is not PG_PVLIST. free pve if we've got one */ - if (pve) - pool_put(&pmap_pv_pool, pve); + pve = NULL; } enter_now: @@ -2074,13 +2091,18 @@ enter_now: if (nocache && (opte & PG_N) == 0) wbinvd(); pmap_tlb_shootpage(pmap, va, shootself); - pmap_tlb_shootwait(); } + pmap_unmap_ptes(pmap, scr3); + pmap_tlb_shootwait(); + error = 0; out: - pmap_unmap_ptes(pmap, scr3); + if (pve) + pool_put(&pmap_pv_pool, pve); + if (opve) + pool_put(&pmap_pv_pool, opve); return error; } Index: uvm/uvm_addr.c =================================================================== RCS file: /home/cvs/src/sys/uvm/uvm_addr.c,v retrieving revision 1.11 diff -u -p -r1.11 uvm_addr.c --- uvm/uvm_addr.c 23 Dec 2014 02:01:57 -0000 1.11 +++ uvm/uvm_addr.c 9 Feb 2015 03:43:13 -0000 @@ -281,14 +281,19 @@ uvm_addr_init(void) { pool_init(&uaddr_pool, sizeof(struct uvm_addr_state), 0, 0, PR_WAITOK, "uaddr", NULL); + pool_setipl(&uaddr_pool, IPL_VM); pool_init(&uaddr_hint_pool, sizeof(struct uaddr_hint_state), 0, 0, PR_WAITOK, "uaddrhint", NULL); + pool_setipl(&uaddr_hint_pool, IPL_VM); pool_init(&uaddr_bestfit_pool, sizeof(struct uaddr_bestfit_state), 0, 0, PR_WAITOK, "uaddrbest", NULL); + pool_setipl(&uaddr_bestfit_pool, IPL_VM); pool_init(&uaddr_pivot_pool, sizeof(struct uaddr_pivot_state), 0, 0, PR_WAITOK, "uaddrpivot", NULL); + pool_setipl(&uaddr_pivot_pool, IPL_VM); pool_init(&uaddr_rnd_pool, sizeof(struct uaddr_rnd_state), 0, 0, PR_WAITOK, "uaddrrnd", NULL); + pool_setipl(&uaddr_rnd_pool, IPL_VM); uaddr_kbootstrap.uaddr_minaddr = PAGE_SIZE; uaddr_kbootstrap.uaddr_maxaddr = -(vaddr_t)PAGE_SIZE; Index: uvm/uvm_glue.c =================================================================== RCS file: /home/cvs/src/sys/uvm/uvm_glue.c,v retrieving revision 1.69 diff -u -p -r1.69 uvm_glue.c --- uvm/uvm_glue.c 15 Dec 2014 20:38:22 -0000 1.69 +++ uvm/uvm_glue.c 9 Feb 2015 03:43:22 -0000 @@ -464,11 +464,15 @@ uvm_atopg(vaddr_t kva) return (pg); } +#ifndef MULTIPROCESSOR +#define _kernel_lock_held() 1 +#endif + void uvm_pause(void) { static unsigned int toggle; - if (toggle++ > 128) { + if (toggle++ > 128 && _kernel_lock_held()) { toggle = 0; KERNEL_UNLOCK(); KERNEL_LOCK(); Index: uvm/uvm_map.c =================================================================== RCS file: /home/cvs/src/sys/uvm/uvm_map.c,v retrieving revision 1.184 diff -u -p -r1.184 uvm_map.c --- uvm/uvm_map.c 6 Feb 2015 11:41:55 -0000 1.184 +++ uvm/uvm_map.c 9 Feb 2015 03:43:28 -0000 @@ -1844,8 +1844,10 @@ uvm_unmap_kill_entry(struct vm_map *map, { /* Unwire removed map entry. */ if (VM_MAPENT_ISWIRED(entry)) { + KERNEL_LOCK(); entry->wired_count = 0; uvm_fault_unwire_locked(map, entry->start, entry->end); + KERNEL_UNLOCK(); } /* Entry-type specific code. */ @@ -2424,18 +2426,20 @@ void uvm_map_teardown(struct vm_map *map) { struct uvm_map_deadq dead_entries; - int i, waitok = 0; struct vm_map_entry *entry, *tmp; #ifdef VMMAP_DEBUG size_t numq, numt; #endif + int i; - if ((map->flags & VM_MAP_INTRSAFE) == 0) - waitok = 1; - if (waitok) { - if (rw_enter(&map->lock, RW_NOSLEEP | RW_WRITE) != 0) - panic("uvm_map_teardown: rw_enter failed on free map"); - } + KERNEL_ASSERT_LOCKED(); + KERNEL_UNLOCK(); + KERNEL_ASSERT_UNLOCKED(); + + KASSERT((map->flags & VM_MAP_INTRSAFE) == 0); + + if (rw_enter(&map->lock, RW_NOSLEEP | RW_WRITE) != 0) + panic("uvm_map_teardown: rw_enter failed on free map"); /* Remove address selectors. */ uvm_addr_destroy(map->uaddr_exe); @@ -2468,8 +2472,7 @@ uvm_map_teardown(struct vm_map *map) if ((entry = RB_ROOT(&map->addr)) != NULL) DEAD_ENTRY_PUSH(&dead_entries, entry); while (entry != NULL) { - if (waitok) - uvm_pause(); + uvm_pause(); uvm_unmap_kill_entry(map, entry); if ((tmp = RB_LEFT(entry, daddrs.addr_entry)) != NULL) DEAD_ENTRY_PUSH(&dead_entries, tmp); @@ -2479,8 +2482,7 @@ uvm_map_teardown(struct vm_map *map) entry = TAILQ_NEXT(entry, dfree.deadq); } - if (waitok) - rw_exit(&map->lock); + rw_exit(&map->lock); #ifdef VMMAP_DEBUG numt = numq = 0; @@ -2490,7 +2492,10 @@ uvm_map_teardown(struct vm_map *map) numq++; KASSERT(numt == numq); #endif - uvm_unmap_detach(&dead_entries, waitok ? UVM_PLA_WAITOK : 0); + uvm_unmap_detach(&dead_entries, UVM_PLA_WAITOK); + + KERNEL_LOCK(); + pmap_destroy(map->pmap); map->pmap = NULL; } @@ -3187,6 +3192,8 @@ void uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t min, vaddr_t max, boolean_t pageable, boolean_t remove_holes) { + KASSERT(pmap == NULL || pmap == pmap_kernel()); + if (pmap) pmap_reference(pmap); else