From 4905f7fbaa60f75df063305c9532fb63b77deab9 Mon Sep 17 00:00:00 2001 From: George Dunlap Date: Thu, 10 Oct 2019 17:57:50 +0100 Subject: [PATCH 11/11] x86/mm: Don't drop a type ref unless you held a ref to begin with Validation and de-validation of pagetable trees may take arbitrarily large amounts of time, and so must be preemptible. This is indicated by setting the PGT_partial bit in the type_info, and setting nr_validated_entries and partial_flags appropriately. Specifically, if the entry at [nr_validated_entries] is partially validated, partial_flags should have the PGT_partial_set bit set, and the entry should hold a general reference count. During de-validation, put_page_type() is called on partially validated entries. Unfortunately, there are a number of issues with the current algorithm. First, doing a "normal" put_page_type() is not safe when no type ref is held: there is nothing to stop another vcpu from coming along and picking up validation again: at which point the put_page_type may drop the only page ref on an in-use page. Some examples are listed in the appendix. The core issue is that put_page_type() is being called both to clean up PGT_partial, and to drop a type count; and has no way of knowing which is which; and so if in between, PGT_partial is cleared, put_page_type() will drop the type ref erroneously. What is needed is to distinguish between two states: - Dropping a type ref which is held - Cleaning up a page which has been partially de/validated Fix this by telling put_page_type() which of the two activities you intend. When cleaning up a partial de/validation, take no action unless you find a page partially validated. If put_page_type() is called without PTF_partial_set, and finds the page in a PGT_partial state anyway, then there's certainly been a misaccounting somewhere, and carrying on would almost certainly cause a security issue, so crash the host instead. In put_page_from_lNe, pass partial_flags on to _put_page_type(). old_guest_table may be set either with a fully validated page (when using the "deferred put" pattern), or with a partially validated page (when a normal "de-validation" is interrupted, or when a validation fails part-way through due to invalid entries). Add a flag, old_guest_table_partial, to indicate which of these it is, and use that to pass the appropriate flag to _put_page_type(). While here, delete stray trailing whitespace. This is part of XSA-299. Reported-by: George Dunlap Signed-off-by: George Dunlap Reviewed-by: Jan Beulich ----- Appendix: Suppose page A, when interpreted as an l3 pagetable, contains all valid entries; and suppose A[x] points to page B, which when interpreted as an l2 pagetable, contains all valid entries. P1: PIN_L3_TABLE A -> PGT_l3_table | 1 | valid B -> PGT_l2_table | 1 | valid P1: UNPIN_TABLE > Arrange to interrupt after B has been de-validated B: type_info -> PGT_l2_table | 0 A: type_info -> PGT_l3_table | 1 | partial nr_validated_enties -> (less than x) P2: mod_l4_entry to point to A > Arrange for this to be interrupted while B is being validated B: type_info -> PGT_l2_table | 1 | partial (nr_validated_entires &c set as appropriate) A: type_info -> PGT_l3_table | 1 | partial nr_validated_entries -> x partial_pte = 1 P3: mod_l3_entry some other unrelated l3 to point to B: B: type_info -> PGT_l2_table | 1 P1: Restart UNPIN_TABLE At this point, since A.nr_validate_entries == x and A.partial_pte != 0, free_l3_table() will call put_page_from_l3e() on pl3e[x], dropping its type count to 0 while it's still being pointed to by some other l3 A similar issue arises with old_guest_table. Consider the following scenario: Suppose A is a page which, when interpreted as an l2, has valid entries until entry x, which is invalid. V1: PIN_L2_TABLE(A) A -> PGT_l2_table | 1 | PGT_partial V1 -> old_guest_table = A V2: PIN_L2_TABLE(A) A -> PGT_l2_table | 1 | PGT_partial V2 -> old_guest_table = A put_old_guest_table() _put_page_type(A) A -> PGT_l2_table | 0 V1: put_old_guest_table() _put_page_type(A) # UNDERFLOW Indeed, it is possible to engineer for old_guest_table for every vcpu a guest has to point to the same page. --- xen/arch/x86/domain.c | 6 +++ xen/arch/x86/mm.c | 99 +++++++++++++++++++++++++++++++----- xen/include/asm-x86/domain.h | 4 +- 3 files changed, 95 insertions(+), 14 deletions(-) diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 897124f05f..6074fa5947 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -1075,9 +1075,15 @@ int arch_set_info_guest( rc = -ERESTART; /* Fallthrough */ case -ERESTART: + /* + * NB that we're putting the kernel-mode table + * here, which we've already successfully + * validated above; hence partial = false; + */ v->arch.old_guest_ptpg = NULL; v->arch.old_guest_table = pagetable_get_page(v->arch.guest_table); + v->arch.old_guest_table_partial = false; v->arch.guest_table = pagetable_null(); break; default: diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 2ea32463a8..9ae71d864a 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -1384,10 +1384,11 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn, { current->arch.old_guest_ptpg = ptpg; current->arch.old_guest_table = pg; + current->arch.old_guest_table_partial = false; } else { - rc = _put_page_type(pg, PTF_preemptible, ptpg); + rc = _put_page_type(pg, flags | PTF_preemptible, ptpg); if ( likely(!rc) ) put_page(pg); } @@ -1410,6 +1411,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, unsigned long mfn = l3e_get_pfn(l3e); int writeable = l3e_get_flags(l3e) & _PAGE_RW; + ASSERT(!(flags & PTF_partial_set)); ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))); do { put_data_page(mfn_to_page(_mfn(mfn)), writeable); @@ -1422,12 +1424,14 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, if ( flags & PTF_defer ) { + ASSERT(!(flags & PTF_partial_set)); current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); current->arch.old_guest_table = pg; + current->arch.old_guest_table_partial = false; return 0; } - rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); + rc = _put_page_type(pg, flags | PTF_preemptible, mfn_to_page(_mfn(pfn))); if ( likely(!rc) ) put_page(pg); @@ -1446,12 +1450,15 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, if ( flags & PTF_defer ) { + ASSERT(!(flags & PTF_partial_set)); current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn)); current->arch.old_guest_table = pg; + current->arch.old_guest_table_partial = false; return 0; } - rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn))); + rc = _put_page_type(pg, flags | PTF_preemptible, + mfn_to_page(_mfn(pfn))); if ( likely(!rc) ) put_page(pg); } @@ -1556,6 +1563,14 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) pl2e = map_domain_page(_mfn(pfn)); + /* + * NB that alloc_l2_table will never set partial_pte on an l2; but + * free_l2_table might if a linear_pagetable entry is interrupted + * partway through de-validation. In that circumstance, + * get_page_from_l2e() will always return -EINVAL; and we must + * retain the type ref by doing the normal partial_flags tracking. + */ + for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++, partial_flags = 0 ) { @@ -1610,6 +1625,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type) page->partial_flags = partial_flags; current->arch.old_guest_ptpg = NULL; current->arch.old_guest_table = page; + current->arch.old_guest_table_partial = true; } } if ( rc < 0 ) @@ -1712,12 +1728,16 @@ static int alloc_l3_table(struct page_info *page) * builds. */ if ( current->arch.old_guest_table == l3e_get_page(l3e) ) + { + ASSERT(current->arch.old_guest_table_partial); page->partial_flags = PTF_partial_set; + } else ASSERT_UNREACHABLE(); } current->arch.old_guest_ptpg = NULL; current->arch.old_guest_table = page; + current->arch.old_guest_table_partial = true; } while ( i-- > 0 ) pl3e[i] = unadjust_guest_l3e(pl3e[i], d); @@ -1891,12 +1911,16 @@ static int alloc_l4_table(struct page_info *page) * builds. */ if ( current->arch.old_guest_table == l4e_get_page(pl4e[i]) ) + { + ASSERT(current->arch.old_guest_table_partial); page->partial_flags = PTF_partial_set; + } else ASSERT_UNREACHABLE(); } current->arch.old_guest_ptpg = NULL; current->arch.old_guest_table = page; + current->arch.old_guest_table_partial = true; } } } @@ -2760,6 +2784,28 @@ static int _put_page_type(struct page_info *page, unsigned int flags, x = y; nx = x - 1; + /* + * Is this expected to do a full reference drop, or only + * cleanup partial validation / devalidation? + * + * If the former, the caller must hold a "full" type ref; + * which means the page must be validated. If the page is + * *not* fully validated, continuing would almost certainly + * open up a security hole. An exception to this is during + * domain destruction, where PGT_validated can be dropped + * without dropping a type ref. + * + * If the latter, do nothing unless type PGT_partial is set. + * If it is set, the type count must be 1. + */ + if ( !(flags & PTF_partial_set) ) + BUG_ON((x & PGT_partial) || + !((x & PGT_validated) || page_get_owner(page)->is_dying)); + else if ( !(x & PGT_partial) ) + return 0; + else + BUG_ON((x & PGT_count_mask) != 1); + ASSERT((x & PGT_count_mask) != 0); if ( unlikely((nx & PGT_count_mask) == 0) ) @@ -3012,17 +3058,34 @@ int put_old_guest_table(struct vcpu *v) if ( !v->arch.old_guest_table ) return 0; - switch ( rc = _put_page_type(v->arch.old_guest_table, PTF_preemptible, - v->arch.old_guest_ptpg) ) + rc = _put_page_type(v->arch.old_guest_table, + PTF_preemptible | + ( v->arch.old_guest_table_partial ? + PTF_partial_set : 0 ), + v->arch.old_guest_ptpg); + + if ( rc == -ERESTART || rc == -EINTR ) { - case -EINTR: - case -ERESTART: + v->arch.old_guest_table_partial = (rc == -ERESTART); return -ERESTART; - case 0: - put_page(v->arch.old_guest_table); } + /* + * It shouldn't be possible for _put_page_type() to return + * anything else at the moment; but if it does happen in + * production, leaking the type ref is probably the best thing to + * do. Either way, drop the general ref held by old_guest_table. + */ + ASSERT(rc == 0); + + put_page(v->arch.old_guest_table); v->arch.old_guest_table = NULL; + v->arch.old_guest_ptpg = NULL; + /* + * Safest default if someone sets old_guest_table without + * explicitly setting old_guest_table_partial. + */ + v->arch.old_guest_table_partial = true; return rc; } @@ -3175,11 +3238,11 @@ int new_guest_cr3(mfn_t mfn) switch ( rc = put_page_and_type_preemptible(page) ) { case -EINTR: - rc = -ERESTART; - /* fallthrough */ case -ERESTART: curr->arch.old_guest_ptpg = NULL; curr->arch.old_guest_table = page; + curr->arch.old_guest_table_partial = (rc == -ERESTART); + rc = -ERESTART; break; default: BUG_ON(rc); @@ -3448,6 +3511,7 @@ long do_mmuext_op( { curr->arch.old_guest_ptpg = NULL; curr->arch.old_guest_table = page; + curr->arch.old_guest_table_partial = false; } } } @@ -3482,6 +3546,11 @@ long do_mmuext_op( case -ERESTART: curr->arch.old_guest_ptpg = NULL; curr->arch.old_guest_table = page; + /* + * EINTR means we still hold the type ref; ERESTART + * means PGT_partial holds the type ref + */ + curr->arch.old_guest_table_partial = (rc == -ERESTART); rc = 0; break; default: @@ -3550,11 +3619,15 @@ long do_mmuext_op( switch ( rc = put_page_and_type_preemptible(page) ) { case -EINTR: - rc = -ERESTART; - /* fallthrough */ case -ERESTART: curr->arch.old_guest_ptpg = NULL; curr->arch.old_guest_table = page; + /* + * EINTR means we still hold the type ref; + * ERESTART means PGT_partial holds the ref + */ + curr->arch.old_guest_table_partial = (rc == -ERESTART); + rc = -ERESTART; break; default: BUG_ON(rc); diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index aec65630d9..5afaf6b9de 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -311,7 +311,7 @@ struct arch_domain struct paging_domain paging; struct p2m_domain *p2m; - /* To enforce lock ordering in the pod code wrt the + /* To enforce lock ordering in the pod code wrt the * page_alloc lock */ int page_alloc_unlock_level; @@ -550,6 +550,8 @@ struct arch_vcpu struct page_info *old_guest_table; /* partially destructed pagetable */ struct page_info *old_guest_ptpg; /* containing page table of the */ /* former, if any */ + bool old_guest_table_partial; /* Are we dropping a type ref, or just + * finishing up a partial de-validation? */ /* guest_table holds a ref to the page, and also a type-count unless * shadow refcounts are in use */ pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */ -- 2.23.0