diff options
Diffstat (limited to 'arch/x86/kernel/cpu/sgx')
-rw-r--r-- | arch/x86/kernel/cpu/sgx/Makefile | 1 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/sgx/arch.h | 338 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/sgx/driver.c | 17 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/sgx/encl.c | 310 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/sgx/encl.h | 14 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/sgx/encls.h | 64 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/sgx/ioctl.c | 43 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/sgx/main.c | 442 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/sgx/sgx.h | 48 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/sgx/virt.c | 432 |
10 files changed, 1148 insertions, 561 deletions
diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile index 91d3dc784a29..9c1656779b2a 100644 --- a/arch/x86/kernel/cpu/sgx/Makefile +++ b/arch/x86/kernel/cpu/sgx/Makefile @@ -3,3 +3,4 @@ obj-y += \ encl.o \ ioctl.o \ main.o +obj-$(CONFIG_X86_SGX_KVM) += virt.o diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/kernel/cpu/sgx/arch.h deleted file mode 100644 index dd7602c44c72..000000000000 --- a/arch/x86/kernel/cpu/sgx/arch.h +++ /dev/null @@ -1,338 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/** - * Copyright(c) 2016-20 Intel Corporation. - * - * Contains data structures defined by the SGX architecture. Data structures - * defined by the Linux software stack should not be placed here. - */ -#ifndef _ASM_X86_SGX_ARCH_H -#define _ASM_X86_SGX_ARCH_H - -#include <linux/bits.h> -#include <linux/types.h> - -/* The SGX specific CPUID function. */ -#define SGX_CPUID 0x12 -/* EPC enumeration. */ -#define SGX_CPUID_EPC 2 -/* An invalid EPC section, i.e. the end marker. */ -#define SGX_CPUID_EPC_INVALID 0x0 -/* A valid EPC section. */ -#define SGX_CPUID_EPC_SECTION 0x1 -/* The bitmask for the EPC section type. */ -#define SGX_CPUID_EPC_MASK GENMASK(3, 0) - -/** - * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV - * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not - * been completed yet. - * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's - * public key does not match IA32_SGXLEPUBKEYHASH. - * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received - */ -enum sgx_return_code { - SGX_NOT_TRACKED = 11, - SGX_INVALID_EINITTOKEN = 16, - SGX_UNMASKED_EVENT = 128, -}; - -/* The modulus size for 3072-bit RSA keys. */ -#define SGX_MODULUS_SIZE 384 - -/** - * enum sgx_miscselect - additional information to an SSA frame - * %SGX_MISC_EXINFO: Report #PF or #GP to the SSA frame. - * - * Save State Area (SSA) is a stack inside the enclave used to store processor - * state when an exception or interrupt occurs. This enum defines additional - * information stored to an SSA frame. - */ -enum sgx_miscselect { - SGX_MISC_EXINFO = BIT(0), -}; - -#define SGX_MISC_RESERVED_MASK GENMASK_ULL(63, 1) - -#define SGX_SSA_GPRS_SIZE 184 -#define SGX_SSA_MISC_EXINFO_SIZE 16 - -/** - * enum sgx_attributes - the attributes field in &struct sgx_secs - * %SGX_ATTR_INIT: Enclave can be entered (is initialized). - * %SGX_ATTR_DEBUG: Allow ENCLS(EDBGRD) and ENCLS(EDBGWR). - * %SGX_ATTR_MODE64BIT: Tell that this a 64-bit enclave. - * %SGX_ATTR_PROVISIONKEY: Allow to use provisioning keys for remote - * attestation. - * %SGX_ATTR_KSS: Allow to use key separation and sharing (KSS). - * %SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to - * sign cryptographic tokens that can be passed to - * EINIT as an authorization to run an enclave. - */ -enum sgx_attribute { - SGX_ATTR_INIT = BIT(0), - SGX_ATTR_DEBUG = BIT(1), - SGX_ATTR_MODE64BIT = BIT(2), - SGX_ATTR_PROVISIONKEY = BIT(4), - SGX_ATTR_EINITTOKENKEY = BIT(5), - SGX_ATTR_KSS = BIT(7), -}; - -#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | BIT_ULL(6) | GENMASK_ULL(63, 8)) - -/** - * struct sgx_secs - SGX Enclave Control Structure (SECS) - * @size: size of the address space - * @base: base address of the address space - * @ssa_frame_size: size of an SSA frame - * @miscselect: additional information stored to an SSA frame - * @attributes: attributes for enclave - * @xfrm: XSave-Feature Request Mask (subset of XCR0) - * @mrenclave: SHA256-hash of the enclave contents - * @mrsigner: SHA256-hash of the public key used to sign the SIGSTRUCT - * @config_id: a user-defined value that is used in key derivation - * @isv_prod_id: a user-defined value that is used in key derivation - * @isv_svn: a user-defined value that is used in key derivation - * @config_svn: a user-defined value that is used in key derivation - * - * SGX Enclave Control Structure (SECS) is a special enclave page that is not - * visible in the address space. In fact, this structure defines the address - * range and other global attributes for the enclave and it is the first EPC - * page created for any enclave. It is moved from a temporary buffer to an EPC - * by the means of ENCLS[ECREATE] function. - */ -struct sgx_secs { - u64 size; - u64 base; - u32 ssa_frame_size; - u32 miscselect; - u8 reserved1[24]; - u64 attributes; - u64 xfrm; - u32 mrenclave[8]; - u8 reserved2[32]; - u32 mrsigner[8]; - u8 reserved3[32]; - u32 config_id[16]; - u16 isv_prod_id; - u16 isv_svn; - u16 config_svn; - u8 reserved4[3834]; -} __packed; - -/** - * enum sgx_tcs_flags - execution flags for TCS - * %SGX_TCS_DBGOPTIN: If enabled allows single-stepping and breakpoints - * inside an enclave. It is cleared by EADD but can - * be set later with EDBGWR. - */ -enum sgx_tcs_flags { - SGX_TCS_DBGOPTIN = 0x01, -}; - -#define SGX_TCS_RESERVED_MASK GENMASK_ULL(63, 1) -#define SGX_TCS_RESERVED_SIZE 4024 - -/** - * struct sgx_tcs - Thread Control Structure (TCS) - * @state: used to mark an entered TCS - * @flags: execution flags (cleared by EADD) - * @ssa_offset: SSA stack offset relative to the enclave base - * @ssa_index: the current SSA frame index (cleard by EADD) - * @nr_ssa_frames: the number of frame in the SSA stack - * @entry_offset: entry point offset relative to the enclave base - * @exit_addr: address outside the enclave to exit on an exception or - * interrupt - * @fs_offset: offset relative to the enclave base to become FS - * segment inside the enclave - * @gs_offset: offset relative to the enclave base to become GS - * segment inside the enclave - * @fs_limit: size to become a new FS-limit (only 32-bit enclaves) - * @gs_limit: size to become a new GS-limit (only 32-bit enclaves) - * - * Thread Control Structure (TCS) is an enclave page visible in its address - * space that defines an entry point inside the enclave. A thread enters inside - * an enclave by supplying address of TCS to ENCLU(EENTER). A TCS can be entered - * by only one thread at a time. - */ -struct sgx_tcs { - u64 state; - u64 flags; - u64 ssa_offset; - u32 ssa_index; - u32 nr_ssa_frames; - u64 entry_offset; - u64 exit_addr; - u64 fs_offset; - u64 gs_offset; - u32 fs_limit; - u32 gs_limit; - u8 reserved[SGX_TCS_RESERVED_SIZE]; -} __packed; - -/** - * struct sgx_pageinfo - an enclave page descriptor - * @addr: address of the enclave page - * @contents: pointer to the page contents - * @metadata: pointer either to a SECINFO or PCMD instance - * @secs: address of the SECS page - */ -struct sgx_pageinfo { - u64 addr; - u64 contents; - u64 metadata; - u64 secs; -} __packed __aligned(32); - - -/** - * enum sgx_page_type - bits in the SECINFO flags defining the page type - * %SGX_PAGE_TYPE_SECS: a SECS page - * %SGX_PAGE_TYPE_TCS: a TCS page - * %SGX_PAGE_TYPE_REG: a regular page - * %SGX_PAGE_TYPE_VA: a VA page - * %SGX_PAGE_TYPE_TRIM: a page in trimmed state - */ -enum sgx_page_type { - SGX_PAGE_TYPE_SECS, - SGX_PAGE_TYPE_TCS, - SGX_PAGE_TYPE_REG, - SGX_PAGE_TYPE_VA, - SGX_PAGE_TYPE_TRIM, -}; - -#define SGX_NR_PAGE_TYPES 5 -#define SGX_PAGE_TYPE_MASK GENMASK(7, 0) - -/** - * enum sgx_secinfo_flags - the flags field in &struct sgx_secinfo - * %SGX_SECINFO_R: allow read - * %SGX_SECINFO_W: allow write - * %SGX_SECINFO_X: allow execution - * %SGX_SECINFO_SECS: a SECS page - * %SGX_SECINFO_TCS: a TCS page - * %SGX_SECINFO_REG: a regular page - * %SGX_SECINFO_VA: a VA page - * %SGX_SECINFO_TRIM: a page in trimmed state - */ -enum sgx_secinfo_flags { - SGX_SECINFO_R = BIT(0), - SGX_SECINFO_W = BIT(1), - SGX_SECINFO_X = BIT(2), - SGX_SECINFO_SECS = (SGX_PAGE_TYPE_SECS << 8), - SGX_SECINFO_TCS = (SGX_PAGE_TYPE_TCS << 8), - SGX_SECINFO_REG = (SGX_PAGE_TYPE_REG << 8), - SGX_SECINFO_VA = (SGX_PAGE_TYPE_VA << 8), - SGX_SECINFO_TRIM = (SGX_PAGE_TYPE_TRIM << 8), -}; - -#define SGX_SECINFO_PERMISSION_MASK GENMASK_ULL(2, 0) -#define SGX_SECINFO_PAGE_TYPE_MASK (SGX_PAGE_TYPE_MASK << 8) -#define SGX_SECINFO_RESERVED_MASK ~(SGX_SECINFO_PERMISSION_MASK | \ - SGX_SECINFO_PAGE_TYPE_MASK) - -/** - * struct sgx_secinfo - describes attributes of an EPC page - * @flags: permissions and type - * - * Used together with ENCLS leaves that add or modify an EPC page to an - * enclave to define page permissions and type. - */ -struct sgx_secinfo { - u64 flags; - u8 reserved[56]; -} __packed __aligned(64); - -#define SGX_PCMD_RESERVED_SIZE 40 - -/** - * struct sgx_pcmd - Paging Crypto Metadata (PCMD) - * @enclave_id: enclave identifier - * @mac: MAC over PCMD, page contents and isvsvn - * - * PCMD is stored for every swapped page to the regular memory. When ELDU loads - * the page back it recalculates the MAC by using a isvsvn number stored in a - * VA page. Together these two structures bring integrity and rollback - * protection. - */ -struct sgx_pcmd { - struct sgx_secinfo secinfo; - u64 enclave_id; - u8 reserved[SGX_PCMD_RESERVED_SIZE]; - u8 mac[16]; -} __packed __aligned(128); - -#define SGX_SIGSTRUCT_RESERVED1_SIZE 84 -#define SGX_SIGSTRUCT_RESERVED2_SIZE 20 -#define SGX_SIGSTRUCT_RESERVED3_SIZE 32 -#define SGX_SIGSTRUCT_RESERVED4_SIZE 12 - -/** - * struct sgx_sigstruct_header - defines author of the enclave - * @header1: constant byte string - * @vendor: must be either 0x0000 or 0x8086 - * @date: YYYYMMDD in BCD - * @header2: costant byte string - * @swdefined: software defined value - */ -struct sgx_sigstruct_header { - u64 header1[2]; - u32 vendor; - u32 date; - u64 header2[2]; - u32 swdefined; - u8 reserved1[84]; -} __packed; - -/** - * struct sgx_sigstruct_body - defines contents of the enclave - * @miscselect: additional information stored to an SSA frame - * @misc_mask: required miscselect in SECS - * @attributes: attributes for enclave - * @xfrm: XSave-Feature Request Mask (subset of XCR0) - * @attributes_mask: required attributes in SECS - * @xfrm_mask: required XFRM in SECS - * @mrenclave: SHA256-hash of the enclave contents - * @isvprodid: a user-defined value that is used in key derivation - * @isvsvn: a user-defined value that is used in key derivation - */ -struct sgx_sigstruct_body { - u32 miscselect; - u32 misc_mask; - u8 reserved2[20]; - u64 attributes; - u64 xfrm; - u64 attributes_mask; - u64 xfrm_mask; - u8 mrenclave[32]; - u8 reserved3[32]; - u16 isvprodid; - u16 isvsvn; -} __packed; - -/** - * struct sgx_sigstruct - an enclave signature - * @header: defines author of the enclave - * @modulus: the modulus of the public key - * @exponent: the exponent of the public key - * @signature: the signature calculated over the fields except modulus, - * @body: defines contents of the enclave - * @q1: a value used in RSA signature verification - * @q2: a value used in RSA signature verification - * - * Header and body are the parts that are actual signed. The remaining fields - * define the signature of the enclave. - */ -struct sgx_sigstruct { - struct sgx_sigstruct_header header; - u8 modulus[SGX_MODULUS_SIZE]; - u32 exponent; - u8 signature[SGX_MODULUS_SIZE]; - struct sgx_sigstruct_body body; - u8 reserved4[12]; - u8 q1[SGX_MODULUS_SIZE]; - u8 q2[SGX_MODULUS_SIZE]; -} __packed; - -#define SGX_LAUNCH_TOKEN_SIZE 304 - -#endif /* _ASM_X86_SGX_ARCH_H */ diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index 8ce6d8371cfb..aa9b8b868867 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -136,10 +136,6 @@ static const struct file_operations sgx_encl_fops = { .get_unmapped_area = sgx_get_unmapped_area, }; -const struct file_operations sgx_provision_fops = { - .owner = THIS_MODULE, -}; - static struct miscdevice sgx_dev_enclave = { .minor = MISC_DYNAMIC_MINOR, .name = "sgx_enclave", @@ -147,13 +143,6 @@ static struct miscdevice sgx_dev_enclave = { .fops = &sgx_encl_fops, }; -static struct miscdevice sgx_dev_provision = { - .minor = MISC_DYNAMIC_MINOR, - .name = "sgx_provision", - .nodename = "sgx_provision", - .fops = &sgx_provision_fops, -}; - int __init sgx_drv_init(void) { unsigned int eax, ebx, ecx, edx; @@ -187,11 +176,5 @@ int __init sgx_drv_init(void) if (ret) return ret; - ret = misc_register(&sgx_dev_provision); - if (ret) { - misc_deregister(&sgx_dev_enclave); - return ret; - } - return 0; } diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 7449ef33f081..19876ebfb504 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -7,11 +7,121 @@ #include <linux/shmem_fs.h> #include <linux/suspend.h> #include <linux/sched/mm.h> -#include "arch.h" +#include <asm/sgx.h> #include "encl.h" #include "encls.h" #include "sgx.h" +#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd)) +/* + * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to + * determine the page index associated with the first PCMD entry + * within a PCMD page. + */ +#define PCMD_FIRST_MASK GENMASK(4, 0) + +/** + * reclaimer_writing_to_pcmd() - Query if any enclave page associated with + * a PCMD page is in process of being reclaimed. + * @encl: Enclave to which PCMD page belongs + * @start_addr: Address of enclave page using first entry within the PCMD page + * + * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is + * stored. The PCMD data of a reclaimed enclave page contains enough + * information for the processor to verify the page at the time + * it is loaded back into the Enclave Page Cache (EPC). + * + * The backing storage to which enclave pages are reclaimed is laid out as + * follows: + * Encrypted enclave pages:SECS page:PCMD pages + * + * Each PCMD page contains the PCMD metadata of + * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages. + * + * A PCMD page can only be truncated if it is (a) empty, and (b) not in the + * process of getting data (and thus soon being non-empty). (b) is tested with + * a check if an enclave page sharing the PCMD page is in the process of being + * reclaimed. + * + * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it + * intends to reclaim that enclave page - it means that the PCMD page + * associated with that enclave page is about to get some data and thus + * even if the PCMD page is empty, it should not be truncated. + * + * Context: Enclave mutex (&sgx_encl->lock) must be held. + * Return: 1 if the reclaimer is about to write to the PCMD page + * 0 if the reclaimer has no intention to write to the PCMD page + */ +static int reclaimer_writing_to_pcmd(struct sgx_encl *encl, + unsigned long start_addr) +{ + int reclaimed = 0; + int i; + + /* + * PCMD_FIRST_MASK is based on number of PCMD entries within + * PCMD page being 32. + */ + BUILD_BUG_ON(PCMDS_PER_PAGE != 32); + + for (i = 0; i < PCMDS_PER_PAGE; i++) { + struct sgx_encl_page *entry; + unsigned long addr; + + addr = start_addr + i * PAGE_SIZE; + + /* + * Stop when reaching the SECS page - it does not + * have a page_array entry and its reclaim is + * started and completed with enclave mutex held so + * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED + * flag. + */ + if (addr == encl->base + encl->size) + break; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + continue; + + /* + * VA page slot ID uses same bit as the flag so it is important + * to ensure that the page is not already in backing store. + */ + if (entry->epc_page && + (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) { + reclaimed = 1; + break; + } + } + + return reclaimed; +} + +/* + * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's + * follow right after the EPC data in the backing storage. In addition to the + * visible enclave pages, there's one extra page slot for SECS, before PCMD + * structs. + */ +static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl, + unsigned long page_index) +{ + pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs); + + return epc_end_off + page_index * sizeof(struct sgx_pcmd); +} + +/* + * Free a page from the backing storage in the given page index. + */ +static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index) +{ + struct inode *inode = file_inode(encl->backing); + + shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1); +} + /* * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC * Pages" in the SDM. @@ -22,9 +132,12 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, { unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; struct sgx_encl *encl = encl_page->encl; + pgoff_t page_index, page_pcmd_off; + unsigned long pcmd_first_page; struct sgx_pageinfo pginfo; struct sgx_backing b; - pgoff_t page_index; + bool pcmd_page_empty; + u8 *pcmd_page; int ret; if (secs_page) @@ -32,14 +145,21 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, else page_index = PFN_DOWN(encl->size); - ret = sgx_encl_get_backing(encl, page_index, &b); + /* + * Address of enclave page using the first entry within the PCMD page. + */ + pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base; + + page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); + + ret = sgx_encl_lookup_backing(encl, page_index, &b); if (ret) return ret; pginfo.addr = encl_page->desc & PAGE_MASK; pginfo.contents = (unsigned long)kmap_atomic(b.contents); - pginfo.metadata = (unsigned long)kmap_atomic(b.pcmd) + - b.pcmd_offset; + pcmd_page = kmap_atomic(b.pcmd); + pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset; if (secs_page) pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page); @@ -55,10 +175,32 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, ret = -EFAULT; } - kunmap_atomic((void *)(unsigned long)(pginfo.metadata - b.pcmd_offset)); + memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd)); + set_page_dirty(b.pcmd); + + /* + * The area for the PCMD in the page was zeroed above. Check if the + * whole page is now empty meaning that all PCMD's have been zeroed: + */ + pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE); + + kunmap_atomic(pcmd_page); kunmap_atomic((void *)(unsigned long)pginfo.contents); - sgx_encl_put_backing(&b, false); + get_page(b.pcmd); + sgx_encl_put_backing(&b); + + sgx_encl_truncate_backing_page(encl, page_index); + + if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) { + sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); + pcmd_page = kmap_atomic(b.pcmd); + if (memchr_inv(pcmd_page, 0, PAGE_SIZE)) + pr_warn("PCMD page not empty after truncate.\n"); + kunmap_atomic(pcmd_page); + } + + put_page(b.pcmd); return ret; } @@ -78,7 +220,7 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, ret = __sgx_encl_eldu(encl_page, epc_page, secs_page); if (ret) { - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); return ERR_PTR(ret); } @@ -383,7 +525,7 @@ const struct vm_operations_struct sgx_vm_ops = { /** * sgx_encl_release - Destroy an enclave instance - * @kref: address of a kref inside &sgx_encl + * @ref: address of a kref inside &sgx_encl * * Used together with kref_put(). Frees all the resources associated with the * enclave and the instance itself. @@ -404,18 +546,20 @@ void sgx_encl_release(struct kref *ref) if (sgx_unmark_page_reclaimable(entry->epc_page)) continue; - sgx_free_epc_page(entry->epc_page); + sgx_encl_free_epc_page(entry->epc_page); encl->secs_child_cnt--; entry->epc_page = NULL; } kfree(entry); + /* Invoke scheduler to prevent soft lockups. */ + cond_resched(); } xa_destroy(&encl->page_array); if (!encl->secs_child_cnt && encl->secs.epc_page) { - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; } @@ -423,7 +567,7 @@ void sgx_encl_release(struct kref *ref) va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, list); list_del(&va_page->list); - sgx_free_epc_page(va_page->epc_page); + sgx_encl_free_epc_page(va_page->epc_page); kfree(va_page); } @@ -574,10 +718,10 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, * 0 on success, * -errno otherwise. */ -int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, +static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing) { - pgoff_t pcmd_index = PFN_DOWN(encl->size) + 1 + (page_index >> 5); + pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); struct page *contents; struct page *pcmd; @@ -585,7 +729,7 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, if (IS_ERR(contents)) return PTR_ERR(contents); - pcmd = sgx_encl_get_backing_page(encl, pcmd_index); + pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off)); if (IS_ERR(pcmd)) { put_page(contents); return PTR_ERR(pcmd); @@ -594,25 +738,118 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, backing->page_index = page_index; backing->contents = contents; backing->pcmd = pcmd; - backing->pcmd_offset = - (page_index & (PAGE_SIZE / sizeof(struct sgx_pcmd) - 1)) * - sizeof(struct sgx_pcmd); + backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1); return 0; } +/* + * When called from ksgxd, returns the mem_cgroup of a struct mm stored + * in the enclave's mm_list. When not called from ksgxd, just returns + * the mem_cgroup of the current task. + */ +static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl) +{ + struct mem_cgroup *memcg = NULL; + struct sgx_encl_mm *encl_mm; + int idx; + + /* + * If called from normal task context, return the mem_cgroup + * of the current task's mm. The remainder of the handling is for + * ksgxd. + */ + if (!current_is_ksgxd()) + return get_mem_cgroup_from_mm(current->mm); + + /* + * Search the enclave's mm_list to find an mm associated with + * this enclave to charge the allocation to. + */ + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + memcg = get_mem_cgroup_from_mm(encl_mm->mm); + + mmput_async(encl_mm->mm); + + break; + } + + srcu_read_unlock(&encl->srcu, idx); + + /* + * In the rare case that there isn't an mm associated with + * the enclave, set memcg to the current active mem_cgroup. + * This will be the root mem_cgroup if there is no active + * mem_cgroup. + */ + if (!memcg) + return get_mem_cgroup_from_mm(NULL); + + return memcg; +} + /** - * sgx_encl_put_backing() - Unpin the backing storage + * sgx_encl_alloc_backing() - allocate a new backing storage page + * @encl: an enclave pointer + * @page_index: enclave page index * @backing: data for accessing backing storage for the page - * @do_write: mark pages dirty + * + * When called from ksgxd, sets the active memcg from one of the + * mms in the enclave's mm_list prior to any backing page allocation, + * in order to ensure that shmem page allocations are charged to the + * enclave. + * + * Return: + * 0 on success, + * -errno otherwise. */ -void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write) +int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing) { - if (do_write) { - set_page_dirty(backing->pcmd); - set_page_dirty(backing->contents); - } + struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl); + struct mem_cgroup *memcg = set_active_memcg(encl_memcg); + int ret; + + ret = sgx_encl_get_backing(encl, page_index, backing); + + set_active_memcg(memcg); + mem_cgroup_put(encl_memcg); + + return ret; +} +/** + * sgx_encl_lookup_backing() - retrieve an existing backing storage page + * @encl: an enclave pointer + * @page_index: enclave page index + * @backing: data for accessing backing storage for the page + * + * Retrieve a backing page for loading data back into an EPC page with ELDU. + * It is the caller's responsibility to ensure that it is appropriate to use + * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is + * not used correctly, this will cause an allocation which is not accounted for. + * + * Return: + * 0 on success, + * -errno otherwise. + */ +int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing) +{ + return sgx_encl_get_backing(encl, page_index, backing); +} + +/** + * sgx_encl_put_backing() - Unpin the backing storage + * @backing: data for accessing backing storage for the page + */ +void sgx_encl_put_backing(struct sgx_backing *backing) +{ put_page(backing->pcmd); put_page(backing->contents); } @@ -686,7 +923,7 @@ struct sgx_epc_page *sgx_alloc_va_page(void) ret = __epa(sgx_get_epc_virt_addr(epc_page)); if (ret) { WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret); - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); return ERR_PTR(-EFAULT); } @@ -735,3 +972,24 @@ bool sgx_va_page_full(struct sgx_va_page *va_page) return slot == SGX_VA_SLOT_COUNT; } + +/** + * sgx_encl_free_epc_page - free an EPC page assigned to an enclave + * @page: EPC page to be freed + * + * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and + * only upon success, it puts the page back to free page list. Otherwise, it + * gives a WARNING to indicate page is leaked. + */ +void sgx_encl_free_epc_page(struct sgx_epc_page *page) +{ + int ret; + + WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); + + ret = __eremove(sgx_get_epc_virt_addr(page)); + if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret)) + return; + + sgx_free_epc_page(page); +} diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index d8d30ccbef4c..332ef3568267 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -91,8 +91,8 @@ static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr, { struct vm_area_struct *result; - result = find_vma(mm, addr); - if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start) + result = vma_lookup(mm, addr); + if (!result || result->vm_ops != &sgx_vm_ops) return -EINVAL; *vma = result; @@ -103,11 +103,14 @@ static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr, int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, unsigned long end, unsigned long vm_flags); +bool current_is_ksgxd(void); void sgx_encl_release(struct kref *ref); int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); -int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, - struct sgx_backing *backing); -void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write); +int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing); +int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing); +void sgx_encl_put_backing(struct sgx_backing *backing); int sgx_encl_test_and_clear_young(struct mm_struct *mm, struct sgx_encl_page *page); @@ -115,5 +118,6 @@ struct sgx_epc_page *sgx_alloc_va_page(void); unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); bool sgx_va_page_full(struct sgx_va_page *va_page); +void sgx_encl_free_epc_page(struct sgx_epc_page *page); #endif /* _X86_ENCL_H */ diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index 443188fe7e70..fa04a73daf9c 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -11,41 +11,8 @@ #include <asm/traps.h> #include "sgx.h" -enum sgx_encls_function { - ECREATE = 0x00, - EADD = 0x01, - EINIT = 0x02, - EREMOVE = 0x03, - EDGBRD = 0x04, - EDGBWR = 0x05, - EEXTEND = 0x06, - ELDU = 0x08, - EBLOCK = 0x09, - EPA = 0x0A, - EWB = 0x0B, - ETRACK = 0x0C, -}; - -/** - * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr - * - * ENCLS has its own (positive value) error codes and also generates - * ENCLS specific #GP and #PF faults. And the ENCLS values get munged - * with system error codes as everything percolates back up the stack. - * Unfortunately (for us), we need to precisely identify each unique - * error code, e.g. the action taken if EWB fails varies based on the - * type of fault and on the exact SGX error code, i.e. we can't simply - * convert all faults to -EFAULT. - * - * To make all three error types coexist, we set bit 30 to identify an - * ENCLS fault. Bit 31 (technically bits N:31) is used to differentiate - * between positive (faults and SGX error codes) and negative (system - * error codes) values. - */ -#define ENCLS_FAULT_FLAG 0x40000000 - /* Retrieve the encoded trapnr from the specified return code. */ -#define ENCLS_TRAPNR(r) ((r) & ~ENCLS_FAULT_FLAG) +#define ENCLS_TRAPNR(r) ((r) & ~SGX_ENCLS_FAULT_FLAG) /* Issue a WARN() about an ENCLS function. */ #define ENCLS_WARN(r, name) { \ @@ -55,6 +22,19 @@ enum sgx_encls_function { } while (0); \ } +/* + * encls_faulted() - Check if an ENCLS leaf faulted given an error code + * @ret: the return value of an ENCLS leaf function call + * + * Return: + * - true: ENCLS leaf faulted. + * - false: Otherwise. + */ +static inline bool encls_faulted(int ret) +{ + return ret & SGX_ENCLS_FAULT_FLAG; +} + /** * encls_failed() - Check if an ENCLS function failed * @ret: the return value of an ENCLS function call @@ -65,7 +45,7 @@ enum sgx_encls_function { */ static inline bool encls_failed(int ret) { - if (ret & ENCLS_FAULT_FLAG) + if (encls_faulted(ret)) return ENCLS_TRAPNR(ret) != X86_TRAP_PF; return !!ret; @@ -90,11 +70,7 @@ static inline bool encls_failed(int ret) asm volatile( \ "1: .byte 0x0f, 0x01, 0xcf;\n\t" \ "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE_FAULT(1b, 3b) \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \ : "=a"(ret) \ : "a"(rax), inputs \ : "memory", "cc"); \ @@ -129,7 +105,7 @@ static inline bool encls_failed(int ret) * * Return: * 0 on success, - * trapnr with ENCLS_FAULT_FLAG set on fault + * trapnr with SGX_ENCLS_FAULT_FLAG set on fault */ #define __encls_N(rax, rbx_out, inputs...) \ ({ \ @@ -138,11 +114,7 @@ static inline bool encls_failed(int ret) "1: .byte 0x0f, 0x01, 0xcf;\n\t" \ " xor %%eax,%%eax;\n" \ "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE_FAULT(1b, 3b) \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \ : "=a"(ret), "=b"(rbx_out) \ : "a"(rax), inputs \ : "memory"); \ diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 90a5caf76939..83df20e3e633 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -2,6 +2,7 @@ /* Copyright(c) 2016-20 Intel Corporation. */ #include <asm/mman.h> +#include <asm/sgx.h> #include <linux/mman.h> #include <linux/delay.h> #include <linux/file.h> @@ -47,7 +48,7 @@ static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) encl->page_cnt--; if (va_page) { - sgx_free_epc_page(va_page->epc_page); + sgx_encl_free_epc_page(va_page->epc_page); list_del(&va_page->list); kfree(va_page); } @@ -117,7 +118,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) return 0; err_out: - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; err_out_backing: @@ -365,7 +366,7 @@ err_out_unlock: mmap_read_unlock(current->mm); err_out_free: - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); kfree(encl_page); return ret; @@ -495,7 +496,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, void *token) { u64 mrsigner[4]; - int i, j, k; + int i, j; void *addr; int ret; @@ -544,8 +545,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, preempt_disable(); - for (k = 0; k < 4; k++) - wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]); + sgx_update_lepubkeyhash(mrsigner); ret = __einit(sigstruct, token, addr); @@ -568,7 +568,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, } } - if (ret & ENCLS_FAULT_FLAG) { + if (encls_faulted(ret)) { if (encls_failed(ret)) ENCLS_WARN(ret, "EINIT"); @@ -604,7 +604,6 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) { struct sgx_sigstruct *sigstruct; struct sgx_enclave_init init_arg; - struct page *initp_page; void *token; int ret; @@ -615,11 +614,15 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) if (copy_from_user(&init_arg, arg, sizeof(init_arg))) return -EFAULT; - initp_page = alloc_page(GFP_KERNEL); - if (!initp_page) + /* + * 'sigstruct' must be on a page boundary and 'token' on a 512 byte + * boundary. kmalloc() will give this alignment when allocating + * PAGE_SIZE bytes. + */ + sigstruct = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!sigstruct) return -ENOMEM; - sigstruct = kmap(initp_page); token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2); memset(token, 0, SGX_LAUNCH_TOKEN_SIZE); @@ -645,8 +648,7 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) ret = sgx_encl_init(encl, sigstruct, token); out: - kunmap(initp_page); - __free_page(initp_page); + kfree(sigstruct); return ret; } @@ -665,24 +667,11 @@ out: static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg) { struct sgx_enclave_provision params; - struct file *file; if (copy_from_user(¶ms, arg, sizeof(params))) return -EFAULT; - file = fget(params.fd); - if (!file) - return -EINVAL; - - if (file->f_op != &sgx_provision_fops) { - fput(file); - return -EINVAL; - } - - encl->attributes_mask |= SGX_ATTR_PROVISIONKEY; - - fput(file); - return 0; + return sgx_set_attribute(&encl->attributes_mask, params.fd); } long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 8df81a3ed945..a78652d43e61 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -1,14 +1,19 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2016-20 Intel Corporation. */ +#include <linux/file.h> #include <linux/freezer.h> #include <linux/highmem.h> #include <linux/kthread.h> +#include <linux/miscdevice.h> +#include <linux/node.h> #include <linux/pagemap.h> #include <linux/ratelimit.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> +#include <linux/sysfs.h> +#include <asm/sgx.h> #include "driver.h" #include "encl.h" #include "encls.h" @@ -17,48 +22,82 @@ struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; static int sgx_nr_epc_sections; static struct task_struct *ksgxd_tsk; static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); +static DEFINE_XARRAY(sgx_epc_address_space); /* * These variables are part of the state of the reclaimer, and must be accessed * with sgx_reclaimer_lock acquired. */ static LIST_HEAD(sgx_active_page_list); - static DEFINE_SPINLOCK(sgx_reclaimer_lock); +static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0); + +/* Nodes with one or more EPC sections. */ +static nodemask_t sgx_numa_mask; + /* - * Reset dirty EPC pages to uninitialized state. Laundry can be left with SECS - * pages whose child pages blocked EREMOVE. + * Array with one list_head for each possible NUMA node. Each + * list contains all the sgx_epc_section's which are on that + * node. */ -static void sgx_sanitize_section(struct sgx_epc_section *section) +static struct sgx_numa_node *sgx_numa_nodes; + +static LIST_HEAD(sgx_dirty_page_list); + +/* + * Reset post-kexec EPC pages to the uninitialized state. The pages are removed + * from the input list, and made available for the page allocator. SECS pages + * prepending their children in the input list are left intact. + */ +static void __sgx_sanitize_pages(struct list_head *dirty_page_list) { struct sgx_epc_page *page; LIST_HEAD(dirty); int ret; - /* init_laundry_list is thread-local, no need for a lock: */ - while (!list_empty(§ion->init_laundry_list)) { + /* dirty_page_list is thread-local, no need for a lock: */ + while (!list_empty(dirty_page_list)) { if (kthread_should_stop()) return; - /* needed for access to ->page_list: */ - spin_lock(§ion->lock); + page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); + + /* + * Checking page->poison without holding the node->lock + * is racy, but losing the race (i.e. poison is set just + * after the check) just means __eremove() will be uselessly + * called for a page that sgx_free_epc_page() will put onto + * the node->sgx_poison_page_list later. + */ + if (page->poison) { + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + struct sgx_numa_node *node = section->node; + + spin_lock(&node->lock); + list_move(&page->list, &node->sgx_poison_page_list); + spin_unlock(&node->lock); - page = list_first_entry(§ion->init_laundry_list, - struct sgx_epc_page, list); + continue; + } ret = __eremove(sgx_get_epc_virt_addr(page)); - if (!ret) - list_move(&page->list, §ion->page_list); - else + if (!ret) { + /* + * page is now sanitized. Make it available via the SGX + * page allocator: + */ + list_del(&page->list); + sgx_free_epc_page(page); + } else { + /* The page is not yet clean - move to the dirty list. */ list_move_tail(&page->list, &dirty); - - spin_unlock(§ion->lock); + } cond_resched(); } - list_splice(&dirty, §ion->init_laundry_list); + list_splice(&dirty, dirty_page_list); } static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) @@ -152,6 +191,8 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, backing->pcmd_offset; ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot); + set_page_dirty(backing->pcmd); + set_page_dirty(backing->contents); kunmap_atomic((void *)(unsigned long)(pginfo.metadata - backing->pcmd_offset)); @@ -195,10 +236,10 @@ static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) /* * Swap page to the regular memory transformed to the blocked state by using - * EBLOCK, which means that it can no loger be referenced (no new TLB entries). + * EBLOCK, which means that it can no longer be referenced (no new TLB entries). * * The first trial just tries to write the page assuming that some other thread - * has reset the count for threads inside the enlave by using ETRACK, and + * has reset the count for threads inside the enclave by using ETRACK, and * previous thread count has been zeroed out. The second trial calls ETRACK * before EWB. If that fails we kick all the HW threads out, and then do EWB, * which should be guaranteed the succeed. @@ -269,19 +310,20 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, sgx_encl_ewb(epc_page, backing); encl_page->epc_page = NULL; encl->secs_child_cnt--; + sgx_encl_put_backing(backing); if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { - ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size), + ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size), &secs_backing); if (ret) goto out; sgx_encl_ewb(encl->secs.epc_page, &secs_backing); - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; - sgx_encl_put_backing(&secs_backing, true); + sgx_encl_put_backing(&secs_backing); } out: @@ -305,7 +347,6 @@ static void sgx_reclaim_pages(void) { struct sgx_epc_page *chunk[SGX_NR_TO_SCAN]; struct sgx_backing backing[SGX_NR_TO_SCAN]; - struct sgx_epc_section *section; struct sgx_encl_page *encl_page; struct sgx_epc_page *epc_page; pgoff_t page_index; @@ -341,11 +382,14 @@ static void sgx_reclaim_pages(void) goto skip; page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); - ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]); - if (ret) - goto skip; mutex_lock(&encl_page->encl->lock); + ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]); + if (ret) { + mutex_unlock(&encl_page->encl->lock); + goto skip; + } + encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; mutex_unlock(&encl_page->encl->lock); continue; @@ -373,56 +417,33 @@ skip: encl_page = epc_page->owner; sgx_reclaimer_write(epc_page, &backing[i]); - sgx_encl_put_backing(&backing[i], true); kref_put(&encl_page->encl->refcount, sgx_encl_release); epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; - section = &sgx_epc_sections[epc_page->section]; - spin_lock(§ion->lock); - list_add_tail(&epc_page->list, §ion->page_list); - section->free_cnt++; - spin_unlock(§ion->lock); + sgx_free_epc_page(epc_page); } } -static unsigned long sgx_nr_free_pages(void) -{ - unsigned long cnt = 0; - int i; - - for (i = 0; i < sgx_nr_epc_sections; i++) - cnt += sgx_epc_sections[i].free_cnt; - - return cnt; -} - static bool sgx_should_reclaim(unsigned long watermark) { - return sgx_nr_free_pages() < watermark && + return atomic_long_read(&sgx_nr_free_pages) < watermark && !list_empty(&sgx_active_page_list); } static int ksgxd(void *p) { - int i; - set_freezable(); /* * Sanitize pages in order to recover from kexec(). The 2nd pass is * required for SECS pages, whose child pages blocked EREMOVE. */ - for (i = 0; i < sgx_nr_epc_sections; i++) - sgx_sanitize_section(&sgx_epc_sections[i]); + __sgx_sanitize_pages(&sgx_dirty_page_list); + __sgx_sanitize_pages(&sgx_dirty_page_list); - for (i = 0; i < sgx_nr_epc_sections; i++) { - sgx_sanitize_section(&sgx_epc_sections[i]); - - /* Should never happen. */ - if (!list_empty(&sgx_epc_sections[i].init_laundry_list)) - WARN(1, "EPC section %d has unsanitized pages.\n", i); - } + /* sanity check: */ + WARN_ON(!list_empty(&sgx_dirty_page_list)); while (!kthread_should_stop()) { if (try_to_freeze()) @@ -454,45 +475,62 @@ static bool __init sgx_page_reclaimer_init(void) return true; } -static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section) +bool current_is_ksgxd(void) { - struct sgx_epc_page *page; + return current == ksgxd_tsk; +} + +static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) +{ + struct sgx_numa_node *node = &sgx_numa_nodes[nid]; + struct sgx_epc_page *page = NULL; - spin_lock(§ion->lock); + spin_lock(&node->lock); - if (list_empty(§ion->page_list)) { - spin_unlock(§ion->lock); + if (list_empty(&node->free_page_list)) { + spin_unlock(&node->lock); return NULL; } - page = list_first_entry(§ion->page_list, struct sgx_epc_page, list); + page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); list_del_init(&page->list); - section->free_cnt--; + page->flags = 0; + + spin_unlock(&node->lock); + atomic_long_dec(&sgx_nr_free_pages); - spin_unlock(§ion->lock); return page; } /** * __sgx_alloc_epc_page() - Allocate an EPC page * - * Iterate through EPC sections and borrow a free EPC page to the caller. When a - * page is no longer needed it must be released with sgx_free_epc_page(). + * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start + * from the NUMA node, where the caller is executing. * * Return: - * an EPC page, - * -errno on error + * - an EPC page: A borrowed EPC pages were available. + * - NULL: Out of EPC pages. */ struct sgx_epc_page *__sgx_alloc_epc_page(void) { - struct sgx_epc_section *section; struct sgx_epc_page *page; - int i; + int nid_of_current = numa_node_id(); + int nid = nid_of_current; - for (i = 0; i < sgx_nr_epc_sections; i++) { - section = &sgx_epc_sections[i]; + if (node_isset(nid_of_current, sgx_numa_mask)) { + page = __sgx_alloc_epc_page_from_node(nid_of_current); + if (page) + return page; + } - page = __sgx_alloc_epc_page_from_section(section); + /* Fall back to the non-local NUMA nodes: */ + while (true) { + nid = next_node_in(nid, sgx_numa_mask); + if (nid == nid_of_current) + break; + + page = __sgx_alloc_epc_page_from_node(nid); if (page) return page; } @@ -598,23 +636,27 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) * sgx_free_epc_page() - Free an EPC page * @page: an EPC page * - * Call EREMOVE for an EPC page and insert it back to the list of free pages. + * Put the EPC page back to the list of free pages. It's the caller's + * responsibility to make sure that the page is in uninitialized state. In other + * words, do EREMOVE, EWB or whatever operation is necessary before calling + * this function. */ void sgx_free_epc_page(struct sgx_epc_page *page) { struct sgx_epc_section *section = &sgx_epc_sections[page->section]; - int ret; + struct sgx_numa_node *node = section->node; - WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); + spin_lock(&node->lock); - ret = __eremove(sgx_get_epc_virt_addr(page)); - if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret)) - return; + page->owner = NULL; + if (page->poison) + list_add(&page->list, &node->sgx_poison_page_list); + else + list_add_tail(&page->list, &node->free_page_list); + page->flags = SGX_EPC_PAGE_IS_FREE; - spin_lock(§ion->lock); - list_add_tail(&page->list, §ion->page_list); - section->free_cnt++; - spin_unlock(§ion->lock); + spin_unlock(&node->lock); + atomic_long_inc(&sgx_nr_free_pages); } static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, @@ -635,21 +677,102 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, } section->phys_addr = phys_addr; - spin_lock_init(§ion->lock); - INIT_LIST_HEAD(§ion->page_list); - INIT_LIST_HEAD(§ion->init_laundry_list); + xa_store_range(&sgx_epc_address_space, section->phys_addr, + phys_addr + size - 1, section, GFP_KERNEL); for (i = 0; i < nr_pages; i++) { section->pages[i].section = index; section->pages[i].flags = 0; section->pages[i].owner = NULL; - list_add_tail(§ion->pages[i].list, §ion->init_laundry_list); + section->pages[i].poison = 0; + list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); } - section->free_cnt = nr_pages; return true; } +bool arch_is_platform_page(u64 paddr) +{ + return !!xa_load(&sgx_epc_address_space, paddr); +} +EXPORT_SYMBOL_GPL(arch_is_platform_page); + +static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr) +{ + struct sgx_epc_section *section; + + section = xa_load(&sgx_epc_address_space, paddr); + if (!section) + return NULL; + + return §ion->pages[PFN_DOWN(paddr - section->phys_addr)]; +} + +/* + * Called in process context to handle a hardware reported + * error in an SGX EPC page. + * If the MF_ACTION_REQUIRED bit is set in flags, then the + * context is the task that consumed the poison data. Otherwise + * this is called from a kernel thread unrelated to the page. + */ +int arch_memory_failure(unsigned long pfn, int flags) +{ + struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT); + struct sgx_epc_section *section; + struct sgx_numa_node *node; + + /* + * mm/memory-failure.c calls this routine for all errors + * where there isn't a "struct page" for the address. But that + * includes other address ranges besides SGX. + */ + if (!page) + return -ENXIO; + + /* + * If poison was consumed synchronously. Send a SIGBUS to + * the task. Hardware has already exited the SGX enclave and + * will not allow re-entry to an enclave that has a memory + * error. The signal may help the task understand why the + * enclave is broken. + */ + if (flags & MF_ACTION_REQUIRED) + force_sig(SIGBUS); + + section = &sgx_epc_sections[page->section]; + node = section->node; + + spin_lock(&node->lock); + + /* Already poisoned? Nothing more to do */ + if (page->poison) + goto out; + + page->poison = 1; + + /* + * If the page is on a free list, move it to the per-node + * poison page list. + */ + if (page->flags & SGX_EPC_PAGE_IS_FREE) { + list_move(&page->list, &node->sgx_poison_page_list); + goto out; + } + + /* + * TBD: Add additional plumbing to enable pre-emptive + * action for asynchronous poison notification. Until + * then just hope that the poison: + * a) is not accessed - sgx_free_epc_page() will deal with it + * when the user gives it back + * b) results in a recoverable machine check rather than + * a fatal one + */ +out: + spin_unlock(&node->lock); + return 0; +} + /** * A section metric is concatenated in a way that @low bits 12-31 define the * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the @@ -661,12 +784,59 @@ static inline u64 __init sgx_calc_section_metric(u64 low, u64 high) ((high & GENMASK_ULL(19, 0)) << 32); } +#ifdef CONFIG_NUMA +static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size); +} +static DEVICE_ATTR_RO(sgx_total_bytes); + +static umode_t arch_node_attr_is_visible(struct kobject *kobj, + struct attribute *attr, int idx) +{ + /* Make all x86/ attributes invisible when SGX is not initialized: */ + if (nodes_empty(sgx_numa_mask)) + return 0; + + return attr->mode; +} + +static struct attribute *arch_node_dev_attrs[] = { + &dev_attr_sgx_total_bytes.attr, + NULL, +}; + +const struct attribute_group arch_node_dev_group = { + .name = "x86", + .attrs = arch_node_dev_attrs, + .is_visible = arch_node_attr_is_visible, +}; + +static void __init arch_update_sysfs_visibility(int nid) +{ + struct node *node = node_devices[nid]; + int ret; + + ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group); + + if (ret) + pr_err("sysfs update failed (%d), files may be invisible", ret); +} +#else /* !CONFIG_NUMA */ +static void __init arch_update_sysfs_visibility(int nid) {} +#endif + static bool __init sgx_page_cache_init(void) { u32 eax, ebx, ecx, edx, type; u64 pa, size; + int nid; int i; + sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL); + if (!sgx_numa_nodes) + return false; + for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) { cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx); @@ -689,6 +859,27 @@ static bool __init sgx_page_cache_init(void) break; } + nid = numa_map_to_online_node(phys_to_target_node(pa)); + if (nid == NUMA_NO_NODE) { + /* The physical address is already printed above. */ + pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n"); + nid = 0; + } + + if (!node_isset(nid, sgx_numa_mask)) { + spin_lock_init(&sgx_numa_nodes[nid].lock); + INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); + INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list); + node_set(nid, sgx_numa_mask); + sgx_numa_nodes[nid].size = 0; + + /* Make SGX-specific node sysfs files visible: */ + arch_update_sysfs_visibility(nid); + } + + sgx_epc_sections[i].node = &sgx_numa_nodes[nid]; + sgx_numa_nodes[nid].size += size; + sgx_nr_epc_sections++; } @@ -700,6 +891,67 @@ static bool __init sgx_page_cache_init(void) return true; } +/* + * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller. + * Bare-metal driver requires to update them to hash of enclave's signer + * before EINIT. KVM needs to update them to guest's virtual MSR values + * before doing EINIT from guest. + */ +void sgx_update_lepubkeyhash(u64 *lepubkeyhash) +{ + int i; + + WARN_ON_ONCE(preemptible()); + + for (i = 0; i < 4; i++) + wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); +} + +const struct file_operations sgx_provision_fops = { + .owner = THIS_MODULE, +}; + +static struct miscdevice sgx_dev_provision = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_provision", + .nodename = "sgx_provision", + .fops = &sgx_provision_fops, +}; + +/** + * sgx_set_attribute() - Update allowed attributes given file descriptor + * @allowed_attributes: Pointer to allowed enclave attributes + * @attribute_fd: File descriptor for specific attribute + * + * Append enclave attribute indicated by file descriptor to allowed + * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by + * /dev/sgx_provision is supported. + * + * Return: + * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes + * -EINVAL: Invalid, or not supported file descriptor + */ +int sgx_set_attribute(unsigned long *allowed_attributes, + unsigned int attribute_fd) +{ + struct file *file; + + file = fget(attribute_fd); + if (!file) + return -EINVAL; + + if (file->f_op != &sgx_provision_fops) { + fput(file); + return -EINVAL; + } + + *allowed_attributes |= SGX_ATTR_PROVISIONKEY; + + fput(file); + return 0; +} +EXPORT_SYMBOL_GPL(sgx_set_attribute); + static int __init sgx_init(void) { int ret; @@ -716,12 +968,28 @@ static int __init sgx_init(void) goto err_page_cache; } - ret = sgx_drv_init(); + ret = misc_register(&sgx_dev_provision); if (ret) goto err_kthread; + /* + * Always try to initialize the native *and* KVM drivers. + * The KVM driver is less picky than the native one and + * can function if the native one is not supported on the + * current system or fails to initialize. + * + * Error out only if both fail to initialize. + */ + ret = sgx_drv_init(); + + if (sgx_vepc_init() && ret) + goto err_provision; + return 0; +err_provision: + misc_deregister(&sgx_dev_provision); + err_kthread: kthread_stop(ksgxd_tsk); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 5fa42d143feb..0f17def9fe6f 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -8,11 +8,15 @@ #include <linux/rwsem.h> #include <linux/types.h> #include <asm/asm.h> -#include "arch.h" +#include <asm/sgx.h> #undef pr_fmt #define pr_fmt(fmt) "sgx: " fmt +#define EREMOVE_ERROR_MESSAGE \ + "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \ + "Refer to Documentation/x86/sgx.rst for more information." + #define SGX_MAX_EPC_SECTIONS 8 #define SGX_EEXTEND_BLOCK_SIZE 256 #define SGX_NR_TO_SCAN 16 @@ -22,36 +26,39 @@ /* Pages, which are being tracked by the page reclaimer. */ #define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0) +/* Pages on free list */ +#define SGX_EPC_PAGE_IS_FREE BIT(1) + struct sgx_epc_page { unsigned int section; - unsigned int flags; + u16 flags; + u16 poison; struct sgx_encl_page *owner; struct list_head list; }; /* + * Contains the tracking data for NUMA nodes having EPC pages. Most importantly, + * the free page list local to the node is stored here. + */ +struct sgx_numa_node { + struct list_head free_page_list; + struct list_head sgx_poison_page_list; + unsigned long size; + spinlock_t lock; +}; + +/* * The firmware can define multiple chunks of EPC to the different areas of the * physical memory e.g. for memory areas of the each node. This structure is * used to store EPC pages for one EPC section and virtual memory area where * the pages have been mapped. - * - * 'lock' must be held before accessing 'page_list' or 'free_cnt'. */ struct sgx_epc_section { unsigned long phys_addr; void *virt_addr; struct sgx_epc_page *pages; - - spinlock_t lock; - struct list_head page_list; - unsigned long free_cnt; - - /* - * Pages which need EREMOVE run on them before they can be - * used. Only safe to be accessed in ksgxd and init code. - * Not protected by locks. - */ - struct list_head init_laundry_list; + struct sgx_numa_node *node; }; extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; @@ -83,4 +90,15 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page); int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); +#ifdef CONFIG_X86_SGX_KVM +int __init sgx_vepc_init(void); +#else +static inline int __init sgx_vepc_init(void) +{ + return -ENODEV; +} +#endif + +void sgx_update_lepubkeyhash(u64 *lepubkeyhash); + #endif /* _X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c new file mode 100644 index 000000000000..6a77a14eee38 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -0,0 +1,432 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Device driver to expose SGX enclave memory to KVM guests. + * + * Copyright(c) 2021 Intel Corporation. + */ + +#include <linux/miscdevice.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/sched/mm.h> +#include <linux/sched/signal.h> +#include <linux/slab.h> +#include <linux/xarray.h> +#include <asm/sgx.h> +#include <uapi/asm/sgx.h> + +#include "encls.h" +#include "sgx.h" + +struct sgx_vepc { + struct xarray page_array; + struct mutex lock; +}; + +/* + * Temporary SECS pages that cannot be EREMOVE'd due to having child in other + * virtual EPC instances, and the lock to protect it. + */ +static struct mutex zombie_secs_pages_lock; +static struct list_head zombie_secs_pages; + +static int __sgx_vepc_fault(struct sgx_vepc *vepc, + struct vm_area_struct *vma, unsigned long addr) +{ + struct sgx_epc_page *epc_page; + unsigned long index, pfn; + int ret; + + WARN_ON(!mutex_is_locked(&vepc->lock)); + + /* Calculate index of EPC page in virtual EPC's page_array */ + index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start); + + epc_page = xa_load(&vepc->page_array, index); + if (epc_page) + return 0; + + epc_page = sgx_alloc_epc_page(vepc, false); + if (IS_ERR(epc_page)) + return PTR_ERR(epc_page); + + ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL)); + if (ret) + goto err_free; + + pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page)); + + ret = vmf_insert_pfn(vma, addr, pfn); + if (ret != VM_FAULT_NOPAGE) { + ret = -EFAULT; + goto err_delete; + } + + return 0; + +err_delete: + xa_erase(&vepc->page_array, index); +err_free: + sgx_free_epc_page(epc_page); + return ret; +} + +static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct sgx_vepc *vepc = vma->vm_private_data; + int ret; + + mutex_lock(&vepc->lock); + ret = __sgx_vepc_fault(vepc, vma, vmf->address); + mutex_unlock(&vepc->lock); + + if (!ret) + return VM_FAULT_NOPAGE; + + if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) { + mmap_read_unlock(vma->vm_mm); + return VM_FAULT_RETRY; + } + + return VM_FAULT_SIGBUS; +} + +static const struct vm_operations_struct sgx_vepc_vm_ops = { + .fault = sgx_vepc_fault, +}; + +static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct sgx_vepc *vepc = file->private_data; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma->vm_ops = &sgx_vepc_vm_ops; + /* Don't copy VMA in fork() */ + vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY; + vma->vm_private_data = vepc; + + return 0; +} + +static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page) +{ + /* + * Take a previously guest-owned EPC page and return it to the + * general EPC page pool. + * + * Guests can not be trusted to have left this page in a good + * state, so run EREMOVE on the page unconditionally. In the + * case that a guest properly EREMOVE'd this page, a superfluous + * EREMOVE is harmless. + */ + return __eremove(sgx_get_epc_virt_addr(epc_page)); +} + +static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) +{ + int ret = sgx_vepc_remove_page(epc_page); + if (ret) { + /* + * Only SGX_CHILD_PRESENT is expected, which is because of + * EREMOVE'ing an SECS still with child, in which case it can + * be handled by EREMOVE'ing the SECS again after all pages in + * virtual EPC have been EREMOVE'd. See comments in below in + * sgx_vepc_release(). + * + * The user of virtual EPC (KVM) needs to guarantee there's no + * logical processor is still running in the enclave in guest, + * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be + * handled here. + */ + WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE, + ret, ret); + return ret; + } + + sgx_free_epc_page(epc_page); + return 0; +} + +static long sgx_vepc_remove_all(struct sgx_vepc *vepc) +{ + struct sgx_epc_page *entry; + unsigned long index; + long failures = 0; + + xa_for_each(&vepc->page_array, index, entry) { + int ret = sgx_vepc_remove_page(entry); + if (ret) { + if (ret == SGX_CHILD_PRESENT) { + /* The page is a SECS, userspace will retry. */ + failures++; + } else { + /* + * Report errors due to #GP or SGX_ENCLAVE_ACT; do not + * WARN, as userspace can induce said failures by + * calling the ioctl concurrently on multiple vEPCs or + * while one or more CPUs is running the enclave. Only + * a #PF on EREMOVE indicates a kernel/hardware issue. + */ + WARN_ON_ONCE(encls_faulted(ret) && + ENCLS_TRAPNR(ret) != X86_TRAP_GP); + return -EBUSY; + } + } + cond_resched(); + } + + /* + * Return the number of SECS pages that failed to be removed, so + * userspace knows that it has to retry. + */ + return failures; +} + +static int sgx_vepc_release(struct inode *inode, struct file *file) +{ + struct sgx_vepc *vepc = file->private_data; + struct sgx_epc_page *epc_page, *tmp, *entry; + unsigned long index; + + LIST_HEAD(secs_pages); + + xa_for_each(&vepc->page_array, index, entry) { + /* + * Remove all normal, child pages. sgx_vepc_free_page() + * will fail if EREMOVE fails, but this is OK and expected on + * SECS pages. Those can only be EREMOVE'd *after* all their + * child pages. Retries below will clean them up. + */ + if (sgx_vepc_free_page(entry)) + continue; + + xa_erase(&vepc->page_array, index); + } + + /* + * Retry EREMOVE'ing pages. This will clean up any SECS pages that + * only had children in this 'epc' area. + */ + xa_for_each(&vepc->page_array, index, entry) { + epc_page = entry; + /* + * An EREMOVE failure here means that the SECS page still + * has children. But, since all children in this 'sgx_vepc' + * have been removed, the SECS page must have a child on + * another instance. + */ + if (sgx_vepc_free_page(epc_page)) + list_add_tail(&epc_page->list, &secs_pages); + + xa_erase(&vepc->page_array, index); + } + + /* + * SECS pages are "pinned" by child pages, and "unpinned" once all + * children have been EREMOVE'd. A child page in this instance + * may have pinned an SECS page encountered in an earlier release(), + * creating a zombie. Since some children were EREMOVE'd above, + * try to EREMOVE all zombies in the hopes that one was unpinned. + */ + mutex_lock(&zombie_secs_pages_lock); + list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) { + /* + * Speculatively remove the page from the list of zombies, + * if the page is successfully EREMOVE'd it will be added to + * the list of free pages. If EREMOVE fails, throw the page + * on the local list, which will be spliced on at the end. + */ + list_del(&epc_page->list); + + if (sgx_vepc_free_page(epc_page)) + list_add_tail(&epc_page->list, &secs_pages); + } + + if (!list_empty(&secs_pages)) + list_splice_tail(&secs_pages, &zombie_secs_pages); + mutex_unlock(&zombie_secs_pages_lock); + + xa_destroy(&vepc->page_array); + kfree(vepc); + + return 0; +} + +static int sgx_vepc_open(struct inode *inode, struct file *file) +{ + struct sgx_vepc *vepc; + + vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL); + if (!vepc) + return -ENOMEM; + mutex_init(&vepc->lock); + xa_init(&vepc->page_array); + + file->private_data = vepc; + + return 0; +} + +static long sgx_vepc_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct sgx_vepc *vepc = file->private_data; + + switch (cmd) { + case SGX_IOC_VEPC_REMOVE_ALL: + if (arg) + return -EINVAL; + return sgx_vepc_remove_all(vepc); + + default: + return -ENOTTY; + } +} + +static const struct file_operations sgx_vepc_fops = { + .owner = THIS_MODULE, + .open = sgx_vepc_open, + .unlocked_ioctl = sgx_vepc_ioctl, + .compat_ioctl = sgx_vepc_ioctl, + .release = sgx_vepc_release, + .mmap = sgx_vepc_mmap, +}; + +static struct miscdevice sgx_vepc_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_vepc", + .nodename = "sgx_vepc", + .fops = &sgx_vepc_fops, +}; + +int __init sgx_vepc_init(void) +{ + /* SGX virtualization requires KVM to work */ + if (!cpu_feature_enabled(X86_FEATURE_VMX)) + return -ENODEV; + + INIT_LIST_HEAD(&zombie_secs_pages); + mutex_init(&zombie_secs_pages_lock); + + return misc_register(&sgx_vepc_dev); +} + +/** + * sgx_virt_ecreate() - Run ECREATE on behalf of guest + * @pageinfo: Pointer to PAGEINFO structure + * @secs: Userspace pointer to SECS page + * @trapnr: trap number injected to guest in case of ECREATE error + * + * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose + * of enforcing policies of guest's enclaves, and return the trap number + * which should be injected to guest in case of any ECREATE error. + * + * Return: + * - 0: ECREATE was successful. + * - <0: on error. + */ +int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, + int *trapnr) +{ + int ret; + + /* + * @secs is an untrusted, userspace-provided address. It comes from + * KVM and is assumed to be a valid pointer which points somewhere in + * userspace. This can fault and call SGX or other fault handlers when + * userspace mapping @secs doesn't exist. + * + * Add a WARN() to make sure @secs is already valid userspace pointer + * from caller (KVM), who should already have handled invalid pointer + * case (for instance, made by malicious guest). All other checks, + * such as alignment of @secs, are deferred to ENCLS itself. + */ + if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE))) + return -EINVAL; + + __uaccess_begin(); + ret = __ecreate(pageinfo, (void *)secs); + __uaccess_end(); + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + + /* ECREATE doesn't return an error code, it faults or succeeds. */ + WARN_ON_ONCE(ret); + return 0; +} +EXPORT_SYMBOL_GPL(sgx_virt_ecreate); + +static int __sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs) +{ + int ret; + + /* + * Make sure all userspace pointers from caller (KVM) are valid. + * All other checks deferred to ENCLS itself. Also see comment + * for @secs in sgx_virt_ecreate(). + */ +#define SGX_EINITTOKEN_SIZE 304 + if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) || + !access_ok(token, SGX_EINITTOKEN_SIZE) || + !access_ok(secs, PAGE_SIZE))) + return -EINVAL; + + __uaccess_begin(); + ret = __einit((void *)sigstruct, (void *)token, (void *)secs); + __uaccess_end(); + + return ret; +} + +/** + * sgx_virt_einit() - Run EINIT on behalf of guest + * @sigstruct: Userspace pointer to SIGSTRUCT structure + * @token: Userspace pointer to EINITTOKEN structure + * @secs: Userspace pointer to SECS page + * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values + * @trapnr: trap number injected to guest in case of EINIT error + * + * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available + * in host, SGX driver may rewrite the hardware values at wish, therefore KVM + * needs to update hardware values to guest's virtual MSR values in order to + * ensure EINIT is executed with expected hardware values. + * + * Return: + * - 0: EINIT was successful. + * - <0: on error. + */ +int sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs, u64 *lepubkeyhash, int *trapnr) +{ + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) { + ret = __sgx_virt_einit(sigstruct, token, secs); + } else { + preempt_disable(); + + sgx_update_lepubkeyhash(lepubkeyhash); + + ret = __sgx_virt_einit(sigstruct, token, secs); + preempt_enable(); + } + + /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */ + if (ret == -EINVAL) + return ret; + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + + return ret; +} +EXPORT_SYMBOL_GPL(sgx_virt_einit); |