From 4406671f9ec830ed2d41bffe28b801daab0bff3b Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 2 Nov 2017 16:16:07 +0100 Subject: UPSTREAM: netfilter: conntrack: use power efficient workqueue conntrack uses the bounded system_long_wq workqueue for its works that don't have to run on the cpu they have been queued. Using bounded workqueue prevents the scheduler to make smart decision about the best place to schedule the work. This patch replaces system_long_wq with system_power_efficient_wq. the work stays bounded to a cpu by default unless the CONFIG_WQ_POWER_EFFICIENT is enable. In the latter case, the work can be scheduled on the best cpu from a power or a performance point of view. Change-Id: I12f862e7476b8e77c50e989ce16bd49b52063c1a (cherry picked from commit 0984d427c1d3cb2aa882c9ad9e787ba973cf2915) Signed-off-by: Vincent Guittot Signed-off-by: Pablo Neira Ayuso Signed-off-by: Viresh Kumar --- net/netfilter/nf_conntrack_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 750b8bf13e60..8da893c397da 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1023,7 +1023,7 @@ static void gc_worker(struct work_struct *work) next_run = gc_work->next_gc_run; gc_work->last_bucket = i; - queue_delayed_work(system_long_wq, &gc_work->dwork, next_run); + queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); } static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) @@ -1922,7 +1922,7 @@ int nf_conntrack_init_start(void) nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); conntrack_gc_work_init(&conntrack_gc_work); - queue_delayed_work(system_long_wq, &conntrack_gc_work.dwork, HZ); + queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); return 0; -- cgit v1.2.3 From 11537d5ac91a51571027818b98a99f43caaacb1b Mon Sep 17 00:00:00 2001 From: Hyojun Kim Date: Thu, 21 Dec 2017 09:57:41 -0800 Subject: blkdev: Refactoring block io latency histogram codes The current io_latency_state structure includes entries for read and write requests. There are special types of write commands such as sync and discard (trim) commands, and the current implementation is not general enough if we want to separate latency histogram for such special commands. This change makes io_latency_state structure request-type neutral. It also changes to print the latency average. Signed-off-by: Hyojun Kim --- block/blk-core.c | 93 +++++++++++++++-------------------------------- drivers/mmc/core/core.c | 22 +++++++---- drivers/scsi/ufs/ufshcd.c | 23 ++++++++---- drivers/scsi/ufs/ufshcd.h | 3 +- include/linux/blkdev.h | 39 ++++++-------------- include/linux/mmc/host.h | 3 +- 6 files changed, 75 insertions(+), 108 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 4c090a0d8812..b2e57642dbb9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3583,76 +3583,43 @@ int __init blk_dev_init(void) * TODO : If necessary, we can make the histograms per-cpu and aggregate * them when printing them out. */ -void -blk_zero_latency_hist(struct io_latency_state *s) -{ - memset(s->latency_y_axis_read, 0, - sizeof(s->latency_y_axis_read)); - memset(s->latency_y_axis_write, 0, - sizeof(s->latency_y_axis_write)); - s->latency_reads_elems = 0; - s->latency_writes_elems = 0; -} -EXPORT_SYMBOL(blk_zero_latency_hist); - ssize_t -blk_latency_hist_show(struct io_latency_state *s, char *buf) +blk_latency_hist_show(char* name, struct io_latency_state *s, char *buf, + int buf_size) { int i; int bytes_written = 0; u_int64_t num_elem, elem; int pct; - - num_elem = s->latency_reads_elems; - if (num_elem > 0) { - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "IO svc_time Read Latency Histogram (n = %llu):\n", - num_elem); - for (i = 0; - i < ARRAY_SIZE(latency_x_axis_us); - i++) { - elem = s->latency_y_axis_read[i]; - pct = div64_u64(elem * 100, num_elem); - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "\t< %5lluus%15llu%15d%%\n", - latency_x_axis_us[i], - elem, pct); - } - /* Last element in y-axis table is overflow */ - elem = s->latency_y_axis_read[i]; - pct = div64_u64(elem * 100, num_elem); - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "\t> %5dms%15llu%15d%%\n", 10, - elem, pct); - } - num_elem = s->latency_writes_elems; - if (num_elem > 0) { - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "IO svc_time Write Latency Histogram (n = %llu):\n", - num_elem); - for (i = 0; - i < ARRAY_SIZE(latency_x_axis_us); - i++) { - elem = s->latency_y_axis_write[i]; - pct = div64_u64(elem * 100, num_elem); - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "\t< %5lluus%15llu%15d%%\n", - latency_x_axis_us[i], - elem, pct); - } - /* Last element in y-axis table is overflow */ - elem = s->latency_y_axis_write[i]; - pct = div64_u64(elem * 100, num_elem); - bytes_written += scnprintf(buf + bytes_written, - PAGE_SIZE - bytes_written, - "\t> %5dms%15llu%15d%%\n", 10, - elem, pct); + u_int64_t average; + + num_elem = s->latency_elems; + if (num_elem > 0) { + average = div64_u64(s->latency_sum, s->latency_elems); + bytes_written += scnprintf(buf + bytes_written, + buf_size - bytes_written, + "IO svc_time %s Latency Histogram (n = %llu," + " average = %llu):\n", name, num_elem, average); + for (i = 0; + i < ARRAY_SIZE(latency_x_axis_us); + i++) { + elem = s->latency_y_axis[i]; + pct = div64_u64(elem * 100, num_elem); + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t< %6lluus%15llu%15d%%\n", + latency_x_axis_us[i], + elem, pct); + } + /* Last element in y-axis table is overflow */ + elem = s->latency_y_axis[i]; + pct = div64_u64(elem * 100, num_elem); + bytes_written += scnprintf(buf + bytes_written, + PAGE_SIZE - bytes_written, + "\t>=%6lluus%15llu%15d%%\n", + latency_x_axis_us[i - 1], elem, pct); } + return bytes_written; } EXPORT_SYMBOL(blk_latency_hist_show); diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 40ddc3e69a4d..00b9d7ef392f 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -209,9 +209,10 @@ void mmc_request_done(struct mmc_host *host, struct mmc_request *mrq) completion = ktime_get(); delta_us = ktime_us_delta(completion, mrq->io_start); - blk_update_latency_hist(&host->io_lat_s, - (mrq->data->flags & MMC_DATA_READ), - delta_us); + blk_update_latency_hist( + (mrq->data->flags & MMC_DATA_READ) ? + &host->io_lat_read : + &host->io_lat_write, delta_us); } #endif } @@ -3100,8 +3101,14 @@ static ssize_t latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mmc_host *host = cls_dev_to_mmc_host(dev); + size_t written_bytes; - return blk_latency_hist_show(&host->io_lat_s, buf); + written_bytes = blk_latency_hist_show("Read", &host->io_lat_read, + buf, PAGE_SIZE); + written_bytes += blk_latency_hist_show("Write", &host->io_lat_write, + buf + written_bytes, PAGE_SIZE - written_bytes); + + return written_bytes; } /* @@ -3119,9 +3126,10 @@ latency_hist_store(struct device *dev, struct device_attribute *attr, if (kstrtol(buf, 0, &value)) return -EINVAL; - if (value == BLK_IO_LAT_HIST_ZERO) - blk_zero_latency_hist(&host->io_lat_s); - else if (value == BLK_IO_LAT_HIST_ENABLE || + if (value == BLK_IO_LAT_HIST_ZERO) { + memset(&host->io_lat_read, 0, sizeof(host->io_lat_read)); + memset(&host->io_lat_write, 0, sizeof(host->io_lat_write)); + } else if (value == BLK_IO_LAT_HIST_ENABLE || value == BLK_IO_LAT_HIST_DISABLE) host->latency_hist_enabled = value; return count; diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index f0a015aca608..fe893f23d693 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -3622,10 +3622,10 @@ static void __ufshcd_transfer_req_compl(struct ufs_hba *hba, completion = ktime_get(); delta_us = ktime_us_delta(completion, req->lat_hist_io_start); - /* rq_data_dir() => true if WRITE */ - blk_update_latency_hist(&hba->io_lat_s, - (rq_data_dir(req) == READ), - delta_us); + blk_update_latency_hist( + (rq_data_dir(req) == READ) ? + &hba->io_lat_read : + &hba->io_lat_write, delta_us); } } /* Do not touch lrbp after scsi done */ @@ -6371,9 +6371,10 @@ latency_hist_store(struct device *dev, struct device_attribute *attr, if (kstrtol(buf, 0, &value)) return -EINVAL; - if (value == BLK_IO_LAT_HIST_ZERO) - blk_zero_latency_hist(&hba->io_lat_s); - else if (value == BLK_IO_LAT_HIST_ENABLE || + if (value == BLK_IO_LAT_HIST_ZERO) { + memset(&hba->io_lat_read, 0, sizeof(hba->io_lat_read)); + memset(&hba->io_lat_write, 0, sizeof(hba->io_lat_write)); + } else if (value == BLK_IO_LAT_HIST_ENABLE || value == BLK_IO_LAT_HIST_DISABLE) hba->latency_hist_enabled = value; return count; @@ -6384,8 +6385,14 @@ latency_hist_show(struct device *dev, struct device_attribute *attr, char *buf) { struct ufs_hba *hba = dev_get_drvdata(dev); + size_t written_bytes; - return blk_latency_hist_show(&hba->io_lat_s, buf); + written_bytes = blk_latency_hist_show("Read", &hba->io_lat_read, + buf, PAGE_SIZE); + written_bytes += blk_latency_hist_show("Write", &hba->io_lat_write, + buf + written_bytes, PAGE_SIZE - written_bytes); + + return written_bytes; } static DEVICE_ATTR(latency_hist, S_IRUGO | S_IWUSR, diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index dbe68b27c6cb..b35a5b999412 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -564,7 +564,8 @@ struct ufs_hba { enum bkops_status urgent_bkops_lvl; bool is_urgent_bkops_lvl_checked; int latency_hist_enabled; - struct io_latency_state io_lat_s; + struct io_latency_state io_lat_read; + struct io_latency_state io_lat_write; }; /* Returns true if clocks can be gated. Otherwise false */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e47a7f7025a0..fe8dd2797b1f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1738,43 +1738,26 @@ static const u_int64_t latency_x_axis_us[] = { #define BLK_IO_LAT_HIST_ZERO 2 struct io_latency_state { - u_int64_t latency_y_axis_read[ARRAY_SIZE(latency_x_axis_us) + 1]; - u_int64_t latency_reads_elems; - u_int64_t latency_y_axis_write[ARRAY_SIZE(latency_x_axis_us) + 1]; - u_int64_t latency_writes_elems; + u_int64_t latency_y_axis[ARRAY_SIZE(latency_x_axis_us) + 1]; + u_int64_t latency_elems; + u_int64_t latency_sum; }; static inline void -blk_update_latency_hist(struct io_latency_state *s, - int read, - u_int64_t delta_us) +blk_update_latency_hist(struct io_latency_state *s, u_int64_t delta_us) { int i; - for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++) { - if (delta_us < (u_int64_t)latency_x_axis_us[i]) { - if (read) - s->latency_y_axis_read[i]++; - else - s->latency_y_axis_write[i]++; + for (i = 0; i < ARRAY_SIZE(latency_x_axis_us); i++) + if (delta_us < (u_int64_t)latency_x_axis_us[i]) break; - } - } - if (i == ARRAY_SIZE(latency_x_axis_us)) { - /* Overflowed the histogram */ - if (read) - s->latency_y_axis_read[i]++; - else - s->latency_y_axis_write[i]++; - } - if (read) - s->latency_reads_elems++; - else - s->latency_writes_elems++; + s->latency_y_axis[i]++; + s->latency_elems++; + s->latency_sum += delta_us; } -void blk_zero_latency_hist(struct io_latency_state *s); -ssize_t blk_latency_hist_show(struct io_latency_state *s, char *buf); +ssize_t blk_latency_hist_show(char* name, struct io_latency_state *s, + char *buf, int buf_size); #else /* CONFIG_BLOCK */ diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index fac3b5c27f4f..1808b7895dac 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -409,7 +409,8 @@ struct mmc_host { #ifdef CONFIG_BLOCK int latency_hist_enabled; - struct io_latency_state io_lat_s; + struct io_latency_state io_lat_read; + struct io_latency_state io_lat_write; #endif unsigned long private[0] ____cacheline_aligned; -- cgit v1.2.3 From f18c44dc552e2ed0655ed5ec49b578da4dd30588 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 20 Dec 2017 16:59:11 -0800 Subject: ANDROID: sdcardfs: notify lower file of opens fsnotify_open is not called within dentry_open, so we need to call it ourselves. Change-Id: Ia7f323b3d615e6ca5574e114e8a5d7973fb4c119 Signed-off-by: Daniel Rosenberg Bug: 70706497 --- fs/sdcardfs/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c index 5ac0b0bbb0ec..dd76ecf33cf3 100644 --- a/fs/sdcardfs/file.c +++ b/fs/sdcardfs/file.c @@ -18,6 +18,7 @@ * General Public License. */ +#include #include "sdcardfs.h" #ifdef CONFIG_SDCARD_FS_FADV_NOACTIVE #include @@ -259,6 +260,7 @@ static int sdcardfs_open(struct inode *inode, struct file *file) fput(lower_file); /* fput calls dput for lower_dentry */ } } else { + fsnotify_open(lower_file); sdcardfs_set_lower_file(file, lower_file); } -- cgit v1.2.3 From 173c52eae9283363f730c69bfc8694f331571fd3 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Tue, 2 Jan 2018 14:44:49 -0800 Subject: ANDROID: sdcardfs: Add default_normal option The default_normal option causes mounts with the gid set to AID_SDCARD_RW to have user specific gids, as in the normal case. Signed-off-by: Daniel Rosenberg Change-Id: I9619b8ac55f41415df943484dc8db1ea986cef6f Bug: 64672411 --- fs/sdcardfs/main.c | 6 ++++++ fs/sdcardfs/sdcardfs.h | 3 ++- fs/sdcardfs/super.c | 2 ++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index 0a2b5167e9a2..3d1023a7ff7c 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -33,6 +33,7 @@ enum { Opt_userid, Opt_reserved_mb, Opt_gid_derivation, + Opt_default_normal, Opt_err, }; @@ -45,6 +46,7 @@ static const match_table_t sdcardfs_tokens = { {Opt_userid, "userid=%d"}, {Opt_multiuser, "multiuser"}, {Opt_gid_derivation, "derive_gid"}, + {Opt_default_normal, "default_normal"}, {Opt_reserved_mb, "reserved_mb=%u"}, {Opt_err, NULL} }; @@ -68,6 +70,7 @@ static int parse_options(struct super_block *sb, char *options, int silent, opts->reserved_mb = 0; /* by default, gid derivation is off */ opts->gid_derivation = false; + vfsopts->default_normal = false; *debug = 0; @@ -122,6 +125,8 @@ static int parse_options(struct super_block *sb, char *options, int silent, case Opt_gid_derivation: opts->gid_derivation = true; break; + case Opt_default_normal: + vfsopts->default_normal = true; /* unknown option */ default: if (!silent) @@ -175,6 +180,7 @@ int parse_options_remount(struct super_block *sb, char *options, int silent, return 0; vfsopts->mask = option; break; + case Opt_default_normal: case Opt_multiuser: case Opt_userid: case Opt_fsuid: diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index d1d8bab00fe5..365b302149c5 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -226,6 +226,7 @@ struct sdcardfs_mount_options { struct sdcardfs_vfsmount_options { gid_t gid; mode_t mask; + bool default_normal; }; extern int parse_options_remount(struct super_block *sb, char *options, int silent, @@ -417,7 +418,7 @@ static inline int get_gid(struct vfsmount *mnt, { struct sdcardfs_vfsmount_options *opts = mnt->data; - if (opts->gid == AID_SDCARD_RW) + if (opts->gid == AID_SDCARD_RW && !opts->default_normal) /* As an optimization, certain trusted system components only run * as owner but operate across all users. Since we're now handing * out the sdcard_rw GID only to trusted apps, we're okay relaxing diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c index b89947d878e3..a28b40f5adc8 100644 --- a/fs/sdcardfs/super.c +++ b/fs/sdcardfs/super.c @@ -304,6 +304,8 @@ static int sdcardfs_show_options(struct vfsmount *mnt, struct seq_file *m, seq_printf(m, ",userid=%u", opts->fs_user_id); if (opts->gid_derivation) seq_puts(m, ",derive_gid"); + if (vfsopts->default_normal) + seq_puts(m, ",default_normal"); if (opts->reserved_mb != 0) seq_printf(m, ",reserved=%uMB", opts->reserved_mb); -- cgit v1.2.3 From 61c51da2b4bd8abb2920f4dec0861a6fe5b8b486 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 7 Dec 2017 12:43:31 -0500 Subject: tcp_bbr: reset full pipe detection on loss recovery undo commit 2f6c498e4f15d27852c04ed46d804a39137ba364 upstream. Fix BBR so that upon notification of a loss recovery undo BBR resets the full pipe detection (STARTUP exit) state machine. Under high reordering, reordering events can be interpreted as loss. If the reordering and spurious loss estimates are high enough, this could previously cause BBR to spuriously estimate that the pipe is full. Since spurious loss recovery means that our overall sending will have slowed down spuriously, this commit gives a flow more time to probe robustly for bandwidth and decide the pipe is really full. Signed-off-by: Neal Cardwell Reviewed-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_bbr.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 97f9cac98348..64ff4966aad6 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -843,6 +843,10 @@ static u32 bbr_sndbuf_expand(struct sock *sk) */ static u32 bbr_undo_cwnd(struct sock *sk) { + struct bbr *bbr = inet_csk_ca(sk); + + bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ + bbr->full_bw_cnt = 0; return tcp_sk(sk)->snd_cwnd; } -- cgit v1.2.3 From 8824b2d7abfb113e33eae268ea22b11eb15b30b4 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 7 Dec 2017 12:43:32 -0500 Subject: tcp_bbr: reset long-term bandwidth sampling on loss recovery undo commit 600647d467c6d04b3954b41a6ee1795b5ae00550 upstream. Fix BBR so that upon notification of a loss recovery undo BBR resets long-term bandwidth sampling. Under high reordering, reordering events can be interpreted as loss. If the reordering and spurious loss estimates are high enough, this can cause BBR to spuriously estimate that we are seeing loss rates high enough to trigger long-term bandwidth estimation. To avoid that problem, this commit resets long-term bandwidth sampling on loss recovery undo events. Signed-off-by: Neal Cardwell Reviewed-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_bbr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 64ff4966aad6..e86a34fd5484 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -847,6 +847,7 @@ static u32 bbr_undo_cwnd(struct sock *sk) bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ bbr->full_bw_cnt = 0; + bbr_reset_lt_bw_sampling(sk); return tcp_sk(sk)->snd_cwnd; } -- cgit v1.2.3 From b5fd58e997cf6294d2c027d585f7dcbd1e096bce Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:33 -0500 Subject: x86/boot: Add early cmdline parsing for options with arguments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e505371dd83963caae1a37ead9524e8d997341be upstream. Add a cmdline_find_option() function to look for cmdline options that take arguments. The argument is returned in a supplied buffer and the argument length (regardless of whether it fits in the supplied buffer) is returned, with -1 indicating not found. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/36b5f97492a9745dce27682305f990fc20e5cf8a.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cmdline.h | 2 + arch/x86/lib/cmdline.c | 105 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h index e01f7f7ccb0c..84ae170bc3d0 100644 --- a/arch/x86/include/asm/cmdline.h +++ b/arch/x86/include/asm/cmdline.h @@ -2,5 +2,7 @@ #define _ASM_X86_CMDLINE_H int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); +int cmdline_find_option(const char *cmdline_ptr, const char *option, + char *buffer, int bufsize); #endif /* _ASM_X86_CMDLINE_H */ diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c index 5cc78bf57232..3261abb21ef4 100644 --- a/arch/x86/lib/cmdline.c +++ b/arch/x86/lib/cmdline.c @@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size, return 0; /* Buffer overrun */ } +/* + * Find a non-boolean option (i.e. option=argument). In accordance with + * standard Linux practice, if this option is repeated, this returns the + * last instance on the command line. + * + * @cmdline: the cmdline string + * @max_cmdline_size: the maximum size of cmdline + * @option: option string to look for + * @buffer: memory buffer to return the option argument + * @bufsize: size of the supplied memory buffer + * + * Returns the length of the argument (regardless of if it was + * truncated to fit in the buffer), or -1 on not found. + */ +static int +__cmdline_find_option(const char *cmdline, int max_cmdline_size, + const char *option, char *buffer, int bufsize) +{ + char c; + int pos = 0, len = -1; + const char *opptr = NULL; + char *bufptr = buffer; + enum { + st_wordstart = 0, /* Start of word/after whitespace */ + st_wordcmp, /* Comparing this word */ + st_wordskip, /* Miscompare, skip */ + st_bufcpy, /* Copying this to buffer */ + } state = st_wordstart; + + if (!cmdline) + return -1; /* No command line */ + + /* + * This 'pos' check ensures we do not overrun + * a non-NULL-terminated 'cmdline' + */ + while (pos++ < max_cmdline_size) { + c = *(char *)cmdline++; + if (!c) + break; + + switch (state) { + case st_wordstart: + if (myisspace(c)) + break; + + state = st_wordcmp; + opptr = option; + /* fall through */ + + case st_wordcmp: + if ((c == '=') && !*opptr) { + /* + * We matched all the way to the end of the + * option we were looking for, prepare to + * copy the argument. + */ + len = 0; + bufptr = buffer; + state = st_bufcpy; + break; + } else if (c == *opptr++) { + /* + * We are currently matching, so continue + * to the next character on the cmdline. + */ + break; + } + state = st_wordskip; + /* fall through */ + + case st_wordskip: + if (myisspace(c)) + state = st_wordstart; + break; + + case st_bufcpy: + if (myisspace(c)) { + state = st_wordstart; + } else { + /* + * Increment len, but don't overrun the + * supplied buffer and leave room for the + * NULL terminator. + */ + if (++len < bufsize) + *bufptr++ = c; + } + break; + } + } + + if (bufsize) + *bufptr = '\0'; + + return len; +} + int cmdline_find_option_bool(const char *cmdline, const char *option) { return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); } + +int cmdline_find_option(const char *cmdline, const char *option, char *buffer, + int bufsize) +{ + return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, + buffer, bufsize); +} -- cgit v1.2.3 From 13be4483bb487176c48732b887780630a141ae96 Mon Sep 17 00:00:00 2001 From: Richard Fellner Date: Thu, 4 May 2017 14:26:50 +0200 Subject: KAISER: Kernel Address Isolation This patch introduces our implementation of KAISER (Kernel Address Isolation to have Side-channels Efficiently Removed), a kernel isolation technique to close hardware side channels on kernel address information. More information about the patch can be found on: https://github.com/IAIK/KAISER From: Richard Fellner From: Daniel Gruss Subject: [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode Date: Thu, 4 May 2017 14:26:50 +0200 Link: http://marc.info/?l=linux-kernel&m=149390087310405&w=2 Kaiser-4.10-SHA1: c4b1831d44c6144d3762ccc72f0c4e71a0c713e5 To: To: Cc: Cc: Cc: Michael Schwarz Cc: Richard Fellner Cc: Ingo Molnar Cc: Cc: After several recent works [1,2,3] KASLR on x86_64 was basically considered dead by many researchers. We have been working on an efficient but effective fix for this problem and found that not mapping the kernel space when running in user mode is the solution to this problem [4] (the corresponding paper [5] will be presented at ESSoS17). With this RFC patch we allow anybody to configure their kernel with the flag CONFIG_KAISER to add our defense mechanism. If there are any questions we would love to answer them. We also appreciate any comments! Cheers, Daniel (+ the KAISER team from Graz University of Technology) [1] http://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf [2] https://www.blackhat.com/docs/us-16/materials/us-16-Fogh-Using-Undocumented-CPU-Behaviour-To-See-Into-Kernel-Mode-And-Break-KASLR-In-The-Process.pdf [3] https://www.blackhat.com/docs/us-16/materials/us-16-Jang-Breaking-Kernel-Address-Space-Layout-Randomization-KASLR-With-Intel-TSX.pdf [4] https://github.com/IAIK/KAISER [5] https://gruss.cc/files/kaiser.pdf [patch based also on https://raw.githubusercontent.com/IAIK/KAISER/master/KAISER/0001-KAISER-Kernel-Address-Isolation.patch] Signed-off-by: Richard Fellner Signed-off-by: Moritz Lipp Signed-off-by: Daniel Gruss Signed-off-by: Michael Schwarz Acked-by: Jiri Kosina Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_64.S | 17 ++++ arch/x86/entry/entry_64_compat.S | 7 +- arch/x86/include/asm/hw_irq.h | 2 +- arch/x86/include/asm/kaiser.h | 113 +++++++++++++++++++++++++ arch/x86/include/asm/pgtable.h | 4 + arch/x86/include/asm/pgtable_64.h | 21 +++++ arch/x86/include/asm/pgtable_types.h | 12 ++- arch/x86/include/asm/processor.h | 7 +- arch/x86/kernel/cpu/common.c | 4 +- arch/x86/kernel/espfix_64.c | 6 ++ arch/x86/kernel/head_64.S | 16 +++- arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/process.c | 2 +- arch/x86/mm/Makefile | 2 +- arch/x86/mm/kaiser.c | 160 +++++++++++++++++++++++++++++++++++ arch/x86/mm/pageattr.c | 2 +- arch/x86/mm/pgtable.c | 26 ++++++ include/asm-generic/vmlinux.lds.h | 11 ++- include/linux/percpu-defs.h | 30 +++++++ init/main.c | 6 ++ kernel/fork.c | 8 ++ security/Kconfig | 7 ++ 22 files changed, 449 insertions(+), 16 deletions(-) create mode 100644 arch/x86/include/asm/kaiser.h create mode 100644 arch/x86/mm/kaiser.c diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e7b0e7ff4c58..9467a2c4bc60 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -36,6 +36,7 @@ #include #include #include +#include #include /* Avoid __ASSEMBLER__'ifying just for this. */ @@ -146,6 +147,7 @@ ENTRY(entry_SYSCALL_64) * it is too small to ever cause noticeable irq latency. */ SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK /* * A hypervisor implementation might want to use a label * after the swapgs, so that it can do the swapgs @@ -228,6 +230,7 @@ entry_SYSCALL_64_fastpath: movq RIP(%rsp), %rcx movq EFLAGS(%rsp), %r11 RESTORE_C_REGS_EXCEPT_RCX_R11 + SWITCH_USER_CR3 movq RSP(%rsp), %rsp USERGS_SYSRET64 @@ -323,10 +326,12 @@ return_from_SYSCALL_64: syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ RESTORE_C_REGS_EXCEPT_RCX_R11 + SWITCH_USER_CR3 movq RSP(%rsp), %rsp USERGS_SYSRET64 opportunistic_sysret_failed: + SWITCH_USER_CR3 SWAPGS jmp restore_c_regs_and_iret END(entry_SYSCALL_64) @@ -424,6 +429,7 @@ ENTRY(ret_from_fork) movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ + SWITCH_USER_CR3 SWAPGS jmp restore_regs_and_iret @@ -478,6 +484,7 @@ END(irq_entries_start) * tracking that we're in kernel mode. */ SWAPGS + SWITCH_KERNEL_CR3 /* * We need to tell lockdep that IRQs are off. We can't do this until @@ -535,6 +542,7 @@ GLOBAL(retint_user) mov %rsp,%rdi call prepare_exit_to_usermode TRACE_IRQS_IRETQ + SWITCH_USER_CR3 SWAPGS jmp restore_regs_and_iret @@ -612,6 +620,7 @@ native_irq_return_ldt: pushq %rdi /* Stash user RDI */ SWAPGS + SWITCH_KERNEL_CR3 movq PER_CPU_VAR(espfix_waddr), %rdi movq %rax, (0*8)(%rdi) /* user RAX */ movq (1*8)(%rsp), %rax /* user RIP */ @@ -638,6 +647,7 @@ native_irq_return_ldt: * still points to an RO alias of the ESPFIX stack. */ orq PER_CPU_VAR(espfix_stack), %rax + SWITCH_USER_CR3 SWAPGS movq %rax, %rsp @@ -1034,6 +1044,7 @@ ENTRY(paranoid_entry) testl %edx, %edx js 1f /* negative -> in kernel */ SWAPGS + SWITCH_KERNEL_CR3 xorl %ebx, %ebx 1: ret END(paranoid_entry) @@ -1056,6 +1067,7 @@ ENTRY(paranoid_exit) testl %ebx, %ebx /* swapgs needed? */ jnz paranoid_exit_no_swapgs TRACE_IRQS_IRETQ + SWITCH_USER_CR3_NO_STACK SWAPGS_UNSAFE_STACK jmp paranoid_exit_restore paranoid_exit_no_swapgs: @@ -1084,6 +1096,7 @@ ENTRY(error_entry) * from user mode due to an IRET fault. */ SWAPGS + SWITCH_KERNEL_CR3 .Lerror_entry_from_usermode_after_swapgs: /* @@ -1135,6 +1148,7 @@ ENTRY(error_entry) * Switch to kernel gsbase: */ SWAPGS + SWITCH_KERNEL_CR3 /* * Pretend that the exception came from user mode: set up pt_regs @@ -1235,6 +1249,7 @@ ENTRY(nmi) */ SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK cld movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp @@ -1275,6 +1290,7 @@ ENTRY(nmi) * work, because we don't want to enable interrupts. Fortunately, * do_nmi doesn't modify pt_regs. */ + SWITCH_USER_CR3 SWAPGS jmp restore_c_regs_and_iret @@ -1486,6 +1502,7 @@ end_repeat_nmi: testl %ebx, %ebx /* swapgs needed? */ jnz nmi_restore nmi_swapgs: + SWITCH_USER_CR3_NO_STACK SWAPGS_UNSAFE_STACK nmi_restore: RESTORE_EXTRA_REGS diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index e1721dafbcb1..f0e384ee8fc6 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -48,6 +49,7 @@ ENTRY(entry_SYSENTER_compat) /* Interrupts are off on entry. */ SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* @@ -184,6 +186,7 @@ ENDPROC(entry_SYSENTER_compat) ENTRY(entry_SYSCALL_compat) /* Interrupts are off on entry. */ SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK /* Stash user ESP and switch to the kernel stack. */ movl %esp, %r8d @@ -259,6 +262,7 @@ sysret32_from_system_call: xorq %r8, %r8 xorq %r9, %r9 xorq %r10, %r10 + SWITCH_USER_CR3 movq RSP-ORIG_RAX(%rsp), %rsp swapgs sysretl @@ -297,7 +301,7 @@ ENTRY(entry_INT80_compat) PARAVIRT_ADJUST_EXCEPTION_FRAME ASM_CLAC /* Do this early to minimize exposure */ SWAPGS - + SWITCH_KERNEL_CR3_NO_STACK /* * User tracing code (ptrace or signal handlers) might assume that * the saved RAX contains a 32-bit number when we're invoking a 32-bit @@ -338,6 +342,7 @@ ENTRY(entry_INT80_compat) /* Go back to user mode. */ TRACE_IRQS_ON + SWITCH_USER_CR3_NO_STACK SWAPGS jmp restore_regs_and_iret END(entry_INT80_compat) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index b90e1053049b..0817d63bce41 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -178,7 +178,7 @@ extern char irq_entries_start[]; #define VECTOR_RETRIGGERED ((void *)~0UL) typedef struct irq_desc* vector_irq_t[NR_VECTORS]; -DECLARE_PER_CPU(vector_irq_t, vector_irq); +DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq); #endif /* !ASSEMBLY_ */ diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h new file mode 100644 index 000000000000..63ee8309b35b --- /dev/null +++ b/arch/x86/include/asm/kaiser.h @@ -0,0 +1,113 @@ +#ifndef _ASM_X86_KAISER_H +#define _ASM_X86_KAISER_H + +/* This file includes the definitions for the KAISER feature. + * KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory. + * It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped, + * but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled, + * the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled. + * By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory. + * + * A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions + * of the user space, or the stacks. + */ +#ifdef __ASSEMBLY__ +#ifdef CONFIG_KAISER + +.macro _SWITCH_TO_KERNEL_CR3 reg +movq %cr3, \reg +andq $(~0x1000), \reg +movq \reg, %cr3 +.endm + +.macro _SWITCH_TO_USER_CR3 reg +movq %cr3, \reg +orq $(0x1000), \reg +movq \reg, %cr3 +.endm + +.macro SWITCH_KERNEL_CR3 +pushq %rax +_SWITCH_TO_KERNEL_CR3 %rax +popq %rax +.endm + +.macro SWITCH_USER_CR3 +pushq %rax +_SWITCH_TO_USER_CR3 %rax +popq %rax +.endm + +.macro SWITCH_KERNEL_CR3_NO_STACK +movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) +_SWITCH_TO_KERNEL_CR3 %rax +movq PER_CPU_VAR(unsafe_stack_register_backup), %rax +.endm + + +.macro SWITCH_USER_CR3_NO_STACK + +movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) +_SWITCH_TO_USER_CR3 %rax +movq PER_CPU_VAR(unsafe_stack_register_backup), %rax + +.endm + +#else /* CONFIG_KAISER */ + +.macro SWITCH_KERNEL_CR3 reg +.endm +.macro SWITCH_USER_CR3 reg +.endm +.macro SWITCH_USER_CR3_NO_STACK +.endm +.macro SWITCH_KERNEL_CR3_NO_STACK +.endm + +#endif /* CONFIG_KAISER */ +#else /* __ASSEMBLY__ */ + + +#ifdef CONFIG_KAISER +// Upon kernel/user mode switch, it may happen that +// the address space has to be switched before the registers have been stored. +// To change the address space, another register is needed. +// A register therefore has to be stored/restored. +// +DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + +#endif /* CONFIG_KAISER */ + +/** + * shadowmem_add_mapping - map a virtual memory part to the shadow mapping + * @addr: the start address of the range + * @size: the size of the range + * @flags: The mapping flags of the pages + * + * the mapping is done on a global scope, so no bigger synchronization has to be done. + * the pages have to be manually unmapped again when they are not needed any longer. + */ +extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); + + +/** + * shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping + * @addr: the start address of the range + * @size: the size of the range + */ +extern void kaiser_remove_mapping(unsigned long start, unsigned long size); + +/** + * shadowmem_initialize_mapping - Initalize the shadow mapping + * + * most parts of the shadow mapping can be mapped upon boot time. + * only the thread stacks have to be mapped on runtime. + * the mapped regions are not unmapped at all. + */ +extern void kaiser_init(void); + +#endif + + + +#endif /* _ASM_X86_KAISER_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 437feb436efa..4b479c9b064f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -904,6 +904,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { memcpy(dst, src, count * sizeof(pgd_t)); +#ifdef CONFIG_KAISER + // clone the shadow pgd part as well + memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t)); +#endif } #define PTE_SHIFT ilog2(PTRS_PER_PTE) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 1cc82ece9ac1..e6ea39fa6d0b 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -106,9 +106,30 @@ static inline void native_pud_clear(pud_t *pud) native_set_pud(pud, native_make_pud(0)); } +#ifdef CONFIG_KAISER +static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) { + return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE); +} + +static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) { + return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE); +} +#endif /* CONFIG_KAISER */ + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { +#ifdef CONFIG_KAISER + // We know that a pgd is page aligned. + // Therefore the lower indices have to be mapped to user space. + // These pages are mapped to the shadow mapping. + if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) { + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; + } + + pgdp->pgd = pgd.pgd & ~_PAGE_USER; +#else /* CONFIG_KAISER */ *pgdp = pgd; +#endif } static inline void native_pgd_clear(pgd_t *pgd) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 8b4de22d6429..00fecbb153ac 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -45,7 +45,11 @@ #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) -#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +#ifdef CONFIG_KAISER +#define _PAGE_GLOBAL (_AT(pteval_t, 0)) +#else +#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +#endif #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) @@ -119,7 +123,11 @@ #define _PAGE_DEVMAP (_AT(pteval_t, 0)) #endif -#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) +#ifdef CONFIG_KAISER +#define _PAGE_PROTNONE (_AT(pteval_t, 0)) +#else +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) +#endif #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ _PAGE_ACCESSED | _PAGE_DIRTY) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 83db0eae9979..3d4784e2f862 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -308,7 +308,7 @@ struct tss_struct { } ____cacheline_aligned; -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); +DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss); #ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); @@ -335,6 +335,11 @@ union irq_stack_union { char gs_base[40]; unsigned long stack_canary; }; + + struct { + char irq_stack_pointer[64]; + char unused[IRQ_STACK_SIZE - 64]; + }; }; DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 91588be529b9..3efde13eaed0 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -93,7 +93,7 @@ static const struct cpu_dev default_cpu = { static const struct cpu_dev *this_cpu = &default_cpu; -DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 /* * We need valid kernel segments for data and code in long mode too @@ -1365,7 +1365,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { [DEBUG_STACK - 1] = DEBUG_STKSZ }; -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); /* May not be marked __init: used by software suspend */ diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 04f89caef9c4..9ff875a1aa24 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -41,6 +41,7 @@ #include #include #include +#include /* * Note: we only need 6*8 = 48 bytes for the espfix stack, but round @@ -126,6 +127,11 @@ void __init init_espfix_bsp(void) /* Install the espfix pud into the kernel page directory */ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); +#ifdef CONFIG_KAISER + // add the esp stack pud to the shadow mapping here. + // This can be done directly, because the fixup stack has its own pud + set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page))); +#endif /* Randomize the locations */ init_espfix_random(); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b4421cc191b0..9e849b520780 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -405,6 +405,14 @@ GLOBAL(early_recursion_flag) .balign PAGE_SIZE; \ GLOBAL(name) +#ifdef CONFIG_KAISER +#define NEXT_PGD_PAGE(name) \ + .balign 2 * PAGE_SIZE; \ +GLOBAL(name) +#else +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) +#endif + /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ i = 0 ; \ @@ -414,7 +422,7 @@ GLOBAL(name) .endr __INITDATA -NEXT_PAGE(early_level4_pgt) +NEXT_PGD_PAGE(early_level4_pgt) .fill 511,8,0 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE @@ -424,10 +432,10 @@ NEXT_PAGE(early_dynamic_pgts) .data #ifndef CONFIG_XEN -NEXT_PAGE(init_level4_pgt) - .fill 512,8,0 +NEXT_PGD_PAGE(init_level4_pgt) + .fill 2*512,8,0 #else -NEXT_PAGE(init_level4_pgt) +NEXT_PGD_PAGE(init_level4_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 1423ab1b0312..f480b38a03c3 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -51,7 +51,7 @@ static struct irqaction irq2 = { .flags = IRQF_NO_THREAD, }; -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { +DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = { [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, }; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 8e10e72bf6ee..a55b32007785 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -41,7 +41,7 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { +__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = { .x86_tss = { .sp0 = TOP_OF_INIT_STACK, #ifdef CONFIG_X86_32 diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 96d2b847e09e..682c162333ba 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -38,4 +38,4 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_X86_INTEL_MPX) += mpx.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o - +obj-$(CONFIG_KAISER) += kaiser.o diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c new file mode 100644 index 000000000000..cf1bb922d467 --- /dev/null +++ b/arch/x86/mm/kaiser.c @@ -0,0 +1,160 @@ + + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#ifdef CONFIG_KAISER + +__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + +/** + * Get the real ppn from a address in kernel mapping. + * @param address The virtual adrress + * @return the physical address + */ +static inline unsigned long get_pa_from_mapping (unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(address); + BUG_ON(pgd_none(*pgd) || pgd_large(*pgd)); + + pud = pud_offset(pgd, address); + BUG_ON(pud_none(*pud)); + + if (pud_large(*pud)) { + return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK); + } + + pmd = pmd_offset(pud, address); + BUG_ON(pmd_none(*pmd)); + + if (pmd_large(*pmd)) { + return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK); + } + + pte = pte_offset_kernel(pmd, address); + BUG_ON(pte_none(*pte)); + + return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK); +} + +void _kaiser_copy (unsigned long start_addr, unsigned long size, + unsigned long flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long address; + unsigned long end_addr = start_addr + size; + unsigned long target_address; + + for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1)); + address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) { + target_address = get_pa_from_mapping(address); + + pgd = native_get_shadow_pgd(pgd_offset_k(address)); + + BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n"); + BUG_ON(pgd_large(*pgd)); + + pud = pud_offset(pgd, address); + if (pud_none(*pud)) { + set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address)))); + } + BUG_ON(pud_large(*pud)); + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) { + set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address)))); + } + BUG_ON(pmd_large(*pmd)); + + pte = pte_offset_kernel(pmd, address); + if (pte_none(*pte)) { + set_pte(pte, __pte(flags | target_address)); + } else { + BUG_ON(__pa(pte_page(*pte)) != target_address); + } + } +} + +// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping +static inline void __init _kaiser_init(void) +{ + pgd_t *pgd; + int i = 0; + + pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); + for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { + set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0)))); + } +} + +extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; +spinlock_t shadow_table_lock; +void __init kaiser_init(void) +{ + int cpu; + spin_lock_init(&shadow_table_lock); + + spin_lock(&shadow_table_lock); + + _kaiser_init(); + + for_each_possible_cpu(cpu) { + // map the per cpu user variables + _kaiser_copy( + (unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)), + (unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start, + __PAGE_KERNEL); + } + + // map the entry/exit text section, which is responsible to switch between user- and kernel mode + _kaiser_copy( + (unsigned long) __entry_text_start, + (unsigned long) __entry_text_end - (unsigned long) __entry_text_start, + __PAGE_KERNEL_RX); + + // the fixed map address of the idt_table + _kaiser_copy( + (unsigned long) idt_descr.address, + sizeof(gate_desc) * NR_VECTORS, + __PAGE_KERNEL_RO); + + spin_unlock(&shadow_table_lock); +} + +// add a mapping to the shadow-mapping, and synchronize the mappings +void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) +{ + spin_lock(&shadow_table_lock); + _kaiser_copy(addr, size, flags); + spin_unlock(&shadow_table_lock); +} + +extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end); +void kaiser_remove_mapping(unsigned long start, unsigned long size) +{ + pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start)); + spin_lock(&shadow_table_lock); + do { + unmap_pud_range(pgd, start, start + size); + } while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size))); + spin_unlock(&shadow_table_lock); +} +#endif /* CONFIG_KAISER */ diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e3353c97d086..c17412f92d77 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -823,7 +823,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) pud_clear(pud); } -static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) +void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) { pud_t *pud = pud_offset(pgd, start); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3feec5af4e67..27d218b38538 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -346,12 +346,38 @@ static inline void _pgd_free(pgd_t *pgd) #else static inline pgd_t *_pgd_alloc(void) { +#ifdef CONFIG_KAISER + // Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory + // block. Therefore, we have to allocate at least 3 pages. However, the + // __get_free_pages returns us 4 pages. Hence, we store the base pointer at + // the beginning of the page of our 8kb-aligned memory block in order to + // correctly free it afterwars. + + unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE)); + + if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages) + { + *((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages; + return (pgd_t *) pages; + } + else + { + *((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages; + return (pgd_t *) (pages + PAGE_SIZE); + } +#else return (pgd_t *)__get_free_page(PGALLOC_GFP); +#endif } static inline void _pgd_free(pgd_t *pgd) { +#ifdef CONFIG_KAISER + unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE)); + free_pages(pages, get_order(4*PAGE_SIZE)); +#else free_page((unsigned long)pgd); +#endif } #endif /* CONFIG_X86_PAE */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index dc81e5287ebf..d6ab14414776 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -778,7 +778,16 @@ */ #define PERCPU_INPUT(cacheline) \ VMLINUX_SYMBOL(__per_cpu_start) = .; \ - *(.data..percpu..first) \ + \ + VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \ + *(.data..percpu..first) \ + . = ALIGN(cacheline); \ + *(.data..percpu..user_mapped) \ + *(.data..percpu..user_mapped..shared_aligned) \ + . = ALIGN(PAGE_SIZE); \ + *(.data..percpu..user_mapped..page_aligned) \ + VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \ + \ . = ALIGN(PAGE_SIZE); \ *(.data..percpu..page_aligned) \ . = ALIGN(cacheline); \ diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 8f16299ca068..8ea945f63a05 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -35,6 +35,12 @@ #endif +#ifdef CONFIG_KAISER +#define USER_MAPPED_SECTION "..user_mapped" +#else +#define USER_MAPPED_SECTION "" +#endif + /* * Base implementations of per-CPU variable declarations and definitions, where * the section in which the variable is to be placed is provided by the @@ -115,6 +121,12 @@ #define DEFINE_PER_CPU(type, name) \ DEFINE_PER_CPU_SECTION(type, name, "") +#define DECLARE_PER_CPU_USER_MAPPED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) + +#define DEFINE_PER_CPU_USER_MAPPED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) + /* * Declaration/definition used for per-CPU variables that must come first in * the set of variables. @@ -144,6 +156,14 @@ DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \ ____cacheline_aligned_in_smp +#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ + ____cacheline_aligned_in_smp + +#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ + ____cacheline_aligned_in_smp + #define DECLARE_PER_CPU_ALIGNED(type, name) \ DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \ ____cacheline_aligned @@ -162,6 +182,16 @@ #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \ __aligned(PAGE_SIZE) +/* + * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode. + */ +#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ + __aligned(PAGE_SIZE) + +#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ + __aligned(PAGE_SIZE) /* * Declaration/definition used for per-CPU variables that must be read mostly. diff --git a/init/main.c b/init/main.c index 25bac88bc66e..2c009f77e655 100644 --- a/init/main.c +++ b/init/main.c @@ -86,6 +86,9 @@ #include #include #include +#ifdef CONFIG_KAISER +#include +#endif static int kernel_init(void *); @@ -473,6 +476,9 @@ static void __init mm_init(void) pgtable_init(); vmalloc_init(); ioremap_huge_init(); +#ifdef CONFIG_KAISER + kaiser_init(); +#endif } asmlinkage __visible void __init start_kernel(void) diff --git a/kernel/fork.c b/kernel/fork.c index 9321b1ad3335..4014be1dd2b6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -211,8 +211,12 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) #endif } +extern void kaiser_remove_mapping(unsigned long start_addr, unsigned long size); static inline void free_thread_stack(struct task_struct *tsk) { +#ifdef CONFIG_KAISER + kaiser_remove_mapping((unsigned long)tsk->stack, THREAD_SIZE); +#endif #ifdef CONFIG_VMAP_STACK if (task_stack_vm_area(tsk)) { unsigned long flags; @@ -468,6 +472,7 @@ void set_task_stack_end_magic(struct task_struct *tsk) *stackend = STACK_END_MAGIC; /* for overflow detection */ } +extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; @@ -495,6 +500,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) * functions again. */ tsk->stack = stack; +#ifdef CONFIG_KAISER + kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL); +#endif #ifdef CONFIG_VMAP_STACK tsk->stack_vm_area = stack_vm_area; #endif diff --git a/security/Kconfig b/security/Kconfig index 118f4549404e..f515ac302257 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -30,6 +30,13 @@ config SECURITY model will be used. If you are unsure how to answer this question, answer N. +config KAISER + bool "Remove the kernel mapping in user mode" + depends on X86_64 + depends on !PARAVIRT + help + This enforces a strict kernel and user space isolation in order to close + hardware side channels on kernel address information. config SECURITYFS bool "Enable the securityfs filesystem" -- cgit v1.2.3 From 8f0baadf2bea3861217763734b57e1dd2db703dd Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 30 Aug 2017 16:23:00 -0700 Subject: kaiser: merged update Merged fixes and cleanups, rebased to 4.9.51 tree (no 5-level paging). Signed-off-by: Dave Hansen Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_64.S | 105 ++++++++++-- arch/x86/include/asm/kaiser.h | 43 +++-- arch/x86/include/asm/pgtable.h | 18 +- arch/x86/include/asm/pgtable_64.h | 48 +++++- arch/x86/include/asm/pgtable_types.h | 6 +- arch/x86/kernel/espfix_64.c | 13 +- arch/x86/kernel/head_64.S | 19 ++- arch/x86/kernel/ldt.c | 27 ++- arch/x86/kernel/tracepoint.c | 2 + arch/x86/mm/kaiser.c | 313 +++++++++++++++++++++++++---------- arch/x86/mm/pageattr.c | 63 +++++-- arch/x86/mm/pgtable.c | 40 ++--- include/linux/kaiser.h | 26 +++ kernel/fork.c | 9 +- security/Kconfig | 5 + 15 files changed, 549 insertions(+), 188 deletions(-) create mode 100644 include/linux/kaiser.h diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9467a2c4bc60..2869fc178658 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -230,6 +230,13 @@ entry_SYSCALL_64_fastpath: movq RIP(%rsp), %rcx movq EFLAGS(%rsp), %r11 RESTORE_C_REGS_EXCEPT_RCX_R11 + /* + * This opens a window where we have a user CR3, but are + * running in the kernel. This makes using the CS + * register useless for telling whether or not we need to + * switch CR3 in NMIs. Normal interrupts are OK because + * they are off here. + */ SWITCH_USER_CR3 movq RSP(%rsp), %rsp USERGS_SYSRET64 @@ -326,11 +333,25 @@ return_from_SYSCALL_64: syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ RESTORE_C_REGS_EXCEPT_RCX_R11 + /* + * This opens a window where we have a user CR3, but are + * running in the kernel. This makes using the CS + * register useless for telling whether or not we need to + * switch CR3 in NMIs. Normal interrupts are OK because + * they are off here. + */ SWITCH_USER_CR3 movq RSP(%rsp), %rsp USERGS_SYSRET64 opportunistic_sysret_failed: + /* + * This opens a window where we have a user CR3, but are + * running in the kernel. This makes using the CS + * register useless for telling whether or not we need to + * switch CR3 in NMIs. Normal interrupts are OK because + * they are off here. + */ SWITCH_USER_CR3 SWAPGS jmp restore_c_regs_and_iret @@ -1087,6 +1108,13 @@ ENTRY(error_entry) cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 + /* + * error_entry() always returns with a kernel gsbase and + * CR3. We must also have a kernel CR3/gsbase before + * calling TRACE_IRQS_*. Just unconditionally switch to + * the kernel CR3 here. + */ + SWITCH_KERNEL_CR3 xorl %ebx, %ebx testb $3, CS+8(%rsp) jz .Lerror_kernelspace @@ -1096,7 +1124,6 @@ ENTRY(error_entry) * from user mode due to an IRET fault. */ SWAPGS - SWITCH_KERNEL_CR3 .Lerror_entry_from_usermode_after_swapgs: /* @@ -1148,7 +1175,6 @@ ENTRY(error_entry) * Switch to kernel gsbase: */ SWAPGS - SWITCH_KERNEL_CR3 /* * Pretend that the exception came from user mode: set up pt_regs @@ -1249,7 +1275,10 @@ ENTRY(nmi) */ SWAPGS_UNSAFE_STACK - SWITCH_KERNEL_CR3_NO_STACK + /* + * percpu variables are mapped with user CR3, so no need + * to switch CR3 here. + */ cld movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp @@ -1283,14 +1312,33 @@ ENTRY(nmi) movq %rsp, %rdi movq $-1, %rsi +#ifdef CONFIG_KAISER + /* Unconditionally use kernel CR3 for do_nmi() */ + /* %rax is saved above, so OK to clobber here */ + movq %cr3, %rax + pushq %rax +#ifdef CONFIG_KAISER_REAL_SWITCH + andq $(~0x1000), %rax +#endif + movq %rax, %cr3 +#endif call do_nmi + /* + * Unconditionally restore CR3. I know we return to + * kernel code that needs user CR3, but do we ever return + * to "user mode" where we need the kernel CR3? + */ +#ifdef CONFIG_KAISER + popq %rax + mov %rax, %cr3 +#endif /* * Return back to user mode. We must *not* do the normal exit - * work, because we don't want to enable interrupts. Fortunately, - * do_nmi doesn't modify pt_regs. + * work, because we don't want to enable interrupts. Do not + * switch to user CR3: we might be going back to kernel code + * that had a user CR3 set. */ - SWITCH_USER_CR3 SWAPGS jmp restore_c_regs_and_iret @@ -1486,23 +1534,54 @@ end_repeat_nmi: ALLOC_PT_GPREGS_ON_STACK /* - * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit - * as we should not be calling schedule in NMI context. - * Even with normal interrupts enabled. An NMI should not be - * setting NEED_RESCHED or anything that normal interrupts and - * exceptions might do. + * Use the same approach as paranoid_entry to handle SWAPGS, but + * without CR3 handling since we do that differently in NMIs. No + * need to use paranoid_exit as we should not be calling schedule + * in NMI context. Even with normal interrupts enabled. An NMI + * should not be setting NEED_RESCHED or anything that normal + * interrupts and exceptions might do. */ - call paranoid_entry + cld + SAVE_C_REGS + SAVE_EXTRA_REGS + movl $1, %ebx + movl $MSR_GS_BASE, %ecx + rdmsr + testl %edx, %edx + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx, %ebx +1: +#ifdef CONFIG_KAISER + /* Unconditionally use kernel CR3 for do_nmi() */ + /* %rax is saved above, so OK to clobber here */ + movq %cr3, %rax + pushq %rax +#ifdef CONFIG_KAISER_REAL_SWITCH + andq $(~0x1000), %rax +#endif + movq %rax, %cr3 +#endif /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp, %rdi + addq $8, %rdi /* point %rdi at ptregs, fixed up for CR3 */ movq $-1, %rsi call do_nmi + /* + * Unconditionally restore CR3. We might be returning to + * kernel code that needs user CR3, like just just before + * a sysret. + */ +#ifdef CONFIG_KAISER + popq %rax + mov %rax, %cr3 +#endif testl %ebx, %ebx /* swapgs needed? */ jnz nmi_restore nmi_swapgs: - SWITCH_USER_CR3_NO_STACK + /* We fixed up CR3 above, so no need to switch it here */ SWAPGS_UNSAFE_STACK nmi_restore: RESTORE_EXTRA_REGS diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 63ee8309b35b..0703f48777f3 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -16,13 +16,17 @@ .macro _SWITCH_TO_KERNEL_CR3 reg movq %cr3, \reg +#ifdef CONFIG_KAISER_REAL_SWITCH andq $(~0x1000), \reg +#endif movq \reg, %cr3 .endm .macro _SWITCH_TO_USER_CR3 reg movq %cr3, \reg +#ifdef CONFIG_KAISER_REAL_SWITCH orq $(0x1000), \reg +#endif movq \reg, %cr3 .endm @@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax .endm #endif /* CONFIG_KAISER */ + #else /* __ASSEMBLY__ */ #ifdef CONFIG_KAISER -// Upon kernel/user mode switch, it may happen that -// the address space has to be switched before the registers have been stored. -// To change the address space, another register is needed. -// A register therefore has to be stored/restored. -// -DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); +/* + * Upon kernel/user mode switch, it may happen that the address + * space has to be switched before the registers have been + * stored. To change the address space, another register is + * needed. A register therefore has to be stored/restored. +*/ -#endif /* CONFIG_KAISER */ +DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); /** - * shadowmem_add_mapping - map a virtual memory part to the shadow mapping + * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping * @addr: the start address of the range * @size: the size of the range * @flags: The mapping flags of the pages * - * the mapping is done on a global scope, so no bigger synchronization has to be done. - * the pages have to be manually unmapped again when they are not needed any longer. + * The mapping is done on a global scope, so no bigger + * synchronization has to be done. the pages have to be + * manually unmapped again when they are not needed any longer. */ -extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); +extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); /** - * shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping + * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping * @addr: the start address of the range * @size: the size of the range */ extern void kaiser_remove_mapping(unsigned long start, unsigned long size); /** - * shadowmem_initialize_mapping - Initalize the shadow mapping + * kaiser_initialize_mapping - Initalize the shadow mapping * - * most parts of the shadow mapping can be mapped upon boot time. - * only the thread stacks have to be mapped on runtime. - * the mapped regions are not unmapped at all. + * Most parts of the shadow mapping can be mapped upon boot + * time. Only per-process things like the thread stacks + * or a new LDT have to be mapped at runtime. These boot- + * time mappings are permanent and nevertunmapped. */ extern void kaiser_init(void); -#endif +#endif /* CONFIG_KAISER */ + +#endif /* __ASSEMBLY */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 4b479c9b064f..1cee98e182b7 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -690,7 +690,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) static inline int pgd_bad(pgd_t pgd) { - return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; + pgdval_t ignore_flags = _PAGE_USER; + /* + * We set NX on KAISER pgds that map userspace memory so + * that userspace can not meaningfully use the kernel + * page table by accident; it will fault on the first + * instruction it tries to run. See native_set_pgd(). + */ + if (IS_ENABLED(CONFIG_KAISER)) + ignore_flags |= _PAGE_NX; + + return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; } static inline int pgd_none(pgd_t pgd) @@ -905,8 +915,10 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { memcpy(dst, src, count * sizeof(pgd_t)); #ifdef CONFIG_KAISER - // clone the shadow pgd part as well - memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t)); + /* Clone the shadow pgd part as well */ + memcpy(native_get_shadow_pgd(dst), + native_get_shadow_pgd(src), + count * sizeof(pgd_t)); #endif } diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index e6ea39fa6d0b..000265c8b74e 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_t *pud) } #ifdef CONFIG_KAISER -static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) { +static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) +{ return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE); } -static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) { +static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) +{ return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE); } +#else +static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) +{ + BUILD_BUG_ON(1); + return NULL; +} +static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) +{ + return pgdp; +} #endif /* CONFIG_KAISER */ +/* + * Page table pages are page-aligned. The lower half of the top + * level is used for userspace and the top half for the kernel. + * This returns true for user pages that need to get copied into + * both the user and kernel copies of the page tables, and false + * for kernel pages that should only be in the kernel copy. + */ +static inline bool is_userspace_pgd(void *__ptr) +{ + unsigned long ptr = (unsigned long)__ptr; + + return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2)); +} + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { #ifdef CONFIG_KAISER - // We know that a pgd is page aligned. - // Therefore the lower indices have to be mapped to user space. - // These pages are mapped to the shadow mapping. - if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) { + pteval_t extra_kern_pgd_flags = 0; + /* Do we need to also populate the shadow pgd? */ + if (is_userspace_pgd(pgdp)) { native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; + /* + * Even if the entry is *mapping* userspace, ensure + * that userspace can not use it. This way, if we + * get out to userspace running on the kernel CR3, + * userspace will crash instead of running. + */ + extra_kern_pgd_flags = _PAGE_NX; } - - pgdp->pgd = pgd.pgd & ~_PAGE_USER; + pgdp->pgd = pgd.pgd; + pgdp->pgd |= extra_kern_pgd_flags; #else /* CONFIG_KAISER */ *pgdp = pgd; #endif diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 00fecbb153ac..8bc8d02fb4b1 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -48,7 +48,7 @@ #ifdef CONFIG_KAISER #define _PAGE_GLOBAL (_AT(pteval_t, 0)) #else -#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) #endif #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) @@ -123,11 +123,7 @@ #define _PAGE_DEVMAP (_AT(pteval_t, 0)) #endif -#ifdef CONFIG_KAISER -#define _PAGE_PROTNONE (_AT(pteval_t, 0)) -#else #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) -#endif #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ _PAGE_ACCESSED | _PAGE_DIRTY) diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 9ff875a1aa24..560c2fd73b28 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -127,11 +127,14 @@ void __init init_espfix_bsp(void) /* Install the espfix pud into the kernel page directory */ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); -#ifdef CONFIG_KAISER - // add the esp stack pud to the shadow mapping here. - // This can be done directly, because the fixup stack has its own pud - set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page))); -#endif + /* + * Just copy the top-level PGD that is mapping the espfix + * area to ensure it is mapped into the shadow user page + * tables. + */ + if (IS_ENABLED(CONFIG_KAISER)) + set_pgd(native_get_shadow_pgd(pgd_p), + __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page))); /* Randomize the locations */ init_espfix_random(); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 9e849b520780..5775379237f7 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -406,11 +406,24 @@ GLOBAL(early_recursion_flag) GLOBAL(name) #ifdef CONFIG_KAISER +/* + * Each PGD needs to be 8k long and 8k aligned. We do not + * ever go out to userspace with these, so we do not + * strictly *need* the second page, but this allows us to + * have a single set_pgd() implementation that does not + * need to worry about whether it has 4k or 8k to work + * with. + * + * This ensures PGDs are 8k long: + */ +#define KAISER_USER_PGD_FILL 512 +/* This ensures they are 8k-aligned: */ #define NEXT_PGD_PAGE(name) \ .balign 2 * PAGE_SIZE; \ GLOBAL(name) #else #define NEXT_PGD_PAGE(name) NEXT_PAGE(name) +#define KAISER_USER_PGD_FILL 0 #endif /* Automate the creation of 1 to 1 mapping pmd entries */ @@ -425,6 +438,7 @@ GLOBAL(name) NEXT_PGD_PAGE(early_level4_pgt) .fill 511,8,0 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .fill KAISER_USER_PGD_FILL,8,0 NEXT_PAGE(early_dynamic_pgts) .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 @@ -433,7 +447,8 @@ NEXT_PAGE(early_dynamic_pgts) #ifndef CONFIG_XEN NEXT_PGD_PAGE(init_level4_pgt) - .fill 2*512,8,0 + .fill 512,8,0 + .fill KAISER_USER_PGD_FILL,8,0 #else NEXT_PGD_PAGE(init_level4_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE @@ -442,6 +457,7 @@ NEXT_PGD_PAGE(init_level4_pgt) .org init_level4_pgt + L4_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .fill KAISER_USER_PGD_FILL,8,0 NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE @@ -452,6 +468,7 @@ NEXT_PAGE(level2_ident_pgt) */ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) #endif + .fill KAISER_USER_PGD_FILL,8,0 NEXT_PAGE(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 5f70014ca602..dd31f9f11f01 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm) set_ldt(pc->ldt->entries, pc->ldt->size); } +static void __free_ldt_struct(struct ldt_struct *ldt) +{ + if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(ldt->entries); + else + free_page((unsigned long)ldt->entries); + kfree(ldt); +} + /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ static struct ldt_struct *alloc_ldt_struct(int size) { struct ldt_struct *new_ldt; int alloc_size; + int ret = 0; if (size > LDT_ENTRIES) return NULL; @@ -66,6 +77,14 @@ static struct ldt_struct *alloc_ldt_struct(int size) return NULL; } + // FIXME: make kaiser_add_mapping() return an error code + // when it fails + kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size, + __PAGE_KERNEL); + if (ret) { + __free_ldt_struct(new_ldt); + return NULL; + } new_ldt->size = size; return new_ldt; } @@ -92,12 +111,10 @@ static void free_ldt_struct(struct ldt_struct *ldt) if (likely(!ldt)) return; + kaiser_remove_mapping((unsigned long)ldt->entries, + ldt->size * LDT_ENTRY_SIZE); paravirt_free_ldt(ldt->entries, ldt->size); - if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(ldt->entries); - else - free_page((unsigned long)ldt->entries); - kfree(ldt); + __free_ldt_struct(ldt); } /* diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index 1c113db9ed57..2bb5ee464df3 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -9,10 +9,12 @@ #include atomic_t trace_idt_ctr = ATOMIC_INIT(0); +__aligned(PAGE_SIZE) struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) trace_idt_table }; /* No need to be aligned, but done to keep all IDTs defined the same way. */ +__aligned(PAGE_SIZE) gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; static int trace_irq_vector_refcount; diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index cf1bb922d467..7270a293175d 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -1,160 +1,305 @@ - - +#include #include #include #include #include #include #include +#include #include #include - #include + +#include #include #include #include #ifdef CONFIG_KAISER __visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); +/* + * At runtime, the only things we map are some things for CPU + * hotplug, and stacks for new processes. No two CPUs will ever + * be populating the same addresses, so we only need to ensure + * that we protect between two CPUs trying to allocate and + * populate the same page table page. + * + * Only take this lock when doing a set_p[4um]d(), but it is not + * needed for doing a set_pte(). We assume that only the *owner* + * of a given allocation will be doing this for _their_ + * allocation. + * + * This ensures that once a system has been running for a while + * and there have been stacks all over and these page tables + * are fully populated, there will be no further acquisitions of + * this lock. + */ +static DEFINE_SPINLOCK(shadow_table_allocation_lock); -/** - * Get the real ppn from a address in kernel mapping. - * @param address The virtual adrress - * @return the physical address +/* + * Returns -1 on error. */ -static inline unsigned long get_pa_from_mapping (unsigned long address) +static inline unsigned long get_pa_from_mapping(unsigned long vaddr) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; - pgd = pgd_offset_k(address); - BUG_ON(pgd_none(*pgd) || pgd_large(*pgd)); - - pud = pud_offset(pgd, address); - BUG_ON(pud_none(*pud)); + pgd = pgd_offset_k(vaddr); + /* + * We made all the kernel PGDs present in kaiser_init(). + * We expect them to stay that way. + */ + BUG_ON(pgd_none(*pgd)); + /* + * PGDs are either 512GB or 128TB on all x86_64 + * configurations. We don't handle these. + */ + BUG_ON(pgd_large(*pgd)); - if (pud_large(*pud)) { - return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK); + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + WARN_ON_ONCE(1); + return -1; } - pmd = pmd_offset(pud, address); - BUG_ON(pmd_none(*pmd)); + if (pud_large(*pud)) + return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK); - if (pmd_large(*pmd)) { - return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK); + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + WARN_ON_ONCE(1); + return -1; } - pte = pte_offset_kernel(pmd, address); - BUG_ON(pte_none(*pte)); + if (pmd_large(*pmd)) + return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK); - return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK); + pte = pte_offset_kernel(pmd, vaddr); + if (pte_none(*pte)) { + WARN_ON_ONCE(1); + return -1; + } + + return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); } -void _kaiser_copy (unsigned long start_addr, unsigned long size, - unsigned long flags) +/* + * This is a relatively normal page table walk, except that it + * also tries to allocate page tables pages along the way. + * + * Returns a pointer to a PTE on success, or NULL on failure. + */ +static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic) { - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; - pte_t *pte; - unsigned long address; - unsigned long end_addr = start_addr + size; - unsigned long target_address; + pud_t *pud; + pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); - for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1)); - address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) { - target_address = get_pa_from_mapping(address); + might_sleep(); + if (is_atomic) { + gfp &= ~GFP_KERNEL; + gfp |= __GFP_HIGH | __GFP_ATOMIC; + } - pgd = native_get_shadow_pgd(pgd_offset_k(address)); + if (pgd_none(*pgd)) { + WARN_ONCE(1, "All shadow pgds should have been populated"); + return NULL; + } + BUILD_BUG_ON(pgd_large(*pgd) != 0); - BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n"); - BUG_ON(pgd_large(*pgd)); + pud = pud_offset(pgd, address); + /* The shadow page tables do not use large mappings: */ + if (pud_large(*pud)) { + WARN_ON(1); + return NULL; + } + if (pud_none(*pud)) { + unsigned long new_pmd_page = __get_free_page(gfp); + if (!new_pmd_page) + return NULL; + spin_lock(&shadow_table_allocation_lock); + if (pud_none(*pud)) + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); + else + free_page(new_pmd_page); + spin_unlock(&shadow_table_allocation_lock); + } - pud = pud_offset(pgd, address); - if (pud_none(*pud)) { - set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address)))); - } - BUG_ON(pud_large(*pud)); + pmd = pmd_offset(pud, address); + /* The shadow page tables do not use large mappings: */ + if (pmd_large(*pmd)) { + WARN_ON(1); + return NULL; + } + if (pmd_none(*pmd)) { + unsigned long new_pte_page = __get_free_page(gfp); + if (!new_pte_page) + return NULL; + spin_lock(&shadow_table_allocation_lock); + if (pmd_none(*pmd)) + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); + else + free_page(new_pte_page); + spin_unlock(&shadow_table_allocation_lock); + } - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) { - set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address)))); - } - BUG_ON(pmd_large(*pmd)); + return pte_offset_kernel(pmd, address); +} - pte = pte_offset_kernel(pmd, address); +int kaiser_add_user_map(const void *__start_addr, unsigned long size, + unsigned long flags) +{ + int ret = 0; + pte_t *pte; + unsigned long start_addr = (unsigned long )__start_addr; + unsigned long address = start_addr & PAGE_MASK; + unsigned long end_addr = PAGE_ALIGN(start_addr + size); + unsigned long target_address; + + for (;address < end_addr; address += PAGE_SIZE) { + target_address = get_pa_from_mapping(address); + if (target_address == -1) { + ret = -EIO; + break; + } + pte = kaiser_pagetable_walk(address, false); if (pte_none(*pte)) { set_pte(pte, __pte(flags | target_address)); } else { - BUG_ON(__pa(pte_page(*pte)) != target_address); + pte_t tmp; + set_pte(&tmp, __pte(flags | target_address)); + WARN_ON_ONCE(!pte_same(*pte, tmp)); } } + return ret; +} + +static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags) +{ + unsigned long size = end - start; + + return kaiser_add_user_map(start, size, flags); } -// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping -static inline void __init _kaiser_init(void) +/* + * Ensure that the top level of the (shadow) page tables are + * entirely populated. This ensures that all processes that get + * forked have the same entries. This way, we do not have to + * ever go set up new entries in older processes. + * + * Note: we never free these, so there are no updates to them + * after this. + */ +static void __init kaiser_init_all_pgds(void) { pgd_t *pgd; int i = 0; pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { - set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0)))); + pgd_t new_pgd; + pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE); + if (!pud) { + WARN_ON(1); + break; + } + new_pgd = __pgd(_KERNPG_TABLE |__pa(pud)); + /* + * Make sure not to stomp on some other pgd entry. + */ + if (!pgd_none(pgd[i])) { + WARN_ON(1); + continue; + } + set_pgd(pgd + i, new_pgd); } } +#define kaiser_add_user_map_early(start, size, flags) do { \ + int __ret = kaiser_add_user_map(start, size, flags); \ + WARN_ON(__ret); \ +} while (0) + +#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \ + int __ret = kaiser_add_user_map_ptrs(start, end, flags); \ + WARN_ON(__ret); \ +} while (0) + extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; -spinlock_t shadow_table_lock; +/* + * If anything in here fails, we will likely die on one of the + * first kernel->user transitions and init will die. But, we + * will have most of the kernel up by then and should be able to + * get a clean warning out of it. If we BUG_ON() here, we run + * the risk of being before we have good console output. + */ void __init kaiser_init(void) { int cpu; - spin_lock_init(&shadow_table_lock); - - spin_lock(&shadow_table_lock); - _kaiser_init(); + kaiser_init_all_pgds(); for_each_possible_cpu(cpu) { - // map the per cpu user variables - _kaiser_copy( - (unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)), - (unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start, - __PAGE_KERNEL); + void *percpu_vaddr = __per_cpu_user_mapped_start + + per_cpu_offset(cpu); + unsigned long percpu_sz = __per_cpu_user_mapped_end - + __per_cpu_user_mapped_start; + kaiser_add_user_map_early(percpu_vaddr, percpu_sz, + __PAGE_KERNEL); } - // map the entry/exit text section, which is responsible to switch between user- and kernel mode - _kaiser_copy( - (unsigned long) __entry_text_start, - (unsigned long) __entry_text_end - (unsigned long) __entry_text_start, - __PAGE_KERNEL_RX); + /* + * Map the entry/exit text section, which is needed at + * switches from user to and from kernel. + */ + kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end, + __PAGE_KERNEL_RX); - // the fixed map address of the idt_table - _kaiser_copy( - (unsigned long) idt_descr.address, - sizeof(gate_desc) * NR_VECTORS, - __PAGE_KERNEL_RO); - - spin_unlock(&shadow_table_lock); +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) + kaiser_add_user_map_ptrs_early(__irqentry_text_start, + __irqentry_text_end, + __PAGE_KERNEL_RX); +#endif + kaiser_add_user_map_early((void *)idt_descr.address, + sizeof(gate_desc) * NR_VECTORS, + __PAGE_KERNEL_RO); +#ifdef CONFIG_TRACING + kaiser_add_user_map_early(&trace_idt_descr, + sizeof(trace_idt_descr), + __PAGE_KERNEL); + kaiser_add_user_map_early(&trace_idt_table, + sizeof(gate_desc) * NR_VECTORS, + __PAGE_KERNEL); +#endif + kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr), + __PAGE_KERNEL); + kaiser_add_user_map_early(&debug_idt_table, + sizeof(gate_desc) * NR_VECTORS, + __PAGE_KERNEL); } +extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end); // add a mapping to the shadow-mapping, and synchronize the mappings -void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) +int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) { - spin_lock(&shadow_table_lock); - _kaiser_copy(addr, size, flags); - spin_unlock(&shadow_table_lock); + return kaiser_add_user_map((const void *)addr, size, flags); } -extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end); void kaiser_remove_mapping(unsigned long start, unsigned long size) { - pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start)); - spin_lock(&shadow_table_lock); - do { - unmap_pud_range(pgd, start, start + size); - } while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size))); - spin_unlock(&shadow_table_lock); + unsigned long end = start + size; + unsigned long addr; + + for (addr = start; addr < end; addr += PGDIR_SIZE) { + pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr)); + /* + * unmap_p4d_range() handles > P4D_SIZE unmaps, + * so no need to trim 'end'. + */ + unmap_pud_range_nofree(pgd, addr, end); + } } #endif /* CONFIG_KAISER */ diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index c17412f92d77..73dcb0e18c1b 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock); #define CPA_FLUSHTLB 1 #define CPA_ARRAY 2 #define CPA_PAGES_ARRAY 4 +#define CPA_FREE_PAGETABLES 8 #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; @@ -729,10 +730,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, return 0; } -static bool try_to_free_pte_page(pte_t *pte) +static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte) { int i; + if (!(cpa->flags & CPA_FREE_PAGETABLES)) + return false; + for (i = 0; i < PTRS_PER_PTE; i++) if (!pte_none(pte[i])) return false; @@ -741,10 +745,13 @@ static bool try_to_free_pte_page(pte_t *pte) return true; } -static bool try_to_free_pmd_page(pmd_t *pmd) +static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd) { int i; + if (!(cpa->flags & CPA_FREE_PAGETABLES)) + return false; + for (i = 0; i < PTRS_PER_PMD; i++) if (!pmd_none(pmd[i])) return false; @@ -753,7 +760,9 @@ static bool try_to_free_pmd_page(pmd_t *pmd) return true; } -static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) +static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd, + unsigned long start, + unsigned long end) { pte_t *pte = pte_offset_kernel(pmd, start); @@ -764,22 +773,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) pte++; } - if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { + if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) { pmd_clear(pmd); return true; } return false; } -static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, +static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd, unsigned long start, unsigned long end) { - if (unmap_pte_range(pmd, start, end)) - if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) + if (unmap_pte_range(cpa, pmd, start, end)) + if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud))) pud_clear(pud); } -static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) +static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, + unsigned long start, unsigned long end) { pmd_t *pmd = pmd_offset(pud, start); @@ -790,7 +800,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; unsigned long pre_end = min_t(unsigned long, end, next_page); - __unmap_pmd_range(pud, pmd, start, pre_end); + __unmap_pmd_range(cpa, pud, pmd, start, pre_end); start = pre_end; pmd++; @@ -803,7 +813,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) if (pmd_large(*pmd)) pmd_clear(pmd); else - __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); + __unmap_pmd_range(cpa, pud, pmd, + start, start + PMD_SIZE); start += PMD_SIZE; pmd++; @@ -813,17 +824,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) * 4K leftovers? */ if (start < end) - return __unmap_pmd_range(pud, pmd, start, end); + return __unmap_pmd_range(cpa, pud, pmd, start, end); /* * Try again to free the PMD page if haven't succeeded above. */ if (!pud_none(*pud)) - if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) + if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud))) pud_clear(pud); } -void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) +static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd, + unsigned long start, + unsigned long end) { pud_t *pud = pud_offset(pgd, start); @@ -834,7 +847,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; unsigned long pre_end = min_t(unsigned long, end, next_page); - unmap_pmd_range(pud, start, pre_end); + unmap_pmd_range(cpa, pud, start, pre_end); start = pre_end; pud++; @@ -848,7 +861,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) if (pud_large(*pud)) pud_clear(pud); else - unmap_pmd_range(pud, start, start + PUD_SIZE); + unmap_pmd_range(cpa, pud, start, start + PUD_SIZE); start += PUD_SIZE; pud++; @@ -858,7 +871,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) * 2M leftovers? */ if (start < end) - unmap_pmd_range(pud, start, end); + unmap_pmd_range(cpa, pud, start, end); /* * No need to try to free the PUD page because we'll free it in @@ -866,6 +879,24 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) */ } +static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) +{ + struct cpa_data cpa = { + .flags = CPA_FREE_PAGETABLES, + }; + + __unmap_pud_range(&cpa, pgd, start, end); +} + +void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end) +{ + struct cpa_data cpa = { + .flags = 0, + }; + + __unmap_pud_range(&cpa, pgd, start, end); +} + static int alloc_pte_page(pmd_t *pmd) { pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 27d218b38538..352fd015bc63 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -344,40 +344,26 @@ static inline void _pgd_free(pgd_t *pgd) kmem_cache_free(pgd_cache, pgd); } #else -static inline pgd_t *_pgd_alloc(void) -{ + #ifdef CONFIG_KAISER - // Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory - // block. Therefore, we have to allocate at least 3 pages. However, the - // __get_free_pages returns us 4 pages. Hence, we store the base pointer at - // the beginning of the page of our 8kb-aligned memory block in order to - // correctly free it afterwars. - - unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE)); - - if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages) - { - *((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages; - return (pgd_t *) pages; - } - else - { - *((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages; - return (pgd_t *) (pages + PAGE_SIZE); - } +/* + * Instead of one pmd, we aquire two pmds. Being order-1, it is + * both 8k in size and 8k-aligned. That lets us just flip bit 12 + * in a pointer to swap between the two 4k halves. + */ +#define PGD_ALLOCATION_ORDER 1 #else - return (pgd_t *)__get_free_page(PGALLOC_GFP); +#define PGD_ALLOCATION_ORDER 0 #endif + +static inline pgd_t *_pgd_alloc(void) +{ + return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); } static inline void _pgd_free(pgd_t *pgd) { -#ifdef CONFIG_KAISER - unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE)); - free_pages(pages, get_order(4*PAGE_SIZE)); -#else - free_page((unsigned long)pgd); -#endif + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); } #endif /* CONFIG_X86_PAE */ diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h new file mode 100644 index 000000000000..9db5433c2284 --- /dev/null +++ b/include/linux/kaiser.h @@ -0,0 +1,26 @@ +#ifndef _INCLUDE_KAISER_H +#define _INCLUDE_KAISER_H + +#ifdef CONFIG_KAISER +#include +#else + +/* + * These stubs are used whenever CONFIG_KAISER is off, which + * includes architectures that support KAISER, but have it + * disabled. + */ + +static inline void kaiser_init(void) +{ +} +static inline void kaiser_remove_mapping(unsigned long start, unsigned long size) +{ +} +static inline int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) +{ + return 0; +} + +#endif /* !CONFIG_KAISER */ +#endif /* _INCLUDE_KAISER_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 4014be1dd2b6..0b9688d91eaa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include @@ -472,7 +473,6 @@ void set_task_stack_end_magic(struct task_struct *tsk) *stackend = STACK_END_MAGIC; /* for overflow detection */ } -extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; @@ -500,9 +500,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) * functions again. */ tsk->stack = stack; -#ifdef CONFIG_KAISER - kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL); -#endif + + err= kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL); + if (err) + goto free_stack; #ifdef CONFIG_VMAP_STACK tsk->stack_vm_area = stack_vm_area; #endif diff --git a/security/Kconfig b/security/Kconfig index f515ac302257..334d2e85fa7c 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -32,12 +32,17 @@ config SECURITY If you are unsure how to answer this question, answer N. config KAISER bool "Remove the kernel mapping in user mode" + default y depends on X86_64 depends on !PARAVIRT help This enforces a strict kernel and user space isolation in order to close hardware side channels on kernel address information. +config KAISER_REAL_SWITCH + bool "KAISER: actually switch page tables" + default y + config SECURITYFS bool "Enable the securityfs filesystem" help -- cgit v1.2.3 From ac2f1018ac210cfedcfab82dbafbda4e2db7ed08 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 5 Sep 2017 12:05:01 -0700 Subject: kaiser: do not set _PAGE_NX on pgd_none native_pgd_clear() uses native_set_pgd(), so native_set_pgd() must avoid setting the _PAGE_NX bit on an otherwise pgd_none() entry: usually that just generated a warning on exit, but sometimes more mysterious and damaging failures (our production machines could not complete booting). The original fix to this just avoided adding _PAGE_NX to an empty entry; but eventually more problems surfaced with kexec, and EFI mapping expected to be a problem too. So now instead change native_set_pgd() to update shadow only if _PAGE_USER: A few places (kernel/machine_kexec_64.c, platform/efi/efi_64.c for sure) use set_pgd() to set up a temporary internal virtual address space, with physical pages remapped at what Kaiser regards as userspace addresses: Kaiser then assumes a shadow pgd follows, which it will try to corrupt. This appears to be responsible for the recent kexec and kdump failures; though it's unclear how those did not manifest as a problem before. Ah, the shadow pgd will only be assumed to "follow" if the requested pgd is on an even-numbered page: so I suppose it was going wrong 50% of the time all along. What we need is a flag to set_pgd(), to tell it we're dealing with userspace. Er, isn't that what the pgd's _PAGE_USER bit is saying? Add a test for that. But we cannot do the same for pgd_clear() (which may be called to clear corrupted entries - set aside the question of "corrupt in which pgd?" until later), so there just rely on pgd_clear() not being called in the problematic cases - with a WARN_ON_ONCE() which should fire half the time if it is. But this is getting too big for an inline function: move it into arch/x86/mm/kaiser.c (which then demands a boot/compressed mod); and de-void and de-space native_get_shadow/normal_pgd() while here. Also make an unnecessary change to KASLR's init_trampoline(): it was using set_pgd() to assign a pgd-value to a global variable (not in a pg directory page), which was rather scary given Kaiser's previous set_pgd() implementation: not a problem now, but too scary to leave as was, it could easily blow up if we have to change set_pgd() again. Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/misc.h | 1 + arch/x86/include/asm/pgtable_64.h | 51 ++++++++++----------------------------- arch/x86/mm/kaiser.c | 42 ++++++++++++++++++++++++++++++++ arch/x86/mm/kaslr.c | 4 +-- 4 files changed, 58 insertions(+), 40 deletions(-) diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 766a5211f827..80e2cce110b9 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -9,6 +9,7 @@ */ #undef CONFIG_PARAVIRT #undef CONFIG_PARAVIRT_SPINLOCKS +#undef CONFIG_KAISER #undef CONFIG_KASAN #include diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 000265c8b74e..177caf3f7ab1 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -107,61 +107,36 @@ static inline void native_pud_clear(pud_t *pud) } #ifdef CONFIG_KAISER -static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) +extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); + +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) { - return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE); + return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE); } -static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) +static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) { - return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE); + return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE); } #else -static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) +static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) +{ + return pgd; +} +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) { BUILD_BUG_ON(1); return NULL; } -static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) +static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) { return pgdp; } #endif /* CONFIG_KAISER */ -/* - * Page table pages are page-aligned. The lower half of the top - * level is used for userspace and the top half for the kernel. - * This returns true for user pages that need to get copied into - * both the user and kernel copies of the page tables, and false - * for kernel pages that should only be in the kernel copy. - */ -static inline bool is_userspace_pgd(void *__ptr) -{ - unsigned long ptr = (unsigned long)__ptr; - - return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2)); -} - static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { -#ifdef CONFIG_KAISER - pteval_t extra_kern_pgd_flags = 0; - /* Do we need to also populate the shadow pgd? */ - if (is_userspace_pgd(pgdp)) { - native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; - /* - * Even if the entry is *mapping* userspace, ensure - * that userspace can not use it. This way, if we - * get out to userspace running on the kernel CR3, - * userspace will crash instead of running. - */ - extra_kern_pgd_flags = _PAGE_NX; - } - pgdp->pgd = pgd.pgd; - pgdp->pgd |= extra_kern_pgd_flags; -#else /* CONFIG_KAISER */ - *pgdp = pgd; -#endif + *pgdp = kaiser_set_shadow_pgd(pgdp, pgd); } static inline void native_pgd_clear(pgd_t *pgd) diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index 7270a293175d..8d6061c2bc79 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -302,4 +302,46 @@ void kaiser_remove_mapping(unsigned long start, unsigned long size) unmap_pud_range_nofree(pgd, addr, end); } } + +/* + * Page table pages are page-aligned. The lower half of the top + * level is used for userspace and the top half for the kernel. + * This returns true for user pages that need to get copied into + * both the user and kernel copies of the page tables, and false + * for kernel pages that should only be in the kernel copy. + */ +static inline bool is_userspace_pgd(pgd_t *pgdp) +{ + return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2); +} + +pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) +{ + /* + * Do we need to also populate the shadow pgd? Check _PAGE_USER to + * skip cases like kexec and EFI which make temporary low mappings. + */ + if (pgd.pgd & _PAGE_USER) { + if (is_userspace_pgd(pgdp)) { + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; + /* + * Even if the entry is *mapping* userspace, ensure + * that userspace can not use it. This way, if we + * get out to userspace running on the kernel CR3, + * userspace will crash instead of running. + */ + pgd.pgd |= _PAGE_NX; + } + } else if (!pgd.pgd) { + /* + * pgd_clear() cannot check _PAGE_USER, and is even used to + * clear corrupted pgd entries: so just rely on cases like + * kexec and EFI never to be using pgd_clear(). + */ + if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) && + is_userspace_pgd(pgdp)) + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; + } + return pgd; +} #endif /* CONFIG_KAISER */ diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index aed206475aa7..9284ec1e8aa9 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -189,6 +189,6 @@ void __meminit init_trampoline(void) *pud_tramp = *pud; } - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); + /* Avoid set_pgd(), in case it's complicated by CONFIG_KAISER */ + trampoline_pgd_entry = __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)); } -- cgit v1.2.3 From 0994a2cf8fe4e884bad4810681117a7d0096c8e7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 3 Sep 2017 18:57:03 -0700 Subject: kaiser: stack map PAGE_SIZE at THREAD_SIZE-PAGE_SIZE Kaiser only needs to map one page of the stack; and kernel/fork.c did not build on powerpc (no __PAGE_KERNEL). It's all cleaner if linux/kaiser.h provides kaiser_map_thread_stack() and kaiser_unmap_thread_stack() wrappers around asm/kaiser.h's kaiser_add_mapping() and kaiser_remove_mapping(). And use linux/kaiser.h in init/main.c to avoid the #ifdefs there. Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- include/linux/kaiser.h | 40 +++++++++++++++++++++++++++++++++------- init/main.c | 6 +----- kernel/fork.c | 7 ++----- 3 files changed, 36 insertions(+), 17 deletions(-) diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h index 9db5433c2284..4a4d6d911a14 100644 --- a/include/linux/kaiser.h +++ b/include/linux/kaiser.h @@ -1,26 +1,52 @@ -#ifndef _INCLUDE_KAISER_H -#define _INCLUDE_KAISER_H +#ifndef _LINUX_KAISER_H +#define _LINUX_KAISER_H #ifdef CONFIG_KAISER #include + +static inline int kaiser_map_thread_stack(void *stack) +{ + /* + * Map that page of kernel stack on which we enter from user context. + */ + return kaiser_add_mapping((unsigned long)stack + + THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL); +} + +static inline void kaiser_unmap_thread_stack(void *stack) +{ + /* + * Note: may be called even when kaiser_map_thread_stack() failed. + */ + kaiser_remove_mapping((unsigned long)stack + + THREAD_SIZE - PAGE_SIZE, PAGE_SIZE); +} #else /* * These stubs are used whenever CONFIG_KAISER is off, which - * includes architectures that support KAISER, but have it - * disabled. + * includes architectures that support KAISER, but have it disabled. */ static inline void kaiser_init(void) { } -static inline void kaiser_remove_mapping(unsigned long start, unsigned long size) +static inline int kaiser_add_mapping(unsigned long addr, + unsigned long size, unsigned long flags) +{ + return 0; +} +static inline void kaiser_remove_mapping(unsigned long start, + unsigned long size) { } -static inline int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) +static inline int kaiser_map_thread_stack(void *stack) { return 0; } +static inline void kaiser_unmap_thread_stack(void *stack) +{ +} #endif /* !CONFIG_KAISER */ -#endif /* _INCLUDE_KAISER_H */ +#endif /* _LINUX_KAISER_H */ diff --git a/init/main.c b/init/main.c index 2c009f77e655..99f026565608 100644 --- a/init/main.c +++ b/init/main.c @@ -80,15 +80,13 @@ #include #include #include +#include #include #include #include #include #include -#ifdef CONFIG_KAISER -#include -#endif static int kernel_init(void *); @@ -476,9 +474,7 @@ static void __init mm_init(void) pgtable_init(); vmalloc_init(); ioremap_huge_init(); -#ifdef CONFIG_KAISER kaiser_init(); -#endif } asmlinkage __visible void __init start_kernel(void) diff --git a/kernel/fork.c b/kernel/fork.c index 0b9688d91eaa..70e10cb49be0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -212,12 +212,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) #endif } -extern void kaiser_remove_mapping(unsigned long start_addr, unsigned long size); static inline void free_thread_stack(struct task_struct *tsk) { -#ifdef CONFIG_KAISER - kaiser_remove_mapping((unsigned long)tsk->stack, THREAD_SIZE); -#endif + kaiser_unmap_thread_stack(tsk->stack); #ifdef CONFIG_VMAP_STACK if (task_stack_vm_area(tsk)) { unsigned long flags; @@ -501,7 +498,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) */ tsk->stack = stack; - err= kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL); + err= kaiser_map_thread_stack(tsk->stack); if (err) goto free_stack; #ifdef CONFIG_VMAP_STACK -- cgit v1.2.3 From 7a92e20d157f02d0259e2799dea43c9fa1a4541a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 3 Sep 2017 17:09:44 -0700 Subject: kaiser: fix build and FIXME in alloc_ldt_struct() Include linux/kaiser.h instead of asm/kaiser.h to build ldt.c without CONFIG_KAISER. kaiser_add_mapping() does already return an error code, so fix the FIXME. Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/ldt.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index dd31f9f11f01..567bdbd7d917 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -16,9 +16,9 @@ #include #include #include +#include #include -#include #include #include #include @@ -49,7 +49,7 @@ static struct ldt_struct *alloc_ldt_struct(int size) { struct ldt_struct *new_ldt; int alloc_size; - int ret = 0; + int ret; if (size > LDT_ENTRIES) return NULL; @@ -77,10 +77,8 @@ static struct ldt_struct *alloc_ldt_struct(int size) return NULL; } - // FIXME: make kaiser_add_mapping() return an error code - // when it fails - kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size, - __PAGE_KERNEL); + ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size, + __PAGE_KERNEL); if (ret) { __free_ldt_struct(new_ldt); return NULL; -- cgit v1.2.3 From 639c005daeebab077596b034fecd6b8902a88024 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 13 Sep 2017 14:03:10 -0700 Subject: kaiser: KAISER depends on SMP It is absurd that KAISER should depend on SMP, but apparently nobody has tried a UP build before: which breaks on implicit declaration of function 'per_cpu_offset' in arch/x86/mm/kaiser.c. Now, you would expect that to be trivially fixed up; but looking at the System.map when that block is #ifdef'ed out of kaiser_init(), I see that in a UP build __per_cpu_user_mapped_end is precisely at __per_cpu_user_mapped_start, and the items carefully gathered into that section for user-mapping on SMP, dispersed elsewhere on UP. So, some other kind of section assignment will be needed on UP, but implementing that is not a priority: just make KAISER depend on SMP for now. Also inserted a blank line before the option, tidied up the brief Kconfig help message, and added an "If unsure, Y". Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- security/Kconfig | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/security/Kconfig b/security/Kconfig index 334d2e85fa7c..dc78671a5a70 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -30,14 +30,16 @@ config SECURITY model will be used. If you are unsure how to answer this question, answer N. + config KAISER bool "Remove the kernel mapping in user mode" default y - depends on X86_64 - depends on !PARAVIRT + depends on X86_64 && SMP && !PARAVIRT help - This enforces a strict kernel and user space isolation in order to close - hardware side channels on kernel address information. + This enforces a strict kernel and user space isolation, in order + to close hardware side channels on kernel address information. + + If you are unsure how to answer this question, answer Y. config KAISER_REAL_SWITCH bool "KAISER: actually switch page tables" -- cgit v1.2.3 From 19377944317f7d1f706d7da2c7cea5c4bcb7440b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 21 Sep 2017 20:39:56 -0700 Subject: kaiser: fix regs to do_nmi() ifndef CONFIG_KAISER pjt has observed that nmi's second (nmi_from_kernel) call to do_nmi() adjusted the %rdi regs arg, rightly when CONFIG_KAISER, but wrongly when not CONFIG_KAISER. Although the minimal change is to add an #ifdef CONFIG_KAISER around the addq line, that looks cluttered, and I prefer how the first call to do_nmi() handled it: prepare args in %rdi and %rsi before getting into the CONFIG_KAISER block, since it does not touch them at all. And while we're here, place the "#ifdef CONFIG_KAISER" that follows each, to enclose the "Unconditionally restore CR3" comment: matching how the "Unconditionally use kernel CR3" comment above is enclosed. Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_64.S | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 2869fc178658..756dcbb41ccf 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1323,12 +1323,13 @@ ENTRY(nmi) movq %rax, %cr3 #endif call do_nmi + +#ifdef CONFIG_KAISER /* * Unconditionally restore CR3. I know we return to * kernel code that needs user CR3, but do we ever return * to "user mode" where we need the kernel CR3? */ -#ifdef CONFIG_KAISER popq %rax mov %rax, %cr3 #endif @@ -1552,6 +1553,8 @@ end_repeat_nmi: SWAPGS xorl %ebx, %ebx 1: + movq %rsp, %rdi + movq $-1, %rsi #ifdef CONFIG_KAISER /* Unconditionally use kernel CR3 for do_nmi() */ /* %rax is saved above, so OK to clobber here */ @@ -1564,16 +1567,14 @@ end_repeat_nmi: #endif /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ - movq %rsp, %rdi - addq $8, %rdi /* point %rdi at ptregs, fixed up for CR3 */ - movq $-1, %rsi call do_nmi + +#ifdef CONFIG_KAISER /* * Unconditionally restore CR3. We might be returning to * kernel code that needs user CR3, like just just before * a sysret. */ -#ifdef CONFIG_KAISER popq %rax mov %rax, %cr3 #endif -- cgit v1.2.3 From f881e626849c62641a91a13cb5344908c6613d11 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 23 Aug 2017 14:21:14 -0700 Subject: kaiser: fix perf crashes Avoid perf crashes: place debug_store in the user-mapped per-cpu area instead of allocating, and use page allocator plus kaiser_add_mapping() to keep the BTS and PEBS buffers user-mapped (that is, present in the user mapping, though visible only to kernel and hardware). The PEBS fixup buffer does not need this treatment. The need for a user-mapped struct debug_store showed up before doing any conscious perf testing: in a couple of kernel paging oopses on Westmere, implicating the debug_store offset of the per-cpu area. Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/events/intel/ds.c | 57 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 12 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 9dfeeeca0ea8..51170f69682d 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2,11 +2,15 @@ #include #include +#include #include #include #include "../perf_event.h" +static +DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store); + /* The size of a BTS record in bytes: */ #define BTS_RECORD_SIZE 24 @@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu) static DEFINE_PER_CPU(void *, insn_buffer); +static void *dsalloc(size_t size, gfp_t flags, int node) +{ +#ifdef CONFIG_KAISER + unsigned int order = get_order(size); + struct page *page; + unsigned long addr; + + page = __alloc_pages_node(node, flags | __GFP_ZERO, order); + if (!page) + return NULL; + addr = (unsigned long)page_address(page); + if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) { + __free_pages(page, order); + addr = 0; + } + return (void *)addr; +#else + return kmalloc_node(size, flags | __GFP_ZERO, node); +#endif +} + +static void dsfree(const void *buffer, size_t size) +{ +#ifdef CONFIG_KAISER + if (!buffer) + return; + kaiser_remove_mapping((unsigned long)buffer, size); + free_pages((unsigned long)buffer, get_order(size)); +#else + kfree(buffer); +#endif +} + static int alloc_pebs_buffer(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu) if (!x86_pmu.pebs) return 0; - buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); + buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); if (unlikely(!buffer)) return -ENOMEM; @@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu) if (x86_pmu.intel_cap.pebs_format < 2) { ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); if (!ibuffer) { - kfree(buffer); + dsfree(buffer, x86_pmu.pebs_buffer_size); return -ENOMEM; } per_cpu(insn_buffer, cpu) = ibuffer; @@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu) kfree(per_cpu(insn_buffer, cpu)); per_cpu(insn_buffer, cpu) = NULL; - kfree((void *)(unsigned long)ds->pebs_buffer_base); + dsfree((void *)(unsigned long)ds->pebs_buffer_base, + x86_pmu.pebs_buffer_size); ds->pebs_buffer_base = 0; } @@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu) if (!x86_pmu.bts) return 0; - buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); + buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); if (unlikely(!buffer)) { WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); return -ENOMEM; @@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu) if (!ds || !x86_pmu.bts) return; - kfree((void *)(unsigned long)ds->bts_buffer_base); + dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE); ds->bts_buffer_base = 0; } static int alloc_ds_buffer(int cpu) { - int node = cpu_to_node(cpu); - struct debug_store *ds; - - ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); - if (unlikely(!ds)) - return -ENOMEM; + struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu); + memset(ds, 0, sizeof(*ds)); per_cpu(cpu_hw_events, cpu).ds = ds; return 0; @@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu) return; per_cpu(cpu_hw_events, cpu).ds = NULL; - kfree(ds); } void release_ds_buffers(void) -- cgit v1.2.3 From f43f386f0bf0380fad5483be3ac678eeee934e95 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 3 Sep 2017 18:48:02 -0700 Subject: kaiser: ENOMEM if kaiser_pagetable_walk() NULL kaiser_add_user_map() took no notice when kaiser_pagetable_walk() failed. And avoid its might_sleep() when atomic (though atomic at present unused). Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/kaiser.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index 8d6061c2bc79..ba6fc2c88c82 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -98,11 +98,11 @@ static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic) pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); - might_sleep(); if (is_atomic) { gfp &= ~GFP_KERNEL; gfp |= __GFP_HIGH | __GFP_ATOMIC; - } + } else + might_sleep(); if (pgd_none(*pgd)) { WARN_ONCE(1, "All shadow pgds should have been populated"); @@ -159,13 +159,17 @@ int kaiser_add_user_map(const void *__start_addr, unsigned long size, unsigned long end_addr = PAGE_ALIGN(start_addr + size); unsigned long target_address; - for (;address < end_addr; address += PAGE_SIZE) { + for (; address < end_addr; address += PAGE_SIZE) { target_address = get_pa_from_mapping(address); if (target_address == -1) { ret = -EIO; break; } pte = kaiser_pagetable_walk(address, false); + if (!pte) { + ret = -ENOMEM; + break; + } if (pte_none(*pte)) { set_pte(pte, __pte(flags | target_address)); } else { -- cgit v1.2.3 From 67fab0d4acb3556134127ac285b625f715b12ca1 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 3 Sep 2017 19:18:07 -0700 Subject: kaiser: tidied up asm/kaiser.h somewhat Mainly deleting a surfeit of blank lines, and reflowing header comment. Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/kaiser.h | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 0703f48777f3..7394ba9f9951 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -1,15 +1,17 @@ #ifndef _ASM_X86_KAISER_H #define _ASM_X86_KAISER_H - -/* This file includes the definitions for the KAISER feature. - * KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory. - * It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped, - * but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled, - * the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled. - * By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory. +/* + * This file includes the definitions for the KAISER feature. + * KAISER is a counter measure against x86_64 side channel attacks on + * the kernel virtual memory. It has a shadow pgd for every process: the + * shadow pgd has a minimalistic kernel-set mapped, but includes the whole + * user memory. Within a kernel context switch, or when an interrupt is handled, + * the pgd is switched to the normal one. When the system switches to user mode, + * the shadow pgd is enabled. By this, the virtual memory caches are freed, + * and the user may not attack the whole kernel memory. * - * A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions - * of the user space, or the stacks. + * A minimalistic kernel mapping holds the parts needed to be mapped in user + * mode, such as the entry/exit functions of the user space, or the stacks. */ #ifdef __ASSEMBLY__ #ifdef CONFIG_KAISER @@ -48,13 +50,10 @@ _SWITCH_TO_KERNEL_CR3 %rax movq PER_CPU_VAR(unsafe_stack_register_backup), %rax .endm - .macro SWITCH_USER_CR3_NO_STACK - movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) _SWITCH_TO_USER_CR3 %rax movq PER_CPU_VAR(unsafe_stack_register_backup), %rax - .endm #else /* CONFIG_KAISER */ @@ -72,7 +71,6 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax #else /* __ASSEMBLY__ */ - #ifdef CONFIG_KAISER /* * Upon kernel/user mode switch, it may happen that the address @@ -80,7 +78,6 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax * stored. To change the address space, another register is * needed. A register therefore has to be stored/restored. */ - DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); /** @@ -95,7 +92,6 @@ DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); */ extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); - /** * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping * @addr: the start address of the range @@ -104,12 +100,12 @@ extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned l extern void kaiser_remove_mapping(unsigned long start, unsigned long size); /** - * kaiser_initialize_mapping - Initalize the shadow mapping + * kaiser_init - Initialize the shadow mapping * * Most parts of the shadow mapping can be mapped upon boot * time. Only per-process things like the thread stacks * or a new LDT have to be mapped at runtime. These boot- - * time mappings are permanent and nevertunmapped. + * time mappings are permanent and never unmapped. */ extern void kaiser_init(void); @@ -117,6 +113,4 @@ extern void kaiser_init(void); #endif /* __ASSEMBLY */ - - #endif /* _ASM_X86_KAISER_H */ -- cgit v1.2.3 From be6bf01f4caa523433030a2267df57d6afefa53d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 3 Sep 2017 19:23:08 -0700 Subject: kaiser: tidied up kaiser_add/remove_mapping slightly Yes, unmap_pud_range_nofree()'s declaration ought to be in a header file really, but I'm not sure we want to use it anyway: so for now just declare it inside kaiser_remove_mapping(). And there doesn't seem to be such a thing as unmap_p4d_range(), even in a 5-level paging tree. Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/kaiser.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index ba6fc2c88c82..7a7e850b8381 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -285,8 +285,7 @@ void __init kaiser_init(void) __PAGE_KERNEL); } -extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end); -// add a mapping to the shadow-mapping, and synchronize the mappings +/* Add a mapping to the shadow mapping, and synchronize the mappings */ int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) { return kaiser_add_user_map((const void *)addr, size, flags); @@ -294,15 +293,13 @@ int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long fla void kaiser_remove_mapping(unsigned long start, unsigned long size) { + extern void unmap_pud_range_nofree(pgd_t *pgd, + unsigned long start, unsigned long end); unsigned long end = start + size; unsigned long addr; for (addr = start; addr < end; addr += PGDIR_SIZE) { pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr)); - /* - * unmap_p4d_range() handles > P4D_SIZE unmaps, - * so no need to trim 'end'. - */ unmap_pud_range_nofree(pgd, addr, end); } } -- cgit v1.2.3 From 604db49610853bffcc65029886688ef151469fa8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 3 Sep 2017 19:51:10 -0700 Subject: kaiser: align addition to x86/mm/Makefile Use tab not space so they line up properly, kaslr.o also. Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 682c162333ba..c505569e3835 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -37,5 +37,5 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_X86_INTEL_MPX) += mpx.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o -obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o -obj-$(CONFIG_KAISER) += kaiser.o +obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +obj-$(CONFIG_KAISER) += kaiser.o -- cgit v1.2.3 From 61b7a404fa132ee0ffad04b0018b8d01ba5cf026 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 21 Aug 2017 20:11:43 -0700 Subject: kaiser: cleanups while trying for gold link While trying to get our gold link to work, four cleanups: matched the gdt_page declaration to its definition; in fiddling unsuccessfully with PERCPU_INPUT(), lined up backslashes; lined up the backslashes according to convention in percpu-defs.h; deleted the unused irq_stack_pointer addition to irq_stack_union. Sad to report that aligning backslashes does not appear to help gold align to 8192: but while these did not help, they are worth keeping. Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/desc.h | 2 +- arch/x86/include/asm/processor.h | 5 ----- include/asm-generic/vmlinux.lds.h | 18 ++++++++---------- include/linux/percpu-defs.h | 22 +++++++++++----------- 4 files changed, 20 insertions(+), 27 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 12080d87da3b..2ed5a2b3f8f7 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -43,7 +43,7 @@ struct gdt_page { struct desc_struct gdt[GDT_ENTRIES]; } __attribute__((aligned(PAGE_SIZE))); -DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); +DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page); static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) { diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3d4784e2f862..8cb52ee3ade6 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -335,11 +335,6 @@ union irq_stack_union { char gs_base[40]; unsigned long stack_canary; }; - - struct { - char irq_stack_pointer[64]; - char unused[IRQ_STACK_SIZE - 64]; - }; }; DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index d6ab14414776..2e6000a4eb2c 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -778,16 +778,14 @@ */ #define PERCPU_INPUT(cacheline) \ VMLINUX_SYMBOL(__per_cpu_start) = .; \ - \ - VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \ - *(.data..percpu..first) \ - . = ALIGN(cacheline); \ - *(.data..percpu..user_mapped) \ - *(.data..percpu..user_mapped..shared_aligned) \ - . = ALIGN(PAGE_SIZE); \ - *(.data..percpu..user_mapped..page_aligned) \ - VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \ - \ + VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \ + *(.data..percpu..first) \ + . = ALIGN(cacheline); \ + *(.data..percpu..user_mapped) \ + *(.data..percpu..user_mapped..shared_aligned) \ + . = ALIGN(PAGE_SIZE); \ + *(.data..percpu..user_mapped..page_aligned) \ + VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \ . = ALIGN(PAGE_SIZE); \ *(.data..percpu..page_aligned) \ . = ALIGN(cacheline); \ diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 8ea945f63a05..cfe13cb4ec63 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -121,10 +121,10 @@ #define DEFINE_PER_CPU(type, name) \ DEFINE_PER_CPU_SECTION(type, name, "") -#define DECLARE_PER_CPU_USER_MAPPED(type, name) \ +#define DECLARE_PER_CPU_USER_MAPPED(type, name) \ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) -#define DEFINE_PER_CPU_USER_MAPPED(type, name) \ +#define DEFINE_PER_CPU_USER_MAPPED(type, name) \ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) /* @@ -156,11 +156,11 @@ DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \ ____cacheline_aligned_in_smp -#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ +#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ ____cacheline_aligned_in_smp -#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ +#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ ____cacheline_aligned_in_smp @@ -185,18 +185,18 @@ /* * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode. */ -#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ - DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ - __aligned(PAGE_SIZE) +#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ + __aligned(PAGE_SIZE) -#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ - DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ - __aligned(PAGE_SIZE) +#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ + __aligned(PAGE_SIZE) /* * Declaration/definition used for per-CPU variables that must be read mostly. */ -#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ +#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ DECLARE_PER_CPU_SECTION(type, name, "..read_mostly") #define DEFINE_PER_CPU_READ_MOSTLY(type, name) \ -- cgit v1.2.3 From c27cdea56c546d0a6aa9114d9f5166404f3f14dd Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 9 Sep 2017 17:31:18 -0700 Subject: kaiser: name that 0x1000 KAISER_SHADOW_PGD_OFFSET There's a 0x1000 in various places, which looks better with a name. Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_64.S | 4 ++-- arch/x86/include/asm/kaiser.h | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 756dcbb41ccf..299375379b05 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1318,7 +1318,7 @@ ENTRY(nmi) movq %cr3, %rax pushq %rax #ifdef CONFIG_KAISER_REAL_SWITCH - andq $(~0x1000), %rax + andq $(~KAISER_SHADOW_PGD_OFFSET), %rax #endif movq %rax, %cr3 #endif @@ -1561,7 +1561,7 @@ end_repeat_nmi: movq %cr3, %rax pushq %rax #ifdef CONFIG_KAISER_REAL_SWITCH - andq $(~0x1000), %rax + andq $(~KAISER_SHADOW_PGD_OFFSET), %rax #endif movq %rax, %cr3 #endif diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 7394ba9f9951..051acf678cda 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -13,13 +13,16 @@ * A minimalistic kernel mapping holds the parts needed to be mapped in user * mode, such as the entry/exit functions of the user space, or the stacks. */ + +#define KAISER_SHADOW_PGD_OFFSET 0x1000 + #ifdef __ASSEMBLY__ #ifdef CONFIG_KAISER .macro _SWITCH_TO_KERNEL_CR3 reg movq %cr3, \reg #ifdef CONFIG_KAISER_REAL_SWITCH -andq $(~0x1000), \reg +andq $(~KAISER_SHADOW_PGD_OFFSET), \reg #endif movq \reg, %cr3 .endm @@ -27,7 +30,7 @@ movq \reg, %cr3 .macro _SWITCH_TO_USER_CR3 reg movq %cr3, \reg #ifdef CONFIG_KAISER_REAL_SWITCH -orq $(0x1000), \reg +orq $(KAISER_SHADOW_PGD_OFFSET), \reg #endif movq \reg, %cr3 .endm -- cgit v1.2.3 From 1ce27de4011e57460d454f9bedd301e5b7757e68 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 3 Sep 2017 18:30:43 -0700 Subject: kaiser: delete KAISER_REAL_SWITCH option We fail to see what CONFIG_KAISER_REAL_SWITCH is for: it seems to be left over from early development, and now just obscures tricky parts of the code. Delete it before adding PCIDs, or nokaiser boot option. (Or if there is some good reason to keep the option, then it needs a help text - and a "depends on KAISER", so that all those without KAISER are not asked the question. But we'd much rather delete it.) Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_64.S | 4 ---- arch/x86/include/asm/kaiser.h | 4 ---- security/Kconfig | 4 ---- 3 files changed, 12 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 299375379b05..4214ac1f1c0f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1317,9 +1317,7 @@ ENTRY(nmi) /* %rax is saved above, so OK to clobber here */ movq %cr3, %rax pushq %rax -#ifdef CONFIG_KAISER_REAL_SWITCH andq $(~KAISER_SHADOW_PGD_OFFSET), %rax -#endif movq %rax, %cr3 #endif call do_nmi @@ -1560,9 +1558,7 @@ end_repeat_nmi: /* %rax is saved above, so OK to clobber here */ movq %cr3, %rax pushq %rax -#ifdef CONFIG_KAISER_REAL_SWITCH andq $(~KAISER_SHADOW_PGD_OFFSET), %rax -#endif movq %rax, %cr3 #endif diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 051acf678cda..e0fc45e77aee 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -21,17 +21,13 @@ .macro _SWITCH_TO_KERNEL_CR3 reg movq %cr3, \reg -#ifdef CONFIG_KAISER_REAL_SWITCH andq $(~KAISER_SHADOW_PGD_OFFSET), \reg -#endif movq \reg, %cr3 .endm .macro _SWITCH_TO_USER_CR3 reg movq %cr3, \reg -#ifdef CONFIG_KAISER_REAL_SWITCH orq $(KAISER_SHADOW_PGD_OFFSET), \reg -#endif movq \reg, %cr3 .endm diff --git a/security/Kconfig b/security/Kconfig index dc78671a5a70..d8ae933471ad 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -41,10 +41,6 @@ config KAISER If you are unsure how to answer this question, answer Y. -config KAISER_REAL_SWITCH - bool "KAISER: actually switch page tables" - default y - config SECURITYFS bool "Enable the securityfs filesystem" help -- cgit v1.2.3 From 1972bb9d92066fcf7deb0d798e02c88caa45e035 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 9 Sep 2017 21:27:32 -0700 Subject: kaiser: vmstat show NR_KAISERTABLE as nr_overhead The kaiser update made an interesting choice, never to free any shadow page tables. Contention on global spinlock was worrying, particularly with it held across page table scans when freeing. Something had to be done: I was going to add refcounting; but simply never to free them is an appealing choice, minimizing contention without complicating the code (the more a page table is found already, the less the spinlock is used). But leaking pages in this way is also a worry: can we get away with it? At the very least, we need a count to show how bad it actually gets: in principle, one might end up wasting about 1/256 of memory that way (1/512 for when direct-mapped pages have to be user-mapped, plus 1/512 for when they are user-mapped from the vmalloc area on another occasion (but we don't have vmalloc'ed stacks, so only large ldts are vmalloc'ed). Add per-cpu stat NR_KAISERTABLE: including 256 at startup for the shared pgd entries, and 1 for each intermediate page table added thereafter for user-mapping - but leave out the 1 per mm, for its shadow pgd, because that distracts from the monotonic increase. Shown in /proc/vmstat as nr_overhead (0 if kaiser not enabled). In practice, it doesn't look so bad so far: more like 1/12000 after nine hours of gtests below; and movable pageblock segregation should tend to cluster the kaiser tables into a subset of the address space (if not, they will be bad for compaction too). But production may tell a different story: keep an eye on this number, and bring back lighter freeing if it gets out of control (maybe a shrinker). ["nr_overhead" should of course say "nr_kaisertable", if it needs to stay; but for the moment we are being coy, preferring that when Joe Blow notices a new line in his /proc/vmstat, he does not get too curious about what this "kaiser" stuff might be.] Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/kaiser.c | 16 +++++++++++----- include/linux/mmzone.h | 3 ++- mm/vmstat.c | 1 + 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index 7a7e850b8381..bd22ef51aa2d 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -121,9 +121,11 @@ static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic) if (!new_pmd_page) return NULL; spin_lock(&shadow_table_allocation_lock); - if (pud_none(*pud)) + if (pud_none(*pud)) { set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); - else + __inc_zone_page_state(virt_to_page((void *) + new_pmd_page), NR_KAISERTABLE); + } else free_page(new_pmd_page); spin_unlock(&shadow_table_allocation_lock); } @@ -139,9 +141,11 @@ static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic) if (!new_pte_page) return NULL; spin_lock(&shadow_table_allocation_lock); - if (pmd_none(*pmd)) + if (pmd_none(*pmd)) { set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); - else + __inc_zone_page_state(virt_to_page((void *) + new_pte_page), NR_KAISERTABLE); + } else free_page(new_pte_page); spin_unlock(&shadow_table_allocation_lock); } @@ -205,11 +209,13 @@ static void __init kaiser_init_all_pgds(void) pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { pgd_t new_pgd; - pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE); + pud_t *pud = pud_alloc_one(&init_mm, + PAGE_OFFSET + i * PGDIR_SIZE); if (!pud) { WARN_ON(1); break; } + inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE); new_pgd = __pgd(_KERNPG_TABLE |__pa(pud)); /* * Make sure not to stomp on some other pgd entry. diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fff21a82780c..490f5a83f947 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -124,8 +124,9 @@ enum zone_stat_item { NR_SLAB_UNRECLAIMABLE, NR_PAGETABLE, /* used for pagetables */ NR_KERNEL_STACK_KB, /* measured in KiB */ - /* Second 128 byte cacheline */ + NR_KAISERTABLE, NR_BOUNCE, + /* Second 128 byte cacheline */ #if IS_ENABLED(CONFIG_ZSMALLOC) NR_ZSPAGES, /* allocated in zsmalloc */ #endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 604f26a4f696..6a088df04b29 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -932,6 +932,7 @@ const char * const vmstat_text[] = { "nr_slab_unreclaimable", "nr_page_table_pages", "nr_kernel_stack", + "nr_overhead", "nr_bounce", #if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", -- cgit v1.2.3 From 2684b12a169ee244ffc05d34234b0a3dec238c40 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 30 Aug 2017 16:23:00 -0700 Subject: kaiser: enhanced by kernel and user PCIDs Merged performance improvements to Kaiser, using distinct kernel and user Process Context Identifiers to minimize the TLB flushing. [This work actually all from Dave Hansen 2017-08-30: still omitting trackswitch mods, and KAISER_REAL_SWITCH deleted.] Signed-off-by: Dave Hansen Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_64.S | 10 ++++-- arch/x86/entry/entry_64_compat.S | 1 + arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/kaiser.h | 15 +++++++-- arch/x86/include/asm/pgtable_types.h | 26 +++++++++++++++ arch/x86/include/asm/tlbflush.h | 52 ++++++++++++++++++++++++----- arch/x86/include/uapi/asm/processor-flags.h | 3 +- arch/x86/kernel/cpu/common.c | 34 +++++++++++++++++++ arch/x86/kvm/x86.c | 3 +- arch/x86/mm/kaiser.c | 7 ++++ arch/x86/mm/tlb.c | 46 +++++++++++++++++++++++-- 11 files changed, 181 insertions(+), 17 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4214ac1f1c0f..01ff191830f4 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1317,7 +1317,10 @@ ENTRY(nmi) /* %rax is saved above, so OK to clobber here */ movq %cr3, %rax pushq %rax - andq $(~KAISER_SHADOW_PGD_OFFSET), %rax + /* mask off "user" bit of pgd address and 12 PCID bits: */ + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax + /* Add back kernel PCID and "no flush" bit */ + orq X86_CR3_PCID_KERN_VAR, %rax movq %rax, %cr3 #endif call do_nmi @@ -1558,7 +1561,10 @@ end_repeat_nmi: /* %rax is saved above, so OK to clobber here */ movq %cr3, %rax pushq %rax - andq $(~KAISER_SHADOW_PGD_OFFSET), %rax + /* mask off "user" bit of pgd address and 12 PCID bits: */ + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax + /* Add back kernel PCID and "no flush" bit */ + orq X86_CR3_PCID_KERN_VAR, %rax movq %rax, %cr3 #endif diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index f0e384ee8fc6..0eb5801947ff 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index ed10b5bf9b93..dc508830d0a1 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -189,6 +189,7 @@ #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ +#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index e0fc45e77aee..360ff3bc44a9 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -1,5 +1,8 @@ #ifndef _ASM_X86_KAISER_H #define _ASM_X86_KAISER_H + +#include /* For PCID constants */ + /* * This file includes the definitions for the KAISER feature. * KAISER is a counter measure against x86_64 side channel attacks on @@ -21,13 +24,21 @@ .macro _SWITCH_TO_KERNEL_CR3 reg movq %cr3, \reg -andq $(~KAISER_SHADOW_PGD_OFFSET), \reg +andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg +orq X86_CR3_PCID_KERN_VAR, \reg movq \reg, %cr3 .endm .macro _SWITCH_TO_USER_CR3 reg movq %cr3, \reg -orq $(KAISER_SHADOW_PGD_OFFSET), \reg +andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg +/* + * This can obviously be one instruction by putting the + * KAISER_SHADOW_PGD_OFFSET bit in the X86_CR3_PCID_USER_VAR. + * But, just leave it now for simplicity. + */ +orq X86_CR3_PCID_USER_VAR, \reg +orq $(KAISER_SHADOW_PGD_OFFSET), \reg movq \reg, %cr3 .endm diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 8bc8d02fb4b1..ada77fdc283f 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -141,6 +141,32 @@ _PAGE_SOFT_DIRTY) #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) +/* The ASID is the lower 12 bits of CR3 */ +#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL)) + +/* Mask for all the PCID-related bits in CR3: */ +#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK) +#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64) +#define X86_CR3_PCID_ASID_KERN (_AC(0x4,UL)) +#define X86_CR3_PCID_ASID_USER (_AC(0x6,UL)) + +#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN) +#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER) +#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN) +#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER) +#else +#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) +#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL)) +/* + * PCIDs are unsupported on 32-bit and none of these bits can be + * set in CR3: + */ +#define X86_CR3_PCID_KERN_FLUSH (0) +#define X86_CR3_PCID_USER_FLUSH (0) +#define X86_CR3_PCID_KERN_NOFLUSH (0) +#define X86_CR3_PCID_USER_NOFLUSH (0) +#endif + /* * The cache modes defined here are used to translate between pure SW usage * and the HW defined cache mode bits and/or PAT entries. diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 7d2ea6b1f7d9..5d0c0b504729 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -13,7 +13,6 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr, unsigned long type) { struct { u64 d[2]; } desc = { { pcid, addr } }; - /* * The memory clobber is because the whole point is to invalidate * stale TLB entries and, especially if we're flushing global @@ -134,14 +133,25 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) static inline void __native_flush_tlb(void) { + if (!cpu_feature_enabled(X86_FEATURE_INVPCID)) { + /* + * If current->mm == NULL then we borrow a mm which may change during a + * task switch and therefore we must not be preempted while we write CR3 + * back: + */ + preempt_disable(); + native_write_cr3(native_read_cr3()); + preempt_enable(); + return; + } /* - * If current->mm == NULL then we borrow a mm which may change during a - * task switch and therefore we must not be preempted while we write CR3 - * back: + * We are no longer using globals with KAISER, so a + * "nonglobals" flush would work too. But, this is more + * conservative. + * + * Note, this works with CR4.PCIDE=0 or 1. */ - preempt_disable(); - native_write_cr3(native_read_cr3()); - preempt_enable(); + invpcid_flush_all(); } static inline void __native_flush_tlb_global_irq_disabled(void) @@ -163,6 +173,8 @@ static inline void __native_flush_tlb_global(void) /* * Using INVPCID is considerably faster than a pair of writes * to CR4 sandwiched inside an IRQ flag save/restore. + * + * Note, this works with CR4.PCIDE=0 or 1. */ invpcid_flush_all(); return; @@ -182,7 +194,31 @@ static inline void __native_flush_tlb_global(void) static inline void __native_flush_tlb_single(unsigned long addr) { - asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + /* + * SIMICS #GP's if you run INVPCID with type 2/3 + * and X86_CR4_PCIDE clear. Shame! + * + * The ASIDs used below are hard-coded. But, we must not + * call invpcid(type=1/2) before CR4.PCIDE=1. Just call + * invpcid in the case we are called early. + */ + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + return; + } + /* Flush the address out of both PCIDs. */ + /* + * An optimization here might be to determine addresses + * that are only kernel-mapped and only flush the kernel + * ASID. But, userspace flushes are probably much more + * important performance-wise. + * + * Make sure to do only a single invpcid when KAISER is + * disabled and we have only a single ASID. + */ + if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER) + invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); + invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); } static inline void __flush_tlb_all(void) diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 567de50a4c2a..6768d1321016 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -77,7 +77,8 @@ #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) -#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ +#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ +#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT) /* * Intel CPU features in CR4 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3efde13eaed0..b4c0ae5495f3 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -324,11 +324,45 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) } } +/* + * These can have bit 63 set, so we can not just use a plain "or" + * instruction to get their value or'd into CR3. It would take + * another register. So, we use a memory reference to these + * instead. + * + * This is also handy because systems that do not support + * PCIDs just end up or'ing a 0 into their CR3, which does + * no harm. + */ +__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR = 0; +__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_USER_VAR = 0; + static void setup_pcid(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_PCID)) { if (cpu_has(c, X86_FEATURE_PGE)) { cr4_set_bits(X86_CR4_PCIDE); + /* + * These variables are used by the entry/exit + * code to change PCIDs. + */ +#ifdef CONFIG_KAISER + X86_CR3_PCID_KERN_VAR = X86_CR3_PCID_KERN_NOFLUSH; + X86_CR3_PCID_USER_VAR = X86_CR3_PCID_USER_NOFLUSH; +#endif + /* + * INVPCID has two "groups" of types: + * 1/2: Invalidate an individual address + * 3/4: Invalidate all contexts + * + * 1/2 take a PCID, but 3/4 do not. So, 3/4 + * ignore the PCID argument in the descriptor. + * But, we have to be careful not to call 1/2 + * with an actual non-zero PCID in them before + * we do the above cr4_set_bits(). + */ + if (cpu_has(c, X86_FEATURE_INVPCID)) + set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE); } else { /* * flush_tlb_all(), as currently implemented, won't diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7e28e6c877d9..73304b1a03cc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -773,7 +773,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ - if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) + if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) || + !is_long_mode(vcpu)) return 1; } diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index bd22ef51aa2d..f5c75f73bb71 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -239,6 +239,8 @@ static void __init kaiser_init_all_pgds(void) } while (0) extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; +extern unsigned long X86_CR3_PCID_KERN_VAR; +extern unsigned long X86_CR3_PCID_USER_VAR; /* * If anything in here fails, we will likely die on one of the * first kernel->user transitions and init will die. But, we @@ -289,6 +291,11 @@ void __init kaiser_init(void) kaiser_add_user_map_early(&debug_idt_table, sizeof(gate_desc) * NR_VECTORS, __PAGE_KERNEL); + + kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE, + __PAGE_KERNEL); + kaiser_add_user_map_early(&X86_CR3_PCID_USER_VAR, PAGE_SIZE, + __PAGE_KERNEL); } /* Add a mapping to the shadow mapping, and synchronize the mappings */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 53b72fb4e781..43ce5d316eae 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -34,6 +34,46 @@ struct flush_tlb_info { unsigned long flush_end; }; +static void load_new_mm_cr3(pgd_t *pgdir) +{ + unsigned long new_mm_cr3 = __pa(pgdir); + + /* + * KAISER, plus PCIDs needs some extra work here. But, + * if either of features is not present, we need no + * PCIDs here and just do a normal, full TLB flush with + * the write_cr3() + */ + if (!IS_ENABLED(CONFIG_KAISER) || + !cpu_feature_enabled(X86_FEATURE_PCID)) + goto out_set_cr3; + /* + * We reuse the same PCID for different tasks, so we must + * flush all the entires for the PCID out when we change + * tasks. + */ + new_mm_cr3 = X86_CR3_PCID_KERN_FLUSH | __pa(pgdir); + + /* + * The flush from load_cr3() may leave old TLB entries + * for userspace in place. We must flush that context + * separately. We can theoretically delay doing this + * until we actually load up the userspace CR3, but + * that's a bit tricky. We have to have the "need to + * flush userspace PCID" bit per-cpu and check it in the + * exit-to-userspace paths. + */ + invpcid_flush_single_context(X86_CR3_PCID_ASID_USER); + +out_set_cr3: + /* + * Caution: many callers of this function expect + * that load_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask writes. + */ + write_cr3(new_mm_cr3); +} + /* * We cannot call mmdrop() because we are in interrupt context, * instead update mm->cpu_vm_mask. @@ -45,7 +85,7 @@ void leave_mm(int cpu) BUG(); if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); - load_cr3(swapper_pg_dir); + load_new_mm_cr3(swapper_pg_dir); /* * This gets called in the idle path where RCU * functions differently. Tracing normally @@ -120,7 +160,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * ordering guarantee we need. * */ - load_cr3(next->pgd); + load_new_mm_cr3(next->pgd); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); @@ -167,7 +207,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * As above, load_cr3() is serializing and orders TLB * fills with respect to the mm_cpumask write. */ - load_cr3(next->pgd); + load_new_mm_cr3(next->pgd); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); load_mm_cr4(next); load_mm_ldt(next); -- cgit v1.2.3 From 0b5ca9d99599087971a3cd7634a0b61d4e2653e3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 17 Aug 2017 15:00:37 -0700 Subject: kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush user We have many machines (Westmere, Sandybridge, Ivybridge) supporting PCID but not INVPCID: on these load_new_mm_cr3() simply crashed. Flushing user context inside load_new_mm_cr3() without the use of invpcid is difficult: momentarily switch from kernel to user context and back to do so? I'm not sure whether that can be safely done at all, and would risk polluting user context with kernel internals, and kernel context with stale user externals. Instead, follow the hint in the comment that was there: change X86_CR3_PCID_USER_VAR to be a per-cpu variable, then load_new_mm_cr3() can leave a note in it, for SWITCH_USER_CR3 on return to userspace to flush user context TLB, instead of default X86_CR3_PCID_USER_NOFLUSH. Which works well enough that there's no need to do it this way only when invpcid is unsupported: it's a good alternative to invpcid here. But there's a couple of inlines in asm/tlbflush.h that need to do the same trick, so it's best to localize all this per-cpu business in mm/kaiser.c: moving that part of the initialization from setup_pcid() to kaiser_setup_pcid(); with kaiser_flush_tlb_on_return_to_user() the function for noting an X86_CR3_PCID_USER_FLUSH. And let's keep a KAISER_SHADOW_PGD_OFFSET in there, to avoid the extra OR on exit. I did try to make the feature tests in asm/tlbflush.h more consistent with each other: there seem to be far too many ways of performing such tests, and I don't have a good grasp of their differences. At first I converted them all to be static_cpu_has(): but that proved to be a mistake, as the comment in __native_flush_tlb_single() hints; so then I reversed and made them all this_cpu_has(). Probably all gratuitous change, but that's the way it's working at present. I am slightly bothered by the way non-per-cpu X86_CR3_PCID_KERN_VAR gets re-initialized by each cpu (before and after these changes): no problem when (as usual) all cpus on a machine have the same features, but in principle incorrect. However, my experiment to per-cpu-ify that one did not end well... Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/kaiser.h | 18 +++++++------ arch/x86/include/asm/tlbflush.h | 56 ++++++++++++++++++++++++++++------------- arch/x86/kernel/cpu/common.c | 22 +--------------- arch/x86/mm/kaiser.c | 50 +++++++++++++++++++++++++++++++----- arch/x86/mm/tlb.c | 46 +++++++++++++-------------------- 5 files changed, 113 insertions(+), 79 deletions(-) diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 360ff3bc44a9..009bca514c20 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -32,13 +32,12 @@ movq \reg, %cr3 .macro _SWITCH_TO_USER_CR3 reg movq %cr3, \reg andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg -/* - * This can obviously be one instruction by putting the - * KAISER_SHADOW_PGD_OFFSET bit in the X86_CR3_PCID_USER_VAR. - * But, just leave it now for simplicity. - */ -orq X86_CR3_PCID_USER_VAR, \reg -orq $(KAISER_SHADOW_PGD_OFFSET), \reg +orq PER_CPU_VAR(X86_CR3_PCID_USER_VAR), \reg +js 9f +// FLUSH this time, reset to NOFLUSH for next time +// But if nopcid? Consider using 0x80 for user pcid? +movb $(0x80), PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7) +9: movq \reg, %cr3 .endm @@ -90,6 +89,11 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax */ DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); +extern unsigned long X86_CR3_PCID_KERN_VAR; +DECLARE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR); + +extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; + /** * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping * @addr: the start address of the range diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 5d0c0b504729..01584d488754 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -13,6 +13,7 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr, unsigned long type) { struct { u64 d[2]; } desc = { { pcid, addr } }; + /* * The memory clobber is because the whole point is to invalidate * stale TLB entries and, especially if we're flushing global @@ -131,27 +132,42 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) cr4_set_bits(mask); } +/* + * Declare a couple of kaiser interfaces here for convenience, + * to avoid the need for asm/kaiser.h in unexpected places. + */ +#ifdef CONFIG_KAISER +extern void kaiser_setup_pcid(void); +extern void kaiser_flush_tlb_on_return_to_user(void); +#else +static inline void kaiser_setup_pcid(void) +{ +} +static inline void kaiser_flush_tlb_on_return_to_user(void) +{ +} +#endif + static inline void __native_flush_tlb(void) { - if (!cpu_feature_enabled(X86_FEATURE_INVPCID)) { + if (this_cpu_has(X86_FEATURE_INVPCID)) { /* - * If current->mm == NULL then we borrow a mm which may change during a - * task switch and therefore we must not be preempted while we write CR3 - * back: + * Note, this works with CR4.PCIDE=0 or 1. */ - preempt_disable(); - native_write_cr3(native_read_cr3()); - preempt_enable(); + invpcid_flush_all_nonglobals(); return; } + /* - * We are no longer using globals with KAISER, so a - * "nonglobals" flush would work too. But, this is more - * conservative. - * - * Note, this works with CR4.PCIDE=0 or 1. + * If current->mm == NULL then we borrow a mm which may change during a + * task switch and therefore we must not be preempted while we write CR3 + * back: */ - invpcid_flush_all(); + preempt_disable(); + if (this_cpu_has(X86_FEATURE_PCID)) + kaiser_flush_tlb_on_return_to_user(); + native_write_cr3(native_read_cr3()); + preempt_enable(); } static inline void __native_flush_tlb_global_irq_disabled(void) @@ -167,9 +183,13 @@ static inline void __native_flush_tlb_global_irq_disabled(void) static inline void __native_flush_tlb_global(void) { +#ifdef CONFIG_KAISER + /* Globals are not used at all */ + __native_flush_tlb(); +#else unsigned long flags; - if (static_cpu_has(X86_FEATURE_INVPCID)) { + if (this_cpu_has(X86_FEATURE_INVPCID)) { /* * Using INVPCID is considerably faster than a pair of writes * to CR4 sandwiched inside an IRQ flag save/restore. @@ -186,10 +206,9 @@ static inline void __native_flush_tlb_global(void) * be called from deep inside debugging code.) */ raw_local_irq_save(flags); - __native_flush_tlb_global_irq_disabled(); - raw_local_irq_restore(flags); +#endif } static inline void __native_flush_tlb_single(unsigned long addr) @@ -200,9 +219,12 @@ static inline void __native_flush_tlb_single(unsigned long addr) * * The ASIDs used below are hard-coded. But, we must not * call invpcid(type=1/2) before CR4.PCIDE=1. Just call - * invpcid in the case we are called early. + * invlpg in the case we are called early. */ + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { + if (this_cpu_has(X86_FEATURE_PCID)) + kaiser_flush_tlb_on_return_to_user(); asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); return; } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b4c0ae5495f3..e6be5f3b1259 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -324,32 +324,11 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) } } -/* - * These can have bit 63 set, so we can not just use a plain "or" - * instruction to get their value or'd into CR3. It would take - * another register. So, we use a memory reference to these - * instead. - * - * This is also handy because systems that do not support - * PCIDs just end up or'ing a 0 into their CR3, which does - * no harm. - */ -__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR = 0; -__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_USER_VAR = 0; - static void setup_pcid(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_PCID)) { if (cpu_has(c, X86_FEATURE_PGE)) { cr4_set_bits(X86_CR4_PCIDE); - /* - * These variables are used by the entry/exit - * code to change PCIDs. - */ -#ifdef CONFIG_KAISER - X86_CR3_PCID_KERN_VAR = X86_CR3_PCID_KERN_NOFLUSH; - X86_CR3_PCID_USER_VAR = X86_CR3_PCID_USER_NOFLUSH; -#endif /* * INVPCID has two "groups" of types: * 1/2: Invalidate an individual address @@ -375,6 +354,7 @@ static void setup_pcid(struct cpuinfo_x86 *c) clear_cpu_cap(c, X86_FEATURE_PCID); } } + kaiser_setup_pcid(); } /* diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index f5c75f73bb71..7056840e2739 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -11,12 +11,26 @@ #include #include +#include /* to verify its kaiser declarations */ #include #include #include + #ifdef CONFIG_KAISER +__visible +DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + +/* + * These can have bit 63 set, so we can not just use a plain "or" + * instruction to get their value or'd into CR3. It would take + * another register. So, we use a memory reference to these instead. + * + * This is also handy because systems that do not support PCIDs + * just end up or'ing a 0 into their CR3, which does no harm. + */ +__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR; +DEFINE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR); -__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); /* * At runtime, the only things we map are some things for CPU * hotplug, and stacks for new processes. No two CPUs will ever @@ -238,9 +252,6 @@ static void __init kaiser_init_all_pgds(void) WARN_ON(__ret); \ } while (0) -extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; -extern unsigned long X86_CR3_PCID_KERN_VAR; -extern unsigned long X86_CR3_PCID_USER_VAR; /* * If anything in here fails, we will likely die on one of the * first kernel->user transitions and init will die. But, we @@ -294,8 +305,6 @@ void __init kaiser_init(void) kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE, __PAGE_KERNEL); - kaiser_add_user_map_early(&X86_CR3_PCID_USER_VAR, PAGE_SIZE, - __PAGE_KERNEL); } /* Add a mapping to the shadow mapping, and synchronize the mappings */ @@ -358,4 +367,33 @@ pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) } return pgd; } + +void kaiser_setup_pcid(void) +{ + unsigned long kern_cr3 = 0; + unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET; + + if (this_cpu_has(X86_FEATURE_PCID)) { + kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH; + user_cr3 |= X86_CR3_PCID_USER_NOFLUSH; + } + /* + * These variables are used by the entry/exit + * code to change PCID and pgd and TLB flushing. + */ + X86_CR3_PCID_KERN_VAR = kern_cr3; + this_cpu_write(X86_CR3_PCID_USER_VAR, user_cr3); +} + +/* + * Make a note that this cpu will need to flush USER tlb on return to user. + * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling: + * if cpu does not, then the NOFLUSH bit will never have been set. + */ +void kaiser_flush_tlb_on_return_to_user(void) +{ + this_cpu_write(X86_CR3_PCID_USER_VAR, + X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); +} +EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); #endif /* CONFIG_KAISER */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 43ce5d316eae..4c841c9c4d9c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -6,13 +6,14 @@ #include #include #include +#include #include #include #include #include #include -#include +#include /* * TLB flushing, formerly SMP-only @@ -38,34 +39,23 @@ static void load_new_mm_cr3(pgd_t *pgdir) { unsigned long new_mm_cr3 = __pa(pgdir); - /* - * KAISER, plus PCIDs needs some extra work here. But, - * if either of features is not present, we need no - * PCIDs here and just do a normal, full TLB flush with - * the write_cr3() - */ - if (!IS_ENABLED(CONFIG_KAISER) || - !cpu_feature_enabled(X86_FEATURE_PCID)) - goto out_set_cr3; - /* - * We reuse the same PCID for different tasks, so we must - * flush all the entires for the PCID out when we change - * tasks. - */ - new_mm_cr3 = X86_CR3_PCID_KERN_FLUSH | __pa(pgdir); - - /* - * The flush from load_cr3() may leave old TLB entries - * for userspace in place. We must flush that context - * separately. We can theoretically delay doing this - * until we actually load up the userspace CR3, but - * that's a bit tricky. We have to have the "need to - * flush userspace PCID" bit per-cpu and check it in the - * exit-to-userspace paths. - */ - invpcid_flush_single_context(X86_CR3_PCID_ASID_USER); +#ifdef CONFIG_KAISER + if (this_cpu_has(X86_FEATURE_PCID)) { + /* + * We reuse the same PCID for different tasks, so we must + * flush all the entries for the PCID out when we change tasks. + * Flush KERN below, flush USER when returning to userspace in + * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro. + * + * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could + * do it here, but can only be used if X86_FEATURE_INVPCID is + * available - and many machines support pcid without invpcid. + */ + new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH; + kaiser_flush_tlb_on_return_to_user(); + } +#endif /* CONFIG_KAISER */ -out_set_cr3: /* * Caution: many callers of this function expect * that load_cr3() is serializing and orders TLB -- cgit v1.2.3 From 6a2b4117614c5d4239d1120b08591b2d296f4f53 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 8 Sep 2017 19:26:30 -0700 Subject: kaiser: PCID 0 for kernel and 128 for user Why was 4 chosen for kernel PCID and 6 for user PCID? No good reason in a backport where PCIDs are only used for Kaiser. If we continue with those, then we shall need to add Andy Lutomirski's 4.13 commit 6c690ee1039b ("x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()"), which deals with the problem of read_cr3() callers finding stray bits in the cr3 that they expected to be page-aligned; and for hibernation, his 4.14 commit f34902c5c6c0 ("x86/hibernate/64: Mask off CR3's PCID bits in the saved CR3"). But if 0 is used for kernel PCID, then there's no need to add in those commits - whenever the kernel looks, it sees 0 in the lower bits; and 0 for kernel seems an obvious choice. And I naughtily propose 128 for user PCID. Because there's a place in _SWITCH_TO_USER_CR3 where it takes note of the need for TLB FLUSH, but needs to reset that to NOFLUSH for the next occasion. Currently it does so with a "movb $(0x80)" into the high byte of the per-cpu quadword, but that will cause a machine without PCID support to crash. Now, if %al just happened to have 0x80 in it at that point, on a machine with PCID support, but 0 on a machine without PCID support... (That will go badly wrong once the pgd can be at a physical address above 2^56, but even with 5-level paging, physical goes up to 2^52.) Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/kaiser.h | 19 ++++++++++++------- arch/x86/include/asm/pgtable_types.h | 7 ++++--- arch/x86/mm/tlb.c | 3 +++ 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 009bca514c20..110a73e0572d 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -29,14 +29,19 @@ orq X86_CR3_PCID_KERN_VAR, \reg movq \reg, %cr3 .endm -.macro _SWITCH_TO_USER_CR3 reg +.macro _SWITCH_TO_USER_CR3 reg regb +/* + * regb must be the low byte portion of reg: because we have arranged + * for the low byte of the user PCID to serve as the high byte of NOFLUSH + * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are + * not enabled): so that the one register can update both memory and cr3. + */ movq %cr3, \reg andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg orq PER_CPU_VAR(X86_CR3_PCID_USER_VAR), \reg js 9f -// FLUSH this time, reset to NOFLUSH for next time -// But if nopcid? Consider using 0x80 for user pcid? -movb $(0x80), PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7) +/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */ +movb \regb, PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7) 9: movq \reg, %cr3 .endm @@ -49,7 +54,7 @@ popq %rax .macro SWITCH_USER_CR3 pushq %rax -_SWITCH_TO_USER_CR3 %rax +_SWITCH_TO_USER_CR3 %rax %al popq %rax .endm @@ -61,7 +66,7 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax .macro SWITCH_USER_CR3_NO_STACK movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) -_SWITCH_TO_USER_CR3 %rax +_SWITCH_TO_USER_CR3 %rax %al movq PER_CPU_VAR(unsafe_stack_register_backup), %rax .endm @@ -69,7 +74,7 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax .macro SWITCH_KERNEL_CR3 reg .endm -.macro SWITCH_USER_CR3 reg +.macro SWITCH_USER_CR3 reg regb .endm .macro SWITCH_USER_CR3_NO_STACK .endm diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index ada77fdc283f..7cf2883d7bc4 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -146,16 +146,17 @@ /* Mask for all the PCID-related bits in CR3: */ #define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK) +#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) + #if defined(CONFIG_KAISER) && defined(CONFIG_X86_64) -#define X86_CR3_PCID_ASID_KERN (_AC(0x4,UL)) -#define X86_CR3_PCID_ASID_USER (_AC(0x6,UL)) +/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */ +#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL)) #define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN) #define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER) #define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN) #define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER) #else -#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) #define X86_CR3_PCID_ASID_USER (_AC(0x0,UL)) /* * PCIDs are unsupported on 32-bit and none of these bits can be diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 4c841c9c4d9c..7759b650d9da 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -50,6 +50,9 @@ static void load_new_mm_cr3(pgd_t *pgdir) * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could * do it here, but can only be used if X86_FEATURE_INVPCID is * available - and many machines support pcid without invpcid. + * + * The line below is a no-op: X86_CR3_PCID_KERN_FLUSH is now 0; + * but keep that line in there in case something changes. */ new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH; kaiser_flush_tlb_on_return_to_user(); -- cgit v1.2.3 From d0142ceb7926b81738e1fc122b109f67128d2762 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 27 Aug 2017 16:24:27 -0700 Subject: kaiser: x86_cr3_pcid_noflush and x86_cr3_pcid_user Mostly this commit is just unshouting X86_CR3_PCID_KERN_VAR and X86_CR3_PCID_USER_VAR: we usually name variables in lower-case. But why does x86_cr3_pcid_noflush need to be __aligned(PAGE_SIZE)? Ah, it's a leftover from when kaiser_add_user_map() once complained about mapping the same page twice. Make it __read_mostly instead. (I'm a little uneasy about all the unrelated data which shares its page getting user-mapped too, but that was so before, and not a big deal: though we call it user-mapped, it's not mapped with _PAGE_USER.) And there is a little change around the two calls to do_nmi(). Previously they set the NOFLUSH bit (if PCID supported) when forcing to kernel context before do_nmi(); now they also have the NOFLUSH bit set (if PCID supported) when restoring context after: nothing done in do_nmi() should require a TLB to be flushed here. Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_64.S | 8 ++++---- arch/x86/include/asm/kaiser.h | 11 +++++------ arch/x86/mm/kaiser.c | 13 +++++++------ 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 01ff191830f4..c0e142c6c16c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1316,11 +1316,11 @@ ENTRY(nmi) /* Unconditionally use kernel CR3 for do_nmi() */ /* %rax is saved above, so OK to clobber here */ movq %cr3, %rax + /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ + orq x86_cr3_pcid_noflush, %rax pushq %rax /* mask off "user" bit of pgd address and 12 PCID bits: */ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax - /* Add back kernel PCID and "no flush" bit */ - orq X86_CR3_PCID_KERN_VAR, %rax movq %rax, %cr3 #endif call do_nmi @@ -1560,11 +1560,11 @@ end_repeat_nmi: /* Unconditionally use kernel CR3 for do_nmi() */ /* %rax is saved above, so OK to clobber here */ movq %cr3, %rax + /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ + orq x86_cr3_pcid_noflush, %rax pushq %rax /* mask off "user" bit of pgd address and 12 PCID bits: */ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax - /* Add back kernel PCID and "no flush" bit */ - orq X86_CR3_PCID_KERN_VAR, %rax movq %rax, %cr3 #endif diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 110a73e0572d..48d8d70dd8c7 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -25,7 +25,7 @@ .macro _SWITCH_TO_KERNEL_CR3 reg movq %cr3, \reg andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg -orq X86_CR3_PCID_KERN_VAR, \reg +orq x86_cr3_pcid_noflush, \reg movq \reg, %cr3 .endm @@ -37,11 +37,10 @@ movq \reg, %cr3 * not enabled): so that the one register can update both memory and cr3. */ movq %cr3, \reg -andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg -orq PER_CPU_VAR(X86_CR3_PCID_USER_VAR), \reg +orq PER_CPU_VAR(x86_cr3_pcid_user), \reg js 9f /* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */ -movb \regb, PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7) +movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7) 9: movq \reg, %cr3 .endm @@ -94,8 +93,8 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax */ DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); -extern unsigned long X86_CR3_PCID_KERN_VAR; -DECLARE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR); +extern unsigned long x86_cr3_pcid_noflush; +DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index 7056840e2739..fa1cb0990551 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -28,8 +28,8 @@ DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); * This is also handy because systems that do not support PCIDs * just end up or'ing a 0 into their CR3, which does no harm. */ -__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR; -DEFINE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR); +unsigned long x86_cr3_pcid_noflush __read_mostly; +DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user); /* * At runtime, the only things we map are some things for CPU @@ -303,7 +303,8 @@ void __init kaiser_init(void) sizeof(gate_desc) * NR_VECTORS, __PAGE_KERNEL); - kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE, + kaiser_add_user_map_early(&x86_cr3_pcid_noflush, + sizeof(x86_cr3_pcid_noflush), __PAGE_KERNEL); } @@ -381,8 +382,8 @@ void kaiser_setup_pcid(void) * These variables are used by the entry/exit * code to change PCID and pgd and TLB flushing. */ - X86_CR3_PCID_KERN_VAR = kern_cr3; - this_cpu_write(X86_CR3_PCID_USER_VAR, user_cr3); + x86_cr3_pcid_noflush = kern_cr3; + this_cpu_write(x86_cr3_pcid_user, user_cr3); } /* @@ -392,7 +393,7 @@ void kaiser_setup_pcid(void) */ void kaiser_flush_tlb_on_return_to_user(void) { - this_cpu_write(X86_CR3_PCID_USER_VAR, + this_cpu_write(x86_cr3_pcid_user, X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); } EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); -- cgit v1.2.3 From 05ddad146d02b2cb05f8c06b2c3b3c365372a66c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 26 Sep 2017 18:43:07 -0700 Subject: kaiser: paranoid_entry pass cr3 need to paranoid_exit Neel Natu points out that paranoid_entry() was wrong to assume that an entry that did not need swapgs would not need SWITCH_KERNEL_CR3: paranoid_entry (used for debug breakpoint, int3, double fault or MCE; though I think it's only the MCE case that is cause for concern here) can break in at an awkward time, between cr3 switch and swapgs, but its handling always needs kernel gs and kernel cr3. Easy to fix in itself, but paranoid_entry() also needs to convey to paranoid_exit() (and my reading of macro idtentry says paranoid_entry and paranoid_exit are always paired) how to restore the prior state. The swapgs state is already conveyed by %ebx (0 or 1), so extend that also to convey when SWITCH_USER_CR3 will be needed (2 or 3). (Yes, I'd much prefer that 0 meant no swapgs, whereas it's the other way round: and a convention shared with error_entry() and error_exit(), which I don't want to touch. Perhaps I should have inverted the bit for switch cr3 too, but did not.) paranoid_exit() would be straightforward, except for TRACE_IRQS: it did TRACE_IRQS_IRETQ when doing swapgs, but TRACE_IRQS_IRETQ_DEBUG when not: which is it supposed to use when SWITCH_USER_CR3 is split apart from that? As best as I can determine, commit 5963e317b1e9 ("ftrace/x86: Do not change stacks in DEBUG when calling lockdep") missed the swapgs case, and should have used TRACE_IRQS_IRETQ_DEBUG there too (the discrepancy has nothing to do with the liberal use of _NO_STACK and _UNSAFE_STACK hereabouts: TRACE_IRQS_OFF_DEBUG has just been used in all cases); discrepancy lovingly preserved across several paranoid_exit() cleanups, but I'm now removing it. Neel further indicates that to use SWITCH_USER_CR3_NO_STACK there in paranoid_exit() is now not only unnecessary but unsafe: might corrupt syscall entry's unsafe_stack_register_backup of %rax. Just use SWITCH_USER_CR3: and delete SWITCH_USER_CR3_NO_STACK altogether, before we make the mistake of using it again. hughd adds: this commit fixes an issue in the Kaiser-without-PCIDs part of the series, and ought to be moved earlier, if you decided to make a release of Kaiser-without-PCIDs. Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_64.S | 46 +++++++++++++++++++++++++++++++--------- arch/x86/entry/entry_64_compat.S | 2 +- arch/x86/include/asm/kaiser.h | 8 ------- 3 files changed, 37 insertions(+), 19 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index c0e142c6c16c..34a172facf17 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1053,7 +1053,11 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec /* * Save all registers in pt_regs, and switch gs if needed. * Use slow, but surefire "are we in kernel?" check. - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + * + * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit + * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit + * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit + * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit */ ENTRY(paranoid_entry) cld @@ -1065,9 +1069,26 @@ ENTRY(paranoid_entry) testl %edx, %edx js 1f /* negative -> in kernel */ SWAPGS - SWITCH_KERNEL_CR3 xorl %ebx, %ebx -1: ret +1: +#ifdef CONFIG_KAISER + /* + * We might have come in between a swapgs and a SWITCH_KERNEL_CR3 + * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit. + * Do a conditional SWITCH_KERNEL_CR3: this could safely be done + * unconditionally, but we need to find out whether the reverse + * should be done on return (conveyed to paranoid_exit in %ebx). + */ + movq %cr3, %rax + testl $KAISER_SHADOW_PGD_OFFSET, %eax + jz 2f + orl $2, %ebx + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax + orq x86_cr3_pcid_noflush, %rax + movq %rax, %cr3 +2: +#endif + ret END(paranoid_entry) /* @@ -1080,20 +1101,25 @@ END(paranoid_entry) * be complicated. Fortunately, we there's no good reason * to try to handle preemption here. * - * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) + * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3 + * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 + * ebx=2: needs both swapgs and SWITCH_USER_CR3 + * ebx=3: needs SWITCH_USER_CR3 but not swapgs */ ENTRY(paranoid_exit) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF_DEBUG - testl %ebx, %ebx /* swapgs needed? */ + TRACE_IRQS_IRETQ_DEBUG +#ifdef CONFIG_KAISER + testl $2, %ebx /* SWITCH_USER_CR3 needed? */ + jz paranoid_exit_no_switch + SWITCH_USER_CR3 +paranoid_exit_no_switch: +#endif + testl $1, %ebx /* swapgs needed? */ jnz paranoid_exit_no_swapgs - TRACE_IRQS_IRETQ - SWITCH_USER_CR3_NO_STACK SWAPGS_UNSAFE_STACK - jmp paranoid_exit_restore paranoid_exit_no_swapgs: - TRACE_IRQS_IRETQ_DEBUG -paranoid_exit_restore: RESTORE_EXTRA_REGS RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 0eb5801947ff..d76a97653980 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -343,7 +343,7 @@ ENTRY(entry_INT80_compat) /* Go back to user mode. */ TRACE_IRQS_ON - SWITCH_USER_CR3_NO_STACK + SWITCH_USER_CR3 SWAPGS jmp restore_regs_and_iret END(entry_INT80_compat) diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 48d8d70dd8c7..3dc5f4c39b3e 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -63,20 +63,12 @@ _SWITCH_TO_KERNEL_CR3 %rax movq PER_CPU_VAR(unsafe_stack_register_backup), %rax .endm -.macro SWITCH_USER_CR3_NO_STACK -movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) -_SWITCH_TO_USER_CR3 %rax %al -movq PER_CPU_VAR(unsafe_stack_register_backup), %rax -.endm - #else /* CONFIG_KAISER */ .macro SWITCH_KERNEL_CR3 reg .endm .macro SWITCH_USER_CR3 reg regb .endm -.macro SWITCH_USER_CR3_NO_STACK -.endm .macro SWITCH_KERNEL_CR3_NO_STACK .endm -- cgit v1.2.3 From 3df146178706bf39cfcb22d7fe1a378af6dc5896 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 2 Oct 2017 10:57:24 -0700 Subject: kaiser: kaiser_remove_mapping() move along the pgd When removing the bogus comment from kaiser_remove_mapping(), I really ought to have checked the extent of its bogosity: as Neel points out, there is nothing to stop unmap_pud_range_nofree() from continuing beyond the end of a pud (and starting in the wrong position on the next). Fix kaiser_remove_mapping() to constrain the extent and advance pgd pointer correctly: use pgd_addr_end() macro as used throughout base mm (but don't assume page-rounded start and size in this case). But this bug was very unlikely to trigger in this backport: since any buddy allocation is contained within a single pud extent, and we are not using vmapped stacks (and are only mapping one page of stack anyway): the only way to hit this bug here would be when freeing a large modified ldt. Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/kaiser.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index fa1cb0990551..cc0950f756d5 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -319,11 +319,13 @@ void kaiser_remove_mapping(unsigned long start, unsigned long size) extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end); unsigned long end = start + size; - unsigned long addr; + unsigned long addr, next; + pgd_t *pgd; - for (addr = start; addr < end; addr += PGDIR_SIZE) { - pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr)); - unmap_pud_range_nofree(pgd, addr, end); + pgd = native_get_shadow_pgd(pgd_offset_k(start)); + for (addr = start; addr < end; pgd++, addr = next) { + next = pgd_addr_end(addr, end); + unmap_pud_range_nofree(pgd, addr, next); } } -- cgit v1.2.3 From cb7d8d7e6737cad77d5eaba9f9c95f3322706198 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 4 Dec 2017 20:13:35 -0800 Subject: kaiser: fix unlikely error in alloc_ldt_struct() An error from kaiser_add_mapping() here is not at all likely, but Eric Biggers rightly points out that __free_ldt_struct() relies on new_ldt->size being initialized: move that up. Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/ldt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 567bdbd7d917..8bc68cfc0d33 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -79,11 +79,11 @@ static struct ldt_struct *alloc_ldt_struct(int size) ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size, __PAGE_KERNEL); + new_ldt->size = size; if (ret) { __free_ldt_struct(new_ldt); return NULL; } - new_ldt->size = size; return new_ldt; } -- cgit v1.2.3 From 23e09439aa460e4e194c8d2542a16977ee8ed142 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 24 Sep 2017 16:59:49 -0700 Subject: kaiser: add "nokaiser" boot option, using ALTERNATIVE Added "nokaiser" boot option: an early param like "noinvpcid". Most places now check int kaiser_enabled (#defined 0 when not CONFIG_KAISER) instead of #ifdef CONFIG_KAISER; but entry_64.S and entry_64_compat.S are using the ALTERNATIVE technique, which patches in the preferred instructions at runtime. That technique is tied to x86 cpu features, so X86_FEATURE_KAISER is fabricated. Prior to "nokaiser", Kaiser #defined _PAGE_GLOBAL 0: revert that, but be careful with both _PAGE_GLOBAL and CR4.PGE: setting them when nokaiser like when !CONFIG_KAISER, but not setting either when kaiser - neither matters on its own, but it's hard to be sure that _PAGE_GLOBAL won't get set in some obscure corner, or something add PGE into CR4. By omitting _PAGE_GLOBAL from __supported_pte_mask when kaiser_enabled, all page table setup which uses pte_pfn() masks it out of the ptes. It's slightly shameful that the same declaration versus definition of kaiser_enabled appears in not one, not two, but in three header files (asm/kaiser.h, asm/pgtable.h, asm/tlbflush.h). I felt safer that way, than with #including any of those in any of the others; and did not feel it worth an asm/kaiser_enabled.h - kernel/cpu/common.c includes them all, so we shall hear about it if they get out of synch. Cleanups while in the area: removed the silly #ifdef CONFIG_KAISER from kaiser.c; removed the unused native_get_normal_pgd(); removed the spurious reg clutter from SWITCH_*_CR3 macro stubs; corrected some comments. But more interestingly, set CR4.PSE in secondary_startup_64: the manual is clear that it does not matter whether it's 0 or 1 when 4-level-pts are enabled, but I was distracted to find cr4 different on BSP and auxiliaries - BSP alone was adding PSE, in probe_page_size_mask(). Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- Documentation/kernel-parameters.txt | 2 ++ arch/x86/entry/entry_64.S | 15 ++++++------ arch/x86/include/asm/cpufeatures.h | 3 +++ arch/x86/include/asm/kaiser.h | 27 ++++++++++++++++------ arch/x86/include/asm/pgtable.h | 20 +++++++++++----- arch/x86/include/asm/pgtable_64.h | 13 ++++------- arch/x86/include/asm/pgtable_types.h | 4 ---- arch/x86/include/asm/tlbflush.h | 39 ++++++++++++++++++++------------ arch/x86/kernel/cpu/common.c | 28 ++++++++++++++++++++++- arch/x86/kernel/espfix_64.c | 3 ++- arch/x86/kernel/head_64.S | 4 ++-- arch/x86/mm/init.c | 2 +- arch/x86/mm/init_64.c | 10 ++++++++ arch/x86/mm/kaiser.c | 26 +++++++++++++++++---- arch/x86/mm/pgtable.c | 8 ++----- arch/x86/mm/tlb.c | 4 +--- tools/arch/x86/include/asm/cpufeatures.h | 3 +++ 17 files changed, 146 insertions(+), 65 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 152ec4e87b57..887f0eb7943d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2763,6 +2763,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nojitter [IA-64] Disables jitter checking for ITC timers. + nokaiser [X86-64] Disable KAISER isolation of kernel from user. + no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 34a172facf17..4e5c41005dd0 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1079,7 +1079,7 @@ ENTRY(paranoid_entry) * unconditionally, but we need to find out whether the reverse * should be done on return (conveyed to paranoid_exit in %ebx). */ - movq %cr3, %rax + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER testl $KAISER_SHADOW_PGD_OFFSET, %eax jz 2f orl $2, %ebx @@ -1111,6 +1111,7 @@ ENTRY(paranoid_exit) TRACE_IRQS_OFF_DEBUG TRACE_IRQS_IRETQ_DEBUG #ifdef CONFIG_KAISER + /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */ testl $2, %ebx /* SWITCH_USER_CR3 needed? */ jz paranoid_exit_no_switch SWITCH_USER_CR3 @@ -1341,13 +1342,14 @@ ENTRY(nmi) #ifdef CONFIG_KAISER /* Unconditionally use kernel CR3 for do_nmi() */ /* %rax is saved above, so OK to clobber here */ - movq %cr3, %rax + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ orq x86_cr3_pcid_noflush, %rax pushq %rax /* mask off "user" bit of pgd address and 12 PCID bits: */ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax movq %rax, %cr3 +2: #endif call do_nmi @@ -1357,8 +1359,7 @@ ENTRY(nmi) * kernel code that needs user CR3, but do we ever return * to "user mode" where we need the kernel CR3? */ - popq %rax - mov %rax, %cr3 + ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER #endif /* @@ -1585,13 +1586,14 @@ end_repeat_nmi: #ifdef CONFIG_KAISER /* Unconditionally use kernel CR3 for do_nmi() */ /* %rax is saved above, so OK to clobber here */ - movq %cr3, %rax + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ orq x86_cr3_pcid_noflush, %rax pushq %rax /* mask off "user" bit of pgd address and 12 PCID bits: */ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax movq %rax, %cr3 +2: #endif /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ @@ -1603,8 +1605,7 @@ end_repeat_nmi: * kernel code that needs user CR3, like just just before * a sysret. */ - popq %rax - mov %rax, %cr3 + ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER #endif testl %ebx, %ebx /* swapgs needed? */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index dc508830d0a1..20271d6b0eb9 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -198,6 +198,9 @@ #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ +/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ +#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */ + /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 3dc5f4c39b3e..96643a9c194c 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -46,28 +46,33 @@ movq \reg, %cr3 .endm .macro SWITCH_KERNEL_CR3 -pushq %rax +ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER _SWITCH_TO_KERNEL_CR3 %rax popq %rax +8: .endm .macro SWITCH_USER_CR3 -pushq %rax +ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER _SWITCH_TO_USER_CR3 %rax %al popq %rax +8: .endm .macro SWITCH_KERNEL_CR3_NO_STACK -movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) +ALTERNATIVE "jmp 8f", \ + __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \ + X86_FEATURE_KAISER _SWITCH_TO_KERNEL_CR3 %rax movq PER_CPU_VAR(unsafe_stack_register_backup), %rax +8: .endm #else /* CONFIG_KAISER */ -.macro SWITCH_KERNEL_CR3 reg +.macro SWITCH_KERNEL_CR3 .endm -.macro SWITCH_USER_CR3 reg regb +.macro SWITCH_USER_CR3 .endm .macro SWITCH_KERNEL_CR3_NO_STACK .endm @@ -90,6 +95,16 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; +extern int kaiser_enabled; +#else +#define kaiser_enabled 0 +#endif /* CONFIG_KAISER */ + +/* + * Kaiser function prototypes are needed even when CONFIG_KAISER is not set, + * so as to build with tests on kaiser_enabled instead of #ifdefs. + */ + /** * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping * @addr: the start address of the range @@ -119,8 +134,6 @@ extern void kaiser_remove_mapping(unsigned long start, unsigned long size); */ extern void kaiser_init(void); -#endif /* CONFIG_KAISER */ - #endif /* __ASSEMBLY */ #endif /* _ASM_X86_KAISER_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1cee98e182b7..217e83a657e2 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -18,6 +18,12 @@ #ifndef __ASSEMBLY__ #include +#ifdef CONFIG_KAISER +extern int kaiser_enabled; +#else +#define kaiser_enabled 0 +#endif + void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); void ptdump_walk_pgd_level_checkwx(void); @@ -697,7 +703,7 @@ static inline int pgd_bad(pgd_t pgd) * page table by accident; it will fault on the first * instruction it tries to run. See native_set_pgd(). */ - if (IS_ENABLED(CONFIG_KAISER)) + if (kaiser_enabled) ignore_flags |= _PAGE_NX; return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; @@ -913,12 +919,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, */ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { - memcpy(dst, src, count * sizeof(pgd_t)); + memcpy(dst, src, count * sizeof(pgd_t)); #ifdef CONFIG_KAISER - /* Clone the shadow pgd part as well */ - memcpy(native_get_shadow_pgd(dst), - native_get_shadow_pgd(src), - count * sizeof(pgd_t)); + if (kaiser_enabled) { + /* Clone the shadow pgd part as well */ + memcpy(native_get_shadow_pgd(dst), + native_get_shadow_pgd(src), + count * sizeof(pgd_t)); + } #endif } diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 177caf3f7ab1..cf68b5c1cb74 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -111,13 +111,12 @@ extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) { +#ifdef CONFIG_DEBUG_VM + /* linux/mmdebug.h may not have been included at this point */ + BUG_ON(!kaiser_enabled); +#endif return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE); } - -static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) -{ - return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE); -} #else static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) { @@ -128,10 +127,6 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) BUILD_BUG_ON(1); return NULL; } -static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) -{ - return pgdp; -} #endif /* CONFIG_KAISER */ static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 7cf2883d7bc4..f0d9a1aa3f68 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -45,11 +45,7 @@ #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) -#ifdef CONFIG_KAISER -#define _PAGE_GLOBAL (_AT(pteval_t, 0)) -#else #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) -#endif #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 01584d488754..bdd69a3158dd 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -137,9 +137,11 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) * to avoid the need for asm/kaiser.h in unexpected places. */ #ifdef CONFIG_KAISER +extern int kaiser_enabled; extern void kaiser_setup_pcid(void); extern void kaiser_flush_tlb_on_return_to_user(void); #else +#define kaiser_enabled 0 static inline void kaiser_setup_pcid(void) { } @@ -164,7 +166,7 @@ static inline void __native_flush_tlb(void) * back: */ preempt_disable(); - if (this_cpu_has(X86_FEATURE_PCID)) + if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) kaiser_flush_tlb_on_return_to_user(); native_write_cr3(native_read_cr3()); preempt_enable(); @@ -175,20 +177,30 @@ static inline void __native_flush_tlb_global_irq_disabled(void) unsigned long cr4; cr4 = this_cpu_read(cpu_tlbstate.cr4); - /* clear PGE */ - native_write_cr4(cr4 & ~X86_CR4_PGE); - /* write old PGE again and flush TLBs */ - native_write_cr4(cr4); + if (cr4 & X86_CR4_PGE) { + /* clear PGE and flush TLB of all entries */ + native_write_cr4(cr4 & ~X86_CR4_PGE); + /* restore PGE as it was before */ + native_write_cr4(cr4); + } else { + /* + * x86_64 microcode update comes this way when CR4.PGE is not + * enabled, and it's safer for all callers to allow this case. + */ + native_write_cr3(native_read_cr3()); + } } static inline void __native_flush_tlb_global(void) { -#ifdef CONFIG_KAISER - /* Globals are not used at all */ - __native_flush_tlb(); -#else unsigned long flags; + if (kaiser_enabled) { + /* Globals are not used at all */ + __native_flush_tlb(); + return; + } + if (this_cpu_has(X86_FEATURE_INVPCID)) { /* * Using INVPCID is considerably faster than a pair of writes @@ -208,7 +220,6 @@ static inline void __native_flush_tlb_global(void) raw_local_irq_save(flags); __native_flush_tlb_global_irq_disabled(); raw_local_irq_restore(flags); -#endif } static inline void __native_flush_tlb_single(unsigned long addr) @@ -223,7 +234,7 @@ static inline void __native_flush_tlb_single(unsigned long addr) */ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { - if (this_cpu_has(X86_FEATURE_PCID)) + if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) kaiser_flush_tlb_on_return_to_user(); asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); return; @@ -238,9 +249,9 @@ static inline void __native_flush_tlb_single(unsigned long addr) * Make sure to do only a single invpcid when KAISER is * disabled and we have only a single ASID. */ - if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER) - invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); - invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); + if (kaiser_enabled) + invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); + invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); } static inline void __flush_tlb_all(void) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e6be5f3b1259..8b03874396b9 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -179,6 +179,20 @@ static int __init x86_pcid_setup(char *s) return 1; } __setup("nopcid", x86_pcid_setup); + +static int __init x86_nokaiser_setup(char *s) +{ + /* nokaiser doesn't accept parameters */ + if (s) + return -EINVAL; +#ifdef CONFIG_KAISER + kaiser_enabled = 0; + setup_clear_cpu_cap(X86_FEATURE_KAISER); + pr_info("nokaiser: KAISER feature disabled\n"); +#endif + return 0; +} +early_param("nokaiser", x86_nokaiser_setup); #endif static int __init x86_noinvpcid_setup(char *s) @@ -327,7 +341,7 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) static void setup_pcid(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_PCID)) { - if (cpu_has(c, X86_FEATURE_PGE)) { + if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) { cr4_set_bits(X86_CR4_PCIDE); /* * INVPCID has two "groups" of types: @@ -799,6 +813,10 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); init_scattered_cpuid_features(c); +#ifdef CONFIG_KAISER + if (kaiser_enabled) + set_cpu_cap(c, X86_FEATURE_KAISER); +#endif } static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) @@ -1537,6 +1555,14 @@ void cpu_init(void) * try to read it. */ cr4_init_shadow(); + if (!kaiser_enabled) { + /* + * secondary_startup_64() deferred setting PGE in cr4: + * probe_page_size_mask() sets it on the boot cpu, + * but it needs to be set on each secondary cpu. + */ + cr4_set_bits(X86_CR4_PGE); + } /* * Load microcode on this cpu if a valid microcode is available. diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 560c2fd73b28..e33b38541be3 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -132,9 +132,10 @@ void __init init_espfix_bsp(void) * area to ensure it is mapped into the shadow user page * tables. */ - if (IS_ENABLED(CONFIG_KAISER)) + if (kaiser_enabled) { set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page))); + } /* Randomize the locations */ init_espfix_random(); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 5775379237f7..d04479b4a8ee 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -190,8 +190,8 @@ ENTRY(secondary_startup_64) movq $(init_level4_pgt - __START_KERNEL_map), %rax 1: - /* Enable PAE mode and PGE */ - movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx + /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */ + movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx movq %rcx, %cr4 /* Setup early boot stage 4 level pagetables. */ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 0381638168d1..1e779bca4f3e 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -177,7 +177,7 @@ static void __init probe_page_size_mask(void) cr4_set_bits_and_update_boot(X86_CR4_PSE); /* Enable PGE if available */ - if (boot_cpu_has(X86_FEATURE_PGE)) { + if (boot_cpu_has(X86_FEATURE_PGE) && !kaiser_enabled) { cr4_set_bits_and_update_boot(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } else diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3e27ded6ac65..7df8e3a79dc0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -324,6 +324,16 @@ void __init cleanup_highmap(void) continue; if (vaddr < (unsigned long) _text || vaddr > end) set_pmd(pmd, __pmd(0)); + else if (kaiser_enabled) { + /* + * level2_kernel_pgt is initialized with _PAGE_GLOBAL: + * clear that now. This is not important, so long as + * CR4.PGE remains clear, but it removes an anomaly. + * Physical mapping setup below avoids _PAGE_GLOBAL + * by use of massage_pgprot() inside pfn_pte() etc. + */ + set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL)); + } } } diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index cc0950f756d5..11032dcf9af4 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -16,7 +16,9 @@ #include #include -#ifdef CONFIG_KAISER +int kaiser_enabled __read_mostly = 1; +EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */ + __visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); @@ -167,8 +169,8 @@ static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic) return pte_offset_kernel(pmd, address); } -int kaiser_add_user_map(const void *__start_addr, unsigned long size, - unsigned long flags) +static int kaiser_add_user_map(const void *__start_addr, unsigned long size, + unsigned long flags) { int ret = 0; pte_t *pte; @@ -177,6 +179,15 @@ int kaiser_add_user_map(const void *__start_addr, unsigned long size, unsigned long end_addr = PAGE_ALIGN(start_addr + size); unsigned long target_address; + /* + * It is convenient for callers to pass in __PAGE_KERNEL etc, + * and there is no actual harm from setting _PAGE_GLOBAL, so + * long as CR4.PGE is not set. But it is nonetheless troubling + * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser" + * requires that not to be #defined to 0): so mask it off here. + */ + flags &= ~_PAGE_GLOBAL; + for (; address < end_addr; address += PAGE_SIZE) { target_address = get_pa_from_mapping(address); if (target_address == -1) { @@ -263,6 +274,8 @@ void __init kaiser_init(void) { int cpu; + if (!kaiser_enabled) + return; kaiser_init_all_pgds(); for_each_possible_cpu(cpu) { @@ -311,6 +324,8 @@ void __init kaiser_init(void) /* Add a mapping to the shadow mapping, and synchronize the mappings */ int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) { + if (!kaiser_enabled) + return 0; return kaiser_add_user_map((const void *)addr, size, flags); } @@ -322,6 +337,8 @@ void kaiser_remove_mapping(unsigned long start, unsigned long size) unsigned long addr, next; pgd_t *pgd; + if (!kaiser_enabled) + return; pgd = native_get_shadow_pgd(pgd_offset_k(start)); for (addr = start; addr < end; pgd++, addr = next) { next = pgd_addr_end(addr, end); @@ -343,6 +360,8 @@ static inline bool is_userspace_pgd(pgd_t *pgdp) pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) { + if (!kaiser_enabled) + return pgd; /* * Do we need to also populate the shadow pgd? Check _PAGE_USER to * skip cases like kexec and EFI which make temporary low mappings. @@ -399,4 +418,3 @@ void kaiser_flush_tlb_on_return_to_user(void) X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); } EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); -#endif /* CONFIG_KAISER */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 352fd015bc63..5aaec8effc5f 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -345,16 +345,12 @@ static inline void _pgd_free(pgd_t *pgd) } #else -#ifdef CONFIG_KAISER /* - * Instead of one pmd, we aquire two pmds. Being order-1, it is + * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is * both 8k in size and 8k-aligned. That lets us just flip bit 12 * in a pointer to swap between the two 4k halves. */ -#define PGD_ALLOCATION_ORDER 1 -#else -#define PGD_ALLOCATION_ORDER 0 -#endif +#define PGD_ALLOCATION_ORDER kaiser_enabled static inline pgd_t *_pgd_alloc(void) { diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 7759b650d9da..e7934aed8d67 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -39,8 +39,7 @@ static void load_new_mm_cr3(pgd_t *pgdir) { unsigned long new_mm_cr3 = __pa(pgdir); -#ifdef CONFIG_KAISER - if (this_cpu_has(X86_FEATURE_PCID)) { + if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) { /* * We reuse the same PCID for different tasks, so we must * flush all the entries for the PCID out when we change tasks. @@ -57,7 +56,6 @@ static void load_new_mm_cr3(pgd_t *pgdir) new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH; kaiser_flush_tlb_on_return_to_user(); } -#endif /* CONFIG_KAISER */ /* * Caution: many callers of this function expect diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index a39629206864..67c93d9f5ccd 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -197,6 +197,9 @@ #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ +/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ +#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */ + /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ -- cgit v1.2.3 From 50624dd12d6df936f80cb9f02e9a0e40197b5d91 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 2 Jan 2018 14:19:48 +0100 Subject: x86/kaiser: Rename and simplify X86_FEATURE_KAISER handling Concentrate it in arch/x86/mm/kaiser.c and use the upstream string "nopti". Signed-off-by: Borislav Petkov Signed-off-by: Greg Kroah-Hartman --- Documentation/kernel-parameters.txt | 2 +- arch/x86/kernel/cpu/common.c | 18 ------------------ arch/x86/mm/kaiser.c | 20 +++++++++++++++++++- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 887f0eb7943d..f8c9d9168ca3 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2763,7 +2763,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nojitter [IA-64] Disables jitter checking for ITC timers. - nokaiser [X86-64] Disable KAISER isolation of kernel from user. + nopti [X86-64] Disable KAISER isolation of kernel from user. no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8b03874396b9..918e44772b04 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -179,20 +179,6 @@ static int __init x86_pcid_setup(char *s) return 1; } __setup("nopcid", x86_pcid_setup); - -static int __init x86_nokaiser_setup(char *s) -{ - /* nokaiser doesn't accept parameters */ - if (s) - return -EINVAL; -#ifdef CONFIG_KAISER - kaiser_enabled = 0; - setup_clear_cpu_cap(X86_FEATURE_KAISER); - pr_info("nokaiser: KAISER feature disabled\n"); -#endif - return 0; -} -early_param("nokaiser", x86_nokaiser_setup); #endif static int __init x86_noinvpcid_setup(char *s) @@ -813,10 +799,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); init_scattered_cpuid_features(c); -#ifdef CONFIG_KAISER - if (kaiser_enabled) - set_cpu_cap(c, X86_FEATURE_KAISER); -#endif } static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index 11032dcf9af4..87cae72dccb0 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -274,8 +274,13 @@ void __init kaiser_init(void) { int cpu; - if (!kaiser_enabled) + if (!kaiser_enabled) { + setup_clear_cpu_cap(X86_FEATURE_KAISER); return; + } + + setup_force_cpu_cap(X86_FEATURE_KAISER); + kaiser_init_all_pgds(); for_each_possible_cpu(cpu) { @@ -418,3 +423,16 @@ void kaiser_flush_tlb_on_return_to_user(void) X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); } EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); + +static int __init x86_nokaiser_setup(char *s) +{ + /* nopti doesn't accept parameters */ + if (s) + return -EINVAL; + + kaiser_enabled = 0; + pr_info("Kernel/User page tables isolation: disabled\n"); + + return 0; +} +early_param("nopti", x86_nokaiser_setup); -- cgit v1.2.3 From 8018307a45a90ab2eecfd03d48b7efb31707df37 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 2 Jan 2018 14:19:48 +0100 Subject: x86/kaiser: Check boottime cmdline params AMD (and possibly other vendors) are not affected by the leak KAISER is protecting against. Keep the "nopti" for traditional reasons and add pti= like upstream. Signed-off-by: Borislav Petkov Signed-off-by: Greg Kroah-Hartman --- Documentation/kernel-parameters.txt | 6 ++++ arch/x86/mm/kaiser.c | 59 ++++++++++++++++++++++++++----------- 2 files changed, 47 insertions(+), 18 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index f8c9d9168ca3..5d2676d043de 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3327,6 +3327,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. pt. [PARIDE] See Documentation/blockdev/paride.txt. + pti= [X86_64] + Control KAISER user/kernel address space isolation: + on - enable + off - disable + auto - default setting + pty.legacy_count= [KNL] Number of legacy pty's. Overwrites compiled-in default number. diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index 87cae72dccb0..1840aa0a052d 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -15,6 +15,7 @@ #include #include #include +#include int kaiser_enabled __read_mostly = 1; EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */ @@ -263,6 +264,43 @@ static void __init kaiser_init_all_pgds(void) WARN_ON(__ret); \ } while (0) +void __init kaiser_check_boottime_disable(void) +{ + bool enable = true; + char arg[5]; + int ret; + + ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); + if (ret > 0) { + if (!strncmp(arg, "on", 2)) + goto enable; + + if (!strncmp(arg, "off", 3)) + goto disable; + + if (!strncmp(arg, "auto", 4)) + goto skip; + } + + if (cmdline_find_option_bool(boot_command_line, "nopti")) + goto disable; + +skip: + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + goto disable; + +enable: + if (enable) + setup_force_cpu_cap(X86_FEATURE_KAISER); + + return; + +disable: + pr_info("Kernel/User page tables isolation: disabled\n"); + kaiser_enabled = 0; + setup_clear_cpu_cap(X86_FEATURE_KAISER); +} + /* * If anything in here fails, we will likely die on one of the * first kernel->user transitions and init will die. But, we @@ -274,12 +312,10 @@ void __init kaiser_init(void) { int cpu; - if (!kaiser_enabled) { - setup_clear_cpu_cap(X86_FEATURE_KAISER); - return; - } + kaiser_check_boottime_disable(); - setup_force_cpu_cap(X86_FEATURE_KAISER); + if (!kaiser_enabled) + return; kaiser_init_all_pgds(); @@ -423,16 +459,3 @@ void kaiser_flush_tlb_on_return_to_user(void) X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); } EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); - -static int __init x86_nokaiser_setup(char *s) -{ - /* nopti doesn't accept parameters */ - if (s) - return -EINVAL; - - kaiser_enabled = 0; - pr_info("Kernel/User page tables isolation: disabled\n"); - - return 0; -} -early_param("nopti", x86_nokaiser_setup); -- cgit v1.2.3 From 169b369f99affebb24db16f4fed58a2f8ff4060f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2017 20:49:04 -0700 Subject: kaiser: use ALTERNATIVE instead of x86_cr3_pcid_noflush Now that we're playing the ALTERNATIVE game, use that more efficient method: instead of user-mapping an extra page, and reading an extra cacheline each time for x86_cr3_pcid_noflush. Neel has found that __stringify(bts $X86_CR3_PCID_NOFLUSH_BIT, %rax) is a working substitute for the "bts $63, %rax" in these ALTERNATIVEs; but the one line with $63 in looks clearer, so let's stick with that. Worried about what happens with an ALTERNATIVE between the jump and jump label in another ALTERNATIVE? I was, but have checked the combinations in SWITCH_KERNEL_CR3_NO_STACK at entry_SYSCALL_64, and it does a good job. Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_64.S | 7 ++++--- arch/x86/include/asm/kaiser.h | 6 +++--- arch/x86/mm/kaiser.c | 11 +---------- 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4e5c41005dd0..f273fed682aa 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1084,7 +1084,8 @@ ENTRY(paranoid_entry) jz 2f orl $2, %ebx andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax - orq x86_cr3_pcid_noflush, %rax + /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */ + ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID movq %rax, %cr3 2: #endif @@ -1344,7 +1345,7 @@ ENTRY(nmi) /* %rax is saved above, so OK to clobber here */ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ - orq x86_cr3_pcid_noflush, %rax + ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID pushq %rax /* mask off "user" bit of pgd address and 12 PCID bits: */ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax @@ -1588,7 +1589,7 @@ end_repeat_nmi: /* %rax is saved above, so OK to clobber here */ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ - orq x86_cr3_pcid_noflush, %rax + ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID pushq %rax /* mask off "user" bit of pgd address and 12 PCID bits: */ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 96643a9c194c..906150d6094e 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -25,7 +25,8 @@ .macro _SWITCH_TO_KERNEL_CR3 reg movq %cr3, \reg andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg -orq x86_cr3_pcid_noflush, \reg +/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */ +ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID movq \reg, %cr3 .endm @@ -39,7 +40,7 @@ movq \reg, %cr3 movq %cr3, \reg orq PER_CPU_VAR(x86_cr3_pcid_user), \reg js 9f -/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */ +/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */ movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7) 9: movq \reg, %cr3 @@ -90,7 +91,6 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax */ DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); -extern unsigned long x86_cr3_pcid_noflush; DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index 1840aa0a052d..b8aa9ad59c0f 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -31,7 +31,6 @@ DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); * This is also handy because systems that do not support PCIDs * just end up or'ing a 0 into their CR3, which does no harm. */ -unsigned long x86_cr3_pcid_noflush __read_mostly; DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user); /* @@ -356,10 +355,6 @@ void __init kaiser_init(void) kaiser_add_user_map_early(&debug_idt_table, sizeof(gate_desc) * NR_VECTORS, __PAGE_KERNEL); - - kaiser_add_user_map_early(&x86_cr3_pcid_noflush, - sizeof(x86_cr3_pcid_noflush), - __PAGE_KERNEL); } /* Add a mapping to the shadow mapping, and synchronize the mappings */ @@ -433,18 +428,14 @@ pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) void kaiser_setup_pcid(void) { - unsigned long kern_cr3 = 0; unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET; - if (this_cpu_has(X86_FEATURE_PCID)) { - kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH; + if (this_cpu_has(X86_FEATURE_PCID)) user_cr3 |= X86_CR3_PCID_USER_NOFLUSH; - } /* * These variables are used by the entry/exit * code to change PCID and pgd and TLB flushing. */ - x86_cr3_pcid_noflush = kern_cr3; this_cpu_write(x86_cr3_pcid_user, user_cr3); } -- cgit v1.2.3 From 8c2f8a5cc15b90674e0144e453544b7bfb1340df Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 29 Oct 2017 11:36:19 -0700 Subject: kaiser: drop is_atomic arg to kaiser_pagetable_walk() I have not observed a might_sleep() warning from setup_fixmap_gdt()'s use of kaiser_add_mapping() in our tree (why not?), but like upstream we have not provided a way for that to pass is_atomic true down to kaiser_pagetable_walk(), and at startup it's far from a likely source of trouble: so just delete the walk's is_atomic arg and might_sleep(). Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/kaiser.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index b8aa9ad59c0f..65ac3fddb4f6 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -107,19 +107,13 @@ static inline unsigned long get_pa_from_mapping(unsigned long vaddr) * * Returns a pointer to a PTE on success, or NULL on failure. */ -static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic) +static pte_t *kaiser_pagetable_walk(unsigned long address) { pmd_t *pmd; pud_t *pud; pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); - if (is_atomic) { - gfp &= ~GFP_KERNEL; - gfp |= __GFP_HIGH | __GFP_ATOMIC; - } else - might_sleep(); - if (pgd_none(*pgd)) { WARN_ONCE(1, "All shadow pgds should have been populated"); return NULL; @@ -194,7 +188,7 @@ static int kaiser_add_user_map(const void *__start_addr, unsigned long size, ret = -EIO; break; } - pte = kaiser_pagetable_walk(address, false); + pte = kaiser_pagetable_walk(address); if (!pte) { ret = -ENOMEM; break; -- cgit v1.2.3 From b72c26e911c566c78743d7973103925007d103c7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 4 Nov 2017 18:23:24 -0700 Subject: kaiser: asm/tlbflush.h handle noPGE at lower level I found asm/tlbflush.h too twisty, and think it safer not to avoid __native_flush_tlb_global_irq_disabled() in the kaiser_enabled case, but instead let it handle kaiser_enabled along with cr3: it can just use __native_flush_tlb() for that, no harm in re-disabling preemption. (This is not the same change as Kirill and Dave have suggested for upstream, flipping PGE in cr4: that's neat, but needs a cpu_has_pge check; cr3 is enough for kaiser, and thought to be cheaper than cr4.) Also delete the X86_FEATURE_INVPCID invpcid_flush_all_nonglobals() preference from __native_flush_tlb(): unlike the invpcid_flush_all() preference in __native_flush_tlb_global(), it's not seen in upstream 4.14, and was recently reported to be surprisingly slow. Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/tlbflush.h | 27 +++------------------------ 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index bdd69a3158dd..3d863528f942 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -152,14 +152,6 @@ static inline void kaiser_flush_tlb_on_return_to_user(void) static inline void __native_flush_tlb(void) { - if (this_cpu_has(X86_FEATURE_INVPCID)) { - /* - * Note, this works with CR4.PCIDE=0 or 1. - */ - invpcid_flush_all_nonglobals(); - return; - } - /* * If current->mm == NULL then we borrow a mm which may change during a * task switch and therefore we must not be preempted while we write CR3 @@ -183,11 +175,8 @@ static inline void __native_flush_tlb_global_irq_disabled(void) /* restore PGE as it was before */ native_write_cr4(cr4); } else { - /* - * x86_64 microcode update comes this way when CR4.PGE is not - * enabled, and it's safer for all callers to allow this case. - */ - native_write_cr3(native_read_cr3()); + /* do it with cr3, letting kaiser flush user PCID */ + __native_flush_tlb(); } } @@ -195,12 +184,6 @@ static inline void __native_flush_tlb_global(void) { unsigned long flags; - if (kaiser_enabled) { - /* Globals are not used at all */ - __native_flush_tlb(); - return; - } - if (this_cpu_has(X86_FEATURE_INVPCID)) { /* * Using INVPCID is considerably faster than a pair of writes @@ -256,11 +239,7 @@ static inline void __native_flush_tlb_single(unsigned long addr) static inline void __flush_tlb_all(void) { - if (boot_cpu_has(X86_FEATURE_PGE)) - __flush_tlb_global(); - else - __flush_tlb(); - + __flush_tlb_global(); /* * Note: if we somehow had PCID but not PGE, then this wouldn't work -- * we'd end up flushing kernel translations for the current ASID but -- cgit v1.2.3 From fe5cb75fd2dd51746fd391c7f6d18485e6a44f76 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 4 Nov 2017 18:43:06 -0700 Subject: kaiser: kaiser_flush_tlb_on_return_to_user() check PCID Let kaiser_flush_tlb_on_return_to_user() do the X86_FEATURE_PCID check, instead of each caller doing it inline first: nobody needs to optimize for the noPCID case, it's clearer this way, and better suits later changes. Replace those no-op X86_CR3_PCID_KERN_FLUSH lines by a BUILD_BUG_ON() in load_new_mm_cr3(), in case something changes. Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/tlbflush.h | 4 ++-- arch/x86/mm/kaiser.c | 6 +++--- arch/x86/mm/tlb.c | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 3d863528f942..5b423d5db40a 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -158,7 +158,7 @@ static inline void __native_flush_tlb(void) * back: */ preempt_disable(); - if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) + if (kaiser_enabled) kaiser_flush_tlb_on_return_to_user(); native_write_cr3(native_read_cr3()); preempt_enable(); @@ -217,7 +217,7 @@ static inline void __native_flush_tlb_single(unsigned long addr) */ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { - if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) + if (kaiser_enabled) kaiser_flush_tlb_on_return_to_user(); asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); return; diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index 65ac3fddb4f6..8600663ded3e 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -435,12 +435,12 @@ void kaiser_setup_pcid(void) /* * Make a note that this cpu will need to flush USER tlb on return to user. - * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling: - * if cpu does not, then the NOFLUSH bit will never have been set. + * If cpu does not have PCID, then the NOFLUSH bit will never have been set. */ void kaiser_flush_tlb_on_return_to_user(void) { - this_cpu_write(x86_cr3_pcid_user, + if (this_cpu_has(X86_FEATURE_PCID)) + this_cpu_write(x86_cr3_pcid_user, X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); } EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index e7934aed8d67..41205de487e7 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -39,7 +39,7 @@ static void load_new_mm_cr3(pgd_t *pgdir) { unsigned long new_mm_cr3 = __pa(pgdir); - if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) { + if (kaiser_enabled) { /* * We reuse the same PCID for different tasks, so we must * flush all the entries for the PCID out when we change tasks. @@ -50,10 +50,10 @@ static void load_new_mm_cr3(pgd_t *pgdir) * do it here, but can only be used if X86_FEATURE_INVPCID is * available - and many machines support pcid without invpcid. * - * The line below is a no-op: X86_CR3_PCID_KERN_FLUSH is now 0; - * but keep that line in there in case something changes. + * If X86_CR3_PCID_KERN_FLUSH actually added something, then it + * would be needed in the write_cr3() below - if PCIDs enabled. */ - new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH; + BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH); kaiser_flush_tlb_on_return_to_user(); } -- cgit v1.2.3 From 1817d2c2fac1b277d97c5a54316e6bfb430d0268 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2017 15:07:30 +0100 Subject: x86/paravirt: Dont patch flush_tlb_single commit a035795499ca1c2bd1928808d1a156eda1420383 upstream native_flush_tlb_single() will be changed with the upcoming PAGE_TABLE_ISOLATION feature. This requires to have more code in there than INVLPG. Remove the paravirt patching for it. Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Reviewed-by: Juergen Gross Acked-by: Peter Zijlstra Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Rik van Riel Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: linux-mm@kvack.org Cc: michael.schwarz@iaik.tugraz.at Cc: moritz.lipp@iaik.tugraz.at Cc: richard.fellner@student.tugraz.at Link: https://lkml.kernel.org/r/20171204150606.828111617@linutronix.de Signed-off-by: Ingo Molnar Acked-by: Borislav Petkov Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/paravirt_patch_64.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index bb3840cedb4f..ee43b36075c7 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); -DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); @@ -59,7 +58,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); PATCH_SITE(pv_cpu_ops, clts); - PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); #if defined(CONFIG_PARAVIRT_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): -- cgit v1.2.3 From 2c2721754a7f193c98188d07ee335b124ae2df77 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 2 Jan 2018 14:19:49 +0100 Subject: x86/kaiser: Reenable PARAVIRT Now that the required bits have been addressed, reenable PARAVIRT. Signed-off-by: Borislav Petkov Signed-off-by: Greg Kroah-Hartman --- security/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/Kconfig b/security/Kconfig index d8ae933471ad..fd2ceebe126a 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -34,7 +34,7 @@ config SECURITY config KAISER bool "Remove the kernel mapping in user mode" default y - depends on X86_64 && SMP && !PARAVIRT + depends on X86_64 && SMP help This enforces a strict kernel and user space isolation, in order to close hardware side channels on kernel address information. -- cgit v1.2.3 From 402e63de94afdf7cd64e4eb209a8a77310e02d2c Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Tue, 2 Jan 2018 14:19:49 +0100 Subject: kaiser: disabled on Xen PV Kaiser cannot be used on paravirtualized MMUs (namely reading and writing CR3). This does not work with KAISER as the CR3 switch from and to user space PGD would require to map the whole XEN_PV machinery into both. More importantly, enabling KAISER on Xen PV doesn't make too much sense, as PV guests use distinct %cr3 values for kernel and user already. Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/kaiser.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index 8600663ded3e..27688548c3eb 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -263,6 +263,9 @@ void __init kaiser_check_boottime_disable(void) char arg[5]; int ret; + if (boot_cpu_has(X86_FEATURE_XENPV)) + goto silent_disable; + ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); if (ret > 0) { if (!strncmp(arg, "on", 2)) @@ -290,6 +293,8 @@ enable: disable: pr_info("Kernel/User page tables isolation: disabled\n"); + +silent_disable: kaiser_enabled = 0; setup_clear_cpu_cap(X86_FEATURE_KAISER); } -- cgit v1.2.3 From 59094faf3f618b2d2b2a45acb916437d611cede6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 25 Dec 2017 13:57:16 +0100 Subject: x86/kaiser: Move feature detection up ... before the first use of kaiser_enabled as otherwise funky things happen: about to get started... (XEN) d0v0 Unhandled page fault fault/trap [#14, ec=0000] (XEN) Pagetable walk from ffff88022a449090: (XEN) L4[0x110] = 0000000229e0e067 0000000000001e0e (XEN) L3[0x008] = 0000000000000000 ffffffffffffffff (XEN) domain_crash_sync called from entry.S: fault at ffff82d08033fd08 entry.o#create_bounce_frame+0x135/0x14d (XEN) Domain 0 (vcpu#0) crashed on cpu#0: (XEN) ----[ Xen-4.9.1_02-3.21 x86_64 debug=n Not tainted ]---- (XEN) CPU: 0 (XEN) RIP: e033:[] (XEN) RFLAGS: 0000000000000286 EM: 1 CONTEXT: pv guest (d0v0) Signed-off-by: Borislav Petkov Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/kaiser.h | 2 ++ arch/x86/kernel/setup.c | 7 +++++++ arch/x86/mm/kaiser.c | 2 -- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 906150d6094e..b5e46aa683f4 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -96,8 +96,10 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; extern int kaiser_enabled; +extern void __init kaiser_check_boottime_disable(void); #else #define kaiser_enabled 0 +static inline void __init kaiser_check_boottime_disable(void) {} #endif /* CONFIG_KAISER */ /* diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index feaab07fa124..6b55012d02a3 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -114,6 +114,7 @@ #include #include #include +#include /* * max_low_pfn_mapped: highest direct mapped pfn under 4GB @@ -1019,6 +1020,12 @@ void __init setup_arch(char **cmdline_p) */ init_hypervisor_platform(); + /* + * This needs to happen right after XENPV is set on xen and + * kaiser_enabled is checked below in cleanup_highmap(). + */ + kaiser_check_boottime_disable(); + x86_init.resources.probe_roms(); /* after parse_early_param, so could debug it */ diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index 27688548c3eb..d43f36912219 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -310,8 +310,6 @@ void __init kaiser_init(void) { int cpu; - kaiser_check_boottime_disable(); - if (!kaiser_enabled) return; -- cgit v1.2.3 From e71fac01727a9495f08de9db5259089bee311766 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Jan 2018 10:17:35 -0800 Subject: KPTI: Rename to PAGE_TABLE_ISOLATION This renames CONFIG_KAISER to CONFIG_PAGE_TABLE_ISOLATION. Signed-off-by: Kees Cook Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/compressed/misc.h | 2 +- arch/x86/entry/entry_64.S | 12 ++++++------ arch/x86/events/intel/ds.c | 4 ++-- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/include/asm/kaiser.h | 12 ++++++------ arch/x86/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable_64.h | 4 ++-- arch/x86/include/asm/pgtable_types.h | 2 +- arch/x86/include/asm/tlbflush.h | 2 +- arch/x86/kernel/head_64.S | 2 +- arch/x86/mm/Makefile | 2 +- arch/x86/mm/kaslr.c | 2 +- include/linux/kaiser.h | 6 +++--- include/linux/percpu-defs.h | 2 +- security/Kconfig | 2 +- tools/arch/x86/include/asm/cpufeatures.h | 2 +- 16 files changed, 31 insertions(+), 31 deletions(-) diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 80e2cce110b9..2728e1b7e4a6 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -9,7 +9,7 @@ */ #undef CONFIG_PARAVIRT #undef CONFIG_PARAVIRT_SPINLOCKS -#undef CONFIG_KAISER +#undef CONFIG_PAGE_TABLE_ISOLATION #undef CONFIG_KASAN #include diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f273fed682aa..af4e58132d91 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1071,7 +1071,7 @@ ENTRY(paranoid_entry) SWAPGS xorl %ebx, %ebx 1: -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION /* * We might have come in between a swapgs and a SWITCH_KERNEL_CR3 * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit. @@ -1111,7 +1111,7 @@ ENTRY(paranoid_exit) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF_DEBUG TRACE_IRQS_IRETQ_DEBUG -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */ testl $2, %ebx /* SWITCH_USER_CR3 needed? */ jz paranoid_exit_no_switch @@ -1340,7 +1340,7 @@ ENTRY(nmi) movq %rsp, %rdi movq $-1, %rsi -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION /* Unconditionally use kernel CR3 for do_nmi() */ /* %rax is saved above, so OK to clobber here */ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER @@ -1354,7 +1354,7 @@ ENTRY(nmi) #endif call do_nmi -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION /* * Unconditionally restore CR3. I know we return to * kernel code that needs user CR3, but do we ever return @@ -1584,7 +1584,7 @@ end_repeat_nmi: 1: movq %rsp, %rdi movq $-1, %rsi -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION /* Unconditionally use kernel CR3 for do_nmi() */ /* %rax is saved above, so OK to clobber here */ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER @@ -1600,7 +1600,7 @@ end_repeat_nmi: /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ call do_nmi -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION /* * Unconditionally restore CR3. We might be returning to * kernel code that needs user CR3, like just just before diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 51170f69682d..8e7a3f1df3a5 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -274,7 +274,7 @@ static DEFINE_PER_CPU(void *, insn_buffer); static void *dsalloc(size_t size, gfp_t flags, int node) { -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION unsigned int order = get_order(size); struct page *page; unsigned long addr; @@ -295,7 +295,7 @@ static void *dsalloc(size_t size, gfp_t flags, int node) static void dsfree(const void *buffer, size_t size) { -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION if (!buffer) return; kaiser_remove_mapping((unsigned long)buffer, size); diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 20271d6b0eb9..454a37adb823 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -199,7 +199,7 @@ #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ -#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */ +#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index b5e46aa683f4..802bbbdfe143 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -20,7 +20,7 @@ #define KAISER_SHADOW_PGD_OFFSET 0x1000 #ifdef __ASSEMBLY__ -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION .macro _SWITCH_TO_KERNEL_CR3 reg movq %cr3, \reg @@ -69,7 +69,7 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax 8: .endm -#else /* CONFIG_KAISER */ +#else /* CONFIG_PAGE_TABLE_ISOLATION */ .macro SWITCH_KERNEL_CR3 .endm @@ -78,11 +78,11 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax .macro SWITCH_KERNEL_CR3_NO_STACK .endm -#endif /* CONFIG_KAISER */ +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ #else /* __ASSEMBLY__ */ -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION /* * Upon kernel/user mode switch, it may happen that the address * space has to be switched before the registers have been @@ -100,10 +100,10 @@ extern void __init kaiser_check_boottime_disable(void); #else #define kaiser_enabled 0 static inline void __init kaiser_check_boottime_disable(void) {} -#endif /* CONFIG_KAISER */ +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ /* - * Kaiser function prototypes are needed even when CONFIG_KAISER is not set, + * Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set, * so as to build with tests on kaiser_enabled instead of #ifdefs. */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 217e83a657e2..2536f90cd30c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -18,7 +18,7 @@ #ifndef __ASSEMBLY__ #include -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION extern int kaiser_enabled; #else #define kaiser_enabled 0 @@ -920,7 +920,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { memcpy(dst, src, count * sizeof(pgd_t)); -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION if (kaiser_enabled) { /* Clone the shadow pgd part as well */ memcpy(native_get_shadow_pgd(dst), diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index cf68b5c1cb74..ce97c8c6a310 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -106,7 +106,7 @@ static inline void native_pud_clear(pud_t *pud) native_set_pud(pud, native_make_pud(0)); } -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) @@ -127,7 +127,7 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) BUILD_BUG_ON(1); return NULL; } -#endif /* CONFIG_KAISER */ +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index f0d9a1aa3f68..f1c8ac468292 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -144,7 +144,7 @@ #define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK) #define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) -#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64) +#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64) /* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */ #define X86_CR3_PCID_ASID_USER (_AC(0x80,UL)) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 5b423d5db40a..94146f665a3c 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -136,7 +136,7 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) * Declare a couple of kaiser interfaces here for convenience, * to avoid the need for asm/kaiser.h in unexpected places. */ -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION extern int kaiser_enabled; extern void kaiser_setup_pcid(void); extern void kaiser_flush_tlb_on_return_to_user(void); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d04479b4a8ee..67cd7c1b99da 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -405,7 +405,7 @@ GLOBAL(early_recursion_flag) .balign PAGE_SIZE; \ GLOBAL(name) -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION /* * Each PGD needs to be 8k long and 8k aligned. We do not * ever go out to userspace with these, so we do not diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index c505569e3835..c548b46100cb 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -38,4 +38,4 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_X86_INTEL_MPX) += mpx.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o -obj-$(CONFIG_KAISER) += kaiser.o +obj-$(CONFIG_PAGE_TABLE_ISOLATION) += kaiser.o diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 9284ec1e8aa9..319183d93602 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -189,6 +189,6 @@ void __meminit init_trampoline(void) *pud_tramp = *pud; } - /* Avoid set_pgd(), in case it's complicated by CONFIG_KAISER */ + /* Avoid set_pgd(), in case it's complicated by CONFIG_PAGE_TABLE_ISOLATION */ trampoline_pgd_entry = __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)); } diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h index 4a4d6d911a14..58c55b1589d0 100644 --- a/include/linux/kaiser.h +++ b/include/linux/kaiser.h @@ -1,7 +1,7 @@ #ifndef _LINUX_KAISER_H #define _LINUX_KAISER_H -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION #include static inline int kaiser_map_thread_stack(void *stack) @@ -24,7 +24,7 @@ static inline void kaiser_unmap_thread_stack(void *stack) #else /* - * These stubs are used whenever CONFIG_KAISER is off, which + * These stubs are used whenever CONFIG_PAGE_TABLE_ISOLATION is off, which * includes architectures that support KAISER, but have it disabled. */ @@ -48,5 +48,5 @@ static inline void kaiser_unmap_thread_stack(void *stack) { } -#endif /* !CONFIG_KAISER */ +#endif /* !CONFIG_PAGE_TABLE_ISOLATION */ #endif /* _LINUX_KAISER_H */ diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index cfe13cb4ec63..8902f23bb770 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -35,7 +35,7 @@ #endif -#ifdef CONFIG_KAISER +#ifdef CONFIG_PAGE_TABLE_ISOLATION #define USER_MAPPED_SECTION "..user_mapped" #else #define USER_MAPPED_SECTION "" diff --git a/security/Kconfig b/security/Kconfig index fd2ceebe126a..32f36b40e9f0 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -31,7 +31,7 @@ config SECURITY If you are unsure how to answer this question, answer N. -config KAISER +config PAGE_TABLE_ISOLATION bool "Remove the kernel mapping in user mode" default y depends on X86_64 && SMP diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 67c93d9f5ccd..f79669a38c0c 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -198,7 +198,7 @@ #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ -#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */ +#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ -- cgit v1.2.3 From ea6cd39d230f71e27facc0667c1986504e5b0f54 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Jan 2018 10:18:01 -0800 Subject: KPTI: Report when enabled Make sure dmesg reports when KPTI is enabled. Signed-off-by: Kees Cook Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/kaiser.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index d43f36912219..b6b0f3aa4f77 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -10,6 +10,9 @@ #include #include +#undef pr_fmt +#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt + #include #include /* to verify its kaiser declarations */ #include @@ -292,7 +295,7 @@ enable: return; disable: - pr_info("Kernel/User page tables isolation: disabled\n"); + pr_info("disabled\n"); silent_disable: kaiser_enabled = 0; @@ -352,6 +355,8 @@ void __init kaiser_init(void) kaiser_add_user_map_early(&debug_idt_table, sizeof(gate_desc) * NR_VECTORS, __PAGE_KERNEL); + + pr_info("enabled\n"); } /* Add a mapping to the shadow mapping, and synchronize the mappings */ -- cgit v1.2.3 From 92fd81f772673ee51381337f07b1da94187de542 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 4 Jan 2018 13:41:55 -0800 Subject: kaiser: Set _PAGE_NX only if supported This resolves a crash if loaded under qemu + haxm under windows. See https://www.spinics.net/lists/kernel/msg2689835.html for details. Here is a boot log (the log is from chromeos-4.4, but Tao Wu says that the same log is also seen with vanilla v4.4.110-rc1). [ 0.712750] Freeing unused kernel memory: 552K [ 0.721821] init: Corrupted page table at address 57b029b332e0 [ 0.722761] PGD 80000000bb238067 PUD bc36a067 PMD bc369067 PTE 45d2067 [ 0.722761] Bad pagetable: 000b [#1] PREEMPT SMP [ 0.722761] Modules linked in: [ 0.722761] CPU: 1 PID: 1 Comm: init Not tainted 4.4.96 #31 [ 0.722761] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5.1-0-g8936dbb-20141113_115728-nilsson.home.kraxel.org 04/01/2014 [ 0.722761] task: ffff8800bc290000 ti: ffff8800bc28c000 task.ti: ffff8800bc28c000 [ 0.722761] RIP: 0010:[] [] __clear_user+0x42/0x67 [ 0.722761] RSP: 0000:ffff8800bc28fcf8 EFLAGS: 00010202 [ 0.722761] RAX: 0000000000000000 RBX: 00000000000001a4 RCX: 00000000000001a4 [ 0.722761] RDX: 0000000000000000 RSI: 0000000000000008 RDI: 000057b029b332e0 [ 0.722761] RBP: ffff8800bc28fd08 R08: ffff8800bc290000 R09: ffff8800bb2f4000 [ 0.722761] R10: ffff8800bc290000 R11: ffff8800bb2f4000 R12: 000057b029b332e0 [ 0.722761] R13: 0000000000000000 R14: 000057b029b33340 R15: ffff8800bb1e2a00 [ 0.722761] FS: 0000000000000000(0000) GS:ffff8800bfb00000(0000) knlGS:0000000000000000 [ 0.722761] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 0.722761] CR2: 000057b029b332e0 CR3: 00000000bb2f8000 CR4: 00000000000006e0 [ 0.722761] Stack: [ 0.722761] 000057b029b332e0 ffff8800bb95fa80 ffff8800bc28fd18 ffffffff83f4120c [ 0.722761] ffff8800bc28fe18 ffffffff83e9e7a1 ffff8800bc28fd68 0000000000000000 [ 0.722761] ffff8800bc290000 ffff8800bc290000 ffff8800bc290000 ffff8800bc290000 [ 0.722761] Call Trace: [ 0.722761] [] clear_user+0x2e/0x30 [ 0.722761] [] load_elf_binary+0xa7f/0x18f7 [ 0.722761] [] search_binary_handler+0x86/0x19c [ 0.722761] [] do_execveat_common.isra.26+0x909/0xf98 [ 0.722761] [] ? rest_init+0x87/0x87 [ 0.722761] [] do_execve+0x23/0x25 [ 0.722761] [] run_init_process+0x2b/0x2d [ 0.722761] [] kernel_init+0x6d/0xda [ 0.722761] [] ret_from_fork+0x3f/0x70 [ 0.722761] [] ? rest_init+0x87/0x87 [ 0.722761] Code: 86 84 be 12 00 00 00 e8 87 0d e8 ff 66 66 90 48 89 d8 48 c1 eb 03 4c 89 e7 83 e0 07 48 89 d9 be 08 00 00 00 31 d2 48 85 c9 74 0a <48> 89 17 48 01 f7 ff c9 75 f6 48 89 c1 85 c9 74 09 88 17 48 ff [ 0.722761] RIP [] __clear_user+0x42/0x67 [ 0.722761] RSP [ 0.722761] ---[ end trace def703879b4ff090 ]--- [ 0.722761] BUG: sleeping function called from invalid context at /mnt/host/source/src/third_party/kernel/v4.4/kernel/locking/rwsem.c:21 [ 0.722761] in_atomic(): 0, irqs_disabled(): 1, pid: 1, name: init [ 0.722761] CPU: 1 PID: 1 Comm: init Tainted: G D 4.4.96 #31 [ 0.722761] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5.1-0-g8936dbb-20141113_115728-nilsson.home.kraxel.org 04/01/2014 [ 0.722761] 0000000000000086 dcb5d76098c89836 ffff8800bc28fa30 ffffffff83f34004 [ 0.722761] ffffffff84839dc2 0000000000000015 ffff8800bc28fa40 ffffffff83d57dc9 [ 0.722761] ffff8800bc28fa68 ffffffff83d57e6a ffffffff84a53640 0000000000000000 [ 0.722761] Call Trace: [ 0.722761] [] dump_stack+0x4d/0x63 [ 0.722761] [] ___might_sleep+0x13a/0x13c [ 0.722761] [] __might_sleep+0x9f/0xa6 [ 0.722761] [] down_read+0x20/0x31 [ 0.722761] [] __blocking_notifier_call_chain+0x35/0x63 [ 0.722761] [] blocking_notifier_call_chain+0x14/0x16 [ 0.800374] usb 1-1: new full-speed USB device number 2 using uhci_hcd [ 0.722761] [] profile_task_exit+0x1a/0x1c [ 0.802309] [] do_exit+0x39/0xe7f [ 0.802309] [] ? vprintk_default+0x1d/0x1f [ 0.802309] [] ? printk+0x57/0x73 [ 0.802309] [] oops_end+0x80/0x85 [ 0.802309] [] pgtable_bad+0x8a/0x95 [ 0.802309] [] __do_page_fault+0x8c/0x352 [ 0.802309] [] ? file_has_perm+0xc4/0xe5 [ 0.802309] [] do_page_fault+0xc/0xe [ 0.802309] [] page_fault+0x22/0x30 [ 0.802309] [] ? __clear_user+0x42/0x67 [ 0.802309] [] ? __clear_user+0x23/0x67 [ 0.802309] [] clear_user+0x2e/0x30 [ 0.802309] [] load_elf_binary+0xa7f/0x18f7 [ 0.802309] [] search_binary_handler+0x86/0x19c [ 0.802309] [] do_execveat_common.isra.26+0x909/0xf98 [ 0.802309] [] ? rest_init+0x87/0x87 [ 0.802309] [] do_execve+0x23/0x25 [ 0.802309] [] run_init_process+0x2b/0x2d [ 0.802309] [] kernel_init+0x6d/0xda [ 0.802309] [] ret_from_fork+0x3f/0x70 [ 0.802309] [] ? rest_init+0x87/0x87 [ 0.830559] Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009 [ 0.830559] [ 0.831305] Kernel Offset: 0x2c00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 0.831305] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009 The crash part of this problem may be solved with the following patch (thanks to Hugh for the hint). There is still another problem, though - with this patch applied, the qemu session aborts with "VCPU Shutdown request", whatever that means. Cc: lepton Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/kaiser.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index b6b0f3aa4f77..d8376b4ad9f0 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -413,7 +413,8 @@ pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) * get out to userspace running on the kernel CR3, * userspace will crash instead of running. */ - pgd.pgd |= _PAGE_NX; + if (__supported_pte_mask & _PAGE_NX) + pgd.pgd |= _PAGE_NX; } } else if (!pgd.pgd) { /* -- cgit v1.2.3 From 9f74755895f9b080f79384edb600a18433788adc Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 5 Jan 2018 15:46:36 +0100 Subject: Linux 4.9.75 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 075e429732e7..acbc1b032db2 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 9 -SUBLEVEL = 74 +SUBLEVEL = 75 EXTRAVERSION = NAME = Roaring Lionus -- cgit v1.2.3 From 17d3592068c2cd88e9c27ec7e5e0c3c8882c0008 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 19 Jul 2017 17:24:49 +0100 Subject: UPSTREAM: arm64: factor out entry stack manipulation In subsequent patches, we will detect stack overflow in our exception entry code, by verifying the SP after it has been decremented to make space for the exception regs. This verification code is small, and we can minimize its impact by placing it directly in the vectors. To avoid redundant modification of the SP, we also need to move the initial decrement of the SP into the vectors. As a preparatory step, this patch introduces kernel_ventry, which performs this decrement, and updates the entry code accordingly. Subsequent patches will fold SP verification into kernel_ventry. There should be no functional change as a result of this patch. Signed-off-by: Ard Biesheuvel [Mark: turn into prep patch, expand commit msg] Signed-off-by: Mark Rutland Reviewed-by: Will Deacon Tested-by: Laura Abbott Cc: Catalin Marinas Cc: James Morse (cherry picked from commit b11e5759bfac0c474d95ec4780b1566350e64cad) Signed-off-by: Todd Poynor Change-Id: I54428c1e60c9e0ed288bfda835e3b57d45f72999 Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/entry.S | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index c44a93387908..8cd2d830213f 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -70,8 +70,13 @@ #define BAD_FIQ 2 #define BAD_ERROR 3 - .macro kernel_entry, el, regsize = 64 + .macro kernel_ventry label + .align 7 sub sp, sp, #S_FRAME_SIZE + b \label + .endm + + .macro kernel_entry, el, regsize = 64 .if \regsize == 32 mov w0, w0 // zero upper 32 bits of x0 .endif @@ -316,31 +321,31 @@ tsk .req x28 // current thread_info .align 11 ENTRY(vectors) - ventry el1_sync_invalid // Synchronous EL1t - ventry el1_irq_invalid // IRQ EL1t - ventry el1_fiq_invalid // FIQ EL1t - ventry el1_error_invalid // Error EL1t + kernel_ventry el1_sync_invalid // Synchronous EL1t + kernel_ventry el1_irq_invalid // IRQ EL1t + kernel_ventry el1_fiq_invalid // FIQ EL1t + kernel_ventry el1_error_invalid // Error EL1t - ventry el1_sync // Synchronous EL1h - ventry el1_irq // IRQ EL1h - ventry el1_fiq_invalid // FIQ EL1h - ventry el1_error_invalid // Error EL1h + kernel_ventry el1_sync // Synchronous EL1h + kernel_ventry el1_irq // IRQ EL1h + kernel_ventry el1_fiq_invalid // FIQ EL1h + kernel_ventry el1_error_invalid // Error EL1h - ventry el0_sync // Synchronous 64-bit EL0 - ventry el0_irq // IRQ 64-bit EL0 - ventry el0_fiq_invalid // FIQ 64-bit EL0 - ventry el0_error_invalid // Error 64-bit EL0 + kernel_ventry el0_sync // Synchronous 64-bit EL0 + kernel_ventry el0_irq // IRQ 64-bit EL0 + kernel_ventry el0_fiq_invalid // FIQ 64-bit EL0 + kernel_ventry el0_error_invalid // Error 64-bit EL0 #ifdef CONFIG_COMPAT - ventry el0_sync_compat // Synchronous 32-bit EL0 - ventry el0_irq_compat // IRQ 32-bit EL0 - ventry el0_fiq_invalid_compat // FIQ 32-bit EL0 - ventry el0_error_invalid_compat // Error 32-bit EL0 + kernel_ventry el0_sync_compat // Synchronous 32-bit EL0 + kernel_ventry el0_irq_compat // IRQ 32-bit EL0 + kernel_ventry el0_fiq_invalid_compat // FIQ 32-bit EL0 + kernel_ventry el0_error_invalid_compat // Error 32-bit EL0 #else - ventry el0_sync_invalid // Synchronous 32-bit EL0 - ventry el0_irq_invalid // IRQ 32-bit EL0 - ventry el0_fiq_invalid // FIQ 32-bit EL0 - ventry el0_error_invalid // Error 32-bit EL0 + kernel_ventry el0_sync_invalid // Synchronous 32-bit EL0 + kernel_ventry el0_irq_invalid // IRQ 32-bit EL0 + kernel_ventry el0_fiq_invalid // FIQ 32-bit EL0 + kernel_ventry el0_error_invalid // Error 32-bit EL0 #endif END(vectors) -- cgit v1.2.3 From 19d3691ff6cb8465ed21f3d12dac253e4c4f0d6c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Aug 2017 12:56:18 +0100 Subject: UPSTREAM: arm64: mm: Use non-global mappings for kernel space In preparation for unmapping the kernel whilst running in userspace, make the kernel mappings non-global so we can avoid expensive TLB invalidation on kernel exit to userspace. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon (cherry picked from commit e046eb0c9bf26d94be9e4592c00c7a78b0fa9bfd) Change-Id: Ic267c24fe8b3bc03b96933949418555de989529a [toddpoynor@google.com: fixup PAGE_NONE conflict] Signed-off-by: Todd Poynor Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/kernel-pgtable.h | 12 ++++++++++-- arch/arm64/include/asm/pgtable-prot.h | 21 +++++++++++++++------ 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 7803343e5881..77a27af01371 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -78,8 +78,16 @@ /* * Initial memory map attributes. */ -#define SWAPPER_PTE_FLAGS (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) -#define SWAPPER_PMD_FLAGS (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) +#define _SWAPPER_PTE_FLAGS (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) +#define _SWAPPER_PMD_FLAGS (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +#define SWAPPER_PTE_FLAGS (_SWAPPER_PTE_FLAGS | PTE_NG) +#define SWAPPER_PMD_FLAGS (_SWAPPER_PMD_FLAGS | PMD_SECT_NG) +#else +#define SWAPPER_PTE_FLAGS _SWAPPER_PTE_FLAGS +#define SWAPPER_PMD_FLAGS _SWAPPER_PMD_FLAGS +#endif #if ARM64_SWAPPER_USES_SECTION_MAPS #define SWAPPER_MM_MMUFLAGS (PMD_ATTRINDX(MT_NORMAL) | SWAPPER_PMD_FLAGS) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 2142c7726e76..84b5283d2e7f 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -34,8 +34,16 @@ #include -#define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) -#define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) +#define _PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) +#define _PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +#define PROT_DEFAULT (_PROT_DEFAULT | PTE_NG) +#define PROT_SECT_DEFAULT (_PROT_SECT_DEFAULT | PMD_SECT_NG) +#else +#define PROT_DEFAULT _PROT_DEFAULT +#define PROT_SECT_DEFAULT _PROT_SECT_DEFAULT +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ #define PROT_DEVICE_nGnRnE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE)) #define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE)) @@ -48,6 +56,7 @@ #define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) #define _PAGE_DEFAULT (PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) +#define _HYP_PAGE_DEFAULT (_PAGE_DEFAULT & ~PTE_NG) #define PAGE_KERNEL __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE) #define PAGE_KERNEL_RO __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_RDONLY) @@ -55,15 +64,15 @@ #define PAGE_KERNEL_EXEC __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE) #define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT) -#define PAGE_HYP __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN) -#define PAGE_HYP_EXEC __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY) -#define PAGE_HYP_RO __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN) +#define PAGE_HYP __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN) +#define PAGE_HYP_EXEC __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY) +#define PAGE_HYP_RO __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN) #define PAGE_HYP_DEVICE __pgprot(PROT_DEVICE_nGnRE | PTE_HYP) #define PAGE_S2 __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY) #define PAGE_S2_DEVICE __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_UXN) -#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_PXN | PTE_UXN) +#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_NG | PTE_PXN | PTE_UXN) #define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) #define PAGE_SHARED_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE) #define PAGE_COPY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) -- cgit v1.2.3 From dee3b246017c98b2ef932d2732fb0da03c4bd9d8 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Aug 2017 13:04:48 +0100 Subject: UPSTREAM: arm64: mm: Temporarily disable ARM64_SW_TTBR0_PAN We're about to rework the way ASIDs are allocated, switch_mm is implemented and low-level kernel entry/exit is handled, so keep the ARM64_SW_TTBR0_PAN code out of the way whilst we do the heavy lifting. It will be re-enabled in a subsequent patch. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon (cherry picked from commit 376133b7edc20f237a42e4c72415cc9e8c0a9704) Signed-off-by: Todd Poynor Change-Id: Iac0d096cd8eda5bc2d3cd17d7d2492d1838ad8ed Signed-off-by: Greg Kroah-Hartman --- arch/arm64/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index fbf07a6bd8a4..55578aeccff7 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -802,6 +802,7 @@ endif config ARM64_SW_TTBR0_PAN bool "Emulate Privileged Access Never using TTBR0_EL1 switching" + depends on BROKEN # Temporary while switch_mm is reworked help Enabling this option prevents the kernel from accessing user-space memory directly by pointing TTBR0_EL1 to a reserved -- cgit v1.2.3 From a72dd8a6766560ac64577a5465d2749a5dea0b90 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Aug 2017 13:19:09 +0100 Subject: UPSTREAM: arm64: mm: Move ASID from TTBR0 to TTBR1 In preparation for mapping kernelspace and userspace with different ASIDs, move the ASID to TTBR1 and update switch_mm to context-switch TTBR0 via an invalid mapping (the zero page). Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon (cherry picked from commit 7655abb953860485940d4de74fb45a8192149bb6) [toddpoynor@google.com: add missing mrs inst from context] Change-Id: I9575ae2f0b3b5383c44f7ace0ac50588be739e45 Signed-off-by: Todd Poynor Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/mmu_context.h | 7 +++++++ arch/arm64/include/asm/pgtable-hwdef.h | 1 + arch/arm64/include/asm/proc-fns.h | 6 ------ arch/arm64/mm/proc.S | 9 ++++++--- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 63e9982daca1..3c2ff5183d48 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -51,6 +51,13 @@ static inline void cpu_set_reserved_ttbr0(void) isb(); } +static inline void cpu_switch_mm(pgd_t *pgd, struct mm_struct *mm) +{ + BUG_ON(pgd == swapper_pg_dir); + cpu_set_reserved_ttbr0(); + cpu_do_switch_mm(virt_to_phys(pgd),mm); +} + /* * TCR.T0SZ value to use when the ID map is active. Usually equals * TCR_T0SZ(VA_BITS), unless system RAM is positioned very high in diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index eb0c2bd90de9..8df4cb6ac6f7 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -272,6 +272,7 @@ #define TCR_TG1_4K (UL(2) << TCR_TG1_SHIFT) #define TCR_TG1_64K (UL(3) << TCR_TG1_SHIFT) +#define TCR_A1 (UL(1) << 22) #define TCR_ASID16 (UL(1) << 36) #define TCR_TBI0 (UL(1) << 37) #define TCR_HA (UL(1) << 39) diff --git a/arch/arm64/include/asm/proc-fns.h b/arch/arm64/include/asm/proc-fns.h index 14ad6e4e87d1..16cef2e8449e 100644 --- a/arch/arm64/include/asm/proc-fns.h +++ b/arch/arm64/include/asm/proc-fns.h @@ -35,12 +35,6 @@ extern u64 cpu_do_resume(phys_addr_t ptr, u64 idmap_ttbr); #include -#define cpu_switch_mm(pgd,mm) \ -do { \ - BUG_ON(pgd == swapper_pg_dir); \ - cpu_do_switch_mm(virt_to_phys(pgd),mm); \ -} while (0) - #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* __ASM_PROCFNS_H */ diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index c2adb0cb952a..76ef30afbb22 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -132,9 +132,12 @@ ENDPROC(cpu_do_resume) * - pgd_phys - physical address of new TTB */ ENTRY(cpu_do_switch_mm) + mrs x2, ttbr1_el1 mmid x1, x1 // get mm->context.id - bfi x0, x1, #48, #16 // set the ASID - msr ttbr0_el1, x0 // set TTBR0 + bfi x2, x1, #48, #16 // set the ASID + msr ttbr1_el1, x2 // in TTBR1 (since TCR.A1 is set) + isb + msr ttbr0_el1, x0 // now update TTBR0 isb post_ttbr0_update_workaround ret @@ -218,7 +221,7 @@ ENTRY(__cpu_setup) * both user and kernel. */ ldr x10, =TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \ - TCR_TG_FLAGS | TCR_ASID16 | TCR_TBI0 + TCR_TG_FLAGS | TCR_ASID16 | TCR_TBI0 | TCR_A1 tcr_set_idmap_t0sz x10, x9 /* -- cgit v1.2.3 From 071a49fc82b450c9689b4acb469e7ee22a65d87c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Aug 2017 13:34:30 +0100 Subject: UPSTREAM: arm64: mm: Rename post_ttbr0_update_workaround The post_ttbr0_update_workaround hook applies to any change to TTBRx_EL1. Since we're using TTBR1 for the ASID, rename the hook to make it clearer as to what it's doing. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon (cherry picked from commit 158d495899ce55db453f682a8ac8390d5a426578) [toddpoynor@google.com: fixup context conflict in comments] Change-Id: Ia0f82eee78ad442d1773245b99c7534f0ea066e4 Signed-off-by: Todd Poynor Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/assembler.h | 4 ++-- arch/arm64/kernel/entry.S | 2 +- arch/arm64/mm/proc.S | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 7c4218ea6ff5..eb9fad288b04 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -430,9 +430,9 @@ alternative_endif .endm /* - * Errata workaround post TTBR0_EL1 update. + * Errata workaround post TTBRx_EL1 update. */ - .macro post_ttbr0_update_workaround + .macro post_ttbr_update_workaround #ifdef CONFIG_CAVIUM_ERRATUM_27456 alternative_if ARM64_WORKAROUND_CAVIUM_27456 ic iallu diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 8cd2d830213f..a6619ccda110 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -205,7 +205,7 @@ alternative_else_nop_endif * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache * corruption). */ - post_ttbr0_update_workaround + post_ttbr_update_workaround .endif 1: .if \el != 0 diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 76ef30afbb22..c1e868a036b5 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -139,7 +139,7 @@ ENTRY(cpu_do_switch_mm) isb msr ttbr0_el1, x0 // now update TTBR0 isb - post_ttbr0_update_workaround + post_ttbr_update_workaround ret ENDPROC(cpu_do_switch_mm) -- cgit v1.2.3 From 599c71f8be4a9c7093e8283dce7ff73eb057bb86 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Aug 2017 13:58:16 +0100 Subject: UPSTREAM: arm64: mm: Fix and re-enable ARM64_SW_TTBR0_PAN With the ASID now installed in TTBR1, we can re-enable ARM64_SW_TTBR0_PAN by ensuring that we switch to a reserved ASID of zero when disabling user access and restore the active user ASID on the uaccess enable path. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon (cherry picked from commit 27a921e75711d924617269e0ba4adb8bae9fd0d1) [toddpoynor@google.com: fixup context, move asm-uaccess.h changes to uaccess.h] Change-Id: Ie8e1706152b8d7c068c6b1f20241a10669a69ca1 Signed-off-by: Todd Poynor Signed-off-by: Greg Kroah-Hartman --- arch/arm64/Kconfig | 1 - arch/arm64/include/asm/uaccess.h | 46 +++++++++++++++++++++++++++++----------- arch/arm64/kernel/entry.S | 4 ++-- arch/arm64/lib/clear_user.S | 2 +- arch/arm64/lib/copy_from_user.S | 2 +- arch/arm64/lib/copy_in_user.S | 2 +- arch/arm64/lib/copy_to_user.S | 2 +- arch/arm64/mm/cache.S | 2 +- arch/arm64/xen/hypercall.S | 2 +- 9 files changed, 42 insertions(+), 21 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 55578aeccff7..fbf07a6bd8a4 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -802,7 +802,6 @@ endif config ARM64_SW_TTBR0_PAN bool "Emulate Privileged Access Never using TTBR0_EL1 switching" - depends on BROKEN # Temporary while switch_mm is reworked help Enabling this option prevents the kernel from accessing user-space memory directly by pointing TTBR0_EL1 to a reserved diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 31b6940ec2f2..8e265b2c8ce7 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -132,15 +132,19 @@ static inline void __uaccess_ttbr0_disable(void) { unsigned long ttbr; + ttbr = read_sysreg(ttbr1_el1); /* reserved_ttbr0 placed at the end of swapper_pg_dir */ - ttbr = read_sysreg(ttbr1_el1) + SWAPPER_DIR_SIZE; - write_sysreg(ttbr, ttbr0_el1); + write_sysreg(ttbr + SWAPPER_DIR_SIZE, ttbr0_el1); + isb(); + /* Set reserved ASID */ + ttbr &= ~(0xffffUL << 48); + write_sysreg(ttbr, ttbr1_el1); isb(); } static inline void __uaccess_ttbr0_enable(void) { - unsigned long flags; + unsigned long flags, ttbr0, ttbr1; /* * Disable interrupts to avoid preemption between reading the 'ttbr0' @@ -148,7 +152,16 @@ static inline void __uaccess_ttbr0_enable(void) * roll-over and an update of 'ttbr0'. */ local_irq_save(flags); - write_sysreg(current_thread_info()->ttbr0, ttbr0_el1); + ttbr0 = current_thread_info()->ttbr0; + + /* Restore active ASID */ + ttbr1 = read_sysreg(ttbr1_el1); + ttbr1 |= ttbr0 & (0xffffUL << 48); + write_sysreg(ttbr1, ttbr1_el1); + isb(); + + /* Restore user page table */ + write_sysreg(ttbr0, ttbr0_el1); isb(); local_irq_restore(flags); } @@ -438,11 +451,20 @@ extern __must_check long strnlen_user(const char __user *str, long n); add \tmp1, \tmp1, #SWAPPER_DIR_SIZE // reserved_ttbr0 at the end of swapper_pg_dir msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1 isb + sub \tmp1, \tmp1, #SWAPPER_DIR_SIZE + bic \tmp1, \tmp1, #(0xffff << 48) + msr ttbr1_el1, \tmp1 // set reserved ASID + isb .endm - .macro __uaccess_ttbr0_enable, tmp1 + .macro __uaccess_ttbr0_enable, tmp1, tmp2 get_thread_info \tmp1 ldr \tmp1, [\tmp1, #TSK_TI_TTBR0] // load saved TTBR0_EL1 + mrs \tmp2, ttbr1_el1 + extr \tmp2, \tmp2, \tmp1, #48 + ror \tmp2, \tmp2, #16 + msr ttbr1_el1, \tmp2 // set the active ASID + isb msr ttbr0_el1, \tmp1 // set the non-PAN TTBR0_EL1 isb .endm @@ -453,18 +475,18 @@ alternative_if_not ARM64_HAS_PAN alternative_else_nop_endif .endm - .macro uaccess_ttbr0_enable, tmp1, tmp2 + .macro uaccess_ttbr0_enable, tmp1, tmp2, tmp3 alternative_if_not ARM64_HAS_PAN - save_and_disable_irq \tmp2 // avoid preemption - __uaccess_ttbr0_enable \tmp1 - restore_irq \tmp2 + save_and_disable_irq \tmp3 // avoid preemption + __uaccess_ttbr0_enable \tmp1, \tmp2 + restore_irq \tmp3 alternative_else_nop_endif .endm #else .macro uaccess_ttbr0_disable, tmp1 .endm - .macro uaccess_ttbr0_enable, tmp1, tmp2 + .macro uaccess_ttbr0_enable, tmp1, tmp2, tmp3 .endm #endif @@ -478,8 +500,8 @@ alternative_if ARM64_ALT_PAN_NOT_UAO alternative_else_nop_endif .endm - .macro uaccess_enable_not_uao, tmp1, tmp2 - uaccess_ttbr0_enable \tmp1, \tmp2 + .macro uaccess_enable_not_uao, tmp1, tmp2, tmp3 + uaccess_ttbr0_enable \tmp1, \tmp2, \tmp3 alternative_if ARM64_ALT_PAN_NOT_UAO SET_PSTATE_PAN(0) alternative_else_nop_endif diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index a6619ccda110..b610b161cd62 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -132,7 +132,7 @@ alternative_if ARM64_HAS_PAN alternative_else_nop_endif .if \el != 0 - mrs x21, ttbr0_el1 + mrs x21, ttbr1_el1 tst x21, #0xffff << 48 // Check for the reserved ASID orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR b.eq 1f // TTBR0 access already disabled @@ -196,7 +196,7 @@ alternative_else_nop_endif tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set .endif - __uaccess_ttbr0_enable x0 + __uaccess_ttbr0_enable x0, x1 .if \el == 0 /* diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index d7150e30438a..dd65ca253eb4 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -30,7 +30,7 @@ * Alignment fixed up by hardware. */ ENTRY(__clear_user) - uaccess_enable_not_uao x2, x3 + uaccess_enable_not_uao x2, x3, x4 mov x2, x1 // save the size for fixup return subs x1, x1, #8 b.mi 2f diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index cfe13396085b..7e7e687c17ac 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -64,7 +64,7 @@ end .req x5 ENTRY(__arch_copy_from_user) - uaccess_enable_not_uao x3, x4 + uaccess_enable_not_uao x3, x4, x5 add end, x0, x2 #include "copy_template.S" uaccess_disable_not_uao x3 diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index 718b1c4e2f85..074d52fcd75b 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -65,7 +65,7 @@ end .req x5 ENTRY(__copy_in_user) - uaccess_enable_not_uao x3, x4 + uaccess_enable_not_uao x3, x4, x5 add end, x0, x2 #include "copy_template.S" uaccess_disable_not_uao x3 diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index e99e31c9acac..67118444cde0 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -63,7 +63,7 @@ end .req x5 ENTRY(__arch_copy_to_user) - uaccess_enable_not_uao x3, x4 + uaccess_enable_not_uao x3, x4, x5 add end, x0, x2 #include "copy_template.S" uaccess_disable_not_uao x3 diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index da9576932322..74404536d704 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -49,7 +49,7 @@ ENTRY(flush_icache_range) * - end - virtual end address of region */ ENTRY(__flush_cache_user_range) - uaccess_ttbr0_enable x2, x3 + uaccess_ttbr0_enable x2, x3, x4 dcache_line_size x2, x3 sub x3, x2, #1 bic x4, x0, x3 diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S index b41aff25426d..f5422520cb01 100644 --- a/arch/arm64/xen/hypercall.S +++ b/arch/arm64/xen/hypercall.S @@ -100,7 +100,7 @@ ENTRY(privcmd_call) * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation * is enabled (it implies that hardware UAO and PAN disabled). */ - uaccess_ttbr0_enable x6, x7 + uaccess_ttbr0_enable x6, x7, x8 hvc XEN_IMM /* -- cgit v1.2.3 From 5914b11611d5f6a185ceda0ae38b4cf1e00df0d4 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Aug 2017 14:10:28 +0100 Subject: UPSTREAM: arm64: mm: Allocate ASIDs in pairs In preparation for separate kernel/user ASIDs, allocate them in pairs for each mm_struct. The bottom bit distinguishes the two: if it is set, then the ASID will map only userspace. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon (cherry picked from commit 0c8ea531b7740754cf374ca8b7510655f569c5e3) [toddpoynor@google.com: fixup context] Change-Id: I9879f45cd51cdf13814e3cff18653a76c893f386 Signed-off-by: Todd Poynor Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/mmu.h | 3 +++ arch/arm64/mm/context.c | 25 +++++++++++++++++-------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 8d9fce037b2f..edd9ff732528 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -16,6 +16,9 @@ #ifndef __ASM_MMU_H #define __ASM_MMU_H + +#define USER_ASID_FLAG (UL(1) << 48) + typedef struct { atomic64_t id; void *vdso; diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 4c63cb154859..05a885c14c83 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -39,7 +39,16 @@ static cpumask_t tlb_flush_pending; #define ASID_MASK (~GENMASK(asid_bits - 1, 0)) #define ASID_FIRST_VERSION (1UL << asid_bits) -#define NUM_USER_ASIDS ASID_FIRST_VERSION + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +#define NUM_USER_ASIDS (ASID_FIRST_VERSION >> 1) +#define asid2idx(asid) (((asid) & ~ASID_MASK) >> 1) +#define idx2asid(idx) (((idx) << 1) & ~ASID_MASK) +#else +#define NUM_USER_ASIDS (ASID_FIRST_VERSION) +#define asid2idx(asid) ((asid) & ~ASID_MASK) +#define idx2asid(idx) asid2idx(idx) +#endif /* Get the ASIDBits supported by the current CPU */ static u32 get_cpu_asid_bits(void) @@ -104,7 +113,7 @@ static void flush_context(unsigned int cpu) */ if (asid == 0) asid = per_cpu(reserved_asids, i); - __set_bit(asid & ~ASID_MASK, asid_map); + __set_bit(asid2idx(asid), asid_map); per_cpu(reserved_asids, i) = asid; } @@ -159,16 +168,16 @@ static u64 new_context(struct mm_struct *mm, unsigned int cpu) * We had a valid ASID in a previous life, so try to re-use * it if possible. */ - asid &= ~ASID_MASK; - if (!__test_and_set_bit(asid, asid_map)) + if (!__test_and_set_bit(asid2idx(asid), asid_map)) return newasid; } /* * Allocate a free ASID. If we can't find one, take a note of the - * currently active ASIDs and mark the TLBs as requiring flushes. - * We always count from ASID #1, as we use ASID #0 when setting a - * reserved TTBR0 for the init_mm. + * currently active ASIDs and mark the TLBs as requiring flushes. We + * always count from ASID #2 (index 1), as we use ASID #0 when setting + * a reserved TTBR0 for the init_mm and we allocate ASIDs in even/odd + * pairs. */ asid = find_next_zero_bit(asid_map, NUM_USER_ASIDS, cur_idx); if (asid != NUM_USER_ASIDS) @@ -185,7 +194,7 @@ static u64 new_context(struct mm_struct *mm, unsigned int cpu) set_asid: __set_bit(asid, asid_map); cur_idx = asid; - return asid | generation; + return idx2asid(asid) | generation; } void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) -- cgit v1.2.3 From d6ca455d0a006ba52890f2a2f3c3931e946b92e3 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 13:58:08 +0000 Subject: UPSTREAM: arm64: mm: Add arm64_kernel_unmapped_at_el0 helper In order for code such as TLB invalidation to operate efficiently when the decision to map the kernel at EL0 is determined at runtime, this patch introduces a helper function, arm64_kernel_unmapped_at_el0, to determine whether or not the kernel is mapped whilst running in userspace. Currently, this just reports the value of CONFIG_UNMAP_KERNEL_AT_EL0, but will later be hooked up to a fake CPU capability using a static key. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon (cherry picked from commit fc0e1299da548b32440051f58f08e0c1eb7edd0b) Change-Id: I2d34c2d7b88f82d848d7063efdae7c3cc4fe8fd6 Signed-off-by: Todd Poynor Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/mmu.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index edd9ff732528..478e6bc6b4e8 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -19,6 +19,8 @@ #define USER_ASID_FLAG (UL(1) << 48) +#ifndef __ASSEMBLY__ + typedef struct { atomic64_t id; void *vdso; @@ -31,6 +33,11 @@ typedef struct { */ #define ASID(mm) ((mm)->context.id.counter & 0xffff) +static inline bool arm64_kernel_unmapped_at_el0(void) +{ + return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0); +} + extern void paging_init(void); extern void bootmem_init(void); extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt); @@ -40,4 +47,5 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, pgprot_t prot, bool allow_block_mappings); extern void *fixmap_remap_fdt(phys_addr_t dt_phys); +#endif /* !__ASSEMBLY__ */ #endif -- cgit v1.2.3 From 2053d3c3b488013670965d24166395897c83e5a7 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 10 Aug 2017 14:13:33 +0100 Subject: UPSTREAM: arm64: mm: Invalidate both kernel and user ASIDs when performing TLBI Since an mm has both a kernel and a user ASID, we need to ensure that broadcast TLB maintenance targets both address spaces so that things like CoW continue to work with the uaccess primitives in the kernel. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon (cherry picked from commit 9b0de864b5bc298ea53005ad812f3386f81aee9c) Change-Id: I8b50e223fc7de6f11388b8dfac705fa618b7335a Signed-off-by: Todd Poynor Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/tlbflush.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index deab52374119..ad6bd8b26ada 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -23,6 +23,7 @@ #include #include +#include /* * Raw TLBI operations. @@ -42,6 +43,11 @@ #define __tlbi(op, ...) __TLBI_N(op, ##__VA_ARGS__, 1, 0) +#define __tlbi_user(op, arg) do { \ + if (arm64_kernel_unmapped_at_el0()) \ + __tlbi(op, (arg) | USER_ASID_FLAG); \ +} while (0) + /* * TLB Management * ============== @@ -103,6 +109,7 @@ static inline void flush_tlb_mm(struct mm_struct *mm) dsb(ishst); __tlbi(aside1is, asid); + __tlbi_user(aside1is, asid); dsb(ish); } @@ -113,6 +120,7 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, dsb(ishst); __tlbi(vale1is, addr); + __tlbi_user(vale1is, addr); dsb(ish); } @@ -139,10 +147,13 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, dsb(ishst); for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12)) { - if (last_level) + if (last_level) { __tlbi(vale1is, addr); - else + __tlbi_user(vale1is, addr); + } else { __tlbi(vae1is, addr); + __tlbi_user(vae1is, addr); + } } dsb(ish); } @@ -182,6 +193,7 @@ static inline void __flush_tlb_pgtable(struct mm_struct *mm, unsigned long addr = uaddr >> 12 | (ASID(mm) << 48); __tlbi(vae1is, addr); + __tlbi_user(vae1is, addr); dsb(ish); } -- cgit v1.2.3 From a329b068c97b34ef3045c0c6b0aa5ca9563b68ef Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 14:07:40 +0000 Subject: UPSTREAM: arm64: entry: Add exception trampoline page for exceptions from EL0 To allow unmapping of the kernel whilst running at EL0, we need to point the exception vectors at an entry trampoline that can map/unmap the kernel on entry/exit respectively. This patch adds the trampoline page, although it is not yet plugged into the vector table and is therefore unused. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon (cherry picked from commit c7b9adaf85f818d747eeff5145eb4095ccd587fb) [toddpoynor@google.com: fixup context] Change-Id: I6c6b5eb93a69f26b97628726eccf91df739d03af Signed-off-by: Todd Poynor Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/entry.S | 85 +++++++++++++++++++++++++++++++++++++++++ arch/arm64/kernel/vmlinux.lds.S | 17 +++++++++ 2 files changed, 102 insertions(+) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index b610b161cd62..82da81e21aca 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -865,6 +866,90 @@ __ni_sys_trace: .popsection // .entry.text +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +/* + * Exception vectors trampoline. + */ + .pushsection ".entry.tramp.text", "ax" + + .macro tramp_map_kernel, tmp + mrs \tmp, ttbr1_el1 + sub \tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) + bic \tmp, \tmp, #USER_ASID_FLAG + msr ttbr1_el1, \tmp + .endm + + .macro tramp_unmap_kernel, tmp + mrs \tmp, ttbr1_el1 + add \tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) + orr \tmp, \tmp, #USER_ASID_FLAG + msr ttbr1_el1, \tmp + /* + * We avoid running the post_ttbr_update_workaround here because the + * user and kernel ASIDs don't have conflicting mappings, so any + * "blessing" as described in: + * + * http://lkml.kernel.org/r/56BB848A.6060603@caviumnetworks.com + * + * will not hurt correctness. Whilst this may partially defeat the + * point of using split ASIDs in the first place, it avoids + * the hit of invalidating the entire I-cache on every return to + * userspace. + */ + .endm + + .macro tramp_ventry, regsize = 64 + .align 7 +1: + .if \regsize == 64 + msr tpidrro_el0, x30 // Restored in kernel_ventry + .endif + tramp_map_kernel x30 + ldr x30, =vectors + prfm plil1strm, [x30, #(1b - tramp_vectors)] + msr vbar_el1, x30 + add x30, x30, #(1b - tramp_vectors) + isb + br x30 + .endm + + .macro tramp_exit, regsize = 64 + adr x30, tramp_vectors + msr vbar_el1, x30 + tramp_unmap_kernel x30 + .if \regsize == 64 + mrs x30, far_el1 + .endif + eret + .endm + + .align 11 +ENTRY(tramp_vectors) + .space 0x400 + + tramp_ventry + tramp_ventry + tramp_ventry + tramp_ventry + + tramp_ventry 32 + tramp_ventry 32 + tramp_ventry 32 + tramp_ventry 32 +END(tramp_vectors) + +ENTRY(tramp_exit_native) + tramp_exit +END(tramp_exit_native) + +ENTRY(tramp_exit_compat) + tramp_exit 32 +END(tramp_exit_compat) + + .ltorg + .popsection // .entry.tramp.text +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ + /* * Special system call wrappers. */ diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index b8deffa9e1bf..66c853e4a0c6 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -56,6 +56,17 @@ jiffies = jiffies_64; #define HIBERNATE_TEXT #endif +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +#define TRAMP_TEXT \ + . = ALIGN(PAGE_SIZE); \ + VMLINUX_SYMBOL(__entry_tramp_text_start) = .; \ + *(.entry.tramp.text) \ + . = ALIGN(PAGE_SIZE); \ + VMLINUX_SYMBOL(__entry_tramp_text_end) = .; +#else +#define TRAMP_TEXT +#endif + /* * The size of the PE/COFF section that covers the kernel image, which * runs from stext to _edata, must be a round multiple of the PE/COFF @@ -128,6 +139,7 @@ SECTIONS HYPERVISOR_TEXT IDMAP_TEXT HIBERNATE_TEXT + TRAMP_TEXT *(.fixup) *(.gnu.warning) . = ALIGN(16); @@ -221,6 +233,11 @@ SECTIONS . += RESERVED_TTBR0_SIZE; #endif +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + tramp_pg_dir = .; + . += PAGE_SIZE; +#endif + _end = .; STABS_DEBUG -- cgit v1.2.3 From 5e54c4b6a795285d314fe0bbab42f06414b9c523 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 14:14:17 +0000 Subject: UPSTREAM: arm64: mm: Map entry trampoline into trampoline and kernel page tables The exception entry trampoline needs to be mapped at the same virtual address in both the trampoline page table (which maps nothing else) and also the kernel page table, so that we can swizzle TTBR1_EL1 on exceptions from and return to EL0. This patch maps the trampoline at a fixed virtual address in the fixmap area of the kernel virtual address space, which allows the kernel proper to be randomized with respect to the trampoline when KASLR is enabled. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon (cherry picked from commit 51a0048beb449682d632d0af52a515adb9f9882e) [toddpoynor@google.com: fixup context, remove rodata_enabled check] Change-Id: I920ad6850e4e9b03edea4a84d8603302c28eef37 Signed-off-by: Todd Poynor Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/fixmap.h | 5 +++++ arch/arm64/include/asm/pgtable.h | 1 + arch/arm64/kernel/asm-offsets.c | 6 +++++- arch/arm64/mm/mmu.c | 23 +++++++++++++++++++++++ 4 files changed, 34 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h index caf86be815ba..7b1d88c18143 100644 --- a/arch/arm64/include/asm/fixmap.h +++ b/arch/arm64/include/asm/fixmap.h @@ -51,6 +51,11 @@ enum fixed_addresses { FIX_EARLYCON_MEM_BASE, FIX_TEXT_POKE0, + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + FIX_ENTRY_TRAMP_TEXT, +#define TRAMP_VALIAS (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT)) +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ __end_of_permanent_fixed_addresses, /* diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 1af9f3c5cdf6..7bcd8b96a4b3 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -692,6 +692,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern pgd_t idmap_pg_dir[PTRS_PER_PGD]; +extern pgd_t tramp_pg_dir[PTRS_PER_PGD]; /* * Encode and decode a swap entry: diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 433f03ec8e9b..b6c9389c160a 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -147,11 +148,14 @@ int main(void) DEFINE(ARM_SMCCC_RES_X2_OFFS, offsetof(struct arm_smccc_res, a2)); DEFINE(ARM_SMCCC_QUIRK_ID_OFFS, offsetof(struct arm_smccc_quirk, id)); DEFINE(ARM_SMCCC_QUIRK_STATE_OFFS, offsetof(struct arm_smccc_quirk, state)); - BLANK(); DEFINE(HIBERN_PBE_ORIG, offsetof(struct pbe, orig_address)); DEFINE(HIBERN_PBE_ADDR, offsetof(struct pbe, address)); DEFINE(HIBERN_PBE_NEXT, offsetof(struct pbe, next)); DEFINE(ARM64_FTR_SYSVAL, offsetof(struct arm64_ftr_reg, sys_val)); + BLANK(); +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + DEFINE(TRAMP_VALIAS, TRAMP_VALIAS); +#endif return 0; } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 33ecaffbfc23..29945bc2e3f1 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -420,6 +420,29 @@ static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end, vm_area_add_early(vma); } +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +static int __init map_entry_trampoline(void) +{ + extern char __entry_tramp_text_start[]; + + pgprot_t prot = PAGE_KERNEL_EXEC; + phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start); + + /* The trampoline is always mapped and can therefore be global */ + pgprot_val(prot) &= ~PTE_NG; + + /* Map only the text into the trampoline page table */ + memset(tramp_pg_dir, 0, PGD_SIZE); + __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE, + prot, pgd_pgtable_alloc, 0); + + /* ...as well as the kernel page table */ + __set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot); + return 0; +} +core_initcall(map_entry_trampoline); +#endif + /* * Create fine-grained mappings for the kernel. */ -- cgit v1.2.3 From 8fdbffb119158a8dcf0492e5873bba844744de7e Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 14:20:21 +0000 Subject: UPSTREAM: arm64: entry: Explicitly pass exception level to kernel_ventry macro We will need to treat exceptions from EL0 differently in kernel_ventry, so rework the macro to take the exception level as an argument and construct the branch target using that. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon (cherry picked from commit 5b1f7fe41909cde40decad9f0e8ee585777a0538) [toddpoynor@google.com: fixup context, error -> error_invalid] Change-Id: I5dfbce460f7617e33298a1ece09159572d55fb3a Signed-off-by: Todd Poynor Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/entry.S | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 82da81e21aca..ac474a70db85 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -71,10 +71,10 @@ #define BAD_FIQ 2 #define BAD_ERROR 3 - .macro kernel_ventry label + .macro kernel_ventry, el, label, regsize = 64 .align 7 sub sp, sp, #S_FRAME_SIZE - b \label + b el\()\el\()_\label .endm .macro kernel_entry, el, regsize = 64 @@ -322,31 +322,31 @@ tsk .req x28 // current thread_info .align 11 ENTRY(vectors) - kernel_ventry el1_sync_invalid // Synchronous EL1t - kernel_ventry el1_irq_invalid // IRQ EL1t - kernel_ventry el1_fiq_invalid // FIQ EL1t - kernel_ventry el1_error_invalid // Error EL1t + kernel_ventry 1, sync_invalid // Synchronous EL1t + kernel_ventry 1, irq_invalid // IRQ EL1t + kernel_ventry 1, fiq_invalid // FIQ EL1t + kernel_ventry 1, error_invalid // Error EL1t - kernel_ventry el1_sync // Synchronous EL1h - kernel_ventry el1_irq // IRQ EL1h - kernel_ventry el1_fiq_invalid // FIQ EL1h - kernel_ventry el1_error_invalid // Error EL1h + kernel_ventry 1, sync // Synchronous EL1h + kernel_ventry 1, irq // IRQ EL1h + kernel_ventry 1, fiq_invalid // FIQ EL1h + kernel_ventry 1, error_invalid // Error EL1h - kernel_ventry el0_sync // Synchronous 64-bit EL0 - kernel_ventry el0_irq // IRQ 64-bit EL0 - kernel_ventry el0_fiq_invalid // FIQ 64-bit EL0 - kernel_ventry el0_error_invalid // Error 64-bit EL0 + kernel_ventry 0, sync // Synchronous 64-bit EL0 + kernel_ventry 0, irq // IRQ 64-bit EL0 + kernel_ventry 0, fiq_invalid // FIQ 64-bit EL0 + kernel_ventry 0, error_invalid // Error 64-bit EL0 #ifdef CONFIG_COMPAT - kernel_ventry el0_sync_compat // Synchronous 32-bit EL0 - kernel_ventry el0_irq_compat // IRQ 32-bit EL0 - kernel_ventry el0_fiq_invalid_compat // FIQ 32-bit EL0 - kernel_ventry el0_error_invalid_compat // Error 32-bit EL0 + kernel_ventry 0, sync_compat, 32 // Synchronous 32-bit EL0 + kernel_ventry 0, irq_compat, 32 // IRQ 32-bit EL0 + kernel_ventry 0, fiq_invalid_compat, 32 // FIQ 32-bit EL0 + kernel_ventry 0, error_invalid_compat, 32 // Error 32-bit EL0 #else - kernel_ventry el0_sync_invalid // Synchronous 32-bit EL0 - kernel_ventry el0_irq_invalid // IRQ 32-bit EL0 - kernel_ventry el0_fiq_invalid // FIQ 32-bit EL0 - kernel_ventry el0_error_invalid // Error 32-bit EL0 + kernel_ventry 0, sync_invalid, 32 // Synchronous 32-bit EL0 + kernel_ventry 0, irq_invalid, 32 // IRQ 32-bit EL0 + kernel_ventry 0, fiq_invalid, 32 // FIQ 32-bit EL0 + kernel_ventry 0, error_invalid, 32 // Error 32-bit EL0 #endif END(vectors) -- cgit v1.2.3 From c27a2258342344ac5048e6e60e48fb969fee1ed4 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 14:24:29 +0000 Subject: UPSTREAM: arm64: entry: Hook up entry trampoline to exception vectors Hook up the entry trampoline to our exception vectors so that all exceptions from and returns to EL0 go via the trampoline, which swizzles the vector base register accordingly. Transitioning to and from the kernel clobbers x30, so we use tpidrro_el0 and far_el1 as scratch registers for native tasks. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon (cherry picked from commit 4bf3286d29f3a88425d8d8cd53428cbb8f865f04) Change-Id: I92fe4484fdfcf09e772f5d7ddfa29bfa4eb065f7 Signed-off-by: Todd Poynor Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/entry.S | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index ac474a70db85..a22c78ae57a9 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -73,10 +73,26 @@ .macro kernel_ventry, el, label, regsize = 64 .align 7 +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + .if \el == 0 + .if \regsize == 64 + mrs x30, tpidrro_el0 + msr tpidrro_el0, xzr + .else + mov x30, xzr + .endif + .endif +#endif + sub sp, sp, #S_FRAME_SIZE b el\()\el\()_\label .endm + .macro tramp_alias, dst, sym + mov_q \dst, TRAMP_VALIAS + add \dst, \dst, #(\sym - .entry.tramp.text) + .endm + .macro kernel_entry, el, regsize = 64 .if \regsize == 32 mov w0, w0 // zero upper 32 bits of x0 @@ -218,18 +234,20 @@ alternative_else_nop_endif .if \el == 0 ldr x23, [sp, #S_SP] // load return stack pointer msr sp_el0, x23 + tst x22, #PSR_MODE32_BIT // native task? + b.eq 3f + #ifdef CONFIG_ARM64_ERRATUM_845719 alternative_if ARM64_WORKAROUND_845719 - tbz x22, #4, 1f #ifdef CONFIG_PID_IN_CONTEXTIDR mrs x29, contextidr_el1 msr contextidr_el1, x29 #else msr contextidr_el1, xzr #endif -1: alternative_else_nop_endif #endif +3: .endif msr elr_el1, x21 // set up the return data @@ -251,7 +269,22 @@ alternative_else_nop_endif ldp x28, x29, [sp, #16 * 14] ldr lr, [sp, #S_LR] add sp, sp, #S_FRAME_SIZE // restore sp - eret // return to kernel + +#ifndef CONFIG_UNMAP_KERNEL_AT_EL0 + eret +#else + .if \el == 0 + bne 4f + msr far_el1, x30 + tramp_alias x30, tramp_exit_native + br x30 +4: + tramp_alias x30, tramp_exit_compat + br x30 + .else + eret + .endif +#endif .endm .macro irq_stack_entry -- cgit v1.2.3 From 04b77fe9ae8da2aab099ae51c8835c700f9d5970 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 14:29:19 +0000 Subject: UPSTREAM: arm64: erratum: Work around Falkor erratum #E1003 in trampoline code We rely on an atomic swizzling of TTBR1 when transitioning from the entry trampoline to the kernel proper on an exception. We can't rely on this atomicity in the face of Falkor erratum #E1003, so on affected cores we can issue a TLB invalidation to invalidate the walk cache prior to jumping into the kernel. There is still the possibility of a TLB conflict here due to conflicting walk cache entries prior to the invalidation, but this doesn't appear to be the case on these CPUs in practice. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon (cherry picked from git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git commit d1777e686ad10ba7c594304429c6045fb79255a1) [ghackmann@google.com: replace runtime alternative_if with a compile-time check for Code Aurora's out-of-tree CONFIG_ARCH_MSM8996. Kryo needs this workaround too, and 4.4 doesn't have any of the upstream Falkor errata infrastructure needed to detect this at boot time.] Signed-off-by: Greg Hackmann Signed-off-by: Todd Poynor Change-Id: Iaf244a364b22d386b54368f88b73e39d295de49f Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/entry.S | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index a22c78ae57a9..79ea90b33682 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -910,6 +910,16 @@ __ni_sys_trace: sub \tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) bic \tmp, \tmp, #USER_ASID_FLAG msr ttbr1_el1, \tmp +#ifdef CONFIG_ARCH_MSM8996 + /* ASID already in \tmp[63:48] */ + movk \tmp, #:abs_g2_nc:(TRAMP_VALIAS >> 12) + movk \tmp, #:abs_g1_nc:(TRAMP_VALIAS >> 12) + /* 2MB boundary containing the vectors, so we nobble the walk cache */ + movk \tmp, #:abs_g0_nc:((TRAMP_VALIAS & ~(SZ_2M - 1)) >> 12) + isb + tlbi vae1, \tmp + dsb nsh +#endif /* CONFIG_ARCH_MSM8996 */ .endm .macro tramp_unmap_kernel, tmp -- cgit v1.2.3 From da1016a5635158113253b3dc420c25d289193c1d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 14:33:28 +0000 Subject: UPSTREAM: arm64: tls: Avoid unconditional zeroing of tpidrro_el0 for native tasks When unmapping the kernel at EL0, we use tpidrro_el0 as a scratch register during exception entry from native tasks and subsequently zero it in the kernel_ventry macro. We can therefore avoid zeroing tpidrro_el0 in the context-switch path for native tasks using the entry trampoline. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon (cherry picked from commit 18011eac28c7cb31c87b86b7d0e5b01894405c7f) [toddpoynor@google.com: fixup context] Change-Id: I4a0ded2e339b0c75521bc3b5a7fbbb3e60aa8774 Signed-off-by: Todd Poynor Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/process.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 65ea76255831..7e365ffe1370 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -372,17 +372,17 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, static void tls_thread_switch(struct task_struct *next) { - unsigned long tpidr, tpidrro; + unsigned long tpidr; tpidr = read_sysreg(tpidr_el0); *task_user_tls(current) = tpidr; - tpidr = *task_user_tls(next); - tpidrro = is_compat_thread(task_thread_info(next)) ? - next->thread.tp_value : 0; + if (is_compat_thread(task_thread_info(next))) + write_sysreg(next->thread.tp_value, tpidrro_el0); + else if (!arm64_kernel_unmapped_at_el0()) + write_sysreg(0, tpidrro_el0); - write_sysreg(tpidr, tpidr_el0); - write_sysreg(tpidrro, tpidrro_el0); + write_sysreg(*task_user_tls(next), tpidr_el0); } /* Restore the UAO state depending on next's addr_limit */ -- cgit v1.2.3 From f79ff2d82888428c2b0f7dbdedce553313156fae Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 14:38:19 +0000 Subject: UPSTREAM: arm64: entry: Add fake CPU feature for unmapping the kernel at EL0 Allow explicit disabling of the entry trampoline on the kernel command line (kpti=off) by adding a fake CPU feature (ARM64_UNMAP_KERNEL_AT_EL0) that can be used to toggle the alternative sequences in our entry code and avoid use of the trampoline altogether if desired. This also allows us to make use of a static key in arm64_kernel_unmapped_at_el0(). Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon (cherry picked from commit ea1e3de85e94d711f63437c04624aa0e8de5c8b3) [toddpoynor@google.com: fixup context, cpus_have_const_cap -> cpus_have_cap] Change-Id: I72936d608b0d4c0dd9725eced7674b95e4abcf2d Signed-off-by: Todd Poynor Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/cpucaps.h | 4 +++- arch/arm64/include/asm/mmu.h | 3 ++- arch/arm64/kernel/cpufeature.c | 41 ++++++++++++++++++++++++++++++++++++++++ arch/arm64/kernel/entry.S | 9 +++++---- 4 files changed, 51 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 87b446535185..4a4a98a4b1bd 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -35,6 +35,8 @@ #define ARM64_HYP_OFFSET_LOW 14 #define ARM64_MISMATCHED_CACHE_LINE_SIZE 15 -#define ARM64_NCAPS 16 +#define ARM64_UNMAP_KERNEL_AT_EL0 23 + +#define ARM64_NCAPS 24 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 478e6bc6b4e8..78b79dfc2905 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -35,7 +35,8 @@ typedef struct { static inline bool arm64_kernel_unmapped_at_el0(void) { - return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0); + return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && + cpus_have_cap(ARM64_UNMAP_KERNEL_AT_EL0); } extern void paging_init(void); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 0127e1bb5a7c..bfbd9e90fa04 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -748,6 +748,40 @@ static bool hyp_offset_low(const struct arm64_cpu_capabilities *entry, return idmap_addr > GENMASK(VA_BITS - 2, 0) && !is_kernel_in_hyp_mode(); } +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */ + +static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, + int __unused) +{ + /* Forced on command line? */ + if (__kpti_forced) { + pr_info_once("kernel page table isolation forced %s by command line option\n", + __kpti_forced > 0 ? "ON" : "OFF"); + return __kpti_forced > 0; + } + + /* Useful for KASLR robustness */ + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + return true; + + return false; +} + +static int __init parse_kpti(char *str) +{ + bool enabled; + int ret = strtobool(str, &enabled); + + if (ret) + return ret; + + __kpti_forced = enabled ? 1 : -1; + return 0; +} +__setup("kpti=", parse_kpti); +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ + static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "GIC system register CPU interface", @@ -831,6 +865,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .def_scope = SCOPE_SYSTEM, .matches = hyp_offset_low, }, +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + { + .capability = ARM64_UNMAP_KERNEL_AT_EL0, + .def_scope = SCOPE_SYSTEM, + .matches = unmap_kernel_at_el0, + }, +#endif {}, }; diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 79ea90b33682..6e9ce0fb147b 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -74,6 +74,7 @@ .macro kernel_ventry, el, label, regsize = 64 .align 7 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +alternative_if ARM64_UNMAP_KERNEL_AT_EL0 .if \el == 0 .if \regsize == 64 mrs x30, tpidrro_el0 @@ -82,6 +83,7 @@ mov x30, xzr .endif .endif +alternative_else_nop_endif #endif sub sp, sp, #S_FRAME_SIZE @@ -270,10 +272,9 @@ alternative_else_nop_endif ldr lr, [sp, #S_LR] add sp, sp, #S_FRAME_SIZE // restore sp -#ifndef CONFIG_UNMAP_KERNEL_AT_EL0 - eret -#else .if \el == 0 +alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 bne 4f msr far_el1, x30 tramp_alias x30, tramp_exit_native @@ -281,10 +282,10 @@ alternative_else_nop_endif 4: tramp_alias x30, tramp_exit_compat br x30 +#endif .else eret .endif -#endif .endm .macro irq_stack_entry -- cgit v1.2.3 From 16f712b456b11f40a9b6931b585aa22e11df5057 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 14:41:01 +0000 Subject: UPSTREAM: arm64: Kconfig: Add CONFIG_UNMAP_KERNEL_AT_EL0 Add a Kconfig entry to control use of the entry trampoline, which allows us to unmap the kernel whilst running in userspace and improve the robustness of KASLR. Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon (cherry picked from commit 084eb77cd3a81134d02500977dc0ecc9277dc97d) Change-Id: I70a996e448fa17007a3b8564c794e53e3992d7bf Signed-off-by: Todd Poynor Signed-off-by: Greg Kroah-Hartman --- arch/arm64/Kconfig | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index fbf07a6bd8a4..161fced75e7d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -733,6 +733,19 @@ config FORCE_MAX_ZONEORDER However for 4K, we choose a higher default value, 11 as opposed to 10, giving us 4M allocations matching the default size used by generic code. +config UNMAP_KERNEL_AT_EL0 + bool "Unmap kernel when running in userspace (aka \"KAISER\")" + default y + help + Some attacks against KASLR make use of the timing difference between + a permission fault which could arise from a page table entry that is + present in the TLB, and a translation fault which always requires a + page table walk. This option defends against these attacks by unmapping + the kernel whilst running in userspace, therefore forcing translation + faults for all of kernel space. + + If unsure, say Y. + menuconfig ARMV8_DEPRECATED bool "Emulate deprecated/obsolete ARMv8 instructions" depends on COMPAT -- cgit v1.2.3 From d7013ede260486a55b254e1f95127a2b0f24cdc1 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 1 Dec 2017 17:33:48 +0000 Subject: UPSTREAM: arm64: mm: Introduce TTBR_ASID_MASK for getting at the ASID in the TTBR There are now a handful of open-coded masks to extract the ASID from a TTBR value, so introduce a TTBR_ASID_MASK and use that instead. Suggested-by: Mark Rutland Reviewed-by: Mark Rutland Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon (cherry picked from commit b519538dfefc2f8478a1bcb458459c861d431784) [toddpoynor@google.com: fixup context, asm-uaccess.h changes to uaccess.h] Change-Id: Ia456c58c83f9bbd02177cefb8e3106dfffe357cc Signed-off-by: Todd Poynor Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/mmu.h | 1 + arch/arm64/include/asm/uaccess.h | 7 ++++--- arch/arm64/kernel/entry.S | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 78b79dfc2905..d7750fca1259 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -18,6 +18,7 @@ #define USER_ASID_FLAG (UL(1) << 48) +#define TTBR_ASID_MASK (UL(0xffff) << 48) #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 8e265b2c8ce7..c8af98bf9961 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -20,6 +20,7 @@ #include #include +#include #include #ifndef __ASSEMBLY__ @@ -137,7 +138,7 @@ static inline void __uaccess_ttbr0_disable(void) write_sysreg(ttbr + SWAPPER_DIR_SIZE, ttbr0_el1); isb(); /* Set reserved ASID */ - ttbr &= ~(0xffffUL << 48); + ttbr &= ~TTBR_ASID_MASK; write_sysreg(ttbr, ttbr1_el1); isb(); } @@ -156,7 +157,7 @@ static inline void __uaccess_ttbr0_enable(void) /* Restore active ASID */ ttbr1 = read_sysreg(ttbr1_el1); - ttbr1 |= ttbr0 & (0xffffUL << 48); + ttbr1 |= ttbr0 & TTBR_ASID_MASK; write_sysreg(ttbr1, ttbr1_el1); isb(); @@ -452,7 +453,7 @@ extern __must_check long strnlen_user(const char __user *str, long n); msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1 isb sub \tmp1, \tmp1, #SWAPPER_DIR_SIZE - bic \tmp1, \tmp1, #(0xffff << 48) + bic \tmp1, \tmp1, #TTBR_ASID_MASK msr ttbr1_el1, \tmp1 // set reserved ASID isb .endm diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 6e9ce0fb147b..c97454e542f4 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -152,7 +152,7 @@ alternative_else_nop_endif .if \el != 0 mrs x21, ttbr1_el1 - tst x21, #0xffff << 48 // Check for the reserved ASID + tst x21, #TTBR_ASID_MASK // Check for the reserved ASID orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR b.eq 1f // TTBR0 access already disabled and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR -- cgit v1.2.3 From 06fe41f85237c9d3c915447b9f9d51f1e85c4891 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 6 Dec 2017 11:24:02 +0000 Subject: UPSTREAM: arm64: kaslr: Put kernel vectors address in separate data page The literal pool entry for identifying the vectors base is the only piece of information in the trampoline page that identifies the true location of the kernel. This patch moves it into a page-aligned region of the .rodata section and maps this adjacent to the trampoline text via an additional fixmap entry, which protects against any accidental leakage of the trampoline contents. Suggested-by: Ard Biesheuvel Tested-by: Laura Abbott Tested-by: Shanker Donthineni Signed-off-by: Will Deacon (cherry picked from commit 6c27c4082f4f70b9f41df4d0adf51128b40351df) Change-Id: Id125331e7fa5645c801a26843fecca53f8773705 Signed-off-by: Todd Poynor Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/fixmap.h | 1 + arch/arm64/kernel/entry.S | 14 ++++++++++++++ arch/arm64/kernel/vmlinux.lds.S | 5 ++++- arch/arm64/mm/mmu.c | 10 +++++++++- 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h index 7b1d88c18143..d8e58051f32d 100644 --- a/arch/arm64/include/asm/fixmap.h +++ b/arch/arm64/include/asm/fixmap.h @@ -53,6 +53,7 @@ enum fixed_addresses { FIX_TEXT_POKE0, #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + FIX_ENTRY_TRAMP_DATA, FIX_ENTRY_TRAMP_TEXT, #define TRAMP_VALIAS (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT)) #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index c97454e542f4..fe22ca8ae750 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -949,7 +949,13 @@ __ni_sys_trace: msr tpidrro_el0, x30 // Restored in kernel_ventry .endif tramp_map_kernel x30 +#ifdef CONFIG_RANDOMIZE_BASE + adr x30, tramp_vectors + PAGE_SIZE +alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 + ldr x30, [x30] +#else ldr x30, =vectors +#endif prfm plil1strm, [x30, #(1b - tramp_vectors)] msr vbar_el1, x30 add x30, x30, #(1b - tramp_vectors) @@ -992,6 +998,14 @@ END(tramp_exit_compat) .ltorg .popsection // .entry.tramp.text +#ifdef CONFIG_RANDOMIZE_BASE + .pushsection ".rodata", "a" + .align PAGE_SHIFT + .globl __entry_tramp_data_start +__entry_tramp_data_start: + .quad vectors + .popsection // .rodata +#endif /* CONFIG_RANDOMIZE_BASE */ #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ /* diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 66c853e4a0c6..34d3ed64fe8e 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -257,7 +257,10 @@ ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K, ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1)) <= SZ_4K, "Hibernate exit text too big or misaligned") #endif - +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE, + "Entry trampoline text too big") +#endif /* * If padding is applied before .head.text, virt<->phys conversions will fail. */ diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 29945bc2e3f1..31d16d8257e2 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -436,8 +436,16 @@ static int __init map_entry_trampoline(void) __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE, prot, pgd_pgtable_alloc, 0); - /* ...as well as the kernel page table */ + /* Map both the text and data into the kernel page table */ __set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot); + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + extern char __entry_tramp_data_start[]; + + __set_fixmap(FIX_ENTRY_TRAMP_DATA, + __pa_symbol(__entry_tramp_data_start), + PAGE_KERNEL_RO); + } + return 0; } core_initcall(map_entry_trampoline); -- cgit v1.2.3 From 753593688bfab89b6cbf085f62fec44012a6820d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 16:15:59 +0000 Subject: arm64: use RET instruction for exiting the trampoline Speculation attacks against the entry trampoline can potentially resteer the speculative instruction stream through the indirect branch and into arbitrary gadgets within the kernel. This patch defends against these attacks by forcing a misprediction through the return stack: a dummy BL instruction loads an entry into the stack, so that the predicted program flow of the subsequent RET instruction is to a branch-to-self instruction which is finally resolved as a branch to the kernel vectors with speculation suppressed. Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/entry.S | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index fe22ca8ae750..c1ff0999045e 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -948,6 +948,9 @@ __ni_sys_trace: .if \regsize == 64 msr tpidrro_el0, x30 // Restored in kernel_ventry .endif + bl 2f + b . +2: tramp_map_kernel x30 #ifdef CONFIG_RANDOMIZE_BASE adr x30, tramp_vectors + PAGE_SIZE @@ -960,7 +963,7 @@ alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 msr vbar_el1, x30 add x30, x30, #(1b - tramp_vectors) isb - br x30 + ret .endm .macro tramp_exit, regsize = 64 -- cgit v1.2.3 From 5f5e5d4041e31dfe9605713248a7a66754d224c7 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 14 Nov 2017 16:19:39 +0000 Subject: arm64: Kconfig: Reword UNMAP_KERNEL_AT_EL0 kconfig entry Although CONFIG_UNMAP_KERNEL_AT_EL0 does make KASLR more robust, it's actually more useful as a mitigation against speculation attacks that can leak arbitrary kernel data to userspace through speculation. Reword the Kconfig help message to reflect this, and make the option depend on EXPERT so that it is on by default for the majority of users. Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- arch/arm64/Kconfig | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 161fced75e7d..5702e7d0f5e0 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -734,15 +734,14 @@ config FORCE_MAX_ZONEORDER 4M allocations matching the default size used by generic code. config UNMAP_KERNEL_AT_EL0 - bool "Unmap kernel when running in userspace (aka \"KAISER\")" + bool "Unmap kernel when running in userspace (aka \"KAISER\")" if EXPERT default y help - Some attacks against KASLR make use of the timing difference between - a permission fault which could arise from a page table entry that is - present in the TLB, and a translation fault which always requires a - page table walk. This option defends against these attacks by unmapping - the kernel whilst running in userspace, therefore forcing translation - faults for all of kernel space. + Speculation attacks against some high-performance processors can + be used to bypass MMU permission checks and leak kernel data to + userspace. This can be defended against by unmapping the kernel + when running in userspace, mapping it back in on exception entry + via a trampoline page in the vector table. If unsure, say Y. -- cgit v1.2.3 From b54d99aa12b4d02a66bf976fb85b6f0ed9a5a485 Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Mon, 8 Aug 2016 16:20:15 -0700 Subject: ANDROID: usb: f_fs: Prevent gadget unbind if it is already unbound Upon usb composition switch there is possibility of ep0 file release happening after gadget driver bind. In case of composition switch from adb to a non-adb composition gadget will never gets bound again resulting into failure of usb device enumeration. Fix this issue by checking FFS_FL_BOUND flag and avoid extra gadget driver unbind if it is already done as part of composition switch. Change-Id: I1638001ff4a94f08224b188aa42425f3d732fa2b Signed-off-by: Hemant Kumar --- drivers/usb/gadget/function/f_fs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 7b107e43b1c4..d90bf57ba30e 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -3725,7 +3725,8 @@ static void ffs_closed(struct ffs_data *ffs) ci = opts->func_inst.group.cg_item.ci_parent->ci_parent; ffs_dev_unlock(); - unregister_gadget_item(ci); + if (test_bit(FFS_FL_BOUND, &ffs->flags)) + unregister_gadget_item(ci); return; done: ffs_dev_unlock(); -- cgit v1.2.3