From f294526279cda8934b0313ebd02184a16ba888c9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sun, 19 Jul 2009 14:46:09 +0300 Subject: lguest: dereferencing freed mem in add_eventfd() "new" was freed and then dereferenced. Also the return value wasn't being used so I modified the caller as well. Compile tested only. Found by smatch (http://repo.or.cz/w/smatch.git). regards, dan carpenter Signed-off-by: Dan Carpenter Signed-off-by: Rusty Russell --- drivers/lguest/lguest_user.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 9f9a2953b38..407722a8e0c 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -52,8 +52,9 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) new->map[new->num].addr = addr; new->map[new->num].event = eventfd_ctx_fdget(fd); if (IS_ERR(new->map[new->num].event)) { + int err = PTR_ERR(new->map[new->num].event); kfree(new); - return PTR_ERR(new->map[new->num].event); + return err; } new->num++; @@ -83,7 +84,7 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) err = add_eventfd(lg, addr, fd); mutex_unlock(&lguest_lock); - return 0; + return err; } /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt -- cgit v1.2.3 From 8ef562d112c82ec539775698f8b63ac5ec1bd766 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 30 Jul 2009 16:03:43 -0600 Subject: lguest: fix descriptor corruption in example launcher 1d589bb16b825b3a7b4edd34d997f1f1f953033d "Add serial number support for virtio_blk, V4a" extended 'struct virtio_blk_config' to 536 bytes. Lguest and S/390 both use an 8 bit value for the feature length, and this change broke them (if the code is naive). Signed-off-by: Rusty Russell Cc: John Cooper Cc: Christian Borntraeger --- Documentation/lguest/lguest.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 9ebcd6ef361..45d7d6dcae7 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -1105,6 +1105,9 @@ static void set_config(struct device *dev, unsigned len, const void *conf) /* Copy in the config information, and store the length. */ memcpy(device_config(dev), conf, len); dev->desc->config_len = len; + + /* Size must fit in config_len field (8 bits)! */ + assert(dev->desc->config_len == len); } /* This routine does all the creation and setup of a new device, including @@ -1515,7 +1518,8 @@ static void setup_block_file(const char *filename) add_feature(dev, VIRTIO_BLK_F_SEG_MAX); conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); - set_config(dev, sizeof(conf), &conf); + /* Don't try to put whole struct: we have 8 bit limit. */ + set_config(dev, offsetof(struct virtio_blk_config, geometry), &conf); verbose("device %u: virtblock %llu sectors\n", ++devices.device_num, le64_to_cpu(conf.capacity)); -- cgit v1.2.3 From ff52c3fc7188855ede75d87b022271f0da309e5b Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 23 Jul 2009 14:57:37 +0300 Subject: virtio: fix memory leak on device removal Make vp_free_vectors do the reverse of vq_request_vectors. Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- drivers/virtio/virtio_pci.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index bcec78ffc76..ca40517ef9c 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -258,7 +258,6 @@ static void vp_free_vectors(struct virtio_device *vdev) for (i = 0; i < vp_dev->msix_used_vectors; ++i) free_irq(vp_dev->msix_entries[i].vector, vp_dev); - vp_dev->msix_used_vectors = 0; if (vp_dev->msix_enabled) { /* Disable the vector used for configuration */ @@ -267,9 +266,16 @@ static void vp_free_vectors(struct virtio_device *vdev) /* Flush the write out to device */ ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - vp_dev->msix_enabled = 0; pci_disable_msix(vp_dev->pci_dev); + vp_dev->msix_enabled = 0; + vp_dev->msix_vectors = 0; } + + vp_dev->msix_used_vectors = 0; + kfree(vp_dev->msix_names); + vp_dev->msix_names = NULL; + kfree(vp_dev->msix_entries); + vp_dev->msix_entries = NULL; } static int vp_enable_msix(struct pci_dev *dev, struct msix_entry *entries, @@ -297,11 +303,11 @@ static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries, GFP_KERNEL); if (!vp_dev->msix_entries) - goto error_entries; + goto error; vp_dev->msix_names = kmalloc(nvectors * sizeof *vp_dev->msix_names, GFP_KERNEL); if (!vp_dev->msix_names) - goto error_names; + goto error; for (i = 0; i < nvectors; ++i) vp_dev->msix_entries[i].entry = i; @@ -314,7 +320,7 @@ static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED, name, vp_dev); if (err) - goto error_irq; + goto error; vp_dev->intx_enabled = 1; } else { vp_dev->msix_vectors = err; @@ -328,7 +334,7 @@ static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) vp_config_changed, 0, vp_dev->msix_names[v], vp_dev); if (err) - goto error_irq; + goto error; ++vp_dev->msix_used_vectors; iowrite16(v, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); @@ -336,7 +342,7 @@ static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) v = ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); if (v == VIRTIO_MSI_NO_VECTOR) { err = -EBUSY; - goto error_irq; + goto error; } } @@ -349,16 +355,12 @@ static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) vp_vring_interrupt, 0, vp_dev->msix_names[v], vp_dev); if (err) - goto error_irq; + goto error; ++vp_dev->msix_used_vectors; } return 0; -error_irq: +error: vp_free_vectors(vdev); - kfree(vp_dev->msix_names); -error_names: - kfree(vp_dev->msix_entries); -error_entries: return err; } -- cgit v1.2.3 From f6c82507030d61e15928d5cad946d3eac1c4a384 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 26 Jul 2009 15:48:01 +0300 Subject: virtio: delete vq from list This makes delete vq the reverse of find vq. This is required to make it possible to retry find_vqs after a failure, otherwise the list gets corrupted. Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- drivers/virtio/virtio_pci.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index ca40517ef9c..a1cb1a1c652 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -464,7 +464,11 @@ static void vp_del_vq(struct virtqueue *vq) { struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); struct virtio_pci_vq_info *info = vq->priv; - unsigned long size; + unsigned long flags, size; + + spin_lock_irqsave(&vp_dev->lock, flags); + list_del(&info->node); + spin_unlock_irqrestore(&vp_dev->lock, flags); iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); -- cgit v1.2.3 From e969fed542cae08cb11d666efac4f7c5d624d09f Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 26 Jul 2009 15:48:08 +0300 Subject: virtio: refactor find_vqs This refactors find_vqs, making it more readable and robust, and fixing two regressions from 2.6.30: - double free_irq causing BUG_ON on device removal - probe failure when vq can't be assigned to msi-x vector (reported on old host kernels) Tested-by: Amit Shah Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- drivers/virtio/virtio_pci.c | 212 +++++++++++++++++++++++++------------------- 1 file changed, 119 insertions(+), 93 deletions(-) diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index a1cb1a1c652..248e00ec4dc 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -52,8 +52,10 @@ struct virtio_pci_device char (*msix_names)[256]; /* Number of available vectors */ unsigned msix_vectors; - /* Vectors allocated */ + /* Vectors allocated, excluding per-vq vectors if any */ unsigned msix_used_vectors; + /* Whether we have vector per vq */ + bool per_vq_vectors; }; /* Constants for MSI-X */ @@ -278,27 +280,24 @@ static void vp_free_vectors(struct virtio_device *vdev) vp_dev->msix_entries = NULL; } -static int vp_enable_msix(struct pci_dev *dev, struct msix_entry *entries, - int *options, int noptions) -{ - int i; - for (i = 0; i < noptions; ++i) - if (!pci_enable_msix(dev, entries, options[i])) - return options[i]; - return -EBUSY; -} - -static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) +static int vp_request_vectors(struct virtio_device *vdev, int nvectors, + bool per_vq_vectors) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); const char *name = dev_name(&vp_dev->vdev.dev); unsigned i, v; int err = -ENOMEM; - /* We want at most one vector per queue and one for config changes. - * Fallback to separate vectors for config and a shared for queues. - * Finally fall back to regular interrupts. */ - int options[] = { max_vqs + 1, 2 }; - int nvectors = max(options[0], options[1]); + + if (!nvectors) { + /* Can't allocate MSI-X vectors, use regular interrupt */ + vp_dev->msix_vectors = 0; + err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, + IRQF_SHARED, name, vp_dev); + if (err) + return err; + vp_dev->intx_enabled = 1; + return 0; + } vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries, GFP_KERNEL); @@ -312,41 +311,34 @@ static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) for (i = 0; i < nvectors; ++i) vp_dev->msix_entries[i].entry = i; - err = vp_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries, - options, ARRAY_SIZE(options)); - if (err < 0) { - /* Can't allocate enough MSI-X vectors, use regular interrupt */ - vp_dev->msix_vectors = 0; - err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, - IRQF_SHARED, name, vp_dev); - if (err) - goto error; - vp_dev->intx_enabled = 1; - } else { - vp_dev->msix_vectors = err; - vp_dev->msix_enabled = 1; - - /* Set the vector used for configuration */ - v = vp_dev->msix_used_vectors; - snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, - "%s-config", name); - err = request_irq(vp_dev->msix_entries[v].vector, - vp_config_changed, 0, vp_dev->msix_names[v], - vp_dev); - if (err) - goto error; - ++vp_dev->msix_used_vectors; + err = pci_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries, nvectors); + if (err > 0) + err = -ENOSPC; + if (err) + goto error; + vp_dev->msix_vectors = nvectors; + vp_dev->msix_enabled = 1; + + /* Set the vector used for configuration */ + v = vp_dev->msix_used_vectors; + snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, + "%s-config", name); + err = request_irq(vp_dev->msix_entries[v].vector, + vp_config_changed, 0, vp_dev->msix_names[v], + vp_dev); + if (err) + goto error; + ++vp_dev->msix_used_vectors; - iowrite16(v, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - /* Verify we had enough resources to assign the vector */ - v = ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - if (v == VIRTIO_MSI_NO_VECTOR) { - err = -EBUSY; - goto error; - } + iowrite16(v, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); + /* Verify we had enough resources to assign the vector */ + v = ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); + if (v == VIRTIO_MSI_NO_VECTOR) { + err = -EBUSY; + goto error; } - if (vp_dev->msix_vectors && vp_dev->msix_vectors != max_vqs + 1) { + if (!per_vq_vectors) { /* Shared vector for all VQs */ v = vp_dev->msix_used_vectors; snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, @@ -366,13 +358,14 @@ error: static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, void (*callback)(struct virtqueue *vq), - const char *name) + const char *name, + u16 vector) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info; struct virtqueue *vq; unsigned long flags, size; - u16 num, vector; + u16 num; int err; /* Select the queue we're interested in */ @@ -391,7 +384,7 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, info->queue_index = index; info->num = num; - info->vector = VIRTIO_MSI_NO_VECTOR; + info->vector = vector; size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN)); info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO); @@ -415,22 +408,7 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, vq->priv = info; info->vq = vq; - /* allocate per-vq vector if available and necessary */ - if (callback && vp_dev->msix_used_vectors < vp_dev->msix_vectors) { - vector = vp_dev->msix_used_vectors; - snprintf(vp_dev->msix_names[vector], sizeof *vp_dev->msix_names, - "%s-%s", dev_name(&vp_dev->vdev.dev), name); - err = request_irq(vp_dev->msix_entries[vector].vector, - vring_interrupt, 0, - vp_dev->msix_names[vector], vq); - if (err) - goto out_request_irq; - info->vector = vector; - ++vp_dev->msix_used_vectors; - } else - vector = VP_MSIX_VQ_VECTOR; - - if (callback && vp_dev->msix_enabled) { + if (vector != VIRTIO_MSI_NO_VECTOR) { iowrite16(vector, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); vector = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); if (vector == VIRTIO_MSI_NO_VECTOR) { @@ -446,11 +424,6 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, return vq; out_assign: - if (info->vector != VIRTIO_MSI_NO_VECTOR) { - free_irq(vp_dev->msix_entries[info->vector].vector, vq); - --vp_dev->msix_used_vectors; - } -out_request_irq: vring_del_virtqueue(vq); out_activate_queue: iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); @@ -472,9 +445,6 @@ static void vp_del_vq(struct virtqueue *vq) iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); - if (info->vector != VIRTIO_MSI_NO_VECTOR) - free_irq(vp_dev->msix_entries[info->vector].vector, vq); - if (vp_dev->msix_enabled) { iowrite16(VIRTIO_MSI_NO_VECTOR, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); @@ -495,36 +465,62 @@ static void vp_del_vq(struct virtqueue *vq) /* the config->del_vqs() implementation */ static void vp_del_vqs(struct virtio_device *vdev) { + struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue *vq, *n; + struct virtio_pci_vq_info *info; - list_for_each_entry_safe(vq, n, &vdev->vqs, list) + list_for_each_entry_safe(vq, n, &vdev->vqs, list) { + info = vq->priv; + if (vp_dev->per_vq_vectors) + free_irq(vp_dev->msix_entries[info->vector].vector, vq); vp_del_vq(vq); + } + vp_dev->per_vq_vectors = false; vp_free_vectors(vdev); } -/* the config->find_vqs() implementation */ -static int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs, - struct virtqueue *vqs[], - vq_callback_t *callbacks[], - const char *names[]) +static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], + vq_callback_t *callbacks[], + const char *names[], + int nvectors, + bool per_vq_vectors) { - int vectors = 0; - int i, err; - - /* How many vectors would we like? */ - for (i = 0; i < nvqs; ++i) - if (callbacks[i]) - ++vectors; + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + u16 vector; + int i, err, allocated_vectors; - err = vp_request_vectors(vdev, vectors); + err = vp_request_vectors(vdev, nvectors, per_vq_vectors); if (err) goto error_request; + vp_dev->per_vq_vectors = per_vq_vectors; + allocated_vectors = vp_dev->msix_used_vectors; for (i = 0; i < nvqs; ++i) { - vqs[i] = vp_find_vq(vdev, i, callbacks[i], names[i]); - if (IS_ERR(vqs[i])) + if (!callbacks[i] || !vp_dev->msix_enabled) + vector = VIRTIO_MSI_NO_VECTOR; + else if (vp_dev->per_vq_vectors) + vector = allocated_vectors++; + else + vector = VP_MSIX_VQ_VECTOR; + vqs[i] = vp_find_vq(vdev, i, callbacks[i], names[i], vector); + if (IS_ERR(vqs[i])) { + err = PTR_ERR(vqs[i]); goto error_find; + } + /* allocate per-vq irq if available and necessary */ + if (vp_dev->per_vq_vectors && vector != VIRTIO_MSI_NO_VECTOR) { + snprintf(vp_dev->msix_names[vector], sizeof *vp_dev->msix_names, + "%s-%s", dev_name(&vp_dev->vdev.dev), names[i]); + err = request_irq(vp_dev->msix_entries[vector].vector, + vring_interrupt, 0, + vp_dev->msix_names[vector], vqs[i]); + if (err) { + vp_del_vq(vqs[i]); + goto error_find; + } + } } return 0; @@ -532,7 +528,37 @@ error_find: vp_del_vqs(vdev); error_request: - return PTR_ERR(vqs[i]); + return err; +} + +/* the config->find_vqs() implementation */ +static int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], + vq_callback_t *callbacks[], + const char *names[]) +{ + int vectors = 0; + int i, uninitialized_var(err); + + /* How many vectors would we like? */ + for (i = 0; i < nvqs; ++i) + if (callbacks[i]) + ++vectors; + + /* We want at most one vector per queue and one for config changes. */ + err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, + vectors + 1, true); + if (!err) + return 0; + /* Fallback to separate vectors for config and a shared for queues. */ + err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, + 2, false); + if (!err) + return 0; + /* Finally fall back to regular interrupts. */ + err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, + 0, false); + return err; } static struct virtio_config_ops virtio_pci_config_ops = { -- cgit v1.2.3 From 2e04ef76916d1e29a077ea9d0f2003c8fd86724d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 30 Jul 2009 16:03:45 -0600 Subject: lguest: fix comment style I don't really notice it (except to begrudge the extra vertical space), but Ingo does. And he pointed out that one excuse of lguest is as a teaching tool, it should set a good example. Signed-off-by: Rusty Russell Cc: Ingo Molnar --- Documentation/lguest/lguest.c | 540 ++++++++++++++++++++++------------ arch/x86/include/asm/lguest.h | 3 +- arch/x86/include/asm/lguest_hcall.h | 10 +- arch/x86/lguest/boot.c | 428 +++++++++++++++++---------- arch/x86/lguest/i386_head.S | 110 ++++--- drivers/lguest/core.c | 114 ++++--- drivers/lguest/hypercalls.c | 141 ++++++--- drivers/lguest/interrupts_and_traps.c | 288 ++++++++++++------ drivers/lguest/lg.h | 23 +- drivers/lguest/lguest_device.c | 150 ++++++---- drivers/lguest/lguest_user.c | 137 ++++++--- drivers/lguest/page_tables.c | 427 ++++++++++++++++++--------- drivers/lguest/segments.c | 106 ++++--- drivers/lguest/x86/core.c | 372 +++++++++++++++-------- drivers/lguest/x86/switcher_32.S | 18 +- include/linux/lguest.h | 36 ++- include/linux/lguest_launcher.h | 18 +- 17 files changed, 1906 insertions(+), 1015 deletions(-) diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 45d7d6dcae7..aa66a52b73e 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -1,7 +1,9 @@ -/*P:100 This is the Launcher code, a simple program which lays out the - * "physical" memory for the new Guest by mapping the kernel image and - * the virtual devices, then opens /dev/lguest to tell the kernel - * about the Guest and control it. :*/ +/*P:100 + * This is the Launcher code, a simple program which lays out the "physical" + * memory for the new Guest by mapping the kernel image and the virtual + * devices, then opens /dev/lguest to tell the kernel about the Guest and + * control it. +:*/ #define _LARGEFILE64_SOURCE #define _GNU_SOURCE #include @@ -46,13 +48,15 @@ #include "linux/virtio_rng.h" #include "linux/virtio_ring.h" #include "asm/bootparam.h" -/*L:110 We can ignore the 39 include files we need for this program, but I do - * want to draw attention to the use of kernel-style types. +/*L:110 + * We can ignore the 39 include files we need for this program, but I do want + * to draw attention to the use of kernel-style types. * * As Linus said, "C is a Spartan language, and so should your naming be." I * like these abbreviations, so we define them here. Note that u64 is always * unsigned long long, which works on all Linux systems: this means that we can - * use %llu in printf for any u64. */ + * use %llu in printf for any u64. + */ typedef unsigned long long u64; typedef uint32_t u32; typedef uint16_t u16; @@ -69,8 +73,10 @@ typedef uint8_t u8; /* This will occupy 3 pages: it must be a power of 2. */ #define VIRTQUEUE_NUM 256 -/*L:120 verbose is both a global flag and a macro. The C preprocessor allows - * this, and although I wouldn't recommend it, it works quite nicely here. */ +/*L:120 + * verbose is both a global flag and a macro. The C preprocessor allows + * this, and although I wouldn't recommend it, it works quite nicely here. + */ static bool verbose; #define verbose(args...) \ do { if (verbose) printf(args); } while(0) @@ -100,8 +106,7 @@ struct device_list /* A single linked list of devices. */ struct device *dev; - /* And a pointer to the last device for easy append and also for - * configuration appending. */ + /* And a pointer to the last device for easy append. */ struct device *lastdev; }; @@ -168,20 +173,24 @@ static char **main_args; /* The original tty settings to restore on exit. */ static struct termios orig_term; -/* We have to be careful with barriers: our devices are all run in separate +/* + * We have to be careful with barriers: our devices are all run in separate * threads and so we need to make sure that changes visible to the Guest happen - * in precise order. */ + * in precise order. + */ #define wmb() __asm__ __volatile__("" : : : "memory") #define mb() __asm__ __volatile__("" : : : "memory") -/* Convert an iovec element to the given type. +/* + * Convert an iovec element to the given type. * * This is a fairly ugly trick: we need to know the size of the type and * alignment requirement to check the pointer is kosher. It's also nice to * have the name of the type in case we report failure. * * Typing those three things all the time is cumbersome and error prone, so we - * have a macro which sets them all up and passes to the real function. */ + * have a macro which sets them all up and passes to the real function. + */ #define convert(iov, type) \ ((type *)_convert((iov), sizeof(type), __alignof__(type), #type)) @@ -198,8 +207,10 @@ static void *_convert(struct iovec *iov, size_t size, size_t align, /* Wrapper for the last available index. Makes it easier to change. */ #define lg_last_avail(vq) ((vq)->last_avail_idx) -/* The virtio configuration space is defined to be little-endian. x86 is - * little-endian too, but it's nice to be explicit so we have these helpers. */ +/* + * The virtio configuration space is defined to be little-endian. x86 is + * little-endian too, but it's nice to be explicit so we have these helpers. + */ #define cpu_to_le16(v16) (v16) #define cpu_to_le32(v32) (v32) #define cpu_to_le64(v64) (v64) @@ -241,11 +252,12 @@ static u8 *get_feature_bits(struct device *dev) + dev->num_vq * sizeof(struct lguest_vqconfig); } -/*L:100 The Launcher code itself takes us out into userspace, that scary place - * where pointers run wild and free! Unfortunately, like most userspace - * programs, it's quite boring (which is why everyone likes to hack on the - * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it - * will get you through this section. Or, maybe not. +/*L:100 + * The Launcher code itself takes us out into userspace, that scary place where + * pointers run wild and free! Unfortunately, like most userspace programs, + * it's quite boring (which is why everyone likes to hack on the kernel!). + * Perhaps if you make up an Lguest Drinking Game at this point, it will get + * you through this section. Or, maybe not. * * The Launcher sets up a big chunk of memory to be the Guest's "physical" * memory and stores it in "guest_base". In other words, Guest physical == @@ -253,7 +265,8 @@ static u8 *get_feature_bits(struct device *dev) * * This can be tough to get your head around, but usually it just means that we * use these trivial conversion functions when the Guest gives us it's - * "physical" addresses: */ + * "physical" addresses: + */ static void *from_guest_phys(unsigned long addr) { return guest_base + addr; @@ -268,7 +281,8 @@ static unsigned long to_guest_phys(const void *addr) * Loading the Kernel. * * We start with couple of simple helper routines. open_or_die() avoids - * error-checking code cluttering the callers: */ + * error-checking code cluttering the callers: + */ static int open_or_die(const char *name, int flags) { int fd = open(name, flags); @@ -283,8 +297,10 @@ static void *map_zeroed_pages(unsigned int num) int fd = open_or_die("/dev/zero", O_RDONLY); void *addr; - /* We use a private mapping (ie. if we write to the page, it will be - * copied). */ + /* + * We use a private mapping (ie. if we write to the page, it will be + * copied). + */ addr = mmap(NULL, getpagesize() * num, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); if (addr == MAP_FAILED) @@ -305,20 +321,24 @@ static void *get_pages(unsigned int num) return addr; } -/* This routine is used to load the kernel or initrd. It tries mmap, but if +/* + * This routine is used to load the kernel or initrd. It tries mmap, but if * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), - * it falls back to reading the memory in. */ + * it falls back to reading the memory in. + */ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) { ssize_t r; - /* We map writable even though for some segments are marked read-only. + /* + * We map writable even though for some segments are marked read-only. * The kernel really wants to be writable: it patches its own * instructions. * * MAP_PRIVATE means that the page won't be copied until a write is * done to it. This allows us to share untouched memory between - * Guests. */ + * Guests. + */ if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) return; @@ -329,7 +349,8 @@ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); } -/* This routine takes an open vmlinux image, which is in ELF, and maps it into +/* + * This routine takes an open vmlinux image, which is in ELF, and maps it into * the Guest memory. ELF = Embedded Linking Format, which is the format used * by all modern binaries on Linux including the kernel. * @@ -337,23 +358,28 @@ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) * address. We use the physical address; the Guest will map itself to the * virtual address. * - * We return the starting address. */ + * We return the starting address. + */ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) { Elf32_Phdr phdr[ehdr->e_phnum]; unsigned int i; - /* Sanity checks on the main ELF header: an x86 executable with a - * reasonable number of correctly-sized program headers. */ + /* + * Sanity checks on the main ELF header: an x86 executable with a + * reasonable number of correctly-sized program headers. + */ if (ehdr->e_type != ET_EXEC || ehdr->e_machine != EM_386 || ehdr->e_phentsize != sizeof(Elf32_Phdr) || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) errx(1, "Malformed elf header"); - /* An ELF executable contains an ELF header and a number of "program" + /* + * An ELF executable contains an ELF header and a number of "program" * headers which indicate which parts ("segments") of the program to - * load where. */ + * load where. + */ /* We read in all the program headers at once: */ if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) @@ -361,8 +387,10 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) err(1, "Reading program headers"); - /* Try all the headers: there are usually only three. A read-only one, - * a read-write one, and a "note" section which we don't load. */ + /* + * Try all the headers: there are usually only three. A read-only one, + * a read-write one, and a "note" section which we don't load. + */ for (i = 0; i < ehdr->e_phnum; i++) { /* If this isn't a loadable segment, we ignore it */ if (phdr[i].p_type != PT_LOAD) @@ -380,13 +408,15 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) return ehdr->e_entry; } -/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're - * supposed to jump into it and it will unpack itself. We used to have to - * perform some hairy magic because the unpacking code scared me. +/*L:150 + * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed + * to jump into it and it will unpack itself. We used to have to perform some + * hairy magic because the unpacking code scared me. * * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote * a small patch to jump over the tricky bits in the Guest, so now we just read - * the funky header so we know where in the file to load, and away we go! */ + * the funky header so we know where in the file to load, and away we go! + */ static unsigned long load_bzimage(int fd) { struct boot_params boot; @@ -394,8 +424,10 @@ static unsigned long load_bzimage(int fd) /* Modern bzImages get loaded at 1M. */ void *p = from_guest_phys(0x100000); - /* Go back to the start of the file and read the header. It should be - * a Linux boot header (see Documentation/x86/i386/boot.txt) */ + /* + * Go back to the start of the file and read the header. It should be + * a Linux boot header (see Documentation/x86/i386/boot.txt) + */ lseek(fd, 0, SEEK_SET); read(fd, &boot, sizeof(boot)); @@ -414,9 +446,11 @@ static unsigned long load_bzimage(int fd) return boot.hdr.code32_start; } -/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels +/*L:140 + * Loading the kernel is easy when it's a "vmlinux", but most kernels * come wrapped up in the self-decompressing "bzImage" format. With a little - * work, we can load those, too. */ + * work, we can load those, too. + */ static unsigned long load_kernel(int fd) { Elf32_Ehdr hdr; @@ -433,24 +467,28 @@ static unsigned long load_kernel(int fd) return load_bzimage(fd); } -/* This is a trivial little helper to align pages. Andi Kleen hated it because +/* + * This is a trivial little helper to align pages. Andi Kleen hated it because * it calls getpagesize() twice: "it's dumb code." * * Kernel guys get really het up about optimization, even when it's not - * necessary. I leave this code as a reaction against that. */ + * necessary. I leave this code as a reaction against that. + */ static inline unsigned long page_align(unsigned long addr) { /* Add upwards and truncate downwards. */ return ((addr + getpagesize()-1) & ~(getpagesize()-1)); } -/*L:180 An "initial ram disk" is a disk image loaded into memory along with - * the kernel which the kernel can use to boot from without needing any - * drivers. Most distributions now use this as standard: the initrd contains - * the code to load the appropriate driver modules for the current machine. +/*L:180 + * An "initial ram disk" is a disk image loaded into memory along with the + * kernel which the kernel can use to boot from without needing any drivers. + * Most distributions now use this as standard: the initrd contains the code to + * load the appropriate driver modules for the current machine. * * Importantly, James Morris works for RedHat, and Fedora uses initrds for its - * kernels. He sent me this (and tells me when I break it). */ + * kernels. He sent me this (and tells me when I break it). + */ static unsigned long load_initrd(const char *name, unsigned long mem) { int ifd; @@ -462,12 +500,16 @@ static unsigned long load_initrd(const char *name, unsigned long mem) if (fstat(ifd, &st) < 0) err(1, "fstat() on initrd '%s'", name); - /* We map the initrd at the top of memory, but mmap wants it to be - * page-aligned, so we round the size up for that. */ + /* + * We map the initrd at the top of memory, but mmap wants it to be + * page-aligned, so we round the size up for that. + */ len = page_align(st.st_size); map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); - /* Once a file is mapped, you can close the file descriptor. It's a - * little odd, but quite useful. */ + /* + * Once a file is mapped, you can close the file descriptor. It's a + * little odd, but quite useful. + */ close(ifd); verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); @@ -476,8 +518,10 @@ static unsigned long load_initrd(const char *name, unsigned long mem) } /*:*/ -/* Simple routine to roll all the commandline arguments together with spaces - * between them. */ +/* + * Simple routine to roll all the commandline arguments together with spaces + * between them. + */ static void concat(char *dst, char *args[]) { unsigned int i, len = 0; @@ -494,10 +538,12 @@ static void concat(char *dst, char *args[]) dst[len] = '\0'; } -/*L:185 This is where we actually tell the kernel to initialize the Guest. We +/*L:185 + * This is where we actually tell the kernel to initialize the Guest. We * saw the arguments it expects when we looked at initialize() in lguest_user.c: * the base of Guest "physical" memory, the top physical page to allow and the - * entry point for the Guest. */ + * entry point for the Guest. + */ static void tell_kernel(unsigned long start) { unsigned long args[] = { LHREQ_INITIALIZE, @@ -522,20 +568,26 @@ static void tell_kernel(unsigned long start) static void *_check_pointer(unsigned long addr, unsigned int size, unsigned int line) { - /* We have to separately check addr and addr+size, because size could - * be huge and addr + size might wrap around. */ + /* + * We have to separately check addr and addr+size, because size could + * be huge and addr + size might wrap around. + */ if (addr >= guest_limit || addr + size >= guest_limit) errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); - /* We return a pointer for the caller's convenience, now we know it's - * safe to use. */ + /* + * We return a pointer for the caller's convenience, now we know it's + * safe to use. + */ return from_guest_phys(addr); } /* A macro which transparently hands the line number to the real function. */ #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) -/* Each buffer in the virtqueues is actually a chain of descriptors. This +/* + * Each buffer in the virtqueues is actually a chain of descriptors. This * function returns the next descriptor in the chain, or vq->vring.num if we're - * at the end. */ + * at the end. + */ static unsigned next_desc(struct vring_desc *desc, unsigned int i, unsigned int max) { @@ -576,12 +628,14 @@ static void trigger_irq(struct virtqueue *vq) err(1, "Triggering irq %i", vq->config.irq); } -/* This looks in the virtqueue and for the first available buffer, and converts +/* + * This looks in the virtqueue and for the first available buffer, and converts * it to an iovec for convenient access. Since descriptors consist of some * number of output then some number of input descriptors, it's actually two * iovecs, but we pack them into one and note how many of each there were. * - * This function returns the descriptor number found. */ + * This function returns the descriptor number found. + */ static unsigned wait_for_vq_desc(struct virtqueue *vq, struct iovec iov[], unsigned int *out_num, unsigned int *in_num) @@ -599,8 +653,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, /* OK, now we need to know about added descriptors. */ vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; - /* They could have slipped one in as we were doing that: make - * sure it's written, then check again. */ + /* + * They could have slipped one in as we were doing that: make + * sure it's written, then check again. + */ mb(); if (last_avail != vq->vring.avail->idx) { vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; @@ -620,8 +676,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, errx(1, "Guest moved used index from %u to %u", last_avail, vq->vring.avail->idx); - /* Grab the next descriptor number they're advertising, and increment - * the index we've seen. */ + /* + * Grab the next descriptor number they're advertising, and increment + * the index we've seen. + */ head = vq->vring.avail->ring[last_avail % vq->vring.num]; lg_last_avail(vq)++; @@ -636,8 +694,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, desc = vq->vring.desc; i = head; - /* If this is an indirect entry, then this buffer contains a descriptor - * table which we handle as if it's any normal descriptor chain. */ + /* + * If this is an indirect entry, then this buffer contains a descriptor + * table which we handle as if it's any normal descriptor chain. + */ if (desc[i].flags & VRING_DESC_F_INDIRECT) { if (desc[i].len % sizeof(struct vring_desc)) errx(1, "Invalid size for indirect buffer table"); @@ -656,8 +716,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, if (desc[i].flags & VRING_DESC_F_WRITE) (*in_num)++; else { - /* If it's an output descriptor, they're all supposed - * to come before any input descriptors. */ + /* + * If it's an output descriptor, they're all supposed + * to come before any input descriptors. + */ if (*in_num) errx(1, "Descriptor has out after in"); (*out_num)++; @@ -671,14 +733,18 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, return head; } -/* After we've used one of their buffers, we tell them about it. We'll then - * want to send them an interrupt, using trigger_irq(). */ +/* + * After we've used one of their buffers, we tell them about it. We'll then + * want to send them an interrupt, using trigger_irq(). + */ static void add_used(struct virtqueue *vq, unsigned int head, int len) { struct vring_used_elem *used; - /* The virtqueue contains a ring of used buffers. Get a pointer to the - * next entry in that used ring. */ + /* + * The virtqueue contains a ring of used buffers. Get a pointer to the + * next entry in that used ring. + */ used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; used->id = head; used->len = len; @@ -698,7 +764,8 @@ static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) /* * The Console * - * We associate some data with the console for our exit hack. */ + * We associate some data with the console for our exit hack. + */ struct console_abort { /* How many times have they hit ^C? */ @@ -725,20 +792,24 @@ static void console_input(struct virtqueue *vq) if (len <= 0) { /* Ran out of input? */ warnx("Failed to get console input, ignoring console."); - /* For simplicity, dying threads kill the whole Launcher. So - * just nap here. */ + /* + * For simplicity, dying threads kill the whole Launcher. So + * just nap here. + */ for (;;) pause(); } add_used_and_trigger(vq, head, len); - /* Three ^C within one second? Exit. + /* + * Three ^C within one second? Exit. * * This is such a hack, but works surprisingly well. Each ^C has to * be in a buffer by itself, so they can't be too fast. But we check * that we get three within about a second, so they can't be too - * slow. */ + * slow. + */ if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { abort->count = 0; return; @@ -809,8 +880,7 @@ static bool will_block(int fd) return select(fd+1, &fdset, NULL, NULL, &zero) != 1; } -/* This is where we handle packets coming in from the tun device to our - * Guest. */ +/* This handles packets coming in from the tun device to our Guest. */ static void net_input(struct virtqueue *vq) { int len; @@ -842,8 +912,10 @@ static int do_thread(void *_vq) return 0; } -/* When a child dies, we kill our entire process group with SIGTERM. This - * also has the side effect that the shell restores the console for us! */ +/* + * When a child dies, we kill our entire process group with SIGTERM. This + * also has the side effect that the shell restores the console for us! + */ static void kill_launcher(int signal) { kill(0, SIGTERM); @@ -880,9 +952,10 @@ static void reset_device(struct device *dev) static void create_thread(struct virtqueue *vq) { - /* Create stack for thread and run it. Since stack grows - * upwards, we point the stack pointer to the end of this - * region. */ + /* + * Create stack for thread and run it. Since the stack grows upwards, + * we point the stack pointer to the end of this region. + */ char *stack = malloc(32768); unsigned long args[] = { LHREQ_EVENTFD, vq->config.pfn*getpagesize(), 0 }; @@ -981,8 +1054,11 @@ static void handle_output(unsigned long addr) } } - /* Early console write is done using notify on a nul-terminated string - * in Guest memory. */ + /* + * Early console write is done using notify on a nul-terminated string + * in Guest memory. It's also great for hacking debugging messages + * into a Guest. + */ if (addr >= guest_limit) errx(1, "Bad NOTIFY %#lx", addr); @@ -998,10 +1074,12 @@ static void handle_output(unsigned long addr) * routines to allocate and manage them. */ -/* The layout of the device page is a "struct lguest_device_desc" followed by a +/* + * The layout of the device page is a "struct lguest_device_desc" followed by a * number of virtqueue descriptors, then two sets of feature bits, then an * array of configuration bytes. This routine returns the configuration - * pointer. */ + * pointer. + */ static u8 *device_config(const struct device *dev) { return (void *)(dev->desc + 1) @@ -1009,9 +1087,11 @@ static u8 *device_config(const struct device *dev) + dev->feature_len * 2; } -/* This routine allocates a new "struct lguest_device_desc" from descriptor +/* + * This routine allocates a new "struct lguest_device_desc" from descriptor * table page just above the Guest's normal memory. It returns a pointer to - * that descriptor. */ + * that descriptor. + */ static struct lguest_device_desc *new_dev_desc(u16 type) { struct lguest_device_desc d = { .type = type }; @@ -1032,8 +1112,10 @@ static struct lguest_device_desc *new_dev_desc(u16 type) return memcpy(p, &d, sizeof(d)); } -/* Each device descriptor is followed by the description of its virtqueues. We - * specify how many descriptors the virtqueue is to have. */ +/* + * Each device descriptor is followed by the description of its virtqueues. We + * specify how many descriptors the virtqueue is to have. + */ static void add_virtqueue(struct device *dev, unsigned int num_descs, void (*service)(struct virtqueue *)) { @@ -1061,10 +1143,12 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, /* Initialize the vring. */ vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN); - /* Append virtqueue to this device's descriptor. We use + /* + * Append virtqueue to this device's descriptor. We use * device_config() to get the end of the device's current virtqueues; * we check that we haven't added any config or feature information - * yet, otherwise we'd be overwriting them. */ + * yet, otherwise we'd be overwriting them. + */ assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); memcpy(device_config(dev), &vq->config, sizeof(vq->config)); dev->num_vq++; @@ -1072,14 +1156,18 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, verbose("Virtqueue page %#lx\n", to_guest_phys(p)); - /* Add to tail of list, so dev->vq is first vq, dev->vq->next is - * second. */ + /* + * Add to tail of list, so dev->vq is first vq, dev->vq->next is + * second. + */ for (i = &dev->vq; *i; i = &(*i)->next); *i = vq; } -/* The first half of the feature bitmask is for us to advertise features. The - * second half is for the Guest to accept features. */ +/* + * The first half of the feature bitmask is for us to advertise features. The + * second half is for the Guest to accept features. + */ static void add_feature(struct device *dev, unsigned bit) { u8 *features = get_feature_bits(dev); @@ -1093,9 +1181,11 @@ static void add_feature(struct device *dev, unsigned bit) features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); } -/* This routine sets the configuration fields for an existing device's +/* + * This routine sets the configuration fields for an existing device's * descriptor. It only works for the last device, but that's OK because that's - * how we use it. */ + * how we use it. + */ static void set_config(struct device *dev, unsigned len, const void *conf) { /* Check we haven't overflowed our single page. */ @@ -1110,10 +1200,12 @@ static void set_config(struct device *dev, unsigned len, const void *conf) assert(dev->desc->config_len == len); } -/* This routine does all the creation and setup of a new device, including +/* + * This routine does all the creation and setup of a new device, including * calling new_dev_desc() to allocate the descriptor and device memory. * - * See what I mean about userspace being boring? */ + * See what I mean about userspace being boring? + */ static struct device *new_device(const char *name, u16 type) { struct device *dev = malloc(sizeof(*dev)); @@ -1126,10 +1218,12 @@ static struct device *new_device(const char *name, u16 type) dev->num_vq = 0; dev->running = false; - /* Append to device list. Prepending to a single-linked list is + /* + * Append to device list. Prepending to a single-linked list is * easier, but the user expects the devices to be arranged on the bus * in command-line order. The first network device on the command line - * is eth0, the first block device /dev/vda, etc. */ + * is eth0, the first block device /dev/vda, etc. + */ if (devices.lastdev) devices.lastdev->next = dev; else @@ -1139,8 +1233,10 @@ static struct device *new_device(const char *name, u16 type) return dev; } -/* Our first setup routine is the console. It's a fairly simple device, but - * UNIX tty handling makes it uglier than it could be. */ +/* + * Our first setup routine is the console. It's a fairly simple device, but + * UNIX tty handling makes it uglier than it could be. + */ static void setup_console(void) { struct device *dev; @@ -1148,8 +1244,10 @@ static void setup_console(void) /* If we can save the initial standard input settings... */ if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { struct termios term = orig_term; - /* Then we turn off echo, line buffering and ^C etc. We want a - * raw input stream to the Guest. */ + /* + * Then we turn off echo, line buffering and ^C etc: We want a + * raw input stream to the Guest. + */ term.c_lflag &= ~(ISIG|ICANON|ECHO); tcsetattr(STDIN_FILENO, TCSANOW, &term); } @@ -1160,10 +1258,12 @@ static void setup_console(void) dev->priv = malloc(sizeof(struct console_abort)); ((struct console_abort *)dev->priv)->count = 0; - /* The console needs two virtqueues: the input then the output. When + /* + * The console needs two virtqueues: the input then the output. When * they put something the input queue, we make sure we're listening to * stdin. When they put something in the output queue, we write it to - * stdout. */ + * stdout. + */ add_virtqueue(dev, VIRTQUEUE_NUM, console_input); add_virtqueue(dev, VIRTQUEUE_NUM, console_output); @@ -1171,7 +1271,8 @@ static void setup_console(void) } /*:*/ -/*M:010 Inter-guest networking is an interesting area. Simplest is to have a +/*M:010 + * Inter-guest networking is an interesting area. Simplest is to have a * --sharenet= option which opens or creates a named pipe. This can be * used to send packets to another guest in a 1:1 manner. * @@ -1185,7 +1286,8 @@ static void setup_console(void) * multiple inter-guest channels behind one interface, although it would * require some manner of hotplugging new virtio channels. * - * Finally, we could implement a virtio network switch in the kernel. :*/ + * Finally, we could implement a virtio network switch in the kernel. +:*/ static u32 str2ip(const char *ipaddr) { @@ -1210,11 +1312,13 @@ static void str2mac(const char *macaddr, unsigned char mac[6]) mac[5] = m[5]; } -/* This code is "adapted" from libbridge: it attaches the Host end of the +/* + * This code is "adapted" from libbridge: it attaches the Host end of the * network device to the bridge device specified by the command line. * * This is yet another James Morris contribution (I'm an IP-level guy, so I - * dislike bridging), and I just try not to break it. */ + * dislike bridging), and I just try not to break it. + */ static void add_to_bridge(int fd, const char *if_name, const char *br_name) { int ifidx; @@ -1234,9 +1338,11 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name) err(1, "can't add %s to bridge %s", if_name, br_name); } -/* This sets up the Host end of the network device with an IP address, brings +/* + * This sets up the Host end of the network device with an IP address, brings * it up so packets will flow, the copies the MAC address into the hwaddr - * pointer. */ + * pointer. + */ static void configure_device(int fd, const char *tapif, u32 ipaddr) { struct ifreq ifr; @@ -1263,10 +1369,12 @@ static int get_tun_device(char tapif[IFNAMSIZ]) /* Start with this zeroed. Messy but sure. */ memset(&ifr, 0, sizeof(ifr)); - /* We open the /dev/net/tun device and tell it we want a tap device. A + /* + * We open the /dev/net/tun device and tell it we want a tap device. A * tap device is like a tun device, only somehow different. To tell * the truth, I completely blundered my way through this code, but it - * works now! */ + * works now! + */ netfd = open_or_die("/dev/net/tun", O_RDWR); ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; strcpy(ifr.ifr_name, "tap%d"); @@ -1277,18 +1385,22 @@ static int get_tun_device(char tapif[IFNAMSIZ]) TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0) err(1, "Could not set features for tun device"); - /* We don't need checksums calculated for packets coming in this - * device: trust us! */ + /* + * We don't need checksums calculated for packets coming in this + * device: trust us! + */ ioctl(netfd, TUNSETNOCSUM, 1); memcpy(tapif, ifr.ifr_name, IFNAMSIZ); return netfd; } -/*L:195 Our network is a Host<->Guest network. This can either use bridging or +/*L:195 + * Our network is a Host<->Guest network. This can either use bridging or * routing, but the principle is the same: it uses the "tun" device to inject * packets into the Host as if they came in from a normal network card. We - * just shunt packets between the Guest and the tun device. */ + * just shunt packets between the Guest and the tun device. + */ static void setup_tun_net(char *arg) { struct device *dev; @@ -1305,13 +1417,14 @@ static void setup_tun_net(char *arg) dev = new_device("net", VIRTIO_ID_NET); dev->priv = net_info; - /* Network devices need a receive and a send queue, just like - * console. */ + /* Network devices need a recv and a send queue, just like console. */ add_virtqueue(dev, VIRTQUEUE_NUM, net_input); add_virtqueue(dev, VIRTQUEUE_NUM, net_output); - /* We need a socket to perform the magic network ioctls to bring up the - * tap interface, connect to the bridge etc. Any socket will do! */ + /* + * We need a socket to perform the magic network ioctls to bring up the + * tap interface, connect to the bridge etc. Any socket will do! + */ ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); if (ipfd < 0) err(1, "opening IP socket"); @@ -1366,7 +1479,8 @@ static void setup_tun_net(char *arg) devices.device_num, tapif, arg); } -/* Our block (disk) device should be really simple: the Guest asks for a block +/* + * Our block (disk) device should be really simple: the Guest asks for a block * number and we read or write that position in the file. Unfortunately, that * was amazingly slow: the Guest waits until the read is finished before * running anything else, even if it could have been doing useful work. @@ -1374,7 +1488,9 @@ static void setup_tun_net(char *arg) * We could use async I/O, except it's reputed to suck so hard that characters * actually go missing from your code when you try to use it. * - * So we farm the I/O out to thread, and communicate with it via a pipe. */ + * So this was one reason why lguest now does all virtqueue servicing in + * separate threads: it's more efficient and more like a real device. + */ /* This hangs off device->priv. */ struct vblk_info @@ -1412,9 +1528,11 @@ static void blk_request(struct virtqueue *vq) /* Get the next request. */ head = wait_for_vq_desc(vq, iov, &out_num, &in_num); - /* Every block request should contain at least one output buffer + /* + * Every block request should contain at least one output buffer * (detailing the location on disk and the type of request) and one - * input buffer (to hold the result). */ + * input buffer (to hold the result). + */ if (out_num == 0 || in_num == 0) errx(1, "Bad virtblk cmd %u out=%u in=%u", head, out_num, in_num); @@ -1423,33 +1541,41 @@ static void blk_request(struct virtqueue *vq) in = convert(&iov[out_num+in_num-1], u8); off = out->sector * 512; - /* The block device implements "barriers", where the Guest indicates + /* + * The block device implements "barriers", where the Guest indicates * that it wants all previous writes to occur before this write. We * don't have a way of asking our kernel to do a barrier, so we just - * synchronize all the data in the file. Pretty poor, no? */ + * synchronize all the data in the file. Pretty poor, no? + */ if (out->type & VIRTIO_BLK_T_BARRIER) fdatasync(vblk->fd); - /* In general the virtio block driver is allowed to try SCSI commands. - * It'd be nice if we supported eject, for example, but we don't. */ + /* + * In general the virtio block driver is allowed to try SCSI commands. + * It'd be nice if we supported eject, for example, but we don't. + */ if (out->type & VIRTIO_BLK_T_SCSI_CMD) { fprintf(stderr, "Scsi commands unsupported\n"); *in = VIRTIO_BLK_S_UNSUPP; wlen = sizeof(*in); } else if (out->type & VIRTIO_BLK_T_OUT) { - /* Write */ - - /* Move to the right location in the block file. This can fail - * if they try to write past end. */ + /* + * Write + * + * Move to the right location in the block file. This can fail + * if they try to write past end. + */ if (lseek64(vblk->fd, off, SEEK_SET) != off) err(1, "Bad seek to sector %llu", out->sector); ret = writev(vblk->fd, iov+1, out_num-1); verbose("WRITE to sector %llu: %i\n", out->sector, ret); - /* Grr... Now we know how long the descriptor they sent was, we + /* + * Grr... Now we know how long the descriptor they sent was, we * make sure they didn't try to write over the end of the block - * file (possibly extending it). */ + * file (possibly extending it). + */ if (ret > 0 && off + ret > vblk->len) { /* Trim it back to the correct length */ ftruncate64(vblk->fd, vblk->len); @@ -1459,10 +1585,12 @@ static void blk_request(struct virtqueue *vq) wlen = sizeof(*in); *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); } else { - /* Read */ - - /* Move to the right location in the block file. This can fail - * if they try to read past end. */ + /* + * Read + * + * Move to the right location in the block file. This can fail + * if they try to read past end. + */ if (lseek64(vblk->fd, off, SEEK_SET) != off) err(1, "Bad seek to sector %llu", out->sector); @@ -1477,10 +1605,12 @@ static void blk_request(struct virtqueue *vq) } } - /* OK, so we noted that it was pretty poor to use an fdatasync as a + /* + * OK, so we noted that it was pretty poor to use an fdatasync as a * barrier. But Christoph Hellwig points out that we need a sync * *afterwards* as well: "Barriers specify no reordering to the front - * or the back." And Jens Axboe confirmed it, so here we are: */ + * or the back." And Jens Axboe confirmed it, so here we are: + */ if (out->type & VIRTIO_BLK_T_BARRIER) fdatasync(vblk->fd); @@ -1494,7 +1624,7 @@ static void setup_block_file(const char *filename) struct vblk_info *vblk; struct virtio_blk_config conf; - /* The device responds to return from I/O thread. */ + /* Creat the device. */ dev = new_device("block", VIRTIO_ID_BLOCK); /* The device has one virtqueue, where the Guest places requests. */ @@ -1513,8 +1643,10 @@ static void setup_block_file(const char *filename) /* Tell Guest how many sectors this device has. */ conf.capacity = cpu_to_le64(vblk->len / 512); - /* Tell Guest not to put in too many descriptors at once: two are used - * for the in and out elements. */ + /* + * Tell Guest not to put in too many descriptors at once: two are used + * for the in and out elements. + */ add_feature(dev, VIRTIO_BLK_F_SEG_MAX); conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); @@ -1525,16 +1657,18 @@ static void setup_block_file(const char *filename) ++devices.device_num, le64_to_cpu(conf.capacity)); } -struct rng_info { - int rfd; -}; - -/* Our random number generator device reads from /dev/random into the Guest's +/*L:211 + * Our random number generator device reads from /dev/random into the Guest's * input buffers. The usual case is that the Guest doesn't want random numbers * and so has no buffers although /dev/random is still readable, whereas * console is the reverse. * - * The same logic applies, however. */ + * The same logic applies, however. + */ +struct rng_info { + int rfd; +}; + static void rng_input(struct virtqueue *vq) { int len; @@ -1547,9 +1681,11 @@ static void rng_input(struct virtqueue *vq) if (out_num) errx(1, "Output buffers in rng?"); - /* This is why we convert to iovecs: the readv() call uses them, and so + /* + * This is why we convert to iovecs: the readv() call uses them, and so * it reads straight into the Guest's buffer. We loop to make sure we - * fill it. */ + * fill it. + */ while (!iov_empty(iov, in_num)) { len = readv(rng_info->rfd, iov, in_num); if (len <= 0) @@ -1562,15 +1698,18 @@ static void rng_input(struct virtqueue *vq) add_used(vq, head, totlen); } -/* And this creates a "hardware" random number device for the Guest. */ +/*L:199 + * This creates a "hardware" random number device for the Guest. + */ static void setup_rng(void) { struct device *dev; struct rng_info *rng_info = malloc(sizeof(*rng_info)); + /* Our device's privat info simply contains the /dev/random fd. */ rng_info->rfd = open_or_die("/dev/random", O_RDONLY); - /* The device responds to return from I/O thread. */ + /* Create the new device. */ dev = new_device("rng", VIRTIO_ID_RNG); dev->priv = rng_info; @@ -1586,8 +1725,10 @@ static void __attribute__((noreturn)) restart_guest(void) { unsigned int i; - /* Since we don't track all open fds, we simply close everything beyond - * stderr. */ + /* + * Since we don't track all open fds, we simply close everything beyond + * stderr. + */ for (i = 3; i < FD_SETSIZE; i++) close(i); @@ -1598,8 +1739,10 @@ static void __attribute__((noreturn)) restart_guest(void) err(1, "Could not exec %s", main_args[0]); } -/*L:220 Finally we reach the core of the Launcher which runs the Guest, serves - * its input and output, and finally, lays it to rest. */ +/*L:220 + * Finally we reach the core of the Launcher which runs the Guest, serves + * its input and output, and finally, lays it to rest. + */ static void __attribute__((noreturn)) run_guest(void) { for (;;) { @@ -1634,7 +1777,7 @@ static void __attribute__((noreturn)) run_guest(void) * * Are you ready? Take a deep breath and join me in the core of the Host, in * "make Host". - :*/ +:*/ static struct option opts[] = { { "verbose", 0, NULL, 'v' }, @@ -1655,8 +1798,7 @@ static void usage(void) /*L:105 The main routine is where the real work begins: */ int main(int argc, char *argv[]) { - /* Memory, top-level pagetable, code startpoint and size of the - * (optional) initrd. */ + /* Memory, code startpoint and size of the (optional) initrd. */ unsigned long mem = 0, start, initrd_size = 0; /* Two temporaries. */ int i, c; @@ -1668,24 +1810,30 @@ int main(int argc, char *argv[]) /* Save the args: we "reboot" by execing ourselves again. */ main_args = argv; - /* First we initialize the device list. We keep a pointer to the last + /* + * First we initialize the device list. We keep a pointer to the last * device, and the next interrupt number to use for devices (1: - * remember that 0 is used by the timer). */ + * remember that 0 is used by the timer). + */ devices.lastdev = NULL; devices.next_irq = 1; cpu_id = 0; - /* We need to know how much memory so we can set up the device + /* + * We need to know how much memory so we can set up the device * descriptor and memory pages for the devices as we parse the command * line. So we quickly look through the arguments to find the amount - * of memory now. */ + * of memory now. + */ for (i = 1; i < argc; i++) { if (argv[i][0] != '-') { mem = atoi(argv[i]) * 1024 * 1024; - /* We start by mapping anonymous pages over all of + /* + * We start by mapping anonymous pages over all of * guest-physical memory range. This fills it with 0, * and ensures that the Guest won't be killed when it - * tries to access it. */ + * tries to access it. + */ guest_base = map_zeroed_pages(mem / getpagesize() + DEVICE_PAGES); guest_limit = mem; @@ -1718,8 +1866,10 @@ int main(int argc, char *argv[]) usage(); } } - /* After the other arguments we expect memory and kernel image name, - * followed by command line arguments for the kernel. */ + /* + * After the other arguments we expect memory and kernel image name, + * followed by command line arguments for the kernel. + */ if (optind + 2 > argc) usage(); @@ -1737,20 +1887,26 @@ int main(int argc, char *argv[]) /* Map the initrd image if requested (at top of physical memory) */ if (initrd_name) { initrd_size = load_initrd(initrd_name, mem); - /* These are the location in the Linux boot header where the - * start and size of the initrd are expected to be found. */ + /* + * These are the location in the Linux boot header where the + * start and size of the initrd are expected to be found. + */ boot->hdr.ramdisk_image = mem - initrd_size; boot->hdr.ramdisk_size = initrd_size; /* The bootloader type 0xFF means "unknown"; that's OK. */ boot->hdr.type_of_loader = 0xFF; } - /* The Linux boot header contains an "E820" memory map: ours is a - * simple, single region. */ + /* + * The Linux boot header contains an "E820" memory map: ours is a + * simple, single region. + */ boot->e820_entries = 1; boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); - /* The boot header contains a command line pointer: we put the command - * line after the boot header. */ + /* + * The boot header contains a command line pointer: we put the command + * line after the boot header. + */ boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); /* We use a simple helper to copy the arguments separated by spaces. */ concat((char *)(boot + 1), argv+optind+2); @@ -1764,8 +1920,10 @@ int main(int argc, char *argv[]) /* Tell the entry path not to try to reload segment registers. */ boot->hdr.loadflags |= KEEP_SEGMENTS; - /* We tell the kernel to initialize the Guest: this returns the open - * /dev/lguest file descriptor. */ + /* + * We tell the kernel to initialize the Guest: this returns the open + * /dev/lguest file descriptor. + */ tell_kernel(start); /* Ensure that we terminate if a child dies. */ diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 313389cd50d..5136dad57cb 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -17,8 +17,7 @@ /* Pages for switcher itself, then two pages per cpu */ #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) -/* We map at -4M (-2M when PAE is activated) for ease of mapping - * into the guest (one PTE page). */ +/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */ #ifdef CONFIG_X86_PAE #define SWITCHER_ADDR 0xFFE00000 #else diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index 33600a66755..cceb73e12e5 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -30,7 +30,8 @@ #include #include -/*G:030 But first, how does our Guest contact the Host to ask for privileged +/*G:030 + * But first, how does our Guest contact the Host to ask for privileged * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * @@ -41,16 +42,15 @@ * * Grossly invalid calls result in Sudden Death at the hands of the vengeful * Host, rather than returning failure. This reflects Winston Churchill's - * definition of a gentleman: "someone who is only rude intentionally". */ -/*:*/ + * definition of a gentleman: "someone who is only rude intentionally". +:*/ /* Can't use our min() macro here: needs to be a constant */ #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) #define LHCALL_RING_SIZE 64 struct hcall_args { - /* These map directly onto eax, ebx, ecx, edx and esi - * in struct lguest_regs */ + /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */ unsigned long arg0, arg1, arg2, arg3, arg4; }; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index f2bf1f73d46..025c04d18f2 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -22,7 +22,8 @@ * * So how does the kernel know it's a Guest? We'll see that later, but let's * just say that we end up here where we replace the native functions various - * "paravirt" structures with our Guest versions, then boot like normal. :*/ + * "paravirt" structures with our Guest versions, then boot like normal. +:*/ /* * Copyright (C) 2006, Rusty Russell IBM Corporation. @@ -74,7 +75,8 @@ * * The Guest in our tale is a simple creature: identical to the Host but * behaving in simplified but equivalent ways. In particular, the Guest is the - * same kernel as the Host (or at least, built from the same source code). :*/ + * same kernel as the Host (or at least, built from the same source code). +:*/ struct lguest_data lguest_data = { .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, @@ -85,7 +87,8 @@ struct lguest_data lguest_data = { .syscall_vec = SYSCALL_VECTOR, }; -/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a +/*G:037 + * async_hcall() is pretty simple: I'm quite proud of it really. We have a * ring buffer of stored hypercalls which the Host will run though next time we * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall * arguments, and a "hcall_status" word which is 0 if the call is ready to go, @@ -94,7 +97,8 @@ struct lguest_data lguest_data = { * If we come around to a slot which hasn't been finished, then the table is * full and we just make the hypercall directly. This has the nice side * effect of causing the Host to run all the stored calls in the ring buffer - * which empties it for next time! */ + * which empties it for next time! + */ static void async_hcall(unsigned long call, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4) @@ -103,9 +107,11 @@ static void async_hcall(unsigned long call, unsigned long arg1, static unsigned int next_call; unsigned long flags; - /* Disable interrupts if not already disabled: we don't want an + /* + * Disable interrupts if not already disabled: we don't want an * interrupt handler making a hypercall while we're already doing - * one! */ + * one! + */ local_irq_save(flags); if (lguest_data.hcall_status[next_call] != 0xFF) { /* Table full, so do normal hcall which will flush table. */ @@ -125,8 +131,9 @@ static void async_hcall(unsigned long call, unsigned long arg1, local_irq_restore(flags); } -/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first - * real optimization trick! +/*G:035 + * Notice the lazy_hcall() above, rather than hcall(). This is our first real + * optimization trick! * * When lazy_mode is set, it means we're allowed to defer all hypercalls and do * them as a batch when lazy_mode is eventually turned off. Because hypercalls @@ -136,7 +143,8 @@ static void async_hcall(unsigned long call, unsigned long arg1, * lguest_leave_lazy_mode(). * * So, when we're in lazy mode, we call async_hcall() to store the call for - * future processing: */ + * future processing: + */ static void lazy_hcall1(unsigned long call, unsigned long arg1) { @@ -208,9 +216,11 @@ static void lguest_end_context_switch(struct task_struct *next) * check there before it tries to deliver an interrupt. */ -/* save_flags() is expected to return the processor state (ie. "flags"). The +/* + * save_flags() is expected to return the processor state (ie. "flags"). The * flags word contains all kind of stuff, but in practice Linux only cares - * about the interrupt flag. Our "save_flags()" just returns that. */ + * about the interrupt flag. Our "save_flags()" just returns that. + */ static unsigned long save_fl(void) { return lguest_data.irq_enabled; @@ -222,13 +232,15 @@ static void irq_disable(void) lguest_data.irq_enabled = 0; } -/* Let's pause a moment. Remember how I said these are called so often? +/* + * Let's pause a moment. Remember how I said these are called so often? * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to * break some rules. In particular, these functions are assumed to save their * own registers if they need to: normal C functions assume they can trash the * eax register. To use normal C functions, we use * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the - * C function, then restores it. */ + * C function, then restores it. + */ PV_CALLEE_SAVE_REGS_THUNK(save_fl); PV_CALLEE_SAVE_REGS_THUNK(irq_disable); /*:*/ @@ -237,18 +249,20 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_disable); extern void lg_irq_enable(void); extern void lg_restore_fl(unsigned long flags); -/*M:003 Note that we don't check for outstanding interrupts when we re-enable - * them (or when we unmask an interrupt). This seems to work for the moment, - * since interrupts are rare and we'll just get the interrupt on the next timer - * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way - * would be to put the "irq_enabled" field in a page by itself, and have the - * Host write-protect it when an interrupt comes in when irqs are disabled. - * There will then be a page fault as soon as interrupts are re-enabled. +/*M:003 + * Note that we don't check for outstanding interrupts when we re-enable them + * (or when we unmask an interrupt). This seems to work for the moment, since + * interrupts are rare and we'll just get the interrupt on the next timer tick, + * but now we can run with CONFIG_NO_HZ, we should revisit this. One way would + * be to put the "irq_enabled" field in a page by itself, and have the Host + * write-protect it when an interrupt comes in when irqs are disabled. There + * will then be a page fault as soon as interrupts are re-enabled. * * A better method is to implement soft interrupt disable generally for x86: * instead of disabling interrupts, we set a flag. If an interrupt does come * in, we then disable them for real. This is uncommon, so we could simply use - * a hypercall for interrupt control and not worry about efficiency. :*/ + * a hypercall for interrupt control and not worry about efficiency. +:*/ /*G:034 * The Interrupt Descriptor Table (IDT). @@ -261,10 +275,12 @@ extern void lg_restore_fl(unsigned long flags); static void lguest_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) { - /* The gate_desc structure is 8 bytes long: we hand it to the Host in + /* + * The gate_desc structure is 8 bytes long: we hand it to the Host in * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors * around like this; typesafety wasn't a big concern in Linux's early - * years. */ + * years. + */ u32 *desc = (u32 *)g; /* Keep the local copy up to date. */ native_write_idt_entry(dt, entrynum, g); @@ -272,9 +288,11 @@ static void lguest_write_idt_entry(gate_desc *dt, kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); } -/* Changing to a different IDT is very rare: we keep the IDT up-to-date every +/* + * Changing to a different IDT is very rare: we keep the IDT up-to-date every * time it is written, so we can simply loop through all entries and tell the - * Host about them. */ + * Host about them. + */ static void lguest_load_idt(const struct desc_ptr *desc) { unsigned int i; @@ -305,9 +323,11 @@ static void lguest_load_gdt(const struct desc_ptr *desc) kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); } -/* For a single GDT entry which changes, we do the lazy thing: alter our GDT, +/* + * For a single GDT entry which changes, we do the lazy thing: alter our GDT, * then tell the Host to reload the entire thing. This operation is so rare - * that this naive implementation is reasonable. */ + * that this naive implementation is reasonable. + */ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, const void *desc, int type) { @@ -317,29 +337,36 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, dt[entrynum].a, dt[entrynum].b); } -/* OK, I lied. There are three "thread local storage" GDT entries which change +/* + * OK, I lied. There are three "thread local storage" GDT entries which change * on every context switch (these three entries are how glibc implements - * __thread variables). So we have a hypercall specifically for this case. */ + * __thread variables). So we have a hypercall specifically for this case. + */ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) { - /* There's one problem which normal hardware doesn't have: the Host + /* + * There's one problem which normal hardware doesn't have: the Host * can't handle us removing entries we're currently using. So we clear - * the GS register here: if it's needed it'll be reloaded anyway. */ + * the GS register here: if it's needed it'll be reloaded anyway. + */ lazy_load_gs(0); lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); } -/*G:038 That's enough excitement for now, back to ploughing through each of - * the different pv_ops structures (we're about 1/3 of the way through). +/*G:038 + * That's enough excitement for now, back to ploughing through each of the + * different pv_ops structures (we're about 1/3 of the way through). * * This is the Local Descriptor Table, another weird Intel thingy. Linux only * uses this for some strange applications like Wine. We don't do anything - * here, so they'll get an informative and friendly Segmentation Fault. */ + * here, so they'll get an informative and friendly Segmentation Fault. + */ static void lguest_set_ldt(const void *addr, unsigned entries) { } -/* This loads a GDT entry into the "Task Register": that entry points to a +/* + * This loads a GDT entry into the "Task Register": that entry points to a * structure called the Task State Segment. Some comments scattered though the * kernel code indicate that this used for task switching in ages past, along * with blood sacrifice and astrology. @@ -347,19 +374,21 @@ static void lguest_set_ldt(const void *addr, unsigned entries) * Now there's nothing interesting in here that we don't get told elsewhere. * But the native version uses the "ltr" instruction, which makes the Host * complain to the Guest about a Segmentation Fault and it'll oops. So we - * override the native version with a do-nothing version. */ + * override the native version with a do-nothing version. + */ static void lguest_load_tr_desc(void) { } -/* The "cpuid" instruction is a way of querying both the CPU identity +/* + * The "cpuid" instruction is a way of querying both the CPU identity * (manufacturer, model, etc) and its features. It was introduced before the * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. * As you might imagine, after a decade and a half this treatment, it is now a * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. * * This instruction even it has its own Wikipedia entry. The Wikipedia entry - * has been translated into 4 languages. I am not making this up! + * has been translated into 5 languages. I am not making this up! * * We could get funky here and identify ourselves as "GenuineLguest", but * instead we just use the real "cpuid" instruction. Then I pretty much turned @@ -371,7 +400,8 @@ static void lguest_load_tr_desc(void) * Replacing the cpuid so we can turn features off is great for the kernel, but * anyone (including userspace) can just use the raw "cpuid" instruction and * the Host won't even notice since it isn't privileged. So we try not to get - * too worked up about it. */ + * too worked up about it. + */ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, unsigned int *cx, unsigned int *dx) { @@ -379,43 +409,63 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, native_cpuid(ax, bx, cx, dx); switch (function) { - case 0: /* ID and highest CPUID. Futureproof a little by sticking to - * older ones. */ + /* + * CPUID 0 gives the highest legal CPUID number (and the ID string). + * We futureproof our code a little by sticking to known CPUID values. + */ + case 0: if (*ax > 5) *ax = 5; break; - case 1: /* Basic feature request. */ - /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ + + /* + * CPUID 1 is a basic feature request. + * + * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 + * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. + */ + case 1: *cx &= 0x00002201; - /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ *dx &= 0x07808151; - /* The Host can do a nice optimization if it knows that the + /* + * The Host can do a nice optimization if it knows that the * kernel mappings (addresses above 0xC0000000 or whatever * PAGE_OFFSET is set to) haven't changed. But Linux calls * flush_tlb_user() for both user and kernel mappings unless - * the Page Global Enable (PGE) feature bit is set. */ + * the Page Global Enable (PGE) feature bit is set. + */ *dx |= 0x00002000; - /* We also lie, and say we're family id 5. 6 or greater + /* + * We also lie, and say we're family id 5. 6 or greater * leads to a rdmsr in early_init_intel which we can't handle. - * Family ID is returned as bits 8-12 in ax. */ + * Family ID is returned as bits 8-12 in ax. + */ *ax &= 0xFFFFF0FF; *ax |= 0x00000500; break; + /* + * 0x80000000 returns the highest Extended Function, so we futureproof + * like we do above by limiting it to known fields. + */ case 0x80000000: - /* Futureproof this a little: if they ask how much extended - * processor information there is, limit it to known fields. */ if (*ax > 0x80000008) *ax = 0x80000008; break; + + /* + * PAE systems can mark pages as non-executable. Linux calls this the + * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced + * Virus Protection). We just switch turn if off here, since we don't + * support it. + */ case 0x80000001: - /* Here we should fix nx cap depending on host. */ - /* For this version of PAE, we just clear NX bit. */ *dx &= ~(1 << 20); break; } } -/* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. +/* + * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother * it. The Host needs to know when the Guest wants to change them, so we have * a whole series of functions like read_cr0() and write_cr0(). @@ -430,7 +480,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, * name like "FPUTRAP bit" be a little less cryptic? * * We store cr0 locally because the Host never changes it. The Guest sometimes - * wants to read it and we'd prefer not to bother the Host unnecessarily. */ + * wants to read it and we'd prefer not to bother the Host unnecessarily. + */ static unsigned long current_cr0; static void lguest_write_cr0(unsigned long val) { @@ -443,18 +494,22 @@ static unsigned long lguest_read_cr0(void) return current_cr0; } -/* Intel provided a special instruction to clear the TS bit for people too cool +/* + * Intel provided a special instruction to clear the TS bit for people too cool * to use write_cr0() to do it. This "clts" instruction is faster, because all - * the vowels have been optimized out. */ + * the vowels have been optimized out. + */ static void lguest_clts(void) { lazy_hcall1(LHCALL_TS, 0); current_cr0 &= ~X86_CR0_TS; } -/* cr2 is the virtual address of the last page fault, which the Guest only ever +/* + * cr2 is the virtual address of the last page fault, which the Guest only ever * reads. The Host kindly writes this into our "struct lguest_data", so we - * just read it out of there. */ + * just read it out of there. + */ static unsigned long lguest_read_cr2(void) { return lguest_data.cr2; @@ -463,10 +518,12 @@ static unsigned long lguest_read_cr2(void) /* See lguest_set_pte() below. */ static bool cr3_changed = false; -/* cr3 is the current toplevel pagetable page: the principle is the same as +/* + * cr3 is the current toplevel pagetable page: the principle is the same as * cr0. Keep a local copy, and tell the Host when it changes. The only * difference is that our local copy is in lguest_data because the Host needs - * to set it upon our initial hypercall. */ + * to set it upon our initial hypercall. + */ static void lguest_write_cr3(unsigned long cr3) { lguest_data.pgdir = cr3; @@ -538,10 +595,12 @@ static void lguest_write_cr4(unsigned long val) * the real page tables based on the Guests'. */ -/* The Guest calls this to set a second-level entry (pte), ie. to map a page +/* + * The Guest calls this to set a second-level entry (pte), ie. to map a page * into a process' address space. We set the entry then tell the Host the * toplevel and address this corresponds to. The Guest uses one pagetable per - * process, so we need to tell the Host which one we're changing (mm->pgd). */ + * process, so we need to tell the Host which one we're changing (mm->pgd). + */ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -560,10 +619,13 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, lguest_pte_update(mm, addr, ptep); } -/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd +/* + * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd * to set a middle-level entry when PAE is activated. + * * Again, we set the entry then tell the Host which page we changed, - * and the index of the entry we changed. */ + * and the index of the entry we changed. + */ #ifdef CONFIG_X86_PAE static void lguest_set_pud(pud_t *pudp, pud_t pudval) { @@ -582,8 +644,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) } #else -/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not - * activated. */ +/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { native_set_pmd(pmdp, pmdval); @@ -592,7 +653,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) } #endif -/* There are a couple of legacy places where the kernel sets a PTE, but we +/* + * There are a couple of legacy places where the kernel sets a PTE, but we * don't know the top level any more. This is useless for us, since we don't * know which pagetable is changing or what address, so we just tell the Host * to forget all of them. Fortunately, this is very rare. @@ -600,7 +662,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) * ... except in early boot when the kernel sets up the initial pagetables, * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell * the Host anything changed until we've done the first page table switch, - * which brings boot back to 0.25 seconds. */ + * which brings boot back to 0.25 seconds. + */ static void lguest_set_pte(pte_t *ptep, pte_t pteval) { native_set_pte(ptep, pteval); @@ -628,7 +691,8 @@ void lguest_pmd_clear(pmd_t *pmdp) } #endif -/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on +/* + * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on * native page table operations. On native hardware you can set a new page * table entry whenever you want, but if you want to remove one you have to do * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). @@ -637,24 +701,29 @@ void lguest_pmd_clear(pmd_t *pmdp) * called when a valid entry is written, not when it's removed (ie. marked not * present). Instead, this is where we come when the Guest wants to remove a * page table entry: we tell the Host to set that entry to 0 (ie. the present - * bit is zero). */ + * bit is zero). + */ static void lguest_flush_tlb_single(unsigned long addr) { /* Simply set it to zero: if it was not, it will fault back in. */ lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); } -/* This is what happens after the Guest has removed a large number of entries. +/* + * This is what happens after the Guest has removed a large number of entries. * This tells the Host that any of the page table entries for userspace might - * have changed, ie. virtual addresses below PAGE_OFFSET. */ + * have changed, ie. virtual addresses below PAGE_OFFSET. + */ static void lguest_flush_tlb_user(void) { lazy_hcall1(LHCALL_FLUSH_TLB, 0); } -/* This is called when the kernel page tables have changed. That's not very +/* + * This is called when the kernel page tables have changed. That's not very * common (unless the Guest is using highmem, which makes the Guest extremely - * slow), so it's worth separating this from the user flushing above. */ + * slow), so it's worth separating this from the user flushing above. + */ static void lguest_flush_tlb_kernel(void) { lazy_hcall1(LHCALL_FLUSH_TLB, 1); @@ -691,23 +760,27 @@ static struct irq_chip lguest_irq_controller = { .unmask = enable_lguest_irq, }; -/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware +/* + * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware * interrupt (except 128, which is used for system calls), and then tells the * Linux infrastructure that each interrupt is controlled by our level-based - * lguest interrupt controller. */ + * lguest interrupt controller. + */ static void __init lguest_init_IRQ(void) { unsigned int i; for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { - /* Some systems map "vectors" to interrupts weirdly. Lguest has - * a straightforward 1 to 1 mapping, so force that here. */ + /* Some systems map "vectors" to interrupts weirdly. Not us! */ __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; if (i != SYSCALL_VECTOR) set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); } - /* This call is required to set up for 4k stacks, where we have - * separate stacks for hard and soft interrupts. */ + + /* + * This call is required to set up for 4k stacks, where we have + * separate stacks for hard and soft interrupts. + */ irq_ctx_init(smp_processor_id()); } @@ -729,31 +802,39 @@ static unsigned long lguest_get_wallclock(void) return lguest_data.time.tv_sec; } -/* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us +/* + * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us * what speed it runs at, or 0 if it's unusable as a reliable clock source. * This matches what we want here: if we return 0 from this function, the x86 - * TSC clock will give up and not register itself. */ + * TSC clock will give up and not register itself. + */ static unsigned long lguest_tsc_khz(void) { return lguest_data.tsc_khz; } -/* If we can't use the TSC, the kernel falls back to our lower-priority - * "lguest_clock", where we read the time value given to us by the Host. */ +/* + * If we can't use the TSC, the kernel falls back to our lower-priority + * "lguest_clock", where we read the time value given to us by the Host. + */ static cycle_t lguest_clock_read(struct clocksource *cs) { unsigned long sec, nsec; - /* Since the time is in two parts (seconds and nanoseconds), we risk + /* + * Since the time is in two parts (seconds and nanoseconds), we risk * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, * and getting 99 and 0. As Linux tends to come apart under the stress - * of time travel, we must be careful: */ + * of time travel, we must be careful: + */ do { /* First we read the seconds part. */ sec = lguest_data.time.tv_sec; - /* This read memory barrier tells the compiler and the CPU that + /* + * This read memory barrier tells the compiler and the CPU that * this can't be reordered: we have to complete the above - * before going on. */ + * before going on. + */ rmb(); /* Now we read the nanoseconds part. */ nsec = lguest_data.time.tv_nsec; @@ -777,9 +858,11 @@ static struct clocksource lguest_clock = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -/* We also need a "struct clock_event_device": Linux asks us to set it to go +/* + * We also need a "struct clock_event_device": Linux asks us to set it to go * off some time in the future. Actually, James Morris figured all this out, I - * just applied the patch. */ + * just applied the patch. + */ static int lguest_clockevent_set_next_event(unsigned long delta, struct clock_event_device *evt) { @@ -829,8 +912,10 @@ static struct clock_event_device lguest_clockevent = { .max_delta_ns = LG_CLOCK_MAX_DELTA, }; -/* This is the Guest timer interrupt handler (hardware interrupt 0). We just - * call the clockevent infrastructure and it does whatever needs doing. */ +/* + * This is the Guest timer interrupt handler (hardware interrupt 0). We just + * call the clockevent infrastructure and it does whatever needs doing. + */ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) { unsigned long flags; @@ -841,10 +926,12 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) local_irq_restore(flags); } -/* At some point in the boot process, we get asked to set up our timing +/* + * At some point in the boot process, we get asked to set up our timing * infrastructure. The kernel doesn't expect timer interrupts before this, but * we cleverly initialized the "blocked_interrupts" field of "struct - * lguest_data" so that timer interrupts were blocked until now. */ + * lguest_data" so that timer interrupts were blocked until now. + */ static void lguest_time_init(void) { /* Set up the timer interrupt (0) to go to our simple timer routine */ @@ -868,14 +955,16 @@ static void lguest_time_init(void) * to work. They're pretty simple. */ -/* The Guest needs to tell the Host what stack it expects traps to use. For +/* + * The Guest needs to tell the Host what stack it expects traps to use. For * native hardware, this is part of the Task State Segment mentioned above in * lguest_load_tr_desc(), but to help hypervisors there's this special call. * * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data * segment), the privilege level (we're privilege level 1, the Host is 0 and * will not tolerate us trying to use that), the stack pointer, and the number - * of pages in the stack. */ + * of pages in the stack. + */ static void lguest_load_sp0(struct tss_struct *tss, struct thread_struct *thread) { @@ -889,7 +978,8 @@ static void lguest_set_debugreg(int regno, unsigned long value) /* FIXME: Implement */ } -/* There are times when the kernel wants to make sure that no memory writes are +/* + * There are times when the kernel wants to make sure that no memory writes are * caught in the cache (that they've all reached real hardware devices). This * doesn't matter for the Guest which has virtual hardware. * @@ -903,11 +993,13 @@ static void lguest_wbinvd(void) { } -/* If the Guest expects to have an Advanced Programmable Interrupt Controller, +/* + * If the Guest expects to have an Advanced Programmable Interrupt Controller, * we play dumb by ignoring writes and returning 0 for reads. So it's no * longer Programmable nor Controlling anything, and I don't think 8 lines of * code qualifies for Advanced. It will also never interrupt anything. It - * does, however, allow us to get through the Linux boot code. */ + * does, however, allow us to get through the Linux boot code. + */ #ifdef CONFIG_X86_LOCAL_APIC static void lguest_apic_write(u32 reg, u32 v) { @@ -956,11 +1048,13 @@ static void lguest_safe_halt(void) kvm_hypercall0(LHCALL_HALT); } -/* The SHUTDOWN hypercall takes a string to describe what's happening, and +/* + * The SHUTDOWN hypercall takes a string to describe what's happening, and * an argument which says whether this to restart (reboot) the Guest or not. * * Note that the Host always prefers that the Guest speak in physical addresses - * rather than virtual addresses, so we use __pa() here. */ + * rather than virtual addresses, so we use __pa() here. + */ static void lguest_power_off(void) { kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), @@ -991,8 +1085,10 @@ static __init char *lguest_memory_setup(void) * nice to move it back to lguest_init. Patch welcome... */ atomic_notifier_chain_register(&panic_notifier_list, &paniced); - /* The Linux bootloader header contains an "e820" memory map: the - * Launcher populated the first entry with our memory limit. */ + /* + *The Linux bootloader header contains an "e820" memory map: the + * Launcher populated the first entry with our memory limit. + */ e820_add_region(boot_params.e820_map[0].addr, boot_params.e820_map[0].size, boot_params.e820_map[0].type); @@ -1001,16 +1097,17 @@ static __init char *lguest_memory_setup(void) return "LGUEST"; } -/* We will eventually use the virtio console device to produce console output, +/* + * We will eventually use the virtio console device to produce console output, * but before that is set up we use LHCALL_NOTIFY on normal memory to produce - * console output. */ + * console output. + */ static __init int early_put_chars(u32 vtermno, const char *buf, int count) { char scratch[17]; unsigned int len = count; - /* We use a nul-terminated string, so we have to make a copy. Icky, - * huh? */ + /* We use a nul-terminated string, so we make a copy. Icky, huh? */ if (len > sizeof(scratch) - 1) len = sizeof(scratch) - 1; scratch[len] = '\0'; @@ -1021,8 +1118,10 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) return len; } -/* Rebooting also tells the Host we're finished, but the RESTART flag tells the - * Launcher to reboot us. */ +/* + * Rebooting also tells the Host we're finished, but the RESTART flag tells the + * Launcher to reboot us. + */ static void lguest_restart(char *reason) { kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); @@ -1049,7 +1148,8 @@ static void lguest_restart(char *reason) * fit comfortably. * * First we need assembly templates of each of the patchable Guest operations, - * and these are in i386_head.S. */ + * and these are in i386_head.S. + */ /*G:060 We construct a table from the assembler templates: */ static const struct lguest_insns @@ -1060,9 +1160,11 @@ static const struct lguest_insns [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, }; -/* Now our patch routine is fairly simple (based on the native one in +/* + * Now our patch routine is fairly simple (based on the native one in * paravirt.c). If we have a replacement, we copy it in and return how much of - * the available space we used. */ + * the available space we used. + */ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, unsigned long addr, unsigned len) { @@ -1074,8 +1176,7 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, insn_len = lguest_insns[type].end - lguest_insns[type].start; - /* Similarly if we can't fit replacement (shouldn't happen, but let's - * be thorough). */ + /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ if (len < insn_len) return paravirt_patch_default(type, clobber, ibuf, addr, len); @@ -1084,22 +1185,28 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, return insn_len; } -/*G:029 Once we get to lguest_init(), we know we're a Guest. The various +/*G:029 + * Once we get to lguest_init(), we know we're a Guest. The various * pv_ops structures in the kernel provide points for (almost) every routine we - * have to override to avoid privileged instructions. */ + * have to override to avoid privileged instructions. + */ __init void lguest_init(void) { - /* We're under lguest, paravirt is enabled, and we're running at - * privilege level 1, not 0 as normal. */ + /* We're under lguest. */ pv_info.name = "lguest"; + /* Paravirt is enabled. */ pv_info.paravirt_enabled = 1; + /* We're running at privilege level 1, not 0 as normal. */ pv_info.kernel_rpl = 1; + /* Everyone except Xen runs with this set. */ pv_info.shared_kernel_pmd = 1; - /* We set up all the lguest overrides for sensitive operations. These - * are detailed with the operations themselves. */ + /* + * We set up all the lguest overrides for sensitive operations. These + * are detailed with the operations themselves. + */ - /* interrupt-related operations */ + /* Interrupt-related operations */ pv_irq_ops.init_IRQ = lguest_init_IRQ; pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); @@ -1107,11 +1214,11 @@ __init void lguest_init(void) pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); pv_irq_ops.safe_halt = lguest_safe_halt; - /* init-time operations */ + /* Setup operations */ pv_init_ops.memory_setup = lguest_memory_setup; pv_init_ops.patch = lguest_patch; - /* Intercepts of various cpu instructions */ + /* Intercepts of various CPU instructions */ pv_cpu_ops.load_gdt = lguest_load_gdt; pv_cpu_ops.cpuid = lguest_cpuid; pv_cpu_ops.load_idt = lguest_load_idt; @@ -1132,7 +1239,7 @@ __init void lguest_init(void) pv_cpu_ops.start_context_switch = paravirt_start_context_switch; pv_cpu_ops.end_context_switch = lguest_end_context_switch; - /* pagetable management */ + /* Pagetable management */ pv_mmu_ops.write_cr3 = lguest_write_cr3; pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; @@ -1154,54 +1261,71 @@ __init void lguest_init(void) pv_mmu_ops.pte_update_defer = lguest_pte_update; #ifdef CONFIG_X86_LOCAL_APIC - /* apic read/write intercepts */ + /* APIC read/write intercepts */ set_lguest_basic_apic_ops(); #endif - /* time operations */ + /* Time operations */ pv_time_ops.get_wallclock = lguest_get_wallclock; pv_time_ops.time_init = lguest_time_init; pv_time_ops.get_tsc_khz = lguest_tsc_khz; - /* Now is a good time to look at the implementations of these functions - * before returning to the rest of lguest_init(). */ + /* + * Now is a good time to look at the implementations of these functions + * before returning to the rest of lguest_init(). + */ - /*G:070 Now we've seen all the paravirt_ops, we return to + /*G:070 + * Now we've seen all the paravirt_ops, we return to * lguest_init() where the rest of the fairly chaotic boot setup - * occurs. */ + * occurs. + */ - /* The stack protector is a weird thing where gcc places a canary + /* + * The stack protector is a weird thing where gcc places a canary * value on the stack and then checks it on return. This file is * compiled with -fno-stack-protector it, so we got this far without * problems. The value of the canary is kept at offset 20 from the * %gs register, so we need to set that up before calling C functions - * in other files. */ + * in other files. + */ setup_stack_canary_segment(0); - /* We could just call load_stack_canary_segment(), but we might as - * call switch_to_new_gdt() which loads the whole table and sets up - * the per-cpu segment descriptor register %fs as well. */ + + /* + * We could just call load_stack_canary_segment(), but we might as well + * call switch_to_new_gdt() which loads the whole table and sets up the + * per-cpu segment descriptor register %fs as well. + */ switch_to_new_gdt(0); /* As described in head_32.S, we map the first 128M of memory. */ max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; - /* The Host<->Guest Switcher lives at the top of our address space, and + /* + * The Host<->Guest Switcher lives at the top of our address space, and * the Host told us how big it is when we made LGUEST_INIT hypercall: - * it put the answer in lguest_data.reserve_mem */ + * it put the answer in lguest_data.reserve_mem + */ reserve_top_address(lguest_data.reserve_mem); - /* If we don't initialize the lock dependency checker now, it crashes - * paravirt_disable_iospace. */ + /* + * If we don't initialize the lock dependency checker now, it crashes + * paravirt_disable_iospace. + */ lockdep_init(); - /* The IDE code spends about 3 seconds probing for disks: if we reserve + /* + * The IDE code spends about 3 seconds probing for disks: if we reserve * all the I/O ports up front it can't get them and so doesn't probe. * Other device drivers are similar (but less severe). This cuts the - * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */ + * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. + */ paravirt_disable_iospace(); - /* This is messy CPU setup stuff which the native boot code does before - * start_kernel, so we have to do, too: */ + /* + * This is messy CPU setup stuff which the native boot code does before + * start_kernel, so we have to do, too: + */ cpu_detect(&new_cpu_data); /* head.S usually sets up the first capability word, so do it here. */ new_cpu_data.x86_capability[0] = cpuid_edx(1); @@ -1218,22 +1342,28 @@ __init void lguest_init(void) acpi_ht = 0; #endif - /* We set the preferred console to "hvc". This is the "hypervisor + /* + * We set the preferred console to "hvc". This is the "hypervisor * virtual console" driver written by the PowerPC people, which we also - * adapted for lguest's use. */ + * adapted for lguest's use. + */ add_preferred_console("hvc", 0, NULL); /* Register our very early console. */ virtio_cons_early_init(early_put_chars); - /* Last of all, we set the power management poweroff hook to point to + /* + * Last of all, we set the power management poweroff hook to point to * the Guest routine to power off, and the reboot hook to our restart - * routine. */ + * routine. + */ pm_power_off = lguest_power_off; machine_ops.restart = lguest_restart; - /* Now we're set up, call i386_start_kernel() in head32.c and we proceed - * to boot as normal. It never returns. */ + /* + * Now we're set up, call i386_start_kernel() in head32.c and we proceed + * to boot as normal. It never returns. + */ i386_start_kernel(); } /* diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index a9c8cfe61cd..db6aa95eb05 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -5,7 +5,8 @@ #include #include -/*G:020 Our story starts with the kernel booting into startup_32 in +/*G:020 + * Our story starts with the kernel booting into startup_32 in * arch/x86/kernel/head_32.S. It expects a boot header, which is created by * the bootloader (the Launcher in our case). * @@ -21,11 +22,14 @@ * data without remembering to subtract __PAGE_OFFSET! * * The .section line puts this code in .init.text so it will be discarded after - * boot. */ + * boot. + */ .section .init.text, "ax", @progbits ENTRY(lguest_entry) - /* We make the "initialization" hypercall now to tell the Host about - * us, and also find out where it put our page tables. */ + /* + * We make the "initialization" hypercall now to tell the Host about + * us, and also find out where it put our page tables. + */ movl $LHCALL_LGUEST_INIT, %eax movl $lguest_data - __PAGE_OFFSET, %ebx .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ @@ -33,13 +37,14 @@ ENTRY(lguest_entry) /* Set up the initial stack so we can run C code. */ movl $(init_thread_union+THREAD_SIZE),%esp - /* Jumps are relative, and we're running __PAGE_OFFSET too low at the - * moment. */ + /* Jumps are relative: we're running __PAGE_OFFSET too low. */ jmp lguest_init+__PAGE_OFFSET -/*G:055 We create a macro which puts the assembler code between lgstart_ and - * lgend_ markers. These templates are put in the .text section: they can't be - * discarded after boot as we may need to patch modules, too. */ +/*G:055 + * We create a macro which puts the assembler code between lgstart_ and lgend_ + * markers. These templates are put in the .text section: they can't be + * discarded after boot as we may need to patch modules, too. + */ .text #define LGUEST_PATCH(name, insns...) \ lgstart_##name: insns; lgend_##name:; \ @@ -48,58 +53,74 @@ ENTRY(lguest_entry) LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) -/*G:033 But using those wrappers is inefficient (we'll see why that doesn't - * matter for save_fl and irq_disable later). If we write our routines - * carefully in assembler, we can avoid clobbering any registers and avoid - * jumping through the wrapper functions. +/*G:033 + * But using those wrappers is inefficient (we'll see why that doesn't matter + * for save_fl and irq_disable later). If we write our routines carefully in + * assembler, we can avoid clobbering any registers and avoid jumping through + * the wrapper functions. * * I skipped over our first piece of assembler, but this one is worth studying - * in a bit more detail so I'll describe in easy stages. First, the routine - * to enable interrupts: */ + * in a bit more detail so I'll describe in easy stages. First, the routine to + * enable interrupts: + */ ENTRY(lg_irq_enable) - /* The reverse of irq_disable, this sets lguest_data.irq_enabled to - * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ + /* + * The reverse of irq_disable, this sets lguest_data.irq_enabled to + * X86_EFLAGS_IF (ie. "Interrupts enabled"). + */ movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled - /* But now we need to check if the Host wants to know: there might have + /* + * But now we need to check if the Host wants to know: there might have * been interrupts waiting to be delivered, in which case it will have * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we - * jump to send_interrupts, otherwise we're done. */ + * jump to send_interrupts, otherwise we're done. + */ testl $0, lguest_data+LGUEST_DATA_irq_pending jnz send_interrupts - /* One cool thing about x86 is that you can do many things without using + /* + * One cool thing about x86 is that you can do many things without using * a register. In this case, the normal path hasn't needed to save or - * restore any registers at all! */ + * restore any registers at all! + */ ret send_interrupts: - /* OK, now we need a register: eax is used for the hypercall number, + /* + * OK, now we need a register: eax is used for the hypercall number, * which is LHCALL_SEND_INTERRUPTS. * * We used not to bother with this pending detection at all, which was * much simpler. Sooner or later the Host would realize it had to * send us an interrupt. But that turns out to make performance 7 * times worse on a simple tcp benchmark. So now we do this the hard - * way. */ + * way. + */ pushl %eax movl $LHCALL_SEND_INTERRUPTS, %eax - /* This is a vmcall instruction (same thing that KVM uses). Older + /* + * This is a vmcall instruction (same thing that KVM uses). Older * assembler versions might not know the "vmcall" instruction, so we - * create one manually here. */ + * create one manually here. + */ .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ popl %eax ret -/* Finally, the "popf" or "restore flags" routine. The %eax register holds the +/* + * Finally, the "popf" or "restore flags" routine. The %eax register holds the * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're - * enabling interrupts again, if it's 0 we're leaving them off. */ + * enabling interrupts again, if it's 0 we're leaving them off. + */ ENTRY(lg_restore_fl) /* This is just "lguest_data.irq_enabled = flags;" */ movl %eax, lguest_data+LGUEST_DATA_irq_enabled - /* Now, if the %eax value has enabled interrupts and + /* + * Now, if the %eax value has enabled interrupts and * lguest_data.irq_pending is set, we want to tell the Host so it can * deliver any outstanding interrupts. Fortunately, both values will * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" * instruction will AND them together for us. If both are set, we - * jump to send_interrupts. */ + * jump to send_interrupts. + */ testl lguest_data+LGUEST_DATA_irq_pending, %eax jnz send_interrupts /* Again, the normal path has used no extra registers. Clever, huh? */ @@ -109,22 +130,24 @@ ENTRY(lg_restore_fl) .global lguest_noirq_start .global lguest_noirq_end -/*M:004 When the Host reflects a trap or injects an interrupt into the Guest, - * it sets the eflags interrupt bit on the stack based on - * lguest_data.irq_enabled, so the Guest iret logic does the right thing when - * restoring it. However, when the Host sets the Guest up for direct traps, - * such as system calls, the processor is the one to push eflags onto the - * stack, and the interrupt bit will be 1 (in reality, interrupts are always - * enabled in the Guest). +/*M:004 + * When the Host reflects a trap or injects an interrupt into the Guest, it + * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled, + * so the Guest iret logic does the right thing when restoring it. However, + * when the Host sets the Guest up for direct traps, such as system calls, the + * processor is the one to push eflags onto the stack, and the interrupt bit + * will be 1 (in reality, interrupts are always enabled in the Guest). * * This turns out to be harmless: the only trap which should happen under Linux * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc * regions), which has to be reflected through the Host anyway. If another * trap *does* go off when interrupts are disabled, the Guest will panic, and - * we'll never get to this iret! :*/ + * we'll never get to this iret! +:*/ -/*G:045 There is one final paravirt_op that the Guest implements, and glancing - * at it you can see why I left it to last. It's *cool*! It's in *assembler*! +/*G:045 + * There is one final paravirt_op that the Guest implements, and glancing at it + * you can see why I left it to last. It's *cool*! It's in *assembler*! * * The "iret" instruction is used to return from an interrupt or trap. The * stack looks like this: @@ -148,15 +171,18 @@ ENTRY(lg_restore_fl) * return to userspace or wherever. Our solution to this is to surround the * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the * Host that it is *never* to interrupt us there, even if interrupts seem to be - * enabled. */ + * enabled. + */ ENTRY(lguest_iret) pushl %eax movl 12(%esp), %eax lguest_noirq_start: - /* Note the %ss: segment prefix here. Normal data accesses use the + /* + * Note the %ss: segment prefix here. Normal data accesses use the * "ds" segment, but that will have already been restored for whatever * we're returning to (such as userspace): we can't trust it. The %ss: - * prefix makes sure we use the stack segment, which is still valid. */ + * prefix makes sure we use the stack segment, which is still valid. + */ movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled popl %eax iret diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index a6974e9b8eb..cd058bc903f 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -1,6 +1,8 @@ -/*P:400 This contains run_guest() which actually calls into the Host<->Guest +/*P:400 + * This contains run_guest() which actually calls into the Host<->Guest * Switcher and analyzes the return, such as determining if the Guest wants the - * Host to do something. This file also contains useful helper routines. :*/ + * Host to do something. This file also contains useful helper routines. +:*/ #include #include #include @@ -24,7 +26,8 @@ static struct page **switcher_page; /* This One Big lock protects all inter-guest data structures. */ DEFINE_MUTEX(lguest_lock); -/*H:010 We need to set up the Switcher at a high virtual address. Remember the +/*H:010 + * We need to set up the Switcher at a high virtual address. Remember the * Switcher is a few hundred bytes of assembler code which actually changes the * CPU to run the Guest, and then changes back to the Host when a trap or * interrupt happens. @@ -33,7 +36,8 @@ DEFINE_MUTEX(lguest_lock); * Host since it will be running as the switchover occurs. * * Trying to map memory at a particular address is an unusual thing to do, so - * it's not a simple one-liner. */ + * it's not a simple one-liner. + */ static __init int map_switcher(void) { int i, err; @@ -47,8 +51,10 @@ static __init int map_switcher(void) * easy. */ - /* We allocate an array of struct page pointers. map_vm_area() wants - * this, rather than just an array of pages. */ + /* + * We allocate an array of struct page pointers. map_vm_area() wants + * this, rather than just an array of pages. + */ switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, GFP_KERNEL); if (!switcher_page) { @@ -56,8 +62,10 @@ static __init int map_switcher(void) goto out; } - /* Now we actually allocate the pages. The Guest will see these pages, - * so we make sure they're zeroed. */ + /* + * Now we actually allocate the pages. The Guest will see these pages, + * so we make sure they're zeroed. + */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { unsigned long addr = get_zeroed_page(GFP_KERNEL); if (!addr) { @@ -67,19 +75,23 @@ static __init int map_switcher(void) switcher_page[i] = virt_to_page(addr); } - /* First we check that the Switcher won't overlap the fixmap area at + /* + * First we check that the Switcher won't overlap the fixmap area at * the top of memory. It's currently nowhere near, but it could have - * very strange effects if it ever happened. */ + * very strange effects if it ever happened. + */ if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ err = -ENOMEM; printk("lguest: mapping switcher would thwack fixmap\n"); goto free_pages; } - /* Now we reserve the "virtual memory area" we want: 0xFFC00000 + /* + * Now we reserve the "virtual memory area" we want: 0xFFC00000 * (SWITCHER_ADDR). We might not get it in theory, but in practice * it's worked so far. The end address needs +1 because __get_vm_area - * allocates an extra guard page, so we need space for that. */ + * allocates an extra guard page, so we need space for that. + */ switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); @@ -89,11 +101,13 @@ static __init int map_switcher(void) goto free_pages; } - /* This code actually sets up the pages we've allocated to appear at + /* + * This code actually sets up the pages we've allocated to appear at * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the * kind of pages we're mapping (kernel pages), and a pointer to our * array of struct pages. It increments that pointer, but we don't - * care. */ + * care. + */ pagep = switcher_page; err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); if (err) { @@ -101,8 +115,10 @@ static __init int map_switcher(void) goto free_vma; } - /* Now the Switcher is mapped at the right address, we can't fail! - * Copy in the compiled-in Switcher code (from _switcher.S). */ + /* + * Now the Switcher is mapped at the right address, we can't fail! + * Copy in the compiled-in Switcher code (from _switcher.S). + */ memcpy(switcher_vma->addr, start_switcher_text, end_switcher_text - start_switcher_text); @@ -124,8 +140,7 @@ out: } /*:*/ -/* Cleaning up the mapping when the module is unloaded is almost... - * too easy. */ +/* Cleaning up the mapping when the module is unloaded is almost... too easy. */ static void unmap_switcher(void) { unsigned int i; @@ -151,16 +166,19 @@ static void unmap_switcher(void) * But we can't trust the Guest: it might be trying to access the Launcher * code. We have to check that the range is below the pfn_limit the Launcher * gave us. We have to make sure that addr + len doesn't give us a false - * positive by overflowing, too. */ + * positive by overflowing, too. + */ bool lguest_address_ok(const struct lguest *lg, unsigned long addr, unsigned long len) { return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); } -/* This routine copies memory from the Guest. Here we can see how useful the +/* + * This routine copies memory from the Guest. Here we can see how useful the * kill_lguest() routine we met in the Launcher can be: we return a random - * value (all zeroes) instead of needing to return an error. */ + * value (all zeroes) instead of needing to return an error. + */ void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes) { if (!lguest_address_ok(cpu->lg, addr, bytes) @@ -181,9 +199,11 @@ void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b, } /*:*/ -/*H:030 Let's jump straight to the the main loop which runs the Guest. +/*H:030 + * Let's jump straight to the the main loop which runs the Guest. * Remember, this is called by the Launcher reading /dev/lguest, and we keep - * going around and around until something interesting happens. */ + * going around and around until something interesting happens. + */ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) { /* We stop running once the Guest is dead. */ @@ -195,8 +215,10 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) if (cpu->hcall) do_hypercalls(cpu); - /* It's possible the Guest did a NOTIFY hypercall to the - * Launcher, in which case we return from the read() now. */ + /* + * It's possible the Guest did a NOTIFY hypercall to the + * Launcher, in which case we return from the read() now. + */ if (cpu->pending_notify) { if (!send_notify_to_eventfd(cpu)) { if (put_user(cpu->pending_notify, user)) @@ -209,29 +231,39 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) if (signal_pending(current)) return -ERESTARTSYS; - /* Check if there are any interrupts which can be delivered now: + /* + * Check if there are any interrupts which can be delivered now: * if so, this sets up the hander to be executed when we next - * run the Guest. */ + * run the Guest. + */ irq = interrupt_pending(cpu, &more); if (irq < LGUEST_IRQS) try_deliver_interrupt(cpu, irq, more); - /* All long-lived kernel loops need to check with this horrible + /* + * All long-lived kernel loops need to check with this horrible * thing called the freezer. If the Host is trying to suspend, - * it stops us. */ + * it stops us. + */ try_to_freeze(); - /* Just make absolutely sure the Guest is still alive. One of - * those hypercalls could have been fatal, for example. */ + /* + * Just make absolutely sure the Guest is still alive. One of + * those hypercalls could have been fatal, for example. + */ if (cpu->lg->dead) break; - /* If the Guest asked to be stopped, we sleep. The Guest's - * clock timer will wake us. */ + /* + * If the Guest asked to be stopped, we sleep. The Guest's + * clock timer will wake us. + */ if (cpu->halted) { set_current_state(TASK_INTERRUPTIBLE); - /* Just before we sleep, make sure no interrupt snuck in - * which we should be doing. */ + /* + * Just before we sleep, make sure no interrupt snuck in + * which we should be doing. + */ if (interrupt_pending(cpu, &more) < LGUEST_IRQS) set_current_state(TASK_RUNNING); else @@ -239,8 +271,10 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) continue; } - /* OK, now we're ready to jump into the Guest. First we put up - * the "Do Not Disturb" sign: */ + /* + * OK, now we're ready to jump into the Guest. First we put up + * the "Do Not Disturb" sign: + */ local_irq_disable(); /* Actually run the Guest until something happens. */ @@ -327,8 +361,10 @@ static void __exit fini(void) } /*:*/ -/* The Host side of lguest can be a module. This is a nice way for people to - * play with it. */ +/* + * The Host side of lguest can be a module. This is a nice way for people to + * play with it. + */ module_init(init); module_exit(fini); MODULE_LICENSE("GPL"); diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index c29ffa19cb7..787ab4bc09f 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -1,8 +1,10 @@ -/*P:500 Just as userspace programs request kernel operations through a system +/*P:500 + * Just as userspace programs request kernel operations through a system * call, the Guest requests Host operations through a "hypercall". You might * notice this nomenclature doesn't really follow any logic, but the name has * been around for long enough that we're stuck with it. As you'd expect, this - * code is basically a one big switch statement. :*/ + * code is basically a one big switch statement. +:*/ /* Copyright (C) 2006 Rusty Russell IBM Corporation @@ -28,30 +30,41 @@ #include #include "lg.h" -/*H:120 This is the core hypercall routine: where the Guest gets what it wants. - * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. */ +/*H:120 + * This is the core hypercall routine: where the Guest gets what it wants. + * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. + */ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) { switch (args->arg0) { case LHCALL_FLUSH_ASYNC: - /* This call does nothing, except by breaking out of the Guest - * it makes us process all the asynchronous hypercalls. */ + /* + * This call does nothing, except by breaking out of the Guest + * it makes us process all the asynchronous hypercalls. + */ break; case LHCALL_SEND_INTERRUPTS: - /* This call does nothing too, but by breaking out of the Guest - * it makes us process any pending interrupts. */ + /* + * This call does nothing too, but by breaking out of the Guest + * it makes us process any pending interrupts. + */ break; case LHCALL_LGUEST_INIT: - /* You can't get here unless you're already initialized. Don't - * do that. */ + /* + * You can't get here unless you're already initialized. Don't + * do that. + */ kill_guest(cpu, "already have lguest_data"); break; case LHCALL_SHUTDOWN: { - /* Shutdown is such a trivial hypercall that we do it in four - * lines right here. */ char msg[128]; - /* If the lgread fails, it will call kill_guest() itself; the - * kill_guest() with the message will be ignored. */ + /* + * Shutdown is such a trivial hypercall that we do it in four + * lines right here. + * + * If the lgread fails, it will call kill_guest() itself; the + * kill_guest() with the message will be ignored. + */ __lgread(cpu, msg, args->arg1, sizeof(msg)); msg[sizeof(msg)-1] = '\0'; kill_guest(cpu, "CRASH: %s", msg); @@ -60,16 +73,17 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) break; } case LHCALL_FLUSH_TLB: - /* FLUSH_TLB comes in two flavors, depending on the - * argument: */ + /* FLUSH_TLB comes in two flavors, depending on the argument: */ if (args->arg1) guest_pagetable_clear_all(cpu); else guest_pagetable_flush_user(cpu); break; - /* All these calls simply pass the arguments through to the right - * routines. */ + /* + * All these calls simply pass the arguments through to the right + * routines. + */ case LHCALL_NEW_PGTABLE: guest_new_pagetable(cpu, args->arg1); break; @@ -112,15 +126,16 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) kill_guest(cpu, "Bad hypercall %li\n", args->arg0); } } -/*:*/ -/*H:124 Asynchronous hypercalls are easy: we just look in the array in the +/*H:124 + * Asynchronous hypercalls are easy: we just look in the array in the * Guest's "struct lguest_data" to see if any new ones are marked "ready". * * We are careful to do these in order: obviously we respect the order the * Guest put them in the ring, but we also promise the Guest that they will * happen before any normal hypercall (which is why we check this before - * checking for a normal hcall). */ + * checking for a normal hcall). + */ static void do_async_hcalls(struct lg_cpu *cpu) { unsigned int i; @@ -133,22 +148,28 @@ static void do_async_hcalls(struct lg_cpu *cpu) /* We process "struct lguest_data"s hcalls[] ring once. */ for (i = 0; i < ARRAY_SIZE(st); i++) { struct hcall_args args; - /* We remember where we were up to from last time. This makes + /* + * We remember where we were up to from last time. This makes * sure that the hypercalls are done in the order the Guest - * places them in the ring. */ + * places them in the ring. + */ unsigned int n = cpu->next_hcall; /* 0xFF means there's no call here (yet). */ if (st[n] == 0xFF) break; - /* OK, we have hypercall. Increment the "next_hcall" cursor, - * and wrap back to 0 if we reach the end. */ + /* + * OK, we have hypercall. Increment the "next_hcall" cursor, + * and wrap back to 0 if we reach the end. + */ if (++cpu->next_hcall == LHCALL_RING_SIZE) cpu->next_hcall = 0; - /* Copy the hypercall arguments into a local copy of - * the hcall_args struct. */ + /* + * Copy the hypercall arguments into a local copy of the + * hcall_args struct. + */ if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n], sizeof(struct hcall_args))) { kill_guest(cpu, "Fetching async hypercalls"); @@ -164,19 +185,25 @@ static void do_async_hcalls(struct lg_cpu *cpu) break; } - /* Stop doing hypercalls if they want to notify the Launcher: - * it needs to service this first. */ + /* + * Stop doing hypercalls if they want to notify the Launcher: + * it needs to service this first. + */ if (cpu->pending_notify) break; } } -/* Last of all, we look at what happens first of all. The very first time the - * Guest makes a hypercall, we end up here to set things up: */ +/* + * Last of all, we look at what happens first of all. The very first time the + * Guest makes a hypercall, we end up here to set things up: + */ static void initialize(struct lg_cpu *cpu) { - /* You can't do anything until you're initialized. The Guest knows the - * rules, so we're unforgiving here. */ + /* + * You can't do anything until you're initialized. The Guest knows the + * rules, so we're unforgiving here. + */ if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) { kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0); return; @@ -185,32 +212,40 @@ static void initialize(struct lg_cpu *cpu) if (lguest_arch_init_hypercalls(cpu)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - /* The Guest tells us where we're not to deliver interrupts by putting - * the range of addresses into "struct lguest_data". */ + /* + * The Guest tells us where we're not to deliver interrupts by putting + * the range of addresses into "struct lguest_data". + */ if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start) || get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - /* We write the current time into the Guest's data page once so it can - * set its clock. */ + /* + * We write the current time into the Guest's data page once so it can + * set its clock. + */ write_timestamp(cpu); /* page_tables.c will also do some setup. */ page_table_guest_data_init(cpu); - /* This is the one case where the above accesses might have been the + /* + * This is the one case where the above accesses might have been the * first write to a Guest page. This may have caused a copy-on-write * fault, but the old page might be (read-only) in the Guest - * pagetable. */ + * pagetable. + */ guest_pagetable_clear_all(cpu); } /*:*/ -/*M:013 If a Guest reads from a page (so creates a mapping) that it has never +/*M:013 + * If a Guest reads from a page (so creates a mapping) that it has never * written to, and then the Launcher writes to it (ie. the output of a virtual * device), the Guest will still see the old page. In practice, this never * happens: why would the Guest read a page which it has never written to? But - * a similar scenario might one day bite us, so it's worth mentioning. :*/ + * a similar scenario might one day bite us, so it's worth mentioning. +:*/ /*H:100 * Hypercalls @@ -229,17 +264,22 @@ void do_hypercalls(struct lg_cpu *cpu) return; } - /* The Guest has initialized. + /* + * The Guest has initialized. * - * Look in the hypercall ring for the async hypercalls: */ + * Look in the hypercall ring for the async hypercalls: + */ do_async_hcalls(cpu); - /* If we stopped reading the hypercall ring because the Guest did a + /* + * If we stopped reading the hypercall ring because the Guest did a * NOTIFY to the Launcher, we want to return now. Otherwise we do - * the hypercall. */ + * the hypercall. + */ if (!cpu->pending_notify) { do_hcall(cpu, cpu->hcall); - /* Tricky point: we reset the hcall pointer to mark the + /* + * Tricky point: we reset the hcall pointer to mark the * hypercall as "done". We use the hcall pointer rather than * the trap number to indicate a hypercall is pending. * Normally it doesn't matter: the Guest will run again and @@ -248,13 +288,16 @@ void do_hypercalls(struct lg_cpu *cpu) * However, if we are signalled or the Guest sends I/O to the * Launcher, the run_guest() loop will exit without running the * Guest. When it comes back it would try to re-run the - * hypercall. Finding that bug sucked. */ + * hypercall. Finding that bug sucked. + */ cpu->hcall = NULL; } } -/* This routine supplies the Guest with time: it's used for wallclock time at - * initial boot and as a rough time source if the TSC isn't available. */ +/* + * This routine supplies the Guest with time: it's used for wallclock time at + * initial boot and as a rough time source if the TSC isn't available. + */ void write_timestamp(struct lg_cpu *cpu) { struct timespec now; diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 0e9067b0d50..18648180db0 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -1,4 +1,5 @@ -/*P:800 Interrupts (traps) are complicated enough to earn their own file. +/*P:800 + * Interrupts (traps) are complicated enough to earn their own file. * There are three classes of interrupts: * * 1) Real hardware interrupts which occur while we're running the Guest, @@ -10,7 +11,8 @@ * just like real hardware would deliver them. Traps from the Guest can be set * up to go directly back into the Guest, but sometimes the Host wants to see * them first, so we also have a way of "reflecting" them into the Guest as if - * they had been delivered to it directly. :*/ + * they had been delivered to it directly. +:*/ #include #include #include @@ -26,8 +28,10 @@ static unsigned long idt_address(u32 lo, u32 hi) return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); } -/* The "type" of the interrupt handler is a 4 bit field: we only support a - * couple of types. */ +/* + * The "type" of the interrupt handler is a 4 bit field: we only support a + * couple of types. + */ static int idt_type(u32 lo, u32 hi) { return (hi >> 8) & 0xF; @@ -39,8 +43,10 @@ static bool idt_present(u32 lo, u32 hi) return (hi & 0x8000); } -/* We need a helper to "push" a value onto the Guest's stack, since that's a - * big part of what delivering an interrupt does. */ +/* + * We need a helper to "push" a value onto the Guest's stack, since that's a + * big part of what delivering an interrupt does. + */ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) { /* Stack grows upwards: move stack then write value. */ @@ -48,7 +54,8 @@ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) lgwrite(cpu, *gstack, u32, val); } -/*H:210 The set_guest_interrupt() routine actually delivers the interrupt or +/*H:210 + * The set_guest_interrupt() routine actually delivers the interrupt or * trap. The mechanics of delivering traps and interrupts to the Guest are the * same, except some traps have an "error code" which gets pushed onto the * stack as well: the caller tells us if this is one. @@ -59,7 +66,8 @@ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) * * We set up the stack just like the CPU does for a real interrupt, so it's * identical for the Guest (and the standard "iret" instruction will undo - * it). */ + * it). + */ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, bool has_err) { @@ -67,20 +75,26 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, u32 eflags, ss, irq_enable; unsigned long virtstack; - /* There are two cases for interrupts: one where the Guest is already + /* + * There are two cases for interrupts: one where the Guest is already * in the kernel, and a more complex one where the Guest is in - * userspace. We check the privilege level to find out. */ + * userspace. We check the privilege level to find out. + */ if ((cpu->regs->ss&0x3) != GUEST_PL) { - /* The Guest told us their kernel stack with the SET_STACK - * hypercall: both the virtual address and the segment */ + /* + * The Guest told us their kernel stack with the SET_STACK + * hypercall: both the virtual address and the segment. + */ virtstack = cpu->esp1; ss = cpu->ss1; origstack = gstack = guest_pa(cpu, virtstack); - /* We push the old stack segment and pointer onto the new + /* + * We push the old stack segment and pointer onto the new * stack: when the Guest does an "iret" back from the interrupt * handler the CPU will notice they're dropping privilege - * levels and expect these here. */ + * levels and expect these here. + */ push_guest_stack(cpu, &gstack, cpu->regs->ss); push_guest_stack(cpu, &gstack, cpu->regs->esp); } else { @@ -91,18 +105,22 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, origstack = gstack = guest_pa(cpu, virtstack); } - /* Remember that we never let the Guest actually disable interrupts, so + /* + * Remember that we never let the Guest actually disable interrupts, so * the "Interrupt Flag" bit is always set. We copy that bit from the * Guest's "irq_enabled" field into the eflags word: we saw the Guest - * copy it back in "lguest_iret". */ + * copy it back in "lguest_iret". + */ eflags = cpu->regs->eflags; if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0 && !(irq_enable & X86_EFLAGS_IF)) eflags &= ~X86_EFLAGS_IF; - /* An interrupt is expected to push three things on the stack: the old + /* + * An interrupt is expected to push three things on the stack: the old * "eflags" word, the old code segment, and the old instruction - * pointer. */ + * pointer. + */ push_guest_stack(cpu, &gstack, eflags); push_guest_stack(cpu, &gstack, cpu->regs->cs); push_guest_stack(cpu, &gstack, cpu->regs->eip); @@ -111,15 +129,19 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, if (has_err) push_guest_stack(cpu, &gstack, cpu->regs->errcode); - /* Now we've pushed all the old state, we change the stack, the code - * segment and the address to execute. */ + /* + * Now we've pushed all the old state, we change the stack, the code + * segment and the address to execute. + */ cpu->regs->ss = ss; cpu->regs->esp = virtstack + (gstack - origstack); cpu->regs->cs = (__KERNEL_CS|GUEST_PL); cpu->regs->eip = idt_address(lo, hi); - /* There are two kinds of interrupt handlers: 0xE is an "interrupt - * gate" which expects interrupts to be disabled on entry. */ + /* + * There are two kinds of interrupt handlers: 0xE is an "interrupt + * gate" which expects interrupts to be disabled on entry. + */ if (idt_type(lo, hi) == 0xE) if (put_user(0, &cpu->lg->lguest_data->irq_enabled)) kill_guest(cpu, "Disabling interrupts"); @@ -130,7 +152,8 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, * * interrupt_pending() returns the first pending interrupt which isn't blocked * by the Guest. It is called before every entry to the Guest, and just before - * we go to sleep when the Guest has halted itself. */ + * we go to sleep when the Guest has halted itself. + */ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) { unsigned int irq; @@ -140,8 +163,10 @@ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) if (!cpu->lg->lguest_data) return LGUEST_IRQS; - /* Take our "irqs_pending" array and remove any interrupts the Guest - * wants blocked: the result ends up in "blk". */ + /* + * Take our "irqs_pending" array and remove any interrupts the Guest + * wants blocked: the result ends up in "blk". + */ if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, sizeof(blk))) return LGUEST_IRQS; @@ -154,16 +179,20 @@ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) return irq; } -/* This actually diverts the Guest to running an interrupt handler, once an - * interrupt has been identified by interrupt_pending(). */ +/* + * This actually diverts the Guest to running an interrupt handler, once an + * interrupt has been identified by interrupt_pending(). + */ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) { struct desc_struct *idt; BUG_ON(irq >= LGUEST_IRQS); - /* They may be in the middle of an iret, where they asked us never to - * deliver interrupts. */ + /* + * They may be in the middle of an iret, where they asked us never to + * deliver interrupts. + */ if (cpu->regs->eip >= cpu->lg->noirq_start && (cpu->regs->eip < cpu->lg->noirq_end)) return; @@ -187,29 +216,37 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) } } - /* Look at the IDT entry the Guest gave us for this interrupt. The + /* + * Look at the IDT entry the Guest gave us for this interrupt. The * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip - * over them. */ + * over them. + */ idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; /* If they don't have a handler (yet?), we just ignore it */ if (idt_present(idt->a, idt->b)) { /* OK, mark it no longer pending and deliver it. */ clear_bit(irq, cpu->irqs_pending); - /* set_guest_interrupt() takes the interrupt descriptor and a + /* + * set_guest_interrupt() takes the interrupt descriptor and a * flag to say whether this interrupt pushes an error code onto - * the stack as well: virtual interrupts never do. */ + * the stack as well: virtual interrupts never do. + */ set_guest_interrupt(cpu, idt->a, idt->b, false); } - /* Every time we deliver an interrupt, we update the timestamp in the + /* + * Every time we deliver an interrupt, we update the timestamp in the * Guest's lguest_data struct. It would be better for the Guest if we * did this more often, but it can actually be quite slow: doing it * here is a compromise which means at least it gets updated every - * timer interrupt. */ + * timer interrupt. + */ write_timestamp(cpu); - /* If there are no other interrupts we want to deliver, clear - * the pending flag. */ + /* + * If there are no other interrupts we want to deliver, clear + * the pending flag. + */ if (!more) put_user(0, &cpu->lg->lguest_data->irq_pending); } @@ -217,24 +254,29 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) /* And this is the routine when we want to set an interrupt for the Guest. */ void set_interrupt(struct lg_cpu *cpu, unsigned int irq) { - /* Next time the Guest runs, the core code will see if it can deliver - * this interrupt. */ + /* + * Next time the Guest runs, the core code will see if it can deliver + * this interrupt. + */ set_bit(irq, cpu->irqs_pending); - /* Make sure it sees it; it might be asleep (eg. halted), or - * running the Guest right now, in which case kick_process() - * will knock it out. */ + /* + * Make sure it sees it; it might be asleep (eg. halted), or running + * the Guest right now, in which case kick_process() will knock it out. + */ if (!wake_up_process(cpu->tsk)) kick_process(cpu->tsk); } /*:*/ -/* Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent +/* + * Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent * me a patch, so we support that too. It'd be a big step for lguest if half * the Plan 9 user base were to start using it. * * Actually now I think of it, it's possible that Ron *is* half the Plan 9 - * userbase. Oh well. */ + * userbase. Oh well. + */ static bool could_be_syscall(unsigned int num) { /* Normal Linux SYSCALL_VECTOR or reserved vector? */ @@ -274,9 +316,11 @@ void free_interrupts(void) clear_bit(syscall_vector, used_vectors); } -/*H:220 Now we've got the routines to deliver interrupts, delivering traps like +/*H:220 + * Now we've got the routines to deliver interrupts, delivering traps like * page fault is easy. The only trick is that Intel decided that some traps - * should have error codes: */ + * should have error codes: + */ static bool has_err(unsigned int trap) { return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); @@ -285,13 +329,17 @@ static bool has_err(unsigned int trap) /* deliver_trap() returns true if it could deliver the trap. */ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) { - /* Trap numbers are always 8 bit, but we set an impossible trap number - * for traps inside the Switcher, so check that here. */ + /* + * Trap numbers are always 8 bit, but we set an impossible trap number + * for traps inside the Switcher, so check that here. + */ if (num >= ARRAY_SIZE(cpu->arch.idt)) return false; - /* Early on the Guest hasn't set the IDT entries (or maybe it put a - * bogus one in): if we fail here, the Guest will be killed. */ + /* + * Early on the Guest hasn't set the IDT entries (or maybe it put a + * bogus one in): if we fail here, the Guest will be killed. + */ if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b)) return false; set_guest_interrupt(cpu, cpu->arch.idt[num].a, @@ -299,7 +347,8 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) return true; } -/*H:250 Here's the hard part: returning to the Host every time a trap happens +/*H:250 + * Here's the hard part: returning to the Host every time a trap happens * and then calling deliver_trap() and re-entering the Guest is slow. * Particularly because Guest userspace system calls are traps (usually trap * 128). @@ -311,69 +360,87 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) * the other hypervisors would beat it up at lunchtime. * * This routine indicates if a particular trap number could be delivered - * directly. */ + * directly. + */ static bool direct_trap(unsigned int num) { - /* Hardware interrupts don't go to the Guest at all (except system - * call). */ + /* + * Hardware interrupts don't go to the Guest at all (except system + * call). + */ if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num)) return false; - /* The Host needs to see page faults (for shadow paging and to save the + /* + * The Host needs to see page faults (for shadow paging and to save the * fault address), general protection faults (in/out emulation) and * device not available (TS handling), invalid opcode fault (kvm hcall), - * and of course, the hypercall trap. */ + * and of course, the hypercall trap. + */ return num != 14 && num != 13 && num != 7 && num != 6 && num != LGUEST_TRAP_ENTRY; } /*:*/ -/*M:005 The Guest has the ability to turn its interrupt gates into trap gates, +/*M:005 + * The Guest has the ability to turn its interrupt gates into trap gates, * if it is careful. The Host will let trap gates can go directly to the * Guest, but the Guest needs the interrupts atomically disabled for an * interrupt gate. It can do this by pointing the trap gate at instructions - * within noirq_start and noirq_end, where it can safely disable interrupts. */ + * within noirq_start and noirq_end, where it can safely disable interrupts. + */ -/*M:006 The Guests do not use the sysenter (fast system call) instruction, +/*M:006 + * The Guests do not use the sysenter (fast system call) instruction, * because it's hardcoded to enter privilege level 0 and so can't go direct. * It's about twice as fast as the older "int 0x80" system call, so it might * still be worthwhile to handle it in the Switcher and lcall down to the * Guest. The sysenter semantics are hairy tho: search for that keyword in - * entry.S :*/ + * entry.S +:*/ -/*H:260 When we make traps go directly into the Guest, we need to make sure +/*H:260 + * When we make traps go directly into the Guest, we need to make sure * the kernel stack is valid (ie. mapped in the page tables). Otherwise, the * CPU trying to deliver the trap will fault while trying to push the interrupt * words on the stack: this is called a double fault, and it forces us to kill * the Guest. * - * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */ + * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. + */ void pin_stack_pages(struct lg_cpu *cpu) { unsigned int i; - /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or - * two pages of stack space. */ + /* + * Depending on the CONFIG_4KSTACKS option, the Guest can have one or + * two pages of stack space. + */ for (i = 0; i < cpu->lg->stack_pages; i++) - /* The stack grows *upwards*, so the address we're given is the + /* + * The stack grows *upwards*, so the address we're given is the * start of the page after the kernel stack. Subtract one to * get back onto the first stack page, and keep subtracting to - * get to the rest of the stack pages. */ + * get to the rest of the stack pages. + */ pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE); } -/* Direct traps also mean that we need to know whenever the Guest wants to use +/* + * Direct traps also mean that we need to know whenever the Guest wants to use * a different kernel stack, so we can change the IDT entries to use that * stack. The IDT entries expect a virtual address, so unlike most addresses * the Guest gives us, the "esp" (stack pointer) value here is virtual, not * physical. * * In Linux each process has its own kernel stack, so this happens a lot: we - * change stacks on each context switch. */ + * change stacks on each context switch. + */ void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) { - /* You are not allowed have a stack segment with privilege level 0: bad - * Guest! */ + /* + * You're not allowed a stack segment with privilege level 0: bad Guest! + */ if ((seg & 0x3) != GUEST_PL) kill_guest(cpu, "bad stack segment %i", seg); /* We only expect one or two stack pages. */ @@ -387,11 +454,15 @@ void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) pin_stack_pages(cpu); } -/* All this reference to mapping stacks leads us neatly into the other complex - * part of the Host: page table handling. */ +/* + * All this reference to mapping stacks leads us neatly into the other complex + * part of the Host: page table handling. + */ -/*H:235 This is the routine which actually checks the Guest's IDT entry and - * transfers it into the entry in "struct lguest": */ +/*H:235 + * This is the routine which actually checks the Guest's IDT entry and + * transfers it into the entry in "struct lguest": + */ static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, unsigned int num, u32 lo, u32 hi) { @@ -407,30 +478,38 @@ static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, if (type != 0xE && type != 0xF) kill_guest(cpu, "bad IDT type %i", type); - /* We only copy the handler address, present bit, privilege level and + /* + * We only copy the handler address, present bit, privilege level and * type. The privilege level controls where the trap can be triggered * manually with an "int" instruction. This is usually GUEST_PL, - * except for system calls which userspace can use. */ + * except for system calls which userspace can use. + */ trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); trap->b = (hi&0xFFFFEF00); } -/*H:230 While we're here, dealing with delivering traps and interrupts to the +/*H:230 + * While we're here, dealing with delivering traps and interrupts to the * Guest, we might as well complete the picture: how the Guest tells us where * it wants them to go. This would be simple, except making traps fast * requires some tricks. * * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the - * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */ + * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. + */ void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) { - /* Guest never handles: NMI, doublefault, spurious interrupt or - * hypercall. We ignore when it tries to set them. */ + /* + * Guest never handles: NMI, doublefault, spurious interrupt or + * hypercall. We ignore when it tries to set them. + */ if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) return; - /* Mark the IDT as changed: next time the Guest runs we'll know we have - * to copy this again. */ + /* + * Mark the IDT as changed: next time the Guest runs we'll know we have + * to copy this again. + */ cpu->changed |= CHANGED_IDT; /* Check that the Guest doesn't try to step outside the bounds. */ @@ -440,9 +519,11 @@ void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) set_trap(cpu, &cpu->arch.idt[num], num, lo, hi); } -/* The default entry for each interrupt points into the Switcher routines which +/* + * The default entry for each interrupt points into the Switcher routines which * simply return to the Host. The run_guest() loop will then call - * deliver_trap() to bounce it back into the Guest. */ + * deliver_trap() to bounce it back into the Guest. + */ static void default_idt_entry(struct desc_struct *idt, int trap, const unsigned long handler, @@ -451,13 +532,17 @@ static void default_idt_entry(struct desc_struct *idt, /* A present interrupt gate. */ u32 flags = 0x8e00; - /* Set the privilege level on the entry for the hypercall: this allows - * the Guest to use the "int" instruction to trigger it. */ + /* + * Set the privilege level on the entry for the hypercall: this allows + * the Guest to use the "int" instruction to trigger it. + */ if (trap == LGUEST_TRAP_ENTRY) flags |= (GUEST_PL << 13); else if (base) - /* Copy priv. level from what Guest asked for. This allows - * debug (int 3) traps from Guest userspace, for example. */ + /* + * Copy privilege level from what Guest asked for. This allows + * debug (int 3) traps from Guest userspace, for example. + */ flags |= (base->b & 0x6000); /* Now pack it into the IDT entry in its weird format. */ @@ -475,16 +560,20 @@ void setup_default_idt_entries(struct lguest_ro_state *state, default_idt_entry(&state->guest_idt[i], i, def[i], NULL); } -/*H:240 We don't use the IDT entries in the "struct lguest" directly, instead +/*H:240 + * We don't use the IDT entries in the "struct lguest" directly, instead * we copy them into the IDT which we've set up for Guests on this CPU, just - * before we run the Guest. This routine does that copy. */ + * before we run the Guest. This routine does that copy. + */ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, const unsigned long *def) { unsigned int i; - /* We can simply copy the direct traps, otherwise we use the default - * ones in the Switcher: they will return to the Host. */ + /* + * We can simply copy the direct traps, otherwise we use the default + * ones in the Switcher: they will return to the Host. + */ for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) { const struct desc_struct *gidt = &cpu->arch.idt[i]; @@ -492,14 +581,16 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, if (!direct_trap(i)) continue; - /* Only trap gates (type 15) can go direct to the Guest. + /* + * Only trap gates (type 15) can go direct to the Guest. * Interrupt gates (type 14) disable interrupts as they are * entered, which we never let the Guest do. Not present * entries (type 0x0) also can't go direct, of course. * * If it can't go direct, we still need to copy the priv. level: * they might want to give userspace access to a software - * interrupt. */ + * interrupt. + */ if (idt_type(gidt->a, gidt->b) == 0xF) idt[i] = *gidt; else @@ -518,7 +609,8 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, * the next timer interrupt (in nanoseconds). We use the high-resolution timer * infrastructure to set a callback at that time. * - * 0 means "turn off the clock". */ + * 0 means "turn off the clock". + */ void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) { ktime_t expires; @@ -529,9 +621,11 @@ void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) return; } - /* We use wallclock time here, so the Guest might not be running for + /* + * We use wallclock time here, so the Guest might not be running for * all the time between now and the timer interrupt it asked for. This - * is almost always the right thing to do. */ + * is almost always the right thing to do. + */ expires = ktime_add_ns(ktime_get_real(), delta); hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS); } diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 01c59192379..74c0db691b5 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -54,13 +54,13 @@ struct lg_cpu { unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ - /* At end of a page shared mapped over lguest_pages in guest. */ + /* At end of a page shared mapped over lguest_pages in guest. */ unsigned long regs_page; struct lguest_regs *regs; struct lguest_pages *last_pages; - int cpu_pgd; /* which pgd this cpu is currently using */ + int cpu_pgd; /* Which pgd this cpu is currently using */ /* If a hypercall was asked for, this points to the arguments. */ struct hcall_args *hcall; @@ -96,8 +96,11 @@ struct lguest unsigned int nr_cpus; u32 pfn_limit; - /* This provides the offset to the base of guest-physical - * memory in the Launcher. */ + + /* + * This provides the offset to the base of guest-physical memory in the + * Launcher. + */ void __user *mem_base; unsigned long kernel_address; @@ -122,11 +125,13 @@ bool lguest_address_ok(const struct lguest *lg, void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); -/*H:035 Using memory-copy operations like that is usually inconvient, so we +/*H:035 + * Using memory-copy operations like that is usually inconvient, so we * have the following helper macros which read and write a specific type (often * an unsigned long). * - * This reads into a variable of the given type then returns that. */ + * This reads into a variable of the given type then returns that. + */ #define lgread(cpu, addr, type) \ ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; }) @@ -140,9 +145,11 @@ void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); int run_guest(struct lg_cpu *cpu, unsigned long __user *user); -/* Helper macros to obtain the first 12 or the last 20 bits, this is only the +/* + * Helper macros to obtain the first 12 or the last 20 bits, this is only the * first step in the migration to the kernel types. pte_pfn is already defined - * in the kernel. */ + * in the kernel. + */ #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) #define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK) diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index e082cdac88b..cc000e79c3d 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -1,10 +1,12 @@ -/*P:050 Lguest guests use a very simple method to describe devices. It's a +/*P:050 + * Lguest guests use a very simple method to describe devices. It's a * series of device descriptors contained just above the top of normal Guest * memory. * * We use the standard "virtio" device infrastructure, which provides us with a * console, a network and a block driver. Each one expects some configuration - * information and a "virtqueue" or two to send and receive data. :*/ + * information and a "virtqueue" or two to send and receive data. +:*/ #include #include #include @@ -20,8 +22,10 @@ /* The pointer to our (page) of device descriptions. */ static void *lguest_devices; -/* For Guests, device memory can be used as normal memory, so we cast away the - * __iomem to quieten sparse. */ +/* + * For Guests, device memory can be used as normal memory, so we cast away the + * __iomem to quieten sparse. + */ static inline void *lguest_map(unsigned long phys_addr, unsigned long pages) { return (__force void *)ioremap_cache(phys_addr, PAGE_SIZE*pages); @@ -32,8 +36,10 @@ static inline void lguest_unmap(void *addr) iounmap((__force void __iomem *)addr); } -/*D:100 Each lguest device is just a virtio device plus a pointer to its entry - * in the lguest_devices page. */ +/*D:100 + * Each lguest device is just a virtio device plus a pointer to its entry + * in the lguest_devices page. + */ struct lguest_device { struct virtio_device vdev; @@ -41,9 +47,11 @@ struct lguest_device { struct lguest_device_desc *desc; }; -/* Since the virtio infrastructure hands us a pointer to the virtio_device all +/* + * Since the virtio infrastructure hands us a pointer to the virtio_device all * the time, it helps to have a curt macro to get a pointer to the struct - * lguest_device it's enclosed in. */ + * lguest_device it's enclosed in. + */ #define to_lgdev(vd) container_of(vd, struct lguest_device, vdev) /*D:130 @@ -55,7 +63,8 @@ struct lguest_device { * the driver will look at them during setup. * * A convenient routine to return the device's virtqueue config array: - * immediately after the descriptor. */ + * immediately after the descriptor. + */ static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc) { return (void *)(desc + 1); @@ -98,10 +107,12 @@ static u32 lg_get_features(struct virtio_device *vdev) return features; } -/* The virtio core takes the features the Host offers, and copies the - * ones supported by the driver into the vdev->features array. Once - * that's all sorted out, this routine is called so we can tell the - * Host which features we understand and accept. */ +/* + * The virtio core takes the features the Host offers, and copies the ones + * supported by the driver into the vdev->features array. Once that's all + * sorted out, this routine is called so we can tell the Host which features we + * understand and accept. + */ static void lg_finalize_features(struct virtio_device *vdev) { unsigned int i, bits; @@ -112,10 +123,11 @@ static void lg_finalize_features(struct virtio_device *vdev) /* Give virtio_ring a chance to accept features. */ vring_transport_features(vdev); - /* The vdev->feature array is a Linux bitmask: this isn't the - * same as a the simple array of bits used by lguest devices - * for features. So we do this slow, manual conversion which is - * completely general. */ + /* + * The vdev->feature array is a Linux bitmask: this isn't the same as a + * the simple array of bits used by lguest devices for features. So we + * do this slow, manual conversion which is completely general. + */ memset(out_features, 0, desc->feature_len); bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8; for (i = 0; i < bits; i++) { @@ -146,15 +158,19 @@ static void lg_set(struct virtio_device *vdev, unsigned int offset, memcpy(lg_config(desc) + offset, buf, len); } -/* The operations to get and set the status word just access the status field - * of the device descriptor. */ +/* + * The operations to get and set the status word just access the status field + * of the device descriptor. + */ static u8 lg_get_status(struct virtio_device *vdev) { return to_lgdev(vdev)->desc->status; } -/* To notify on status updates, we (ab)use the NOTIFY hypercall, with the - * descriptor address of the device. A zero status means "reset". */ +/* + * To notify on status updates, we (ab)use the NOTIFY hypercall, with the + * descriptor address of the device. A zero status means "reset". + */ static void set_status(struct virtio_device *vdev, u8 status) { unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; @@ -200,13 +216,17 @@ struct lguest_vq_info void *pages; }; -/* When the virtio_ring code wants to prod the Host, it calls us here and we +/* + * When the virtio_ring code wants to prod the Host, it calls us here and we * make a hypercall. We hand the physical address of the virtqueue so the Host - * knows which virtqueue we're talking about. */ + * knows which virtqueue we're talking about. + */ static void lg_notify(struct virtqueue *vq) { - /* We store our virtqueue information in the "priv" pointer of the - * virtqueue structure. */ + /* + * We store our virtqueue information in the "priv" pointer of the + * virtqueue structure. + */ struct lguest_vq_info *lvq = vq->priv; kvm_hypercall1(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT); @@ -215,7 +235,8 @@ static void lg_notify(struct virtqueue *vq) /* An extern declaration inside a C file is bad form. Don't do it. */ extern void lguest_setup_irq(unsigned int irq); -/* This routine finds the first virtqueue described in the configuration of +/* + * This routine finds the first virtqueue described in the configuration of * this device and sets it up. * * This is kind of an ugly duckling. It'd be nicer to have a standard @@ -225,7 +246,8 @@ extern void lguest_setup_irq(unsigned int irq); * simpler for the Host to simply tell us where the pages are. * * So we provide drivers with a "find the Nth virtqueue and set it up" - * function. */ + * function. + */ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, unsigned index, void (*callback)(struct virtqueue *vq), @@ -244,9 +266,11 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, if (!lvq) return ERR_PTR(-ENOMEM); - /* Make a copy of the "struct lguest_vqconfig" entry, which sits after + /* + * Make a copy of the "struct lguest_vqconfig" entry, which sits after * the descriptor. We need a copy because the config space might not - * be aligned correctly. */ + * be aligned correctly. + */ memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config)); printk("Mapping virtqueue %i addr %lx\n", index, @@ -261,8 +285,10 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, goto free_lvq; } - /* OK, tell virtio_ring.c to set up a virtqueue now we know its size - * and we've got a pointer to its pages. */ + /* + * OK, tell virtio_ring.c to set up a virtqueue now we know its size + * and we've got a pointer to its pages. + */ vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN, vdev, lvq->pages, lg_notify, callback, name); if (!vq) { @@ -273,18 +299,23 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, /* Make sure the interrupt is allocated. */ lguest_setup_irq(lvq->config.irq); - /* Tell the interrupt for this virtqueue to go to the virtio_ring - * interrupt handler. */ - /* FIXME: We used to have a flag for the Host to tell us we could use + /* + * Tell the interrupt for this virtqueue to go to the virtio_ring + * interrupt handler. + * + * FIXME: We used to have a flag for the Host to tell us we could use * the interrupt as a source of randomness: it'd be nice to have that - * back.. */ + * back. + */ err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vq); if (err) goto destroy_vring; - /* Last of all we hook up our 'struct lguest_vq_info" to the - * virtqueue's priv pointer. */ + /* + * Last of all we hook up our 'struct lguest_vq_info" to the + * virtqueue's priv pointer. + */ vq->priv = lvq; return vq; @@ -358,11 +389,14 @@ static struct virtio_config_ops lguest_config_ops = { .del_vqs = lg_del_vqs, }; -/* The root device for the lguest virtio devices. This makes them appear as - * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. */ +/* + * The root device for the lguest virtio devices. This makes them appear as + * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. + */ static struct device *lguest_root; -/*D:120 This is the core of the lguest bus: actually adding a new device. +/*D:120 + * This is the core of the lguest bus: actually adding a new device. * It's a separate function because it's neater that way, and because an * earlier version of the code supported hotplug and unplug. They were removed * early on because they were never used. @@ -371,14 +405,14 @@ static struct device *lguest_root; * * It's worth reading this carefully: we start with a pointer to the new device * descriptor in the "lguest_devices" page, and the offset into the device - * descriptor page so we can uniquely identify it if things go badly wrong. */ + * descriptor page so we can uniquely identify it if things go badly wrong. + */ static void add_lguest_device(struct lguest_device_desc *d, unsigned int offset) { struct lguest_device *ldev; - /* Start with zeroed memory; Linux's device layer seems to count on - * it. */ + /* Start with zeroed memory; Linux's device layer counts on it. */ ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); if (!ldev) { printk(KERN_EMERG "Cannot allocate lguest dev %u type %u\n", @@ -390,15 +424,19 @@ static void add_lguest_device(struct lguest_device_desc *d, ldev->vdev.dev.parent = lguest_root; /* We have a unique device index thanks to the dev_index counter. */ ldev->vdev.id.device = d->type; - /* We have a simple set of routines for querying the device's - * configuration information and setting its status. */ + /* + * We have a simple set of routines for querying the device's + * configuration information and setting its status. + */ ldev->vdev.config = &lguest_config_ops; /* And we remember the device's descriptor for lguest_config_ops. */ ldev->desc = d; - /* register_virtio_device() sets up the generic fields for the struct + /* + * register_virtio_device() sets up the generic fields for the struct * virtio_device and calls device_register(). This makes the bus - * infrastructure look for a matching driver. */ + * infrastructure look for a matching driver. + */ if (register_virtio_device(&ldev->vdev) != 0) { printk(KERN_ERR "Failed to register lguest dev %u type %u\n", offset, d->type); @@ -406,8 +444,10 @@ static void add_lguest_device(struct lguest_device_desc *d, } } -/*D:110 scan_devices() simply iterates through the device page. The type 0 is - * reserved to mean "end of devices". */ +/*D:110 + * scan_devices() simply iterates through the device page. The type 0 is + * reserved to mean "end of devices". + */ static void scan_devices(void) { unsigned int i; @@ -426,7 +466,8 @@ static void scan_devices(void) } } -/*D:105 Fairly early in boot, lguest_devices_init() is called to set up the +/*D:105 + * Fairly early in boot, lguest_devices_init() is called to set up the * lguest device infrastructure. We check that we are a Guest by checking * pv_info.name: there are other ways of checking, but this seems most * obvious to me. @@ -437,7 +478,8 @@ static void scan_devices(void) * correct sysfs incantation). * * Finally we call scan_devices() which adds all the devices found in the - * lguest_devices page. */ + * lguest_devices page. + */ static int __init lguest_devices_init(void) { if (strcmp(pv_info.name, "lguest") != 0) @@ -456,11 +498,13 @@ static int __init lguest_devices_init(void) /* We do this after core stuff, but before the drivers. */ postcore_initcall(lguest_devices_init); -/*D:150 At this point in the journey we used to now wade through the lguest +/*D:150 + * At this point in the journey we used to now wade through the lguest * devices themselves: net, block and console. Since they're all now virtio * devices rather than lguest-specific, I've decided to ignore them. Mostly, * they're kind of boring. But this does mean you'll never experience the * thrill of reading the forbidden love scene buried deep in the block driver. * * "make Launcher" beckons, where we answer questions like "Where do Guests - * come from?", and "What do you do when someone asks for optimization?". */ + * come from?", and "What do you do when someone asks for optimization?". + */ diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 407722a8e0c..7e92017103d 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -1,8 +1,10 @@ -/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher +/*P:200 + * This contains all the /dev/lguest code, whereby the userspace launcher * controls and communicates with the Guest. For example, the first write will * tell us the Guest's memory layout, pagetable, entry point and kernel address * offset. A read will run the Guest until something happens, such as a signal - * or the Guest doing a NOTIFY out to the Launcher. :*/ + * or the Guest doing a NOTIFY out to the Launcher. +:*/ #include #include #include @@ -37,8 +39,10 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) if (!addr) return -EINVAL; - /* Replace the old array with the new one, carefully: others can - * be accessing it at the same time */ + /* + * Replace the old array with the new one, carefully: others can + * be accessing it at the same time. + */ new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1), GFP_KERNEL); if (!new) @@ -61,8 +65,10 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) /* Now put new one in place. */ rcu_assign_pointer(lg->eventfds, new); - /* We're not in a big hurry. Wait until noone's looking at old - * version, then delete it. */ + /* + * We're not in a big hurry. Wait until noone's looking at old + * version, then delete it. + */ synchronize_rcu(); kfree(old); @@ -87,8 +93,10 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) return err; } -/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt - * number to /dev/lguest. */ +/*L:050 + * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt + * number to /dev/lguest. + */ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) { unsigned long irq; @@ -102,8 +110,10 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) return 0; } -/*L:040 Once our Guest is initialized, the Launcher makes it run by reading - * from /dev/lguest. */ +/*L:040 + * Once our Guest is initialized, the Launcher makes it run by reading + * from /dev/lguest. + */ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) { struct lguest *lg = file->private_data; @@ -139,8 +149,10 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) return len; } - /* If we returned from read() last time because the Guest sent I/O, - * clear the flag. */ + /* + * If we returned from read() last time because the Guest sent I/O, + * clear the flag. + */ if (cpu->pending_notify) cpu->pending_notify = 0; @@ -148,8 +160,10 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) return run_guest(cpu, (unsigned long __user *)user); } -/*L:025 This actually initializes a CPU. For the moment, a Guest is only - * uniprocessor, so "id" is always 0. */ +/*L:025 + * This actually initializes a CPU. For the moment, a Guest is only + * uniprocessor, so "id" is always 0. + */ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) { /* We have a limited number the number of CPUs in the lguest struct. */ @@ -164,8 +178,10 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) /* Each CPU has a timer it can set. */ init_clockdev(cpu); - /* We need a complete page for the Guest registers: they are accessible - * to the Guest and we can only grant it access to whole pages. */ + /* + * We need a complete page for the Guest registers: they are accessible + * to the Guest and we can only grant it access to whole pages. + */ cpu->regs_page = get_zeroed_page(GFP_KERNEL); if (!cpu->regs_page) return -ENOMEM; @@ -173,29 +189,38 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) /* We actually put the registers at the bottom of the page. */ cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); - /* Now we initialize the Guest's registers, handing it the start - * address. */ + /* + * Now we initialize the Guest's registers, handing it the start + * address. + */ lguest_arch_setup_regs(cpu, start_ip); - /* We keep a pointer to the Launcher task (ie. current task) for when - * other Guests want to wake this one (eg. console input). */ + /* + * We keep a pointer to the Launcher task (ie. current task) for when + * other Guests want to wake this one (eg. console input). + */ cpu->tsk = current; - /* We need to keep a pointer to the Launcher's memory map, because if + /* + * We need to keep a pointer to the Launcher's memory map, because if * the Launcher dies we need to clean it up. If we don't keep a - * reference, it is destroyed before close() is called. */ + * reference, it is destroyed before close() is called. + */ cpu->mm = get_task_mm(cpu->tsk); - /* We remember which CPU's pages this Guest used last, for optimization - * when the same Guest runs on the same CPU twice. */ + /* + * We remember which CPU's pages this Guest used last, for optimization + * when the same Guest runs on the same CPU twice. + */ cpu->last_pages = NULL; /* No error == success. */ return 0; } -/*L:020 The initialization write supplies 3 pointer sized (32 or 64 bit) - * values (in addition to the LHREQ_INITIALIZE value). These are: +/*L:020 + * The initialization write supplies 3 pointer sized (32 or 64 bit) values (in + * addition to the LHREQ_INITIALIZE value). These are: * * base: The start of the Guest-physical memory inside the Launcher memory. * @@ -207,14 +232,15 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) */ static int initialize(struct file *file, const unsigned long __user *input) { - /* "struct lguest" contains everything we (the Host) know about a - * Guest. */ + /* "struct lguest" contains all we (the Host) know about a Guest. */ struct lguest *lg; int err; unsigned long args[3]; - /* We grab the Big Lguest lock, which protects against multiple - * simultaneous initializations. */ + /* + * We grab the Big Lguest lock, which protects against multiple + * simultaneous initializations. + */ mutex_lock(&lguest_lock); /* You can't initialize twice! Close the device and start again... */ if (file->private_data) { @@ -249,8 +275,10 @@ static int initialize(struct file *file, const unsigned long __user *input) if (err) goto free_eventfds; - /* Initialize the Guest's shadow page tables, using the toplevel - * address the Launcher gave us. This allocates memory, so can fail. */ + /* + * Initialize the Guest's shadow page tables, using the toplevel + * address the Launcher gave us. This allocates memory, so can fail. + */ err = init_guest_pagetable(lg); if (err) goto free_regs; @@ -275,7 +303,8 @@ unlock: return err; } -/*L:010 The first operation the Launcher does must be a write. All writes +/*L:010 + * The first operation the Launcher does must be a write. All writes * start with an unsigned long number: for the first write this must be * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use * writes of other values to send interrupts. @@ -283,12 +312,15 @@ unlock: * Note that we overload the "offset" in the /dev/lguest file to indicate what * CPU number we're dealing with. Currently this is always 0, since we only * support uniprocessor Guests, but you can see the beginnings of SMP support - * here. */ + * here. + */ static ssize_t write(struct file *file, const char __user *in, size_t size, loff_t *off) { - /* Once the Guest is initialized, we hold the "struct lguest" in the - * file private data. */ + /* + * Once the Guest is initialized, we hold the "struct lguest" in the + * file private data. + */ struct lguest *lg = file->private_data; const unsigned long __user *input = (const unsigned long __user *)in; unsigned long req; @@ -323,13 +355,15 @@ static ssize_t write(struct file *file, const char __user *in, } } -/*L:060 The final piece of interface code is the close() routine. It reverses +/*L:060 + * The final piece of interface code is the close() routine. It reverses * everything done in initialize(). This is usually called because the * Launcher exited. * * Note that the close routine returns 0 or a negative error number: it can't * really fail, but it can whine. I blame Sun for this wart, and K&R C for - * letting them do it. :*/ + * letting them do it. +:*/ static int close(struct inode *inode, struct file *file) { struct lguest *lg = file->private_data; @@ -339,8 +373,10 @@ static int close(struct inode *inode, struct file *file) if (!lg) return 0; - /* We need the big lock, to protect from inter-guest I/O and other - * Launchers initializing guests. */ + /* + * We need the big lock, to protect from inter-guest I/O and other + * Launchers initializing guests. + */ mutex_lock(&lguest_lock); /* Free up the shadow page tables for the Guest. */ @@ -351,8 +387,10 @@ static int close(struct inode *inode, struct file *file) hrtimer_cancel(&lg->cpus[i].hrt); /* We can free up the register page we allocated. */ free_page(lg->cpus[i].regs_page); - /* Now all the memory cleanups are done, it's safe to release - * the Launcher's memory management structure. */ + /* + * Now all the memory cleanups are done, it's safe to release + * the Launcher's memory management structure. + */ mmput(lg->cpus[i].mm); } @@ -361,8 +399,10 @@ static int close(struct inode *inode, struct file *file) eventfd_ctx_put(lg->eventfds->map[i].event); kfree(lg->eventfds); - /* If lg->dead doesn't contain an error code it will be NULL or a - * kmalloc()ed string, either of which is ok to hand to kfree(). */ + /* + * If lg->dead doesn't contain an error code it will be NULL or a + * kmalloc()ed string, either of which is ok to hand to kfree(). + */ if (!IS_ERR(lg->dead)) kfree(lg->dead); /* Free the memory allocated to the lguest_struct */ @@ -386,7 +426,8 @@ static int close(struct inode *inode, struct file *file) * * We begin our understanding with the Host kernel interface which the Launcher * uses: reading and writing a character device called /dev/lguest. All the - * work happens in the read(), write() and close() routines: */ + * work happens in the read(), write() and close() routines: + */ static struct file_operations lguest_fops = { .owner = THIS_MODULE, .release = close, @@ -394,8 +435,10 @@ static struct file_operations lguest_fops = { .read = read, }; -/* This is a textbook example of a "misc" character device. Populate a "struct - * miscdevice" and register it with misc_register(). */ +/* + * This is a textbook example of a "misc" character device. Populate a "struct + * miscdevice" and register it with misc_register(). + */ static struct miscdevice lguest_dev = { .minor = MISC_DYNAMIC_MINOR, .name = "lguest", diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index a6fe1abda24..3da902e4b4c 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -1,9 +1,11 @@ -/*P:700 The pagetable code, on the other hand, still shows the scars of +/*P:700 + * The pagetable code, on the other hand, still shows the scars of * previous encounters. It's functional, and as neat as it can be in the * circumstances, but be wary, for these things are subtle and break easily. * The Guest provides a virtual to physical mapping, but we can neither trust * it nor use it: we verify and convert it here then point the CPU to the - * converted Guest pages when running the Guest. :*/ + * converted Guest pages when running the Guest. +:*/ /* Copyright (C) Rusty Russell IBM Corporation 2006. * GPL v2 and any later version */ @@ -17,10 +19,12 @@ #include #include "lg.h" -/*M:008 We hold reference to pages, which prevents them from being swapped. +/*M:008 + * We hold reference to pages, which prevents them from being swapped. * It'd be nice to have a callback in the "struct mm_struct" when Linux wants * to swap out. If we had this, and a shrinker callback to trim PTE pages, we - * could probably consider launching Guests as non-root. :*/ + * could probably consider launching Guests as non-root. +:*/ /*H:300 * The Page Table Code @@ -45,16 +49,19 @@ * (v) Flushing (throwing away) page tables, * (vi) Mapping the Switcher when the Guest is about to run, * (vii) Setting up the page tables initially. - :*/ +:*/ - -/* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is +/* + * 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is * conveniently placed at the top 4MB, so it uses a separate, complete PTE - * page. */ + * page. + */ #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) -/* For PAE we need the PMD index as well. We use the last 2MB, so we - * will need the last pmd entry of the last pmd page. */ +/* + * For PAE we need the PMD index as well. We use the last 2MB, so we + * will need the last pmd entry of the last pmd page. + */ #ifdef CONFIG_X86_PAE #define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) #define RESERVE_MEM 2U @@ -64,13 +71,16 @@ #define CHECK_GPGD_MASK _PAGE_TABLE #endif -/* We actually need a separate PTE page for each CPU. Remember that after the +/* + * We actually need a separate PTE page for each CPU. Remember that after the * Switcher code itself comes two pages for each CPU, and we don't want this - * CPU's guest to see the pages of any other CPU. */ + * CPU's guest to see the pages of any other CPU. + */ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) -/*H:320 The page table code is curly enough to need helper functions to keep it +/*H:320 + * The page table code is curly enough to need helper functions to keep it * clear and clean. * * There are two functions which return pointers to the shadow (aka "real") @@ -79,7 +89,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); * spgd_addr() takes the virtual address and returns a pointer to the top-level * page directory entry (PGD) for that address. Since we keep track of several * page tables, the "i" argument tells us which one we're interested in (it's - * usually the current one). */ + * usually the current one). + */ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) { unsigned int index = pgd_index(vaddr); @@ -96,9 +107,11 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) } #ifdef CONFIG_X86_PAE -/* This routine then takes the PGD entry given above, which contains the +/* + * This routine then takes the PGD entry given above, which contains the * address of the PMD page. It then returns a pointer to the PMD entry for the - * given address. */ + * given address. + */ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) { unsigned int index = pmd_index(vaddr); @@ -119,9 +132,11 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) } #endif -/* This routine then takes the page directory entry returned above, which +/* + * This routine then takes the page directory entry returned above, which * contains the address of the page table entry (PTE) page. It then returns a - * pointer to the PTE entry for the given address. */ + * pointer to the PTE entry for the given address. + */ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) { #ifdef CONFIG_X86_PAE @@ -139,8 +154,10 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) return &page[pte_index(vaddr)]; } -/* These two functions just like the above two, except they access the Guest - * page tables. Hence they return a Guest address. */ +/* + * These two functions just like the above two, except they access the Guest + * page tables. Hence they return a Guest address. + */ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) { unsigned int index = vaddr >> (PGDIR_SHIFT); @@ -175,17 +192,21 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, #endif /*:*/ -/*M:014 get_pfn is slow: we could probably try to grab batches of pages here as - * an optimization (ie. pre-faulting). :*/ +/*M:014 + * get_pfn is slow: we could probably try to grab batches of pages here as + * an optimization (ie. pre-faulting). +:*/ -/*H:350 This routine takes a page number given by the Guest and converts it to +/*H:350 + * This routine takes a page number given by the Guest and converts it to * an actual, physical page number. It can fail for several reasons: the * virtual address might not be mapped by the Launcher, the write flag is set * and the page is read-only, or the write flag was set and the page was * shared so had to be copied, but we ran out of memory. * * This holds a reference to the page, so release_pte() is careful to put that - * back. */ + * back. + */ static unsigned long get_pfn(unsigned long virtpfn, int write) { struct page *page; @@ -198,33 +219,41 @@ static unsigned long get_pfn(unsigned long virtpfn, int write) return -1UL; } -/*H:340 Converting a Guest page table entry to a shadow (ie. real) page table +/*H:340 + * Converting a Guest page table entry to a shadow (ie. real) page table * entry can be a little tricky. The flags are (almost) the same, but the * Guest PTE contains a virtual page number: the CPU needs the real page - * number. */ + * number. + */ static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) { unsigned long pfn, base, flags; - /* The Guest sets the global flag, because it thinks that it is using + /* + * The Guest sets the global flag, because it thinks that it is using * PGE. We only told it to use PGE so it would tell us whether it was * flushing a kernel mapping or a userspace mapping. We don't actually - * use the global bit, so throw it away. */ + * use the global bit, so throw it away. + */ flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); /* The Guest's pages are offset inside the Launcher. */ base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE; - /* We need a temporary "unsigned long" variable to hold the answer from + /* + * We need a temporary "unsigned long" variable to hold the answer from * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't * fit in spte.pfn. get_pfn() finds the real physical number of the - * page, given the virtual number. */ + * page, given the virtual number. + */ pfn = get_pfn(base + pte_pfn(gpte), write); if (pfn == -1UL) { kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte)); - /* When we destroy the Guest, we'll go through the shadow page + /* + * When we destroy the Guest, we'll go through the shadow page * tables and release_pte() them. Make sure we don't think - * this one is valid! */ + * this one is valid! + */ flags = 0; } /* Now we assemble our shadow PTE from the page number and flags. */ @@ -234,8 +263,10 @@ static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) /*H:460 And to complete the chain, release_pte() looks like this: */ static void release_pte(pte_t pte) { - /* Remember that get_user_pages_fast() took a reference to the page, in - * get_pfn()? We have to put it back now. */ + /* + * Remember that get_user_pages_fast() took a reference to the page, in + * get_pfn()? We have to put it back now. + */ if (pte_flags(pte) & _PAGE_PRESENT) put_page(pte_page(pte)); } @@ -273,7 +304,8 @@ static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) * and return to the Guest without it knowing. * * If we fixed up the fault (ie. we mapped the address), this routine returns - * true. Otherwise, it was a real fault and we need to tell the Guest. */ + * true. Otherwise, it was a real fault and we need to tell the Guest. + */ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) { pgd_t gpgd; @@ -298,22 +330,26 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ unsigned long ptepage = get_zeroed_page(GFP_KERNEL); - /* This is not really the Guest's fault, but killing it is - * simple for this corner case. */ + /* + * This is not really the Guest's fault, but killing it is + * simple for this corner case. + */ if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); return false; } /* We check that the Guest pgd is OK. */ check_gpgd(cpu, gpgd); - /* And we copy the flags to the shadow PGD entry. The page - * number in the shadow PGD is the page we just allocated. */ + /* + * And we copy the flags to the shadow PGD entry. The page + * number in the shadow PGD is the page we just allocated. + */ set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); } #ifdef CONFIG_X86_PAE gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); - /* middle level not present? We can't map it in. */ + /* Middle level not present? We can't map it in. */ if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) return false; @@ -324,8 +360,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* No shadow entry: allocate a new shadow PTE page. */ unsigned long ptepage = get_zeroed_page(GFP_KERNEL); - /* This is not really the Guest's fault, but killing it is - * simple for this corner case. */ + /* + * This is not really the Guest's fault, but killing it is + * simple for this corner case. + */ if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); return false; @@ -334,17 +372,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* We check that the Guest pmd is OK. */ check_gpmd(cpu, gpmd); - /* And we copy the flags to the shadow PMD entry. The page - * number in the shadow PMD is the page we just allocated. */ + /* + * And we copy the flags to the shadow PMD entry. The page + * number in the shadow PMD is the page we just allocated. + */ native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); } - /* OK, now we look at the lower level in the Guest page table: keep its - * address, because we might update it later. */ + /* + * OK, now we look at the lower level in the Guest page table: keep its + * address, because we might update it later. + */ gpte_ptr = gpte_addr(cpu, gpmd, vaddr); #else - /* OK, now we look at the lower level in the Guest page table: keep its - * address, because we might update it later. */ + /* + * OK, now we look at the lower level in the Guest page table: keep its + * address, because we might update it later. + */ gpte_ptr = gpte_addr(cpu, gpgd, vaddr); #endif gpte = lgread(cpu, gpte_ptr, pte_t); @@ -353,8 +397,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) if (!(pte_flags(gpte) & _PAGE_PRESENT)) return false; - /* Check they're not trying to write to a page the Guest wants - * read-only (bit 2 of errcode == write). */ + /* + * Check they're not trying to write to a page the Guest wants + * read-only (bit 2 of errcode == write). + */ if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) return false; @@ -362,8 +408,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) return false; - /* Check that the Guest PTE flags are OK, and the page number is below - * the pfn_limit (ie. not mapping the Launcher binary). */ + /* + * Check that the Guest PTE flags are OK, and the page number is below + * the pfn_limit (ie. not mapping the Launcher binary). + */ check_gpte(cpu, gpte); /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ @@ -373,29 +421,40 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* Get the pointer to the shadow PTE entry we're going to set. */ spte = spte_addr(cpu, *spgd, vaddr); - /* If there was a valid shadow PTE entry here before, we release it. - * This can happen with a write to a previously read-only entry. */ + + /* + * If there was a valid shadow PTE entry here before, we release it. + * This can happen with a write to a previously read-only entry. + */ release_pte(*spte); - /* If this is a write, we insist that the Guest page is writable (the - * final arg to gpte_to_spte()). */ + /* + * If this is a write, we insist that the Guest page is writable (the + * final arg to gpte_to_spte()). + */ if (pte_dirty(gpte)) *spte = gpte_to_spte(cpu, gpte, 1); else - /* If this is a read, don't set the "writable" bit in the page + /* + * If this is a read, don't set the "writable" bit in the page * table entry, even if the Guest says it's writable. That way * we will come back here when a write does actually occur, so - * we can update the Guest's _PAGE_DIRTY flag. */ + * we can update the Guest's _PAGE_DIRTY flag. + */ native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); - /* Finally, we write the Guest PTE entry back: we've set the - * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ + /* + * Finally, we write the Guest PTE entry back: we've set the + * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. + */ lgwrite(cpu, gpte_ptr, pte_t, gpte); - /* The fault is fixed, the page table is populated, the mapping + /* + * The fault is fixed, the page table is populated, the mapping * manipulated, the result returned and the code complete. A small * delay and a trace of alliteration are the only indications the Guest - * has that a page fault occurred at all. */ + * has that a page fault occurred at all. + */ return true; } @@ -408,7 +467,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * mapped, so it's overkill. * * This is a quick version which answers the question: is this virtual address - * mapped by the shadow page tables, and is it writable? */ + * mapped by the shadow page tables, and is it writable? + */ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) { pgd_t *spgd; @@ -428,16 +488,20 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) return false; #endif - /* Check the flags on the pte entry itself: it must be present and - * writable. */ + /* + * Check the flags on the pte entry itself: it must be present and + * writable. + */ flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); } -/* So, when pin_stack_pages() asks us to pin a page, we check if it's already +/* + * So, when pin_stack_pages() asks us to pin a page, we check if it's already * in the page tables, and if not, we call demand_page() with error code 2 - * (meaning "write"). */ + * (meaning "write"). + */ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) { if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) @@ -485,9 +549,11 @@ static void release_pgd(pgd_t *spgd) /* If the entry's not present, there's nothing to release. */ if (pgd_flags(*spgd) & _PAGE_PRESENT) { unsigned int i; - /* Converting the pfn to find the actual PTE page is easy: turn + /* + * Converting the pfn to find the actual PTE page is easy: turn * the page number into a physical address, then convert to a - * virtual address (easy for kernel pages like this one). */ + * virtual address (easy for kernel pages like this one). + */ pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); /* For each entry in the page, we might need to release it. */ for (i = 0; i < PTRS_PER_PTE; i++) @@ -499,9 +565,12 @@ static void release_pgd(pgd_t *spgd) } } #endif -/*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() + +/*H:445 + * We saw flush_user_mappings() twice: once from the flush_user_mappings() * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. - * It simply releases every PTE page from 0 up to the Guest's kernel address. */ + * It simply releases every PTE page from 0 up to the Guest's kernel address. + */ static void flush_user_mappings(struct lguest *lg, int idx) { unsigned int i; @@ -510,10 +579,12 @@ static void flush_user_mappings(struct lguest *lg, int idx) release_pgd(lg->pgdirs[idx].pgdir + i); } -/*H:440 (v) Flushing (throwing away) page tables, +/*H:440 + * (v) Flushing (throwing away) page tables, * * The Guest has a hypercall to throw away the page tables: it's used when a - * large number of mappings have been changed. */ + * large number of mappings have been changed. + */ void guest_pagetable_flush_user(struct lg_cpu *cpu) { /* Drop the userspace part of the current page table. */ @@ -551,9 +622,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); } -/* We keep several page tables. This is a simple routine to find the page +/* + * We keep several page tables. This is a simple routine to find the page * table (if any) corresponding to this top-level address the Guest has given - * us. */ + * us. + */ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) { unsigned int i; @@ -563,9 +636,11 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) return i; } -/*H:435 And this is us, creating the new page directory. If we really do +/*H:435 + * And this is us, creating the new page directory. If we really do * allocate a new one (and so the kernel parts are not there), we set - * blank_pgdir. */ + * blank_pgdir. + */ static unsigned int new_pgdir(struct lg_cpu *cpu, unsigned long gpgdir, int *blank_pgdir) @@ -575,8 +650,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, pmd_t *pmd_table; #endif - /* We pick one entry at random to throw out. Choosing the Least - * Recently Used might be better, but this is easy. */ + /* + * We pick one entry at random to throw out. Choosing the Least + * Recently Used might be better, but this is easy. + */ next = random32() % ARRAY_SIZE(cpu->lg->pgdirs); /* If it's never been allocated at all before, try now. */ if (!cpu->lg->pgdirs[next].pgdir) { @@ -587,8 +664,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, next = cpu->cpu_pgd; else { #ifdef CONFIG_X86_PAE - /* In PAE mode, allocate a pmd page and populate the - * last pgd entry. */ + /* + * In PAE mode, allocate a pmd page and populate the + * last pgd entry. + */ pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd_table) { free_page((long)cpu->lg->pgdirs[next].pgdir); @@ -598,8 +677,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, set_pgd(cpu->lg->pgdirs[next].pgdir + SWITCHER_PGD_INDEX, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - /* This is a blank page, so there are no kernel - * mappings: caller must map the stack! */ + /* + * This is a blank page, so there are no kernel + * mappings: caller must map the stack! + */ *blank_pgdir = 1; } #else @@ -615,19 +696,23 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, return next; } -/*H:430 (iv) Switching page tables +/*H:430 + * (iv) Switching page tables * * Now we've seen all the page table setting and manipulation, let's see * what happens when the Guest changes page tables (ie. changes the top-level - * pgdir). This occurs on almost every context switch. */ + * pgdir). This occurs on almost every context switch. + */ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) { int newpgdir, repin = 0; /* Look to see if we have this one already. */ newpgdir = find_pgdir(cpu->lg, pgtable); - /* If not, we allocate or mug an existing one: if it's a fresh one, - * repin gets set to 1. */ + /* + * If not, we allocate or mug an existing one: if it's a fresh one, + * repin gets set to 1. + */ if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) newpgdir = new_pgdir(cpu, pgtable, &repin); /* Change the current pgd index to the new one. */ @@ -637,9 +722,11 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) pin_stack_pages(cpu); } -/*H:470 Finally, a routine which throws away everything: all PGD entries in all +/*H:470 + * Finally, a routine which throws away everything: all PGD entries in all * the shadow page tables, including the Guest's kernel mappings. This is used - * when we destroy the Guest. */ + * when we destroy the Guest. + */ static void release_all_pagetables(struct lguest *lg) { unsigned int i, j; @@ -656,8 +743,10 @@ static void release_all_pagetables(struct lguest *lg) spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); - /* And release the pmd entries of that pmd page, - * except for the switcher pmd. */ + /* + * And release the pmd entries of that pmd page, + * except for the switcher pmd. + */ for (k = 0; k < SWITCHER_PMD_INDEX; k++) release_pmd(&pmdpage[k]); #endif @@ -667,10 +756,12 @@ static void release_all_pagetables(struct lguest *lg) } } -/* We also throw away everything when a Guest tells us it's changed a kernel +/* + * We also throw away everything when a Guest tells us it's changed a kernel * mapping. Since kernel mappings are in every page table, it's easiest to * throw them all away. This traps the Guest in amber for a while as - * everything faults back in, but it's rare. */ + * everything faults back in, but it's rare. + */ void guest_pagetable_clear_all(struct lg_cpu *cpu) { release_all_pagetables(cpu->lg); @@ -678,15 +769,19 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) pin_stack_pages(cpu); } /*:*/ -/*M:009 Since we throw away all mappings when a kernel mapping changes, our + +/*M:009 + * Since we throw away all mappings when a kernel mapping changes, our * performance sucks for guests using highmem. In fact, a guest with * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is * usually slower than a Guest with less memory. * * This, of course, cannot be fixed. It would take some kind of... well, I - * don't know, but the term "puissant code-fu" comes to mind. :*/ + * don't know, but the term "puissant code-fu" comes to mind. +:*/ -/*H:420 This is the routine which actually sets the page table entry for then +/*H:420 + * This is the routine which actually sets the page table entry for then * "idx"'th shadow page table. * * Normally, we can just throw out the old entry and replace it with 0: if they @@ -715,31 +810,36 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, spmd = spmd_addr(cpu, *spgd, vaddr); if (pmd_flags(*spmd) & _PAGE_PRESENT) { #endif - /* Otherwise, we start by releasing - * the existing entry. */ + /* Otherwise, start by releasing the existing entry. */ pte_t *spte = spte_addr(cpu, *spgd, vaddr); release_pte(*spte); - /* If they're setting this entry as dirty or accessed, - * we might as well put that entry they've given us - * in now. This shaves 10% off a - * copy-on-write micro-benchmark. */ + /* + * If they're setting this entry as dirty or accessed, + * we might as well put that entry they've given us in + * now. This shaves 10% off a copy-on-write + * micro-benchmark. + */ if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { check_gpte(cpu, gpte); native_set_pte(spte, gpte_to_spte(cpu, gpte, pte_flags(gpte) & _PAGE_DIRTY)); - } else - /* Otherwise kill it and we can demand_page() - * it in later. */ + } else { + /* + * Otherwise kill it and we can demand_page() + * it in later. + */ native_set_pte(spte, __pte(0)); + } #ifdef CONFIG_X86_PAE } #endif } } -/*H:410 Updating a PTE entry is a little trickier. +/*H:410 + * Updating a PTE entry is a little trickier. * * We keep track of several different page tables (the Guest uses one for each * process, so it makes sense to cache at least a few). Each of these have @@ -748,12 +848,15 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, * all the page tables, not just the current one. This is rare. * * The benefit is that when we have to track a new page table, we can keep all - * the kernel mappings. This speeds up context switch immensely. */ + * the kernel mappings. This speeds up context switch immensely. + */ void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, unsigned long vaddr, pte_t gpte) { - /* Kernel mappings must be changed on all top levels. Slow, but doesn't - * happen often. */ + /* + * Kernel mappings must be changed on all top levels. Slow, but doesn't + * happen often. + */ if (vaddr >= cpu->lg->kernel_address) { unsigned int i; for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) @@ -802,12 +905,14 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) } #endif -/* Once we know how much memory we have we can construct simple identity - * (which set virtual == physical) and linear mappings - * which will get the Guest far enough into the boot to create its own. +/* + * Once we know how much memory we have we can construct simple identity (which + * set virtual == physical) and linear mappings which will get the Guest far + * enough into the boot to create its own. * * We lay them out of the way, just below the initrd (which is why we need to - * know its size here). */ + * know its size here). + */ static unsigned long setup_pagetables(struct lguest *lg, unsigned long mem, unsigned long initrd_size) @@ -825,8 +930,10 @@ static unsigned long setup_pagetables(struct lguest *lg, unsigned int phys_linear; #endif - /* We have mapped_pages frames to map, so we need - * linear_pages page tables to map them. */ + /* + * We have mapped_pages frames to map, so we need linear_pages page + * tables to map them. + */ mapped_pages = mem / PAGE_SIZE; linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE; @@ -839,8 +946,10 @@ static unsigned long setup_pagetables(struct lguest *lg, #ifdef CONFIG_X86_PAE pmds = (void *)linear - PAGE_SIZE; #endif - /* Linear mapping is easy: put every page's address into the - * mapping in order. */ + /* + * Linear mapping is easy: put every page's address into the + * mapping in order. + */ for (i = 0; i < mapped_pages; i++) { pte_t pte; pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER)); @@ -848,8 +957,10 @@ static unsigned long setup_pagetables(struct lguest *lg, return -EFAULT; } - /* The top level points to the linear page table pages above. - * We setup the identity and linear mappings here. */ + /* + * The top level points to the linear page table pages above. + * We setup the identity and linear mappings here. + */ #ifdef CONFIG_X86_PAE for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; i += PTRS_PER_PTE, j++) { @@ -880,15 +991,19 @@ static unsigned long setup_pagetables(struct lguest *lg, } #endif - /* We return the top level (guest-physical) address: remember where - * this is. */ + /* + * We return the top level (guest-physical) address: remember where + * this is. + */ return (unsigned long)pgdir - mem_base; } -/*H:500 (vii) Setting up the page tables initially. +/*H:500 + * (vii) Setting up the page tables initially. * * When a Guest is first created, the Launcher tells us where the toplevel of - * its first page table is. We set some things up here: */ + * its first page table is. We set some things up here: + */ int init_guest_pagetable(struct lguest *lg) { u64 mem; @@ -898,14 +1013,18 @@ int init_guest_pagetable(struct lguest *lg) pgd_t *pgd; pmd_t *pmd_table; #endif - /* Get the Guest memory size and the ramdisk size from the boot header - * located at lg->mem_base (Guest address 0). */ + /* + * Get the Guest memory size and the ramdisk size from the boot header + * located at lg->mem_base (Guest address 0). + */ if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) || get_user(initrd_size, &boot->hdr.ramdisk_size)) return -EFAULT; - /* We start on the first shadow page table, and give it a blank PGD - * page. */ + /* + * We start on the first shadow page table, and give it a blank PGD + * page. + */ lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size); if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir)) return lg->pgdirs[0].gpgdir; @@ -931,17 +1050,21 @@ void page_table_guest_data_init(struct lg_cpu *cpu) /* We get the kernel address: above this is all kernel memory. */ if (get_user(cpu->lg->kernel_address, &cpu->lg->lguest_data->kernel_address) - /* We tell the Guest that it can't use the top 2 or 4 MB - * of virtual addresses used by the Switcher. */ + /* + * We tell the Guest that it can't use the top 2 or 4 MB + * of virtual addresses used by the Switcher. + */ || put_user(RESERVE_MEM * 1024 * 1024, &cpu->lg->lguest_data->reserve_mem) || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - /* In flush_user_mappings() we loop from 0 to + /* + * In flush_user_mappings() we loop from 0 to * "pgd_index(lg->kernel_address)". This assumes it won't hit the - * Switcher mappings, so check that now. */ + * Switcher mappings, so check that now. + */ #ifdef CONFIG_X86_PAE if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) @@ -964,12 +1087,14 @@ void free_guest_pagetable(struct lguest *lg) free_page((long)lg->pgdirs[i].pgdir); } -/*H:480 (vi) Mapping the Switcher when the Guest is about to run. +/*H:480 + * (vi) Mapping the Switcher when the Guest is about to run. * * The Switcher and the two pages for this CPU need to be visible in the * Guest (and not the pages for other CPUs). We have the appropriate PTE pages * for each CPU already set up, we just need to hook them in now we know which - * Guest is about to run on this CPU. */ + * Guest is about to run on this CPU. + */ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) { pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); @@ -990,20 +1115,24 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) #else pgd_t switcher_pgd; - /* Make the last PGD entry for this Guest point to the Switcher's PTE - * page for this CPU (with appropriate flags). */ + /* + * Make the last PGD entry for this Guest point to the Switcher's PTE + * page for this CPU (with appropriate flags). + */ switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; #endif - /* We also change the Switcher PTE page. When we're running the Guest, + /* + * We also change the Switcher PTE page. When we're running the Guest, * we want the Guest's "regs" page to appear where the first Switcher * page for this CPU is. This is an optimization: when the Switcher * saves the Guest registers, it saves them into the first page of this * CPU's "struct lguest_pages": if we make sure the Guest's register * page is already mapped there, we don't have to copy them out - * again. */ + * again. + */ pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; native_set_pte(®s_pte, pfn_pte(pfn, PAGE_KERNEL)); native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], @@ -1019,10 +1148,12 @@ static void free_switcher_pte_pages(void) free_page((long)switcher_pte_page(i)); } -/*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given +/*H:520 + * Setting up the Switcher PTE page for given CPU is fairly easy, given * the CPU number and the "struct page"s for the Switcher code itself. * - * Currently the Switcher is less than a page long, so "pages" is always 1. */ + * Currently the Switcher is less than a page long, so "pages" is always 1. + */ static __init void populate_switcher_pte_page(unsigned int cpu, struct page *switcher_page[], unsigned int pages) @@ -1043,13 +1174,16 @@ static __init void populate_switcher_pte_page(unsigned int cpu, native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); - /* The second page contains the "struct lguest_ro_state", and is - * read-only. */ + /* + * The second page contains the "struct lguest_ro_state", and is + * read-only. + */ native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); } -/* We've made it through the page table code. Perhaps our tired brains are +/* + * We've made it through the page table code. Perhaps our tired brains are * still processing the details, or perhaps we're simply glad it's over. * * If nothing else, note that all this complexity in juggling shadow page tables @@ -1058,10 +1192,13 @@ static __init void populate_switcher_pte_page(unsigned int cpu, * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD * have implemented shadow page table support directly into hardware. * - * There is just one file remaining in the Host. */ + * There is just one file remaining in the Host. + */ -/*H:510 At boot or module load time, init_pagetables() allocates and populates - * the Switcher PTE page for each CPU. */ +/*H:510 + * At boot or module load time, init_pagetables() allocates and populates + * the Switcher PTE page for each CPU. + */ __init int init_pagetables(struct page **switcher_page, unsigned int pages) { unsigned int i; diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c index 482ed5a1875..951c57b0a7e 100644 --- a/drivers/lguest/segments.c +++ b/drivers/lguest/segments.c @@ -1,4 +1,5 @@ -/*P:600 The x86 architecture has segments, which involve a table of descriptors +/*P:600 + * The x86 architecture has segments, which involve a table of descriptors * which can be used to do funky things with virtual address interpretation. * We originally used to use segments so the Guest couldn't alter the * Guest<->Host Switcher, and then we had to trim Guest segments, and restore @@ -8,7 +9,8 @@ * * In these modern times, the segment handling code consists of simple sanity * checks, and the worst you'll experience reading this code is butterfly-rash - * from frolicking through its parklike serenity. :*/ + * from frolicking through its parklike serenity. +:*/ #include "lg.h" /*H:600 @@ -41,10 +43,12 @@ * begin. */ -/* There are several entries we don't let the Guest set. The TSS entry is the +/* + * There are several entries we don't let the Guest set. The TSS entry is the * "Task State Segment" which controls all kinds of delicate things. The * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the - * the Guest can't be trusted to deal with double faults. */ + * the Guest can't be trusted to deal with double faults. + */ static bool ignored_gdt(unsigned int num) { return (num == GDT_ENTRY_TSS @@ -53,42 +57,52 @@ static bool ignored_gdt(unsigned int num) || num == GDT_ENTRY_DOUBLEFAULT_TSS); } -/*H:630 Once the Guest gave us new GDT entries, we fix them up a little. We +/*H:630 + * Once the Guest gave us new GDT entries, we fix them up a little. We * don't care if they're invalid: the worst that can happen is a General * Protection Fault in the Switcher when it restores a Guest segment register * which tries to use that entry. Then we kill the Guest for causing such a - * mess: the message will be "unhandled trap 256". */ + * mess: the message will be "unhandled trap 256". + */ static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end) { unsigned int i; for (i = start; i < end; i++) { - /* We never copy these ones to real GDT, so we don't care what - * they say */ + /* + * We never copy these ones to real GDT, so we don't care what + * they say + */ if (ignored_gdt(i)) continue; - /* Segment descriptors contain a privilege level: the Guest is + /* + * Segment descriptors contain a privilege level: the Guest is * sometimes careless and leaves this as 0, even though it's - * running at privilege level 1. If so, we fix it here. */ + * running at privilege level 1. If so, we fix it here. + */ if ((cpu->arch.gdt[i].b & 0x00006000) == 0) cpu->arch.gdt[i].b |= (GUEST_PL << 13); - /* Each descriptor has an "accessed" bit. If we don't set it + /* + * Each descriptor has an "accessed" bit. If we don't set it * now, the CPU will try to set it when the Guest first loads * that entry into a segment register. But the GDT isn't - * writable by the Guest, so bad things can happen. */ + * writable by the Guest, so bad things can happen. + */ cpu->arch.gdt[i].b |= 0x00000100; } } -/*H:610 Like the IDT, we never simply use the GDT the Guest gives us. We keep +/*H:610 + * Like the IDT, we never simply use the GDT the Guest gives us. We keep * a GDT for each CPU, and copy across the Guest's entries each time we want to * run the Guest on that CPU. * * This routine is called at boot or modprobe time for each CPU to set up the * constant GDT entries: the ones which are the same no matter what Guest we're - * running. */ + * running. + */ void setup_default_gdt_entries(struct lguest_ro_state *state) { struct desc_struct *gdt = state->guest_gdt; @@ -98,30 +112,37 @@ void setup_default_gdt_entries(struct lguest_ro_state *state) gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; - /* The TSS segment refers to the TSS entry for this particular CPU. + /* + * The TSS segment refers to the TSS entry for this particular CPU. * Forgive the magic flags: the 0x8900 means the entry is Present, it's * privilege level 0 Available 386 TSS system segment, and the 0x67 - * means Saturn is eclipsed by Mercury in the twelfth house. */ + * means Saturn is eclipsed by Mercury in the twelfth house. + */ gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) | ((tss >> 16) & 0x000000FF); } -/* This routine sets up the initial Guest GDT for booting. All entries start - * as 0 (unusable). */ +/* + * This routine sets up the initial Guest GDT for booting. All entries start + * as 0 (unusable). + */ void setup_guest_gdt(struct lg_cpu *cpu) { - /* Start with full 0-4G segments... */ + /* + * Start with full 0-4G segments...except the Guest is allowed to use + * them, so set the privilege level appropriately in the flags. + */ cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; - /* ...except the Guest is allowed to use them, so set the privilege - * level appropriately in the flags. */ cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); } -/*H:650 An optimization of copy_gdt(), for just the three "thead-local storage" - * entries. */ +/*H:650 + * An optimization of copy_gdt(), for just the three "thead-local storage" + * entries. + */ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) { unsigned int i; @@ -130,26 +151,34 @@ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) gdt[i] = cpu->arch.gdt[i]; } -/*H:640 When the Guest is run on a different CPU, or the GDT entries have - * changed, copy_gdt() is called to copy the Guest's GDT entries across to this - * CPU's GDT. */ +/*H:640 + * When the Guest is run on a different CPU, or the GDT entries have changed, + * copy_gdt() is called to copy the Guest's GDT entries across to this CPU's + * GDT. + */ void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt) { unsigned int i; - /* The default entries from setup_default_gdt_entries() are not - * replaced. See ignored_gdt() above. */ + /* + * The default entries from setup_default_gdt_entries() are not + * replaced. See ignored_gdt() above. + */ for (i = 0; i < GDT_ENTRIES; i++) if (!ignored_gdt(i)) gdt[i] = cpu->arch.gdt[i]; } -/*H:620 This is where the Guest asks us to load a new GDT entry - * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in. */ +/*H:620 + * This is where the Guest asks us to load a new GDT entry + * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in. + */ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) { - /* We assume the Guest has the same number of GDT entries as the - * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ + /* + * We assume the Guest has the same number of GDT entries as the + * Host, otherwise we'd have to dynamically allocate the Guest GDT. + */ if (num >= ARRAY_SIZE(cpu->arch.gdt)) kill_guest(cpu, "too many gdt entries %i", num); @@ -157,15 +186,19 @@ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) cpu->arch.gdt[num].a = lo; cpu->arch.gdt[num].b = hi; fixup_gdt_table(cpu, num, num+1); - /* Mark that the GDT changed so the core knows it has to copy it again, - * even if the Guest is run on the same CPU. */ + /* + * Mark that the GDT changed so the core knows it has to copy it again, + * even if the Guest is run on the same CPU. + */ cpu->changed |= CHANGED_GDT; } -/* This is the fast-track version for just changing the three TLS entries. +/* + * This is the fast-track version for just changing the three TLS entries. * Remember that this happens on every context switch, so it's worth * optimizing. But wouldn't it be neater to have a single hypercall to cover - * both cases? */ + * both cases? + */ void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) { struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN]; @@ -175,7 +208,6 @@ void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) /* Note that just the TLS entries have changed. */ cpu->changed |= CHANGED_GDT_TLS; } -/*:*/ /*H:660 * With this, we have finished the Host. diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index eaf722fe309..96f7d88ec7f 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -17,13 +17,15 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -/*P:450 This file contains the x86-specific lguest code. It used to be all +/*P:450 + * This file contains the x86-specific lguest code. It used to be all * mixed in with drivers/lguest/core.c but several foolhardy code slashers * wrestled most of the dependencies out to here in preparation for porting * lguest to other architectures (see what I mean by foolhardy?). * * This also contains a couple of non-obvious setup and teardown pieces which - * were implemented after days of debugging pain. :*/ + * were implemented after days of debugging pain. +:*/ #include #include #include @@ -82,25 +84,33 @@ static DEFINE_PER_CPU(struct lg_cpu *, last_cpu); */ static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages) { - /* Copying all this data can be quite expensive. We usually run the + /* + * Copying all this data can be quite expensive. We usually run the * same Guest we ran last time (and that Guest hasn't run anywhere else * meanwhile). If that's not the case, we pretend everything in the - * Guest has changed. */ + * Guest has changed. + */ if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) { __get_cpu_var(last_cpu) = cpu; cpu->last_pages = pages; cpu->changed = CHANGED_ALL; } - /* These copies are pretty cheap, so we do them unconditionally: */ - /* Save the current Host top-level page directory. */ + /* + * These copies are pretty cheap, so we do them unconditionally: */ + /* Save the current Host top-level page directory. + */ pages->state.host_cr3 = __pa(current->mm->pgd); - /* Set up the Guest's page tables to see this CPU's pages (and no - * other CPU's pages). */ + /* + * Set up the Guest's page tables to see this CPU's pages (and no + * other CPU's pages). + */ map_switcher_in_guest(cpu, pages); - /* Set up the two "TSS" members which tell the CPU what stack to use + /* + * Set up the two "TSS" members which tell the CPU what stack to use * for traps which do directly into the Guest (ie. traps at privilege - * level 1). */ + * level 1). + */ pages->state.guest_tss.sp1 = cpu->esp1; pages->state.guest_tss.ss1 = cpu->ss1; @@ -125,40 +135,53 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) /* This is a dummy value we need for GCC's sake. */ unsigned int clobber; - /* Copy the guest-specific information into this CPU's "struct - * lguest_pages". */ + /* + * Copy the guest-specific information into this CPU's "struct + * lguest_pages". + */ copy_in_guest_info(cpu, pages); - /* Set the trap number to 256 (impossible value). If we fault while + /* + * Set the trap number to 256 (impossible value). If we fault while * switching to the Guest (bad segment registers or bug), this will - * cause us to abort the Guest. */ + * cause us to abort the Guest. + */ cpu->regs->trapnum = 256; - /* Now: we push the "eflags" register on the stack, then do an "lcall". + /* + * Now: we push the "eflags" register on the stack, then do an "lcall". * This is how we change from using the kernel code segment to using * the dedicated lguest code segment, as well as jumping into the * Switcher. * * The lcall also pushes the old code segment (KERNEL_CS) onto the * stack, then the address of this call. This stack layout happens to - * exactly match the stack layout created by an interrupt... */ + * exactly match the stack layout created by an interrupt... + */ asm volatile("pushf; lcall *lguest_entry" - /* This is how we tell GCC that %eax ("a") and %ebx ("b") - * are changed by this routine. The "=" means output. */ + /* + * This is how we tell GCC that %eax ("a") and %ebx ("b") + * are changed by this routine. The "=" means output. + */ : "=a"(clobber), "=b"(clobber) - /* %eax contains the pages pointer. ("0" refers to the + /* + * %eax contains the pages pointer. ("0" refers to the * 0-th argument above, ie "a"). %ebx contains the * physical address of the Guest's top-level page - * directory. */ + * directory. + */ : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)) - /* We tell gcc that all these registers could change, + /* + * We tell gcc that all these registers could change, * which means we don't have to save and restore them in - * the Switcher. */ + * the Switcher. + */ : "memory", "%edx", "%ecx", "%edi", "%esi"); } /*:*/ -/*M:002 There are hooks in the scheduler which we can register to tell when we +/*M:002 + * There are hooks in the scheduler which we can register to tell when we * get kicked off the CPU (preempt_notifier_register()). This would allow us * to lazily disable SYSENTER which would regain some performance, and should * also simplify copy_in_guest_info(). Note that we'd still need to restore @@ -166,56 +189,72 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) * * We could also try using this hooks for PGE, but that might be too expensive. * - * The hooks were designed for KVM, but we can also put them to good use. :*/ + * The hooks were designed for KVM, but we can also put them to good use. +:*/ -/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts - * are disabled: we own the CPU. */ +/*H:040 + * This is the i386-specific code to setup and run the Guest. Interrupts + * are disabled: we own the CPU. + */ void lguest_arch_run_guest(struct lg_cpu *cpu) { - /* Remember the awfully-named TS bit? If the Guest has asked to set it + /* + * Remember the awfully-named TS bit? If the Guest has asked to set it * we set it now, so we can trap and pass that trap to the Guest if it - * uses the FPU. */ + * uses the FPU. + */ if (cpu->ts) unlazy_fpu(current); - /* SYSENTER is an optimized way of doing system calls. We can't allow + /* + * SYSENTER is an optimized way of doing system calls. We can't allow * it because it always jumps to privilege level 0. A normal Guest * won't try it because we don't advertise it in CPUID, but a malicious * Guest (or malicious Guest userspace program) could, so we tell the - * CPU to disable it before running the Guest. */ + * CPU to disable it before running the Guest. + */ if (boot_cpu_has(X86_FEATURE_SEP)) wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); - /* Now we actually run the Guest. It will return when something + /* + * Now we actually run the Guest. It will return when something * interesting happens, and we can examine its registers to see what it - * was doing. */ + * was doing. + */ run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); - /* Note that the "regs" structure contains two extra entries which are + /* + * Note that the "regs" structure contains two extra entries which are * not really registers: a trap number which says what interrupt or * trap made the switcher code come back, and an error code which some - * traps set. */ + * traps set. + */ /* Restore SYSENTER if it's supposed to be on. */ if (boot_cpu_has(X86_FEATURE_SEP)) wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); - /* If the Guest page faulted, then the cr2 register will tell us the + /* + * If the Guest page faulted, then the cr2 register will tell us the * bad virtual address. We have to grab this now, because once we * re-enable interrupts an interrupt could fault and thus overwrite - * cr2, or we could even move off to a different CPU. */ + * cr2, or we could even move off to a different CPU. + */ if (cpu->regs->trapnum == 14) cpu->arch.last_pagefault = read_cr2(); - /* Similarly, if we took a trap because the Guest used the FPU, + /* + * Similarly, if we took a trap because the Guest used the FPU, * we have to restore the FPU it expects to see. * math_state_restore() may sleep and we may even move off to * a different CPU. So all the critical stuff should be done - * before this. */ + * before this. + */ else if (cpu->regs->trapnum == 7) math_state_restore(); } -/*H:130 Now we've examined the hypercall code; our Guest can make requests. +/*H:130 + * Now we've examined the hypercall code; our Guest can make requests. * Our Guest is usually so well behaved; it never tries to do things it isn't * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual * infrastructure isn't quite complete, because it doesn't contain replacements @@ -225,26 +264,33 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) * * When the Guest uses one of these instructions, we get a trap (General * Protection Fault) and come here. We see if it's one of those troublesome - * instructions and skip over it. We return true if we did. */ + * instructions and skip over it. We return true if we did. + */ static int emulate_insn(struct lg_cpu *cpu) { u8 insn; unsigned int insnlen = 0, in = 0, shift = 0; - /* The eip contains the *virtual* address of the Guest's instruction: - * guest_pa just subtracts the Guest's page_offset. */ + /* + * The eip contains the *virtual* address of the Guest's instruction: + * guest_pa just subtracts the Guest's page_offset. + */ unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); - /* This must be the Guest kernel trying to do something, not userspace! + /* + * This must be the Guest kernel trying to do something, not userspace! * The bottom two bits of the CS segment register are the privilege - * level. */ + * level. + */ if ((cpu->regs->cs & 3) != GUEST_PL) return 0; /* Decoding x86 instructions is icky. */ insn = lgread(cpu, physaddr, u8); - /* 0x66 is an "operand prefix". It means it's using the upper 16 bits - of the eax register. */ + /* + * 0x66 is an "operand prefix". It means it's using the upper 16 bits + * of the eax register. + */ if (insn == 0x66) { shift = 16; /* The instruction is 1 byte so far, read the next byte. */ @@ -252,8 +298,10 @@ static int emulate_insn(struct lg_cpu *cpu) insn = lgread(cpu, physaddr + insnlen, u8); } - /* We can ignore the lower bit for the moment and decode the 4 opcodes - * we need to emulate. */ + /* + * We can ignore the lower bit for the moment and decode the 4 opcodes + * we need to emulate. + */ switch (insn & 0xFE) { case 0xE4: /* in ,%al */ insnlen += 2; @@ -274,9 +322,11 @@ static int emulate_insn(struct lg_cpu *cpu) return 0; } - /* If it was an "IN" instruction, they expect the result to be read + /* + * If it was an "IN" instruction, they expect the result to be read * into %eax, so we change %eax. We always return all-ones, which - * traditionally means "there's nothing there". */ + * traditionally means "there's nothing there". + */ if (in) { /* Lower bit tells is whether it's a 16 or 32 bit access */ if (insn & 0x1) @@ -290,7 +340,8 @@ static int emulate_insn(struct lg_cpu *cpu) return 1; } -/* Our hypercalls mechanism used to be based on direct software interrupts. +/* + * Our hypercalls mechanism used to be based on direct software interrupts. * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to * change over to using kvm hypercalls. * @@ -318,16 +369,20 @@ static int emulate_insn(struct lg_cpu *cpu) */ static void rewrite_hypercall(struct lg_cpu *cpu) { - /* This are the opcodes we use to patch the Guest. The opcode for "int + /* + * This are the opcodes we use to patch the Guest. The opcode for "int * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we - * complete the sequence with a NOP (0x90). */ + * complete the sequence with a NOP (0x90). + */ u8 insn[3] = {0xcd, 0x1f, 0x90}; __lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn)); - /* The above write might have caused a copy of that page to be made + /* + * The above write might have caused a copy of that page to be made * (if it was read-only). We need to make sure the Guest has * up-to-date pagetables. As this doesn't happen often, we can just - * drop them all. */ + * drop them all. + */ guest_pagetable_clear_all(cpu); } @@ -335,9 +390,11 @@ static bool is_hypercall(struct lg_cpu *cpu) { u8 insn[3]; - /* This must be the Guest kernel trying to do something. + /* + * This must be the Guest kernel trying to do something. * The bottom two bits of the CS segment register are the privilege - * level. */ + * level. + */ if ((cpu->regs->cs & 3) != GUEST_PL) return false; @@ -351,86 +408,105 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) { switch (cpu->regs->trapnum) { case 13: /* We've intercepted a General Protection Fault. */ - /* Check if this was one of those annoying IN or OUT + /* + * Check if this was one of those annoying IN or OUT * instructions which we need to emulate. If so, we just go - * back into the Guest after we've done it. */ + * back into the Guest after we've done it. + */ if (cpu->regs->errcode == 0) { if (emulate_insn(cpu)) return; } - /* If KVM is active, the vmcall instruction triggers a - * General Protection Fault. Normally it triggers an - * invalid opcode fault (6): */ + /* + * If KVM is active, the vmcall instruction triggers a General + * Protection Fault. Normally it triggers an invalid opcode + * fault (6): + */ case 6: - /* We need to check if ring == GUEST_PL and - * faulting instruction == vmcall. */ + /* + * We need to check if ring == GUEST_PL and faulting + * instruction == vmcall. + */ if (is_hypercall(cpu)) { rewrite_hypercall(cpu); return; } break; case 14: /* We've intercepted a Page Fault. */ - /* The Guest accessed a virtual address that wasn't mapped. + /* + * The Guest accessed a virtual address that wasn't mapped. * This happens a lot: we don't actually set up most of the page * tables for the Guest at all when we start: as it runs it asks * for more and more, and we set them up as required. In this * case, we don't even tell the Guest that the fault happened. * * The errcode tells whether this was a read or a write, and - * whether kernel or userspace code. */ + * whether kernel or userspace code. + */ if (demand_page(cpu, cpu->arch.last_pagefault, cpu->regs->errcode)) return; - /* OK, it's really not there (or not OK): the Guest needs to + /* + * OK, it's really not there (or not OK): the Guest needs to * know. We write out the cr2 value so it knows where the * fault occurred. * * Note that if the Guest were really messed up, this could * happen before it's done the LHCALL_LGUEST_INIT hypercall, so - * lg->lguest_data could be NULL */ + * lg->lguest_data could be NULL + */ if (cpu->lg->lguest_data && put_user(cpu->arch.last_pagefault, &cpu->lg->lguest_data->cr2)) kill_guest(cpu, "Writing cr2"); break; case 7: /* We've intercepted a Device Not Available fault. */ - /* If the Guest doesn't want to know, we already restored the - * Floating Point Unit, so we just continue without telling - * it. */ + /* + * If the Guest doesn't want to know, we already restored the + * Floating Point Unit, so we just continue without telling it. + */ if (!cpu->ts) return; break; case 32 ... 255: - /* These values mean a real interrupt occurred, in which case + /* + * These values mean a real interrupt occurred, in which case * the Host handler has already been run. We just do a * friendly check if another process should now be run, then - * return to run the Guest again */ + * return to run the Guest again + */ cond_resched(); return; case LGUEST_TRAP_ENTRY: - /* Our 'struct hcall_args' maps directly over our regs: we set - * up the pointer now to indicate a hypercall is pending. */ + /* + * Our 'struct hcall_args' maps directly over our regs: we set + * up the pointer now to indicate a hypercall is pending. + */ cpu->hcall = (struct hcall_args *)cpu->regs; return; } /* We didn't handle the trap, so it needs to go to the Guest. */ if (!deliver_trap(cpu, cpu->regs->trapnum)) - /* If the Guest doesn't have a handler (either it hasn't + /* + * If the Guest doesn't have a handler (either it hasn't * registered any yet, or it's one of the faults we don't let - * it handle), it dies with this cryptic error message. */ + * it handle), it dies with this cryptic error message. + */ kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", cpu->regs->trapnum, cpu->regs->eip, cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault : cpu->regs->errcode); } -/* Now we can look at each of the routines this calls, in increasing order of +/* + * Now we can look at each of the routines this calls, in increasing order of * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), * deliver_trap() and demand_page(). After all those, we'll be ready to * examine the Switcher, and our philosophical understanding of the Host/Guest - * duality will be complete. :*/ + * duality will be complete. +:*/ static void adjust_pge(void *on) { if (on) @@ -439,13 +515,16 @@ static void adjust_pge(void *on) write_cr4(read_cr4() & ~X86_CR4_PGE); } -/*H:020 Now the Switcher is mapped and every thing else is ready, we need to do - * some more i386-specific initialization. */ +/*H:020 + * Now the Switcher is mapped and every thing else is ready, we need to do + * some more i386-specific initialization. + */ void __init lguest_arch_host_init(void) { int i; - /* Most of the i386/switcher.S doesn't care that it's been moved; on + /* + * Most of the i386/switcher.S doesn't care that it's been moved; on * Intel, jumps are relative, and it doesn't access any references to * external code or data. * @@ -453,7 +532,8 @@ void __init lguest_arch_host_init(void) * addresses are placed in a table (default_idt_entries), so we need to * update the table with the new addresses. switcher_offset() is a * convenience function which returns the distance between the - * compiled-in switcher code and the high-mapped copy we just made. */ + * compiled-in switcher code and the high-mapped copy we just made. + */ for (i = 0; i < IDT_ENTRIES; i++) default_idt_entries[i] += switcher_offset(); @@ -468,63 +548,81 @@ void __init lguest_arch_host_init(void) for_each_possible_cpu(i) { /* lguest_pages() returns this CPU's two pages. */ struct lguest_pages *pages = lguest_pages(i); - /* This is a convenience pointer to make the code fit one - * statement to a line. */ + /* This is a convenience pointer to make the code neater. */ struct lguest_ro_state *state = &pages->state; - /* The Global Descriptor Table: the Host has a different one + /* + * The Global Descriptor Table: the Host has a different one * for each CPU. We keep a descriptor for the GDT which says * where it is and how big it is (the size is actually the last - * byte, not the size, hence the "-1"). */ + * byte, not the size, hence the "-1"). + */ state->host_gdt_desc.size = GDT_SIZE-1; state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); - /* All CPUs on the Host use the same Interrupt Descriptor + /* + * All CPUs on the Host use the same Interrupt Descriptor * Table, so we just use store_idt(), which gets this CPU's IDT - * descriptor. */ + * descriptor. + */ store_idt(&state->host_idt_desc); - /* The descriptors for the Guest's GDT and IDT can be filled + /* + * The descriptors for the Guest's GDT and IDT can be filled * out now, too. We copy the GDT & IDT into ->guest_gdt and - * ->guest_idt before actually running the Guest. */ + * ->guest_idt before actually running the Guest. + */ state->guest_idt_desc.size = sizeof(state->guest_idt)-1; state->guest_idt_desc.address = (long)&state->guest_idt; state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; state->guest_gdt_desc.address = (long)&state->guest_gdt; - /* We know where we want the stack to be when the Guest enters + /* + * We know where we want the stack to be when the Guest enters * the Switcher: in pages->regs. The stack grows upwards, so - * we start it at the end of that structure. */ + * we start it at the end of that structure. + */ state->guest_tss.sp0 = (long)(&pages->regs + 1); - /* And this is the GDT entry to use for the stack: we keep a - * couple of special LGUEST entries. */ + /* + * And this is the GDT entry to use for the stack: we keep a + * couple of special LGUEST entries. + */ state->guest_tss.ss0 = LGUEST_DS; - /* x86 can have a finegrained bitmap which indicates what I/O + /* + * x86 can have a finegrained bitmap which indicates what I/O * ports the process can use. We set it to the end of our - * structure, meaning "none". */ + * structure, meaning "none". + */ state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); - /* Some GDT entries are the same across all Guests, so we can - * set them up now. */ + /* + * Some GDT entries are the same across all Guests, so we can + * set them up now. + */ setup_default_gdt_entries(state); /* Most IDT entries are the same for all Guests, too.*/ setup_default_idt_entries(state, default_idt_entries); - /* The Host needs to be able to use the LGUEST segments on this - * CPU, too, so put them in the Host GDT. */ + /* + * The Host needs to be able to use the LGUEST segments on this + * CPU, too, so put them in the Host GDT. + */ get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; } - /* In the Switcher, we want the %cs segment register to use the + /* + * In the Switcher, we want the %cs segment register to use the * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so * it will be undisturbed when we switch. To change %cs and jump we - * need this structure to feed to Intel's "lcall" instruction. */ + * need this structure to feed to Intel's "lcall" instruction. + */ lguest_entry.offset = (long)switch_to_guest + switcher_offset(); lguest_entry.segment = LGUEST_CS; - /* Finally, we need to turn off "Page Global Enable". PGE is an + /* + * Finally, we need to turn off "Page Global Enable". PGE is an * optimization where page table entries are specially marked to show * they never change. The Host kernel marks all the kernel pages this * way because it's always present, even when userspace is running. @@ -534,16 +632,21 @@ void __init lguest_arch_host_init(void) * you'll get really weird bugs that you'll chase for two days. * * I used to turn PGE off every time we switched to the Guest and back - * on when we return, but that slowed the Switcher down noticibly. */ + * on when we return, but that slowed the Switcher down noticibly. + */ - /* We don't need the complexity of CPUs coming and going while we're - * doing this. */ + /* + * We don't need the complexity of CPUs coming and going while we're + * doing this. + */ get_online_cpus(); if (cpu_has_pge) { /* We have a broader idea of "global". */ /* Remember that this was originally set (for cleanup). */ cpu_had_pge = 1; - /* adjust_pge is a helper function which sets or unsets the PGE - * bit on its CPU, depending on the argument (0 == unset). */ + /* + * adjust_pge is a helper function which sets or unsets the PGE + * bit on its CPU, depending on the argument (0 == unset). + */ on_each_cpu(adjust_pge, (void *)0, 1); /* Turn off the feature in the global feature set. */ clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); @@ -590,26 +693,32 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) { u32 tsc_speed; - /* The pointer to the Guest's "struct lguest_data" is the only argument. - * We check that address now. */ + /* + * The pointer to the Guest's "struct lguest_data" is the only argument. + * We check that address now. + */ if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, sizeof(*cpu->lg->lguest_data))) return -EFAULT; - /* Having checked it, we simply set lg->lguest_data to point straight + /* + * Having checked it, we simply set lg->lguest_data to point straight * into the Launcher's memory at the right place and then use * copy_to_user/from_user from now on, instead of lgread/write. I put * this in to show that I'm not immune to writing stupid - * optimizations. */ + * optimizations. + */ cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1; - /* We insist that the Time Stamp Counter exist and doesn't change with + /* + * We insist that the Time Stamp Counter exist and doesn't change with * cpu frequency. Some devious chip manufacturers decided that TSC * changes could be handled in software. I decided that time going * backwards might be good for benchmarks, but it's bad for users. * * We also insist that the TSC be stable: the kernel detects unreliable - * TSCs for its own purposes, and we use that here. */ + * TSCs for its own purposes, and we use that here. + */ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) tsc_speed = tsc_khz; else @@ -625,38 +734,47 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) } /*:*/ -/*L:030 lguest_arch_setup_regs() +/*L:030 + * lguest_arch_setup_regs() * * Most of the Guest's registers are left alone: we used get_zeroed_page() to - * allocate the structure, so they will be 0. */ + * allocate the structure, so they will be 0. + */ void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) { struct lguest_regs *regs = cpu->regs; - /* There are four "segment" registers which the Guest needs to boot: + /* + * There are four "segment" registers which the Guest needs to boot: * The "code segment" register (cs) refers to the kernel code segment * __KERNEL_CS, and the "data", "extra" and "stack" segment registers * refer to the kernel data segment __KERNEL_DS. * * The privilege level is packed into the lower bits. The Guest runs - * at privilege level 1 (GUEST_PL).*/ + * at privilege level 1 (GUEST_PL). + */ regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; regs->cs = __KERNEL_CS|GUEST_PL; - /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002) + /* + * The "eflags" register contains miscellaneous flags. Bit 1 (0x002) * is supposed to always be "1". Bit 9 (0x200) controls whether * interrupts are enabled. We always leave interrupts enabled while - * running the Guest. */ + * running the Guest. + */ regs->eflags = X86_EFLAGS_IF | 0x2; - /* The "Extended Instruction Pointer" register says where the Guest is - * running. */ + /* + * The "Extended Instruction Pointer" register says where the Guest is + * running. + */ regs->eip = start; - /* %esi points to our boot information, at physical address 0, so don't - * touch it. */ + /* + * %esi points to our boot information, at physical address 0, so don't + * touch it. + */ - /* There are a couple of GDT entries the Guest expects when first - * booting. */ + /* There are a couple of GDT entries the Guest expects at boot. */ setup_guest_gdt(cpu); } diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S index 3fc15318a80..6dec0979383 100644 --- a/drivers/lguest/x86/switcher_32.S +++ b/drivers/lguest/x86/switcher_32.S @@ -1,12 +1,15 @@ -/*P:900 This is the Switcher: code which sits at 0xFFC00000 astride both the +/*P:900 + * This is the Switcher: code which sits at 0xFFC00000 astride both the * Host and Guest to do the low-level Guest<->Host switch. It is as simple as * it can be made, but it's naturally very specific to x86. * * You have now completed Preparation. If this has whet your appetite; if you * are feeling invigorated and refreshed then the next, more challenging stage - * can be found in "make Guest". :*/ + * can be found in "make Guest". + :*/ -/*M:012 Lguest is meant to be simple: my rule of thumb is that 1% more LOC must +/*M:012 + * Lguest is meant to be simple: my rule of thumb is that 1% more LOC must * gain at least 1% more performance. Since neither LOC nor performance can be * measured beforehand, it generally means implementing a feature then deciding * if it's worth it. And once it's implemented, who can say no? @@ -31,11 +34,14 @@ * Host (which is actually really easy). * * Two questions remain. Would the performance gain outweigh the complexity? - * And who would write the verse documenting it? :*/ + * And who would write the verse documenting it? +:*/ -/*M:011 Lguest64 handles NMI. This gave me NMI envy (until I looked at their +/*M:011 + * Lguest64 handles NMI. This gave me NMI envy (until I looked at their * code). It's worth doing though, since it would let us use oprofile in the - * Host when a Guest is running. :*/ + * Host when a Guest is running. +:*/ /*S:100 * Welcome to the Switcher itself! diff --git a/include/linux/lguest.h b/include/linux/lguest.h index dbf2479e808..0a3a11afd64 100644 --- a/include/linux/lguest.h +++ b/include/linux/lguest.h @@ -1,5 +1,7 @@ -/* Things the lguest guest needs to know. Note: like all lguest interfaces, - * this is subject to wild and random change between versions. */ +/* + * Things the lguest guest needs to know. Note: like all lguest interfaces, + * this is subject to wild and random change between versions. + */ #ifndef _LINUX_LGUEST_H #define _LINUX_LGUEST_H @@ -11,32 +13,42 @@ #define LG_CLOCK_MIN_DELTA 100UL #define LG_CLOCK_MAX_DELTA ULONG_MAX -/*G:031 The second method of communicating with the Host is to via "struct +/*G:031 + * The second method of communicating with the Host is to via "struct * lguest_data". Once the Guest's initialization hypercall tells the Host where - * this is, the Guest and Host both publish information in it. :*/ + * this is, the Guest and Host both publish information in it. +:*/ struct lguest_data { - /* 512 == enabled (same as eflags in normal hardware). The Guest - * changes interrupts so often that a hypercall is too slow. */ + /* + * 512 == enabled (same as eflags in normal hardware). The Guest + * changes interrupts so often that a hypercall is too slow. + */ unsigned int irq_enabled; /* Fine-grained interrupt disabling by the Guest */ DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS); - /* The Host writes the virtual address of the last page fault here, + /* + * The Host writes the virtual address of the last page fault here, * which saves the Guest a hypercall. CR2 is the native register where - * this address would normally be found. */ + * this address would normally be found. + */ unsigned long cr2; /* Wallclock time set by the Host. */ struct timespec time; - /* Interrupt pending set by the Host. The Guest should do a hypercall - * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). */ + /* + * Interrupt pending set by the Host. The Guest should do a hypercall + * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). + */ int irq_pending; - /* Async hypercall ring. Instead of directly making hypercalls, we can + /* + * Async hypercall ring. Instead of directly making hypercalls, we can * place them in here for processing the next time the Host wants. - * This batching can be quite efficient. */ + * This batching can be quite efficient. + */ /* 0xFF == done (set by Host), 0 == pending (set by Guest). */ u8 hcall_status[LHCALL_RING_SIZE]; diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index bfefbdf7498..495203ff221 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -29,8 +29,10 @@ struct lguest_device_desc { __u8 type; /* The number of virtqueues (first in config array) */ __u8 num_vq; - /* The number of bytes of feature bits. Multiply by 2: one for host - * features and one for Guest acknowledgements. */ + /* + * The number of bytes of feature bits. Multiply by 2: one for host + * features and one for Guest acknowledgements. + */ __u8 feature_len; /* The number of bytes of the config array after virtqueues. */ __u8 config_len; @@ -39,8 +41,10 @@ struct lguest_device_desc { __u8 config[0]; }; -/*D:135 This is how we expect the device configuration field for a virtqueue - * to be laid out in config space. */ +/*D:135 + * This is how we expect the device configuration field for a virtqueue + * to be laid out in config space. + */ struct lguest_vqconfig { /* The number of entries in the virtio_ring */ __u16 num; @@ -61,7 +65,9 @@ enum lguest_req LHREQ_EVENTFD, /* + address, fd. */ }; -/* The alignment to use between consumer and producer parts of vring. - * x86 pagesize for historical reasons. */ +/* + * The alignment to use between consumer and producer parts of vring. + * x86 pagesize for historical reasons. + */ #define LGUEST_VRING_ALIGN 4096 #endif /* _LINUX_LGUEST_LAUNCHER */ -- cgit v1.2.3 From a91d74a3c4de8115295ee87350c13a329164aaaf Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 30 Jul 2009 16:03:45 -0600 Subject: lguest: update commentry Every so often, after code shuffles, I need to go through and unbitrot the Lguest Journey (see drivers/lguest/README). Since we now use RCU in a simple form in one place I took the opportunity to expand that explanation. Signed-off-by: Rusty Russell Cc: Ingo Molnar Cc: Paul McKenney --- Documentation/lguest/lguest.c | 184 +++++++++++++++++++++++++++--------- arch/x86/include/asm/lguest_hcall.h | 8 +- arch/x86/lguest/boot.c | 99 ++++++++++++++----- arch/x86/lguest/i386_head.S | 2 + drivers/lguest/core.c | 7 +- drivers/lguest/hypercalls.c | 6 +- drivers/lguest/lguest_device.c | 11 ++- drivers/lguest/lguest_user.c | 100 ++++++++++++++++++-- drivers/lguest/page_tables.c | 84 ++++++++++++---- drivers/lguest/x86/core.c | 2 +- drivers/lguest/x86/switcher_32.S | 6 +- 11 files changed, 398 insertions(+), 111 deletions(-) diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index aa66a52b73e..45163651b51 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -49,7 +49,7 @@ #include "linux/virtio_ring.h" #include "asm/bootparam.h" /*L:110 - * We can ignore the 39 include files we need for this program, but I do want + * We can ignore the 42 include files we need for this program, but I do want * to draw attention to the use of kernel-style types. * * As Linus said, "C is a Spartan language, and so should your naming be." I @@ -305,6 +305,11 @@ static void *map_zeroed_pages(unsigned int num) PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); if (addr == MAP_FAILED) err(1, "Mmaping %u pages of /dev/zero", num); + + /* + * One neat mmap feature is that you can close the fd, and it + * stays mapped. + */ close(fd); return addr; @@ -557,7 +562,7 @@ static void tell_kernel(unsigned long start) } /*:*/ -/* +/*L:200 * Device Handling. * * When the Guest gives us a buffer, it sends an array of addresses and sizes. @@ -608,7 +613,10 @@ static unsigned next_desc(struct vring_desc *desc, return next; } -/* This actually sends the interrupt for this virtqueue */ +/* + * This actually sends the interrupt for this virtqueue, if we've used a + * buffer. + */ static void trigger_irq(struct virtqueue *vq) { unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; @@ -629,12 +637,12 @@ static void trigger_irq(struct virtqueue *vq) } /* - * This looks in the virtqueue and for the first available buffer, and converts + * This looks in the virtqueue for the first available buffer, and converts * it to an iovec for convenient access. Since descriptors consist of some * number of output then some number of input descriptors, it's actually two * iovecs, but we pack them into one and note how many of each there were. * - * This function returns the descriptor number found. + * This function waits if necessary, and returns the descriptor number found. */ static unsigned wait_for_vq_desc(struct virtqueue *vq, struct iovec iov[], @@ -644,10 +652,14 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, struct vring_desc *desc; u16 last_avail = lg_last_avail(vq); + /* There's nothing available? */ while (last_avail == vq->vring.avail->idx) { u64 event; - /* OK, tell Guest about progress up to now. */ + /* + * Since we're about to sleep, now is a good time to tell the + * Guest about what we've used up to now. + */ trigger_irq(vq); /* OK, now we need to know about added descriptors. */ @@ -734,8 +746,9 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, } /* - * After we've used one of their buffers, we tell them about it. We'll then - * want to send them an interrupt, using trigger_irq(). + * After we've used one of their buffers, we tell the Guest about it. Sometime + * later we'll want to send them an interrupt using trigger_irq(); note that + * wait_for_vq_desc() does that for us if it has to wait. */ static void add_used(struct virtqueue *vq, unsigned int head, int len) { @@ -782,12 +795,12 @@ static void console_input(struct virtqueue *vq) struct console_abort *abort = vq->dev->priv; struct iovec iov[vq->vring.num]; - /* Make sure there's a descriptor waiting. */ + /* Make sure there's a descriptor available. */ head = wait_for_vq_desc(vq, iov, &out_num, &in_num); if (out_num) errx(1, "Output buffers in console in queue?"); - /* Read it in. */ + /* Read into it. This is where we usually wait. */ len = readv(STDIN_FILENO, iov, in_num); if (len <= 0) { /* Ran out of input? */ @@ -800,6 +813,7 @@ static void console_input(struct virtqueue *vq) pause(); } + /* Tell the Guest we used a buffer. */ add_used_and_trigger(vq, head, len); /* @@ -834,15 +848,23 @@ static void console_output(struct virtqueue *vq) unsigned int head, out, in; struct iovec iov[vq->vring.num]; + /* We usually wait in here, for the Guest to give us something. */ head = wait_for_vq_desc(vq, iov, &out, &in); if (in) errx(1, "Input buffers in console output queue?"); + + /* writev can return a partial write, so we loop here. */ while (!iov_empty(iov, out)) { int len = writev(STDOUT_FILENO, iov, out); if (len <= 0) err(1, "Write to stdout gave %i", len); iov_consume(iov, out, len); } + + /* + * We're finished with that buffer: if we're going to sleep, + * wait_for_vq_desc() will prod the Guest with an interrupt. + */ add_used(vq, head, 0); } @@ -862,15 +884,30 @@ static void net_output(struct virtqueue *vq) unsigned int head, out, in; struct iovec iov[vq->vring.num]; + /* We usually wait in here for the Guest to give us a packet. */ head = wait_for_vq_desc(vq, iov, &out, &in); if (in) errx(1, "Input buffers in net output queue?"); + /* + * Send the whole thing through to /dev/net/tun. It expects the exact + * same format: what a coincidence! + */ if (writev(net_info->tunfd, iov, out) < 0) errx(1, "Write to tun failed?"); + + /* + * Done with that one; wait_for_vq_desc() will send the interrupt if + * all packets are processed. + */ add_used(vq, head, 0); } -/* Will reading from this file descriptor block? */ +/* + * Handling network input is a bit trickier, because I've tried to optimize it. + * + * First we have a helper routine which tells is if from this file descriptor + * (ie. the /dev/net/tun device) will block: + */ static bool will_block(int fd) { fd_set fdset; @@ -880,7 +917,11 @@ static bool will_block(int fd) return select(fd+1, &fdset, NULL, NULL, &zero) != 1; } -/* This handles packets coming in from the tun device to our Guest. */ +/* + * This handles packets coming in from the tun device to our Guest. Like all + * service routines, it gets called again as soon as it returns, so you don't + * see a while(1) loop here. + */ static void net_input(struct virtqueue *vq) { int len; @@ -888,21 +929,38 @@ static void net_input(struct virtqueue *vq) struct iovec iov[vq->vring.num]; struct net_info *net_info = vq->dev->priv; + /* + * Get a descriptor to write an incoming packet into. This will also + * send an interrupt if they're out of descriptors. + */ head = wait_for_vq_desc(vq, iov, &out, &in); if (out) errx(1, "Output buffers in net input queue?"); - /* Deliver interrupt now, since we're about to sleep. */ + /* + * If it looks like we'll block reading from the tun device, send them + * an interrupt. + */ if (vq->pending_used && will_block(net_info->tunfd)) trigger_irq(vq); + /* + * Read in the packet. This is where we normally wait (when there's no + * incoming network traffic). + */ len = readv(net_info->tunfd, iov, in); if (len <= 0) err(1, "Failed to read from tun."); + + /* + * Mark that packet buffer as used, but don't interrupt here. We want + * to wait until we've done as much work as we can. + */ add_used(vq, head, len); } +/*:*/ -/* This is the helper to create threads. */ +/* This is the helper to create threads: run the service routine in a loop. */ static int do_thread(void *_vq) { struct virtqueue *vq = _vq; @@ -950,11 +1008,14 @@ static void reset_device(struct device *dev) signal(SIGCHLD, (void *)kill_launcher); } +/*L:216 + * This actually creates the thread which services the virtqueue for a device. + */ static void create_thread(struct virtqueue *vq) { /* - * Create stack for thread and run it. Since the stack grows upwards, - * we point the stack pointer to the end of this region. + * Create stack for thread. Since the stack grows upwards, we point + * the stack pointer to the end of this region. */ char *stack = malloc(32768); unsigned long args[] = { LHREQ_EVENTFD, @@ -966,17 +1027,22 @@ static void create_thread(struct virtqueue *vq) err(1, "Creating eventfd"); args[2] = vq->eventfd; - /* Attach an eventfd to this virtqueue: it will go off - * when the Guest does an LHCALL_NOTIFY for this vq. */ + /* + * Attach an eventfd to this virtqueue: it will go off when the Guest + * does an LHCALL_NOTIFY for this vq. + */ if (write(lguest_fd, &args, sizeof(args)) != 0) err(1, "Attaching eventfd"); - /* CLONE_VM: because it has to access the Guest memory, and - * SIGCHLD so we get a signal if it dies. */ + /* + * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so + * we get a signal if it dies. + */ vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); if (vq->thread == (pid_t)-1) err(1, "Creating clone"); - /* We close our local copy, now the child has it. */ + + /* We close our local copy now the child has it. */ close(vq->eventfd); } @@ -1028,7 +1094,10 @@ static void update_device_status(struct device *dev) } } -/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ +/*L:215 + * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In + * particular, it's used to notify us of device status changes during boot. + */ static void handle_output(unsigned long addr) { struct device *i; @@ -1037,18 +1106,32 @@ static void handle_output(unsigned long addr) for (i = devices.dev; i; i = i->next) { struct virtqueue *vq; - /* Notifications to device descriptors update device status. */ + /* + * Notifications to device descriptors mean they updated the + * device status. + */ if (from_guest_phys(addr) == i->desc) { update_device_status(i); return; } - /* Devices *can* be used before status is set to DRIVER_OK. */ + /* + * Devices *can* be used before status is set to DRIVER_OK. + * The original plan was that they would never do this: they + * would always finish setting up their status bits before + * actually touching the virtqueues. In practice, we allowed + * them to, and they do (eg. the disk probes for partition + * tables as part of initialization). + * + * If we see this, we start the device: once it's running, we + * expect the device to catch all the notifications. + */ for (vq = i->vq; vq; vq = vq->next) { if (addr != vq->config.pfn*getpagesize()) continue; if (i->running) errx(1, "Notification on running %s", i->name); + /* This just calls create_thread() for each virtqueue */ start_device(i); return; } @@ -1132,6 +1215,11 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, vq->next = NULL; vq->last_avail_idx = 0; vq->dev = dev; + + /* + * This is the routine the service thread will run, and its Process ID + * once it's running. + */ vq->service = service; vq->thread = (pid_t)-1; @@ -1202,7 +1290,8 @@ static void set_config(struct device *dev, unsigned len, const void *conf) /* * This routine does all the creation and setup of a new device, including - * calling new_dev_desc() to allocate the descriptor and device memory. + * calling new_dev_desc() to allocate the descriptor and device memory. We + * don't actually start the service threads until later. * * See what I mean about userspace being boring? */ @@ -1478,19 +1567,7 @@ static void setup_tun_net(char *arg) verbose("device %u: tun %s: %s\n", devices.device_num, tapif, arg); } - -/* - * Our block (disk) device should be really simple: the Guest asks for a block - * number and we read or write that position in the file. Unfortunately, that - * was amazingly slow: the Guest waits until the read is finished before - * running anything else, even if it could have been doing useful work. - * - * We could use async I/O, except it's reputed to suck so hard that characters - * actually go missing from your code when you try to use it. - * - * So this was one reason why lguest now does all virtqueue servicing in - * separate threads: it's more efficient and more like a real device. - */ +/*:*/ /* This hangs off device->priv. */ struct vblk_info @@ -1512,8 +1589,16 @@ struct vblk_info /*L:210 * The Disk * - * Remember that the block device is handled by a separate I/O thread. We head - * straight into the core of that thread here: + * The disk only has one virtqueue, so it only has one thread. It is really + * simple: the Guest asks for a block number and we read or write that position + * in the file. + * + * Before we serviced each virtqueue in a separate thread, that was unacceptably + * slow: the Guest waits until the read is finished before running anything + * else, even if it could have been doing useful work. + * + * We could have used async I/O, except it's reputed to suck so hard that + * characters actually go missing from your code when you try to use it. */ static void blk_request(struct virtqueue *vq) { @@ -1525,7 +1610,10 @@ static void blk_request(struct virtqueue *vq) struct iovec iov[vq->vring.num]; off64_t off; - /* Get the next request. */ + /* + * Get the next request, where we normally wait. It triggers the + * interrupt to acknowledge previously serviced requests (if any). + */ head = wait_for_vq_desc(vq, iov, &out_num, &in_num); /* @@ -1539,6 +1627,10 @@ static void blk_request(struct virtqueue *vq) out = convert(&iov[0], struct virtio_blk_outhdr); in = convert(&iov[out_num+in_num-1], u8); + /* + * For historical reasons, block operations are expressed in 512 byte + * "sectors". + */ off = out->sector * 512; /* @@ -1614,6 +1706,7 @@ static void blk_request(struct virtqueue *vq) if (out->type & VIRTIO_BLK_T_BARRIER) fdatasync(vblk->fd); + /* Finished that request. */ add_used(vq, head, wlen); } @@ -1682,9 +1775,8 @@ static void rng_input(struct virtqueue *vq) errx(1, "Output buffers in rng?"); /* - * This is why we convert to iovecs: the readv() call uses them, and so - * it reads straight into the Guest's buffer. We loop to make sure we - * fill it. + * Just like the console write, we loop to cover the whole iovec. + * In this case, short reads actually happen quite a bit. */ while (!iov_empty(iov, in_num)) { len = readv(rng_info->rfd, iov, in_num); @@ -1818,7 +1910,9 @@ int main(int argc, char *argv[]) devices.lastdev = NULL; devices.next_irq = 1; + /* We're CPU 0. In fact, that's the only CPU possible right now. */ cpu_id = 0; + /* * We need to know how much memory so we can set up the device * descriptor and memory pages for the devices as we parse the command @@ -1926,7 +2020,7 @@ int main(int argc, char *argv[]) */ tell_kernel(start); - /* Ensure that we terminate if a child dies. */ + /* Ensure that we terminate if a device-servicing child dies. */ signal(SIGCHLD, kill_launcher); /* If we exit via err(), this kills all the threads, restores tty. */ diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index cceb73e12e5..ba0eed8aa1a 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -35,10 +35,10 @@ * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * - * We use the KVM hypercall mechanism. Seventeen hypercalls are - * available: the hypercall number is put in the %eax register, and the - * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. - * If a return value makes sense, it's returned in %eax. + * We use the KVM hypercall mechanism, though completely different hypercall + * numbers. Seventeen hypercalls are available: the hypercall number is put in + * the %eax register, and the arguments (when required) are placed in %ebx, + * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax. * * Grossly invalid calls result in Sudden Death at the hands of the vengeful * Host, rather than returning failure. This reflects Winston Churchill's diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 025c04d18f2..d677fa9ca65 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -154,6 +154,7 @@ static void lazy_hcall1(unsigned long call, async_hcall(call, arg1, 0, 0, 0); } +/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ static void lazy_hcall2(unsigned long call, unsigned long arg1, unsigned long arg2) @@ -189,8 +190,10 @@ static void lazy_hcall4(unsigned long call, } #endif -/* When lazy mode is turned off reset the per-cpu lazy mode variable and then - * issue the do-nothing hypercall to flush any stored calls. */ +/*G:036 + * When lazy mode is turned off reset the per-cpu lazy mode variable and then + * issue the do-nothing hypercall to flush any stored calls. +:*/ static void lguest_leave_lazy_mmu_mode(void) { kvm_hypercall0(LHCALL_FLUSH_ASYNC); @@ -250,13 +253,11 @@ extern void lg_irq_enable(void); extern void lg_restore_fl(unsigned long flags); /*M:003 - * Note that we don't check for outstanding interrupts when we re-enable them - * (or when we unmask an interrupt). This seems to work for the moment, since - * interrupts are rare and we'll just get the interrupt on the next timer tick, - * but now we can run with CONFIG_NO_HZ, we should revisit this. One way would - * be to put the "irq_enabled" field in a page by itself, and have the Host - * write-protect it when an interrupt comes in when irqs are disabled. There - * will then be a page fault as soon as interrupts are re-enabled. + * We could be more efficient in our checking of outstanding interrupts, rather + * than using a branch. One way would be to put the "irq_enabled" field in a + * page by itself, and have the Host write-protect it when an interrupt comes + * in when irqs are disabled. There will then be a page fault as soon as + * interrupts are re-enabled. * * A better method is to implement soft interrupt disable generally for x86: * instead of disabling interrupts, we set a flag. If an interrupt does come @@ -568,7 +569,7 @@ static void lguest_write_cr4(unsigned long val) * cr3 ---> +---------+ * | --------->+---------+ * | | | PADDR1 | - * Top-level | | PADDR2 | + * Mid-level | | PADDR2 | * (PMD) page | | | * | | Lower-level | * | | (PTE) page | @@ -588,23 +589,62 @@ static void lguest_write_cr4(unsigned long val) * Index into top Index into second Offset within page * page directory page pagetable page * - * The kernel spends a lot of time changing both the top-level page directory - * and lower-level pagetable pages. The Guest doesn't know physical addresses, - * so while it maintains these page tables exactly like normal, it also needs - * to keep the Host informed whenever it makes a change: the Host will create - * the real page tables based on the Guests'. + * Now, unfortunately, this isn't the whole story: Intel added Physical Address + * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). + * These are held in 64-bit page table entries, so we can now only fit 512 + * entries in a page, and the neat three-level tree breaks down. + * + * The result is a four level page table: + * + * cr3 --> [ 4 Upper ] + * [ Level ] + * [ Entries ] + * [(PUD Page)]---> +---------+ + * | --------->+---------+ + * | | | PADDR1 | + * Mid-level | | PADDR2 | + * (PMD) page | | | + * | | Lower-level | + * | | (PTE) page | + * | | | | + * .... .... + * + * + * And the virtual address is decoded as: + * + * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| + * Index into Index into mid Index into lower Offset within page + * top entries directory page pagetable page + * + * It's too hard to switch between these two formats at runtime, so Linux only + * supports one or the other depending on whether CONFIG_X86_PAE is set. Many + * distributions turn it on, and not just for people with silly amounts of + * memory: the larger PTE entries allow room for the NX bit, which lets the + * kernel disable execution of pages and increase security. + * + * This was a problem for lguest, which couldn't run on these distributions; + * then Matias Zabaljauregui figured it all out and implemented it, and only a + * handful of puppies were crushed in the process! + * + * Back to our point: the kernel spends a lot of time changing both the + * top-level page directory and lower-level pagetable pages. The Guest doesn't + * know physical addresses, so while it maintains these page tables exactly + * like normal, it also needs to keep the Host informed whenever it makes a + * change: the Host will create the real page tables based on the Guests'. */ /* - * The Guest calls this to set a second-level entry (pte), ie. to map a page - * into a process' address space. We set the entry then tell the Host the - * toplevel and address this corresponds to. The Guest uses one pagetable per - * process, so we need to tell the Host which one we're changing (mm->pgd). + * The Guest calls this after it has set a second-level entry (pte), ie. to map + * a page into a process' address space. Wetell the Host the toplevel and + * address this corresponds to. The Guest uses one pagetable per process, so + * we need to tell the Host which one we're changing (mm->pgd). */ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { #ifdef CONFIG_X86_PAE + /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low, ptep->pte_high); #else @@ -612,6 +652,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, #endif } +/* This is the "set and update" combo-meal-deal version. */ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { @@ -672,6 +713,11 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval) } #ifdef CONFIG_X86_PAE +/* + * With 64-bit PTE values, we need to be careful setting them: if we set 32 + * bits at a time, the hardware could see a weird half-set entry. These + * versions ensure we update all 64 bits at once. + */ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) { native_set_pte_atomic(ptep, pte); @@ -679,13 +725,14 @@ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) lazy_hcall1(LHCALL_FLUSH_TLB, 1); } -void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) { native_pte_clear(mm, addr, ptep); lguest_pte_update(mm, addr, ptep); } -void lguest_pmd_clear(pmd_t *pmdp) +static void lguest_pmd_clear(pmd_t *pmdp) { lguest_set_pmd(pmdp, __pmd(0)); } @@ -784,6 +831,14 @@ static void __init lguest_init_IRQ(void) irq_ctx_init(smp_processor_id()); } +/* + * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so + * rather than set them in lguest_init_IRQ we are called here every time an + * lguest device needs an interrupt. + * + * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should + * pass that up! + */ void lguest_setup_irq(unsigned int irq) { irq_to_desc_alloc_node(irq, 0); @@ -1298,7 +1353,7 @@ __init void lguest_init(void) */ switch_to_new_gdt(0); - /* As described in head_32.S, we map the first 128M of memory. */ + /* We actually boot with all memory mapped, but let's say 128MB. */ max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; /* diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index db6aa95eb05..27eac0faee4 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -102,6 +102,7 @@ send_interrupts: * create one manually here. */ .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ + /* Put eax back the way we found it. */ popl %eax ret @@ -125,6 +126,7 @@ ENTRY(lg_restore_fl) jnz send_interrupts /* Again, the normal path has used no extra registers. Clever, huh? */ ret +/*:*/ /* These demark the EIP range where host should never deliver interrupts. */ .global lguest_noirq_start diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index cd058bc903f..1e2cb846b3c 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -217,10 +217,15 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) /* * It's possible the Guest did a NOTIFY hypercall to the - * Launcher, in which case we return from the read() now. + * Launcher. */ if (cpu->pending_notify) { + /* + * Does it just needs to write to a registered + * eventfd (ie. the appropriate virtqueue thread)? + */ if (!send_notify_to_eventfd(cpu)) { + /* OK, we tell the main Laucher. */ if (put_user(cpu->pending_notify, user)) return -EFAULT; return sizeof(cpu->pending_notify); diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 787ab4bc09f..83511eb0923 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -59,7 +59,7 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) case LHCALL_SHUTDOWN: { char msg[128]; /* - * Shutdown is such a trivial hypercall that we do it in four + * Shutdown is such a trivial hypercall that we do it in five * lines right here. * * If the lgread fails, it will call kill_guest() itself; the @@ -245,6 +245,10 @@ static void initialize(struct lg_cpu *cpu) * device), the Guest will still see the old page. In practice, this never * happens: why would the Guest read a page which it has never written to? But * a similar scenario might one day bite us, so it's worth mentioning. + * + * Note that if we used a shared anonymous mapping in the Launcher instead of + * mapping /dev/zero private, we wouldn't worry about cop-on-write. And we + * need that to switch the Launcher to processes (away from threads) anyway. :*/ /*H:100 diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index cc000e79c3d..1401c1ace1e 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -236,7 +236,7 @@ static void lg_notify(struct virtqueue *vq) extern void lguest_setup_irq(unsigned int irq); /* - * This routine finds the first virtqueue described in the configuration of + * This routine finds the Nth virtqueue described in the configuration of * this device and sets it up. * * This is kind of an ugly duckling. It'd be nicer to have a standard @@ -244,9 +244,6 @@ extern void lguest_setup_irq(unsigned int irq); * everyone wants to do it differently. The KVM coders want the Guest to * allocate its own pages and tell the Host where they are, but for lguest it's * simpler for the Host to simply tell us where the pages are. - * - * So we provide drivers with a "find the Nth virtqueue and set it up" - * function. */ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, unsigned index, @@ -422,7 +419,11 @@ static void add_lguest_device(struct lguest_device_desc *d, /* This devices' parent is the lguest/ dir. */ ldev->vdev.dev.parent = lguest_root; - /* We have a unique device index thanks to the dev_index counter. */ + /* + * The device type comes straight from the descriptor. There's also a + * device vendor field in the virtio_device struct, which we leave as + * 0. + */ ldev->vdev.id.device = d->type; /* * We have a simple set of routines for querying the device's diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 7e92017103d..b4d3f7ca554 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -1,9 +1,8 @@ -/*P:200 - * This contains all the /dev/lguest code, whereby the userspace launcher +/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher * controls and communicates with the Guest. For example, the first write will - * tell us the Guest's memory layout, pagetable, entry point and kernel address - * offset. A read will run the Guest until something happens, such as a signal - * or the Guest doing a NOTIFY out to the Launcher. + * tell us the Guest's memory layout and entry point. A read will run the + * Guest until something happens, such as a signal or the Guest doing a NOTIFY + * out to the Launcher. :*/ #include #include @@ -13,14 +12,41 @@ #include #include "lg.h" +/*L:056 + * Before we move on, let's jump ahead and look at what the kernel does when + * it needs to look up the eventfds. That will complete our picture of how we + * use RCU. + * + * The notification value is in cpu->pending_notify: we return true if it went + * to an eventfd. + */ bool send_notify_to_eventfd(struct lg_cpu *cpu) { unsigned int i; struct lg_eventfd_map *map; - /* lg->eventfds is RCU-protected */ + /* + * This "rcu_read_lock()" helps track when someone is still looking at + * the (RCU-using) eventfds array. It's not actually a lock at all; + * indeed it's a noop in many configurations. (You didn't expect me to + * explain all the RCU secrets here, did you?) + */ rcu_read_lock(); + /* + * rcu_dereference is the counter-side of rcu_assign_pointer(); it + * makes sure we don't access the memory pointed to by + * cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy, + * but Alpha allows this! Paul McKenney points out that a really + * aggressive compiler could have the same effect: + * http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html + * + * So play safe, use rcu_dereference to get the rcu-protected pointer: + */ map = rcu_dereference(cpu->lg->eventfds); + /* + * Simple array search: even if they add an eventfd while we do this, + * we'll continue to use the old array and just won't see the new one. + */ for (i = 0; i < map->num; i++) { if (map->map[i].addr == cpu->pending_notify) { eventfd_signal(map->map[i].event, 1); @@ -28,14 +54,43 @@ bool send_notify_to_eventfd(struct lg_cpu *cpu) break; } } + /* We're done with the rcu-protected variable cpu->lg->eventfds. */ rcu_read_unlock(); + + /* If we cleared the notification, it's because we found a match. */ return cpu->pending_notify == 0; } +/*L:055 + * One of the more tricksy tricks in the Linux Kernel is a technique called + * Read Copy Update. Since one point of lguest is to teach lguest journeyers + * about kernel coding, I use it here. (In case you're curious, other purposes + * include learning about virtualization and instilling a deep appreciation for + * simplicity and puppies). + * + * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we + * add new eventfds without ever blocking readers from accessing the array. + * The current Launcher only does this during boot, so that never happens. But + * Read Copy Update is cool, and adding a lock risks damaging even more puppies + * than this code does. + * + * We allocate a brand new one-larger array, copy the old one and add our new + * element. Then we make the lg eventfd pointer point to the new array. + * That's the easy part: now we need to free the old one, but we need to make + * sure no slow CPU somewhere is still looking at it. That's what + * synchronize_rcu does for us: waits until every CPU has indicated that it has + * moved on to know it's no longer using the old one. + * + * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update. + */ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) { struct lg_eventfd_map *new, *old = lg->eventfds; + /* + * We don't allow notifications on value 0 anyway (pending_notify of + * 0 means "nothing pending"). + */ if (!addr) return -EINVAL; @@ -62,12 +117,20 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) } new->num++; - /* Now put new one in place. */ + /* + * Now put new one in place: rcu_assign_pointer() is a fancy way of + * doing "lg->eventfds = new", but it uses memory barriers to make + * absolutely sure that the contents of "new" written above is nailed + * down before we actually do the assignment. + * + * We have to think about these kinds of things when we're operating on + * live data without locks. + */ rcu_assign_pointer(lg->eventfds, new); /* * We're not in a big hurry. Wait until noone's looking at old - * version, then delete it. + * version, then free it. */ synchronize_rcu(); kfree(old); @@ -75,6 +138,14 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) return 0; } +/*L:052 + * Receiving notifications from the Guest is usually done by attaching a + * particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will + * become readable when the Guest does an LHCALL_NOTIFY with that value. + * + * This is really convenient for processing each virtqueue in a separate + * thread. + */ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) { unsigned long addr, fd; @@ -86,6 +157,11 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) if (get_user(fd, input) != 0) return -EFAULT; + /* + * Just make sure two callers don't add eventfds at once. We really + * only need to lock against callers adding to the same Guest, so using + * the Big Lguest Lock is overkill. But this is setup, not a fast path. + */ mutex_lock(&lguest_lock); err = add_eventfd(lg, addr, fd); mutex_unlock(&lguest_lock); @@ -106,6 +182,10 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) if (irq >= LGUEST_IRQS) return -EINVAL; + /* + * Next time the Guest runs, the core code will see if it can deliver + * this interrupt. + */ set_interrupt(cpu, irq); return 0; } @@ -307,10 +387,10 @@ unlock: * The first operation the Launcher does must be a write. All writes * start with an unsigned long number: for the first write this must be * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use - * writes of other values to send interrupts. + * writes of other values to send interrupts or set up receipt of notifications. * * Note that we overload the "offset" in the /dev/lguest file to indicate what - * CPU number we're dealing with. Currently this is always 0, since we only + * CPU number we're dealing with. Currently this is always 0 since we only * support uniprocessor Guests, but you can see the beginnings of SMP support * here. */ diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 3da902e4b4c..a8d0aee3bc0 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -29,10 +29,10 @@ /*H:300 * The Page Table Code * - * We use two-level page tables for the Guest. If you're not entirely - * comfortable with virtual addresses, physical addresses and page tables then - * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with - * diagrams!). + * We use two-level page tables for the Guest, or three-level with PAE. If + * you're not entirely comfortable with virtual addresses, physical addresses + * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page + * Table Handling" (with diagrams!). * * The Guest keeps page tables, but we maintain the actual ones here: these are * called "shadow" page tables. Which is a very Guest-centric name: these are @@ -52,9 +52,8 @@ :*/ /* - * 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is - * conveniently placed at the top 4MB, so it uses a separate, complete PTE - * page. + * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB) + * or 512 PTE entries with PAE (2MB). */ #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) @@ -81,7 +80,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); /*H:320 * The page table code is curly enough to need helper functions to keep it - * clear and clean. + * clear and clean. The kernel itself provides many of them; one advantage + * of insisting that the Guest and Host use the same CONFIG_PAE setting. * * There are two functions which return pointers to the shadow (aka "real") * page tables. @@ -155,7 +155,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) } /* - * These two functions just like the above two, except they access the Guest + * These functions are just like the above two, except they access the Guest * page tables. Hence they return a Guest address. */ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) @@ -165,6 +165,7 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) } #ifdef CONFIG_X86_PAE +/* Follow the PGD to the PMD. */ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) { unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; @@ -172,6 +173,7 @@ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) return gpage + pmd_index(vaddr) * sizeof(pmd_t); } +/* Follow the PMD to the PTE. */ static unsigned long gpte_addr(struct lg_cpu *cpu, pmd_t gpmd, unsigned long vaddr) { @@ -181,6 +183,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, return gpage + pte_index(vaddr) * sizeof(pte_t); } #else +/* Follow the PGD to the PTE (no mid-level for !PAE). */ static unsigned long gpte_addr(struct lg_cpu *cpu, pgd_t gpgd, unsigned long vaddr) { @@ -314,6 +317,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) pte_t gpte; pte_t *spte; + /* Mid level for PAE. */ #ifdef CONFIG_X86_PAE pmd_t *spmd; pmd_t gpmd; @@ -391,6 +395,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) */ gpte_ptr = gpte_addr(cpu, gpgd, vaddr); #endif + + /* Read the actual PTE value. */ gpte = lgread(cpu, gpte_ptr, pte_t); /* If this page isn't in the Guest page tables, we can't page it in. */ @@ -507,6 +513,7 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) kill_guest(cpu, "bad stack page %#lx", vaddr); } +/*:*/ #ifdef CONFIG_X86_PAE static void release_pmd(pmd_t *spmd) @@ -543,7 +550,11 @@ static void release_pgd(pgd_t *spgd) } #else /* !CONFIG_X86_PAE */ -/*H:450 If we chase down the release_pgd() code, it looks like this: */ +/*H:450 + * If we chase down the release_pgd() code, the non-PAE version looks like + * this. The PAE version is almost identical, but instead of calling + * release_pte it calls release_pmd(), which looks much like this. + */ static void release_pgd(pgd_t *spgd) { /* If the entry's not present, there's nothing to release. */ @@ -898,17 +909,21 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) /* ... throw it away. */ release_pgd(lg->pgdirs[pgdir].pgdir + idx); } + #ifdef CONFIG_X86_PAE +/* For setting a mid-level, we just throw everything away. It's easy. */ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) { guest_pagetable_clear_all(&lg->cpus[0]); } #endif -/* - * Once we know how much memory we have we can construct simple identity (which +/*H:505 + * To get through boot, we construct simple identity page mappings (which * set virtual == physical) and linear mappings which will get the Guest far - * enough into the boot to create its own. + * enough into the boot to create its own. The linear mapping means we + * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, + * as you'll see. * * We lay them out of the way, just below the initrd (which is why we need to * know its size here). @@ -944,6 +959,10 @@ static unsigned long setup_pagetables(struct lguest *lg, linear = (void *)pgdir - linear_pages * PAGE_SIZE; #ifdef CONFIG_X86_PAE + /* + * And the single mid page goes below that. We only use one, but + * that's enough to map 1G, which definitely gets us through boot. + */ pmds = (void *)linear - PAGE_SIZE; #endif /* @@ -957,13 +976,14 @@ static unsigned long setup_pagetables(struct lguest *lg, return -EFAULT; } +#ifdef CONFIG_X86_PAE /* - * The top level points to the linear page table pages above. - * We setup the identity and linear mappings here. + * Make the Guest PMD entries point to the corresponding place in the + * linear mapping (up to one page worth of PMD). */ -#ifdef CONFIG_X86_PAE for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; i += PTRS_PER_PTE, j++) { + /* FIXME: native_set_pmd is overkill here. */ native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); @@ -971,18 +991,36 @@ static unsigned long setup_pagetables(struct lguest *lg, return -EFAULT; } + /* One PGD entry, pointing to that PMD page. */ set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); + /* Copy it in as the first PGD entry (ie. addresses 0-1G). */ if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) return -EFAULT; + /* + * And the third PGD entry (ie. addresses 3G-4G). + * + * FIXME: This assumes that PAGE_OFFSET for the Guest is 0xC0000000. + */ if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) return -EFAULT; #else + /* + * The top level points to the linear page table pages above. + * We setup the identity and linear mappings here. + */ phys_linear = (unsigned long)linear - mem_base; for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { pgd_t pgd; + /* + * Create a PGD entry which points to the right part of the + * linear PTE pages. + */ pgd = __pgd((phys_linear + i * sizeof(pte_t)) | (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); + /* + * Copy it into the PGD page at 0 and PAGE_OFFSET. + */ if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) + i / PTRS_PER_PTE], @@ -992,8 +1030,8 @@ static unsigned long setup_pagetables(struct lguest *lg, #endif /* - * We return the top level (guest-physical) address: remember where - * this is. + * We return the top level (guest-physical) address: we remember where + * this is to write it into lguest_data when the Guest initializes. */ return (unsigned long)pgdir - mem_base; } @@ -1031,7 +1069,9 @@ int init_guest_pagetable(struct lguest *lg) lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); if (!lg->pgdirs[0].pgdir) return -ENOMEM; + #ifdef CONFIG_X86_PAE + /* For PAE, we also create the initial mid-level. */ pgd = lg->pgdirs[0].pgdir; pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); if (!pmd_table) @@ -1040,11 +1080,13 @@ int init_guest_pagetable(struct lguest *lg) set_pgd(pgd + SWITCHER_PGD_INDEX, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); #endif + + /* This is the current page table. */ lg->cpus[0].cpu_pgd = 0; return 0; } -/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ +/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ void page_table_guest_data_init(struct lg_cpu *cpu) { /* We get the kernel address: above this is all kernel memory. */ @@ -1105,12 +1147,16 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) pmd_t switcher_pmd; pmd_t *pmd_table; + /* FIXME: native_set_pmd is overkill here. */ native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); + /* Figure out where the pmd page is, by reading the PGD, and converting + * it to a virtual address. */ pmd_table = __va(pgd_pfn(cpu->lg-> pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) << PAGE_SHIFT); + /* Now write it into the shadow page table. */ native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); #else pgd_t switcher_pgd; diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 96f7d88ec7f..6ae388849a3 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -187,7 +187,7 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) * also simplify copy_in_guest_info(). Note that we'd still need to restore * things when we exit to Launcher userspace, but that's fairly easy. * - * We could also try using this hooks for PGE, but that might be too expensive. + * We could also try using these hooks for PGE, but that might be too expensive. * * The hooks were designed for KVM, but we can also put them to good use. :*/ diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S index 6dec0979383..40634b0db9f 100644 --- a/drivers/lguest/x86/switcher_32.S +++ b/drivers/lguest/x86/switcher_32.S @@ -1,7 +1,7 @@ /*P:900 - * This is the Switcher: code which sits at 0xFFC00000 astride both the - * Host and Guest to do the low-level Guest<->Host switch. It is as simple as - * it can be made, but it's naturally very specific to x86. + * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride + * both the Host and Guest to do the low-level Guest<->Host switch. It is as + * simple as it can be made, but it's naturally very specific to x86. * * You have now completed Preparation. If this has whet your appetite; if you * are feeling invigorated and refreshed then the next, more challenging stage -- cgit v1.2.3 From 1842f23c05b6a866be831aa60bc8a8731c58ddd0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 30 Jul 2009 16:03:46 -0600 Subject: lguest and virtio: cleanup struct definitions to Linux style. I've been doing this for years, and akpm picked me up on it about 12 months ago. lguest partly serves as example code, so let's do it Right. Also, remove two unused fields in struct vblk_info in the example launcher. Signed-off-by: Rusty Russell Cc: Ingo Molnar --- Documentation/lguest/lguest.c | 21 +++++---------------- drivers/lguest/lg.h | 9 +++------ drivers/lguest/lguest_device.c | 3 +-- include/linux/lguest.h | 3 +-- include/linux/virtio_blk.h | 6 ++---- include/linux/virtio_config.h | 3 +-- include/linux/virtio_net.h | 6 ++---- include/linux/virtio_ring.h | 12 ++++-------- 8 files changed, 19 insertions(+), 44 deletions(-) diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 45163651b51..950cde6d6e5 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -93,8 +93,7 @@ static int lguest_fd; static unsigned int __thread cpu_id; /* This is our list of devices. */ -struct device_list -{ +struct device_list { /* Counter to assign interrupt numbers. */ unsigned int next_irq; @@ -114,8 +113,7 @@ struct device_list static struct device_list devices; /* The device structure describes a single device. */ -struct device -{ +struct device { /* The linked-list pointer. */ struct device *next; @@ -140,8 +138,7 @@ struct device }; /* The virtqueue structure describes a queue attached to a device. */ -struct virtqueue -{ +struct virtqueue { struct virtqueue *next; /* Which device owns me. */ @@ -779,8 +776,7 @@ static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) * * We associate some data with the console for our exit hack. */ -struct console_abort -{ +struct console_abort { /* How many times have they hit ^C? */ int count; /* When did they start? */ @@ -1570,20 +1566,13 @@ static void setup_tun_net(char *arg) /*:*/ /* This hangs off device->priv. */ -struct vblk_info -{ +struct vblk_info { /* The size of the file. */ off64_t len; /* The file descriptor for the file. */ int fd; - /* IO thread listens on this file descriptor [0]. */ - int workpipe[2]; - - /* IO thread writes to this file descriptor to mark it done, then - * Launcher triggers interrupt to Guest. */ - int done_fd; }; /*L:210 diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 74c0db691b5..bc28745d05a 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -16,15 +16,13 @@ void free_pagetables(void); int init_pagetables(struct page **switcher_page, unsigned int pages); -struct pgdir -{ +struct pgdir { unsigned long gpgdir; pgd_t *pgdir; }; /* We have two pages shared with guests, per cpu. */ -struct lguest_pages -{ +struct lguest_pages { /* This is the stack page mapped rw in guest */ char spare[PAGE_SIZE - sizeof(struct lguest_regs)]; struct lguest_regs regs; @@ -89,8 +87,7 @@ struct lg_eventfd_map { }; /* The private info the thread maintains about the guest. */ -struct lguest -{ +struct lguest { struct lguest_data __user *lguest_data; struct lg_cpu cpus[NR_CPUS]; unsigned int nr_cpus; diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index 1401c1ace1e..b6200bc39b5 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -207,8 +207,7 @@ static void lg_reset(struct virtio_device *vdev) */ /*D:140 This is the information we remember about each virtqueue. */ -struct lguest_vq_info -{ +struct lguest_vq_info { /* A copy of the information contained in the device config. */ struct lguest_vqconfig config; diff --git a/include/linux/lguest.h b/include/linux/lguest.h index 0a3a11afd64..2fb1dcbcb5a 100644 --- a/include/linux/lguest.h +++ b/include/linux/lguest.h @@ -18,8 +18,7 @@ * lguest_data". Once the Guest's initialization hypercall tells the Host where * this is, the Guest and Host both publish information in it. :*/ -struct lguest_data -{ +struct lguest_data { /* * 512 == enabled (same as eflags in normal hardware). The Guest * changes interrupts so often that a hypercall is too slow. diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h index be7d255fc7c..8dab9f2b883 100644 --- a/include/linux/virtio_blk.h +++ b/include/linux/virtio_blk.h @@ -20,8 +20,7 @@ #define VIRTIO_BLK_ID_BYTES (sizeof(__u16[256])) /* IDENTIFY DATA */ -struct virtio_blk_config -{ +struct virtio_blk_config { /* The capacity (in 512-byte sectors). */ __u64 capacity; /* The maximum segment size (if VIRTIO_BLK_F_SIZE_MAX) */ @@ -50,8 +49,7 @@ struct virtio_blk_config #define VIRTIO_BLK_T_BARRIER 0x80000000 /* This is the first element of the read scatter-gather list. */ -struct virtio_blk_outhdr -{ +struct virtio_blk_outhdr { /* VIRTIO_BLK_T* */ __u32 type; /* io priority. */ diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 99f514575f6..e547e3c8ee9 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -79,8 +79,7 @@ * the dev->feature bits if it wants. */ typedef void vq_callback_t(struct virtqueue *); -struct virtio_config_ops -{ +struct virtio_config_ops { void (*get)(struct virtio_device *vdev, unsigned offset, void *buf, unsigned len); void (*set)(struct virtio_device *vdev, unsigned offset, diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 9c543d6ac53..d8dd539c9f4 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -31,8 +31,7 @@ #define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ -struct virtio_net_config -{ +struct virtio_net_config { /* The config defining mac address (if VIRTIO_NET_F_MAC) */ __u8 mac[6]; /* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */ @@ -41,8 +40,7 @@ struct virtio_net_config /* This is the first element of the scatter-gather list. If you don't * specify GSO or CSUM features, you can simply ignore the header. */ -struct virtio_net_hdr -{ +struct virtio_net_hdr { #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 // Use csum_start, csum_offset __u8 flags; #define VIRTIO_NET_HDR_GSO_NONE 0 // Not a GSO frame diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index 693e0ec5afa..e4d144b132b 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -30,8 +30,7 @@ #define VIRTIO_RING_F_INDIRECT_DESC 28 /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ -struct vring_desc -{ +struct vring_desc { /* Address (guest-physical). */ __u64 addr; /* Length. */ @@ -42,24 +41,21 @@ struct vring_desc __u16 next; }; -struct vring_avail -{ +struct vring_avail { __u16 flags; __u16 idx; __u16 ring[]; }; /* u32 is used here for ids for padding reasons. */ -struct vring_used_elem -{ +struct vring_used_elem { /* Index of start of used descriptor chain. */ __u32 id; /* Total length of the descriptor chain which was used (written to) */ __u32 len; }; -struct vring_used -{ +struct vring_used { __u16 flags; __u16 idx; struct vring_used_elem ring[]; -- cgit v1.2.3