blob: 11440d12a2d3cc34ea11c8d07a62c54c83b95e8f [file] [log] [blame]
Carsten Otte043405e2007-10-10 17:16:19 +02001/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * derived from drivers/kvm/kvm_main.c
5 *
6 * Copyright (C) 2006 Qumranet, Inc.
7 *
8 * Authors:
9 * Avi Kivity <avi@qumranet.com>
10 * Yaniv Kamay <yaniv@qumranet.com>
11 *
12 * This work is licensed under the terms of the GNU GPL, version 2. See
13 * the COPYING file in the top-level directory.
14 *
15 */
16
Carsten Otte313a3dc2007-10-11 19:16:52 +020017#include "kvm.h"
Carsten Otte043405e2007-10-10 17:16:19 +020018#include "x86.h"
Zhang Xiantaod825ed02007-11-14 20:08:51 +080019#include "x86_emulate.h"
Carsten Otte5fb76f92007-10-29 16:08:51 +010020#include "segment_descriptor.h"
Carsten Otte313a3dc2007-10-11 19:16:52 +020021#include "irq.h"
22
23#include <linux/kvm.h>
24#include <linux/fs.h>
25#include <linux/vmalloc.h>
Carsten Otte5fb76f92007-10-29 16:08:51 +010026#include <linux/module.h>
Zhang Xiantao0de10342007-11-20 16:25:04 +080027#include <linux/mman.h>
Carsten Otte043405e2007-10-10 17:16:19 +020028
29#include <asm/uaccess.h>
Zhang Xiantaod825ed02007-11-14 20:08:51 +080030#include <asm/msr.h>
Carsten Otte043405e2007-10-10 17:16:19 +020031
Carsten Otte313a3dc2007-10-11 19:16:52 +020032#define MAX_IO_MSRS 256
Carsten Ottea03490e2007-10-29 16:09:35 +010033#define CR0_RESERVED_BITS \
34 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
35 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
36 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
37#define CR4_RESERVED_BITS \
38 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
39 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
40 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
41 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
42
43#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
Carsten Otte15c4a642007-10-30 18:44:17 +010044#define EFER_RESERVED_BITS 0xfffffffffffff2fe
Carsten Otte313a3dc2007-10-11 19:16:52 +020045
Avi Kivityba1389b2007-11-18 16:24:12 +020046#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
47#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
Hollis Blanchard417bc302007-10-31 17:24:23 -050048
Zhang Xiantao97896d02007-11-14 20:09:30 +080049struct kvm_x86_ops *kvm_x86_ops;
50
Hollis Blanchard417bc302007-10-31 17:24:23 -050051struct kvm_stats_debugfs_item debugfs_entries[] = {
Avi Kivityba1389b2007-11-18 16:24:12 +020052 { "pf_fixed", VCPU_STAT(pf_fixed) },
53 { "pf_guest", VCPU_STAT(pf_guest) },
54 { "tlb_flush", VCPU_STAT(tlb_flush) },
55 { "invlpg", VCPU_STAT(invlpg) },
56 { "exits", VCPU_STAT(exits) },
57 { "io_exits", VCPU_STAT(io_exits) },
58 { "mmio_exits", VCPU_STAT(mmio_exits) },
59 { "signal_exits", VCPU_STAT(signal_exits) },
60 { "irq_window", VCPU_STAT(irq_window_exits) },
61 { "halt_exits", VCPU_STAT(halt_exits) },
62 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
63 { "request_irq", VCPU_STAT(request_irq_exits) },
64 { "irq_exits", VCPU_STAT(irq_exits) },
65 { "host_state_reload", VCPU_STAT(host_state_reload) },
66 { "efer_reload", VCPU_STAT(efer_reload) },
67 { "fpu_reload", VCPU_STAT(fpu_reload) },
68 { "insn_emulation", VCPU_STAT(insn_emulation) },
69 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
Avi Kivity4cee5762007-11-18 16:37:07 +020070 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
71 { "mmu_pte_write", VM_STAT(mmu_pte_write) },
72 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
73 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
74 { "mmu_flooded", VM_STAT(mmu_flooded) },
75 { "mmu_recycled", VM_STAT(mmu_recycled) },
Avi Kivity0f74a242007-11-20 23:01:14 +020076 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
Hollis Blanchard417bc302007-10-31 17:24:23 -050077 { NULL }
78};
79
80
Carsten Otte5fb76f92007-10-29 16:08:51 +010081unsigned long segment_base(u16 selector)
82{
83 struct descriptor_table gdt;
84 struct segment_descriptor *d;
85 unsigned long table_base;
86 unsigned long v;
87
88 if (selector == 0)
89 return 0;
90
91 asm("sgdt %0" : "=m"(gdt));
92 table_base = gdt.base;
93
94 if (selector & 4) { /* from ldt */
95 u16 ldt_selector;
96
97 asm("sldt %0" : "=g"(ldt_selector));
98 table_base = segment_base(ldt_selector);
99 }
100 d = (struct segment_descriptor *)(table_base + (selector & ~7));
101 v = d->base_low | ((unsigned long)d->base_mid << 16) |
102 ((unsigned long)d->base_high << 24);
103#ifdef CONFIG_X86_64
104 if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
105 v |= ((unsigned long) \
106 ((struct segment_descriptor_64 *)d)->base_higher) << 32;
107#endif
108 return v;
109}
110EXPORT_SYMBOL_GPL(segment_base);
111
Carsten Otte6866b832007-10-29 16:09:10 +0100112u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
113{
114 if (irqchip_in_kernel(vcpu->kvm))
115 return vcpu->apic_base;
116 else
117 return vcpu->apic_base;
118}
119EXPORT_SYMBOL_GPL(kvm_get_apic_base);
120
121void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
122{
123 /* TODO: reserve bits check */
124 if (irqchip_in_kernel(vcpu->kvm))
125 kvm_lapic_set_base(vcpu, data);
126 else
127 vcpu->apic_base = data;
128}
129EXPORT_SYMBOL_GPL(kvm_set_apic_base);
130
Carsten Ottea03490e2007-10-29 16:09:35 +0100131static void inject_gp(struct kvm_vcpu *vcpu)
132{
133 kvm_x86_ops->inject_gp(vcpu, 0);
134}
135
Avi Kivity298101d2007-11-25 13:41:11 +0200136void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
137{
138 WARN_ON(vcpu->exception.pending);
139 vcpu->exception.pending = true;
140 vcpu->exception.has_error_code = false;
141 vcpu->exception.nr = nr;
142}
143EXPORT_SYMBOL_GPL(kvm_queue_exception);
144
145void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
146{
147 WARN_ON(vcpu->exception.pending);
148 vcpu->exception.pending = true;
149 vcpu->exception.has_error_code = true;
150 vcpu->exception.nr = nr;
151 vcpu->exception.error_code = error_code;
152}
153EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
154
155static void __queue_exception(struct kvm_vcpu *vcpu)
156{
157 kvm_x86_ops->queue_exception(vcpu, vcpu->exception.nr,
158 vcpu->exception.has_error_code,
159 vcpu->exception.error_code);
160}
161
Carsten Ottea03490e2007-10-29 16:09:35 +0100162/*
163 * Load the pae pdptrs. Return true is they are all valid.
164 */
165int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
166{
167 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
168 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
169 int i;
170 int ret;
171 u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
172
173 mutex_lock(&vcpu->kvm->lock);
174 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
175 offset * sizeof(u64), sizeof(pdpte));
176 if (ret < 0) {
177 ret = 0;
178 goto out;
179 }
180 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
181 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
182 ret = 0;
183 goto out;
184 }
185 }
186 ret = 1;
187
188 memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
189out:
190 mutex_unlock(&vcpu->kvm->lock);
191
192 return ret;
193}
194
Avi Kivityd835dfe2007-11-21 02:57:59 +0200195static bool pdptrs_changed(struct kvm_vcpu *vcpu)
196{
197 u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
198 bool changed = true;
199 int r;
200
201 if (is_long_mode(vcpu) || !is_pae(vcpu))
202 return false;
203
204 mutex_lock(&vcpu->kvm->lock);
205 r = kvm_read_guest(vcpu->kvm, vcpu->cr3 & ~31u, pdpte, sizeof(pdpte));
206 if (r < 0)
207 goto out;
208 changed = memcmp(pdpte, vcpu->pdptrs, sizeof(pdpte)) != 0;
209out:
210 mutex_unlock(&vcpu->kvm->lock);
211
212 return changed;
213}
214
Carsten Ottea03490e2007-10-29 16:09:35 +0100215void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
216{
217 if (cr0 & CR0_RESERVED_BITS) {
218 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
219 cr0, vcpu->cr0);
220 inject_gp(vcpu);
221 return;
222 }
223
224 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
225 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
226 inject_gp(vcpu);
227 return;
228 }
229
230 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
231 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
232 "and a clear PE flag\n");
233 inject_gp(vcpu);
234 return;
235 }
236
237 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
238#ifdef CONFIG_X86_64
239 if ((vcpu->shadow_efer & EFER_LME)) {
240 int cs_db, cs_l;
241
242 if (!is_pae(vcpu)) {
243 printk(KERN_DEBUG "set_cr0: #GP, start paging "
244 "in long mode while PAE is disabled\n");
245 inject_gp(vcpu);
246 return;
247 }
248 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
249 if (cs_l) {
250 printk(KERN_DEBUG "set_cr0: #GP, start paging "
251 "in long mode while CS.L == 1\n");
252 inject_gp(vcpu);
253 return;
254
255 }
256 } else
257#endif
258 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
259 printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
260 "reserved bits\n");
261 inject_gp(vcpu);
262 return;
263 }
264
265 }
266
267 kvm_x86_ops->set_cr0(vcpu, cr0);
268 vcpu->cr0 = cr0;
269
270 mutex_lock(&vcpu->kvm->lock);
271 kvm_mmu_reset_context(vcpu);
272 mutex_unlock(&vcpu->kvm->lock);
273 return;
274}
275EXPORT_SYMBOL_GPL(set_cr0);
276
277void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
278{
279 set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
280}
281EXPORT_SYMBOL_GPL(lmsw);
282
283void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
284{
285 if (cr4 & CR4_RESERVED_BITS) {
286 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
287 inject_gp(vcpu);
288 return;
289 }
290
291 if (is_long_mode(vcpu)) {
292 if (!(cr4 & X86_CR4_PAE)) {
293 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
294 "in long mode\n");
295 inject_gp(vcpu);
296 return;
297 }
298 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
299 && !load_pdptrs(vcpu, vcpu->cr3)) {
300 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
301 inject_gp(vcpu);
302 return;
303 }
304
305 if (cr4 & X86_CR4_VMXE) {
306 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
307 inject_gp(vcpu);
308 return;
309 }
310 kvm_x86_ops->set_cr4(vcpu, cr4);
311 vcpu->cr4 = cr4;
312 mutex_lock(&vcpu->kvm->lock);
313 kvm_mmu_reset_context(vcpu);
314 mutex_unlock(&vcpu->kvm->lock);
315}
316EXPORT_SYMBOL_GPL(set_cr4);
317
318void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
319{
Avi Kivityd835dfe2007-11-21 02:57:59 +0200320 if (cr3 == vcpu->cr3 && !pdptrs_changed(vcpu)) {
321 kvm_mmu_flush_tlb(vcpu);
322 return;
323 }
324
Carsten Ottea03490e2007-10-29 16:09:35 +0100325 if (is_long_mode(vcpu)) {
326 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
327 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
328 inject_gp(vcpu);
329 return;
330 }
331 } else {
332 if (is_pae(vcpu)) {
333 if (cr3 & CR3_PAE_RESERVED_BITS) {
334 printk(KERN_DEBUG
335 "set_cr3: #GP, reserved bits\n");
336 inject_gp(vcpu);
337 return;
338 }
339 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
340 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
341 "reserved bits\n");
342 inject_gp(vcpu);
343 return;
344 }
345 }
346 /*
347 * We don't check reserved bits in nonpae mode, because
348 * this isn't enforced, and VMware depends on this.
349 */
350 }
351
352 mutex_lock(&vcpu->kvm->lock);
353 /*
354 * Does the new cr3 value map to physical memory? (Note, we
355 * catch an invalid cr3 even in real-mode, because it would
356 * cause trouble later on when we turn on paging anyway.)
357 *
358 * A real CPU would silently accept an invalid cr3 and would
359 * attempt to use it - with largely undefined (and often hard
360 * to debug) behavior on the guest side.
361 */
362 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
363 inject_gp(vcpu);
364 else {
365 vcpu->cr3 = cr3;
366 vcpu->mmu.new_cr3(vcpu);
367 }
368 mutex_unlock(&vcpu->kvm->lock);
369}
370EXPORT_SYMBOL_GPL(set_cr3);
371
372void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
373{
374 if (cr8 & CR8_RESERVED_BITS) {
375 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
376 inject_gp(vcpu);
377 return;
378 }
379 if (irqchip_in_kernel(vcpu->kvm))
380 kvm_lapic_set_tpr(vcpu, cr8);
381 else
382 vcpu->cr8 = cr8;
383}
384EXPORT_SYMBOL_GPL(set_cr8);
385
386unsigned long get_cr8(struct kvm_vcpu *vcpu)
387{
388 if (irqchip_in_kernel(vcpu->kvm))
389 return kvm_lapic_get_cr8(vcpu);
390 else
391 return vcpu->cr8;
392}
393EXPORT_SYMBOL_GPL(get_cr8);
394
Carsten Otte043405e2007-10-10 17:16:19 +0200395/*
396 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
397 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
398 *
399 * This list is modified at module load time to reflect the
400 * capabilities of the host cpu.
401 */
402static u32 msrs_to_save[] = {
403 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
404 MSR_K6_STAR,
405#ifdef CONFIG_X86_64
406 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
407#endif
408 MSR_IA32_TIME_STAMP_COUNTER,
409};
410
411static unsigned num_msrs_to_save;
412
413static u32 emulated_msrs[] = {
414 MSR_IA32_MISC_ENABLE,
415};
416
Carsten Otte15c4a642007-10-30 18:44:17 +0100417#ifdef CONFIG_X86_64
418
419static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
420{
421 if (efer & EFER_RESERVED_BITS) {
422 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
423 efer);
424 inject_gp(vcpu);
425 return;
426 }
427
428 if (is_paging(vcpu)
429 && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
430 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
431 inject_gp(vcpu);
432 return;
433 }
434
435 kvm_x86_ops->set_efer(vcpu, efer);
436
437 efer &= ~EFER_LMA;
438 efer |= vcpu->shadow_efer & EFER_LMA;
439
440 vcpu->shadow_efer = efer;
441}
442
443#endif
444
445/*
446 * Writes msr value into into the appropriate "register".
447 * Returns 0 on success, non-0 otherwise.
448 * Assumes vcpu_load() was already called.
449 */
450int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
451{
452 return kvm_x86_ops->set_msr(vcpu, msr_index, data);
453}
454
Carsten Otte313a3dc2007-10-11 19:16:52 +0200455/*
456 * Adapt set_msr() to msr_io()'s calling convention
457 */
458static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
459{
460 return kvm_set_msr(vcpu, index, *data);
461}
462
Carsten Otte15c4a642007-10-30 18:44:17 +0100463
464int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
465{
466 switch (msr) {
467#ifdef CONFIG_X86_64
468 case MSR_EFER:
469 set_efer(vcpu, data);
470 break;
471#endif
472 case MSR_IA32_MC0_STATUS:
473 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
474 __FUNCTION__, data);
475 break;
476 case MSR_IA32_MCG_STATUS:
477 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
478 __FUNCTION__, data);
479 break;
480 case MSR_IA32_UCODE_REV:
481 case MSR_IA32_UCODE_WRITE:
482 case 0x200 ... 0x2ff: /* MTRRs */
483 break;
484 case MSR_IA32_APICBASE:
485 kvm_set_apic_base(vcpu, data);
486 break;
487 case MSR_IA32_MISC_ENABLE:
488 vcpu->ia32_misc_enable_msr = data;
489 break;
490 default:
491 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
492 return 1;
493 }
494 return 0;
495}
496EXPORT_SYMBOL_GPL(kvm_set_msr_common);
497
498
499/*
500 * Reads an msr value (of 'msr_index') into 'pdata'.
501 * Returns 0 on success, non-0 otherwise.
502 * Assumes vcpu_load() was already called.
503 */
504int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
505{
506 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
507}
508
509int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
510{
511 u64 data;
512
513 switch (msr) {
514 case 0xc0010010: /* SYSCFG */
515 case 0xc0010015: /* HWCR */
516 case MSR_IA32_PLATFORM_ID:
517 case MSR_IA32_P5_MC_ADDR:
518 case MSR_IA32_P5_MC_TYPE:
519 case MSR_IA32_MC0_CTL:
520 case MSR_IA32_MCG_STATUS:
521 case MSR_IA32_MCG_CAP:
522 case MSR_IA32_MC0_MISC:
523 case MSR_IA32_MC0_MISC+4:
524 case MSR_IA32_MC0_MISC+8:
525 case MSR_IA32_MC0_MISC+12:
526 case MSR_IA32_MC0_MISC+16:
527 case MSR_IA32_UCODE_REV:
528 case MSR_IA32_PERF_STATUS:
529 case MSR_IA32_EBL_CR_POWERON:
530 /* MTRR registers */
531 case 0xfe:
532 case 0x200 ... 0x2ff:
533 data = 0;
534 break;
535 case 0xcd: /* fsb frequency */
536 data = 3;
537 break;
538 case MSR_IA32_APICBASE:
539 data = kvm_get_apic_base(vcpu);
540 break;
541 case MSR_IA32_MISC_ENABLE:
542 data = vcpu->ia32_misc_enable_msr;
543 break;
544#ifdef CONFIG_X86_64
545 case MSR_EFER:
546 data = vcpu->shadow_efer;
547 break;
548#endif
549 default:
550 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
551 return 1;
552 }
553 *pdata = data;
554 return 0;
555}
556EXPORT_SYMBOL_GPL(kvm_get_msr_common);
557
Carsten Otte313a3dc2007-10-11 19:16:52 +0200558/*
559 * Read or write a bunch of msrs. All parameters are kernel addresses.
560 *
561 * @return number of msrs set successfully.
562 */
563static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
564 struct kvm_msr_entry *entries,
565 int (*do_msr)(struct kvm_vcpu *vcpu,
566 unsigned index, u64 *data))
567{
568 int i;
569
570 vcpu_load(vcpu);
571
572 for (i = 0; i < msrs->nmsrs; ++i)
573 if (do_msr(vcpu, entries[i].index, &entries[i].data))
574 break;
575
576 vcpu_put(vcpu);
577
578 return i;
579}
580
581/*
582 * Read or write a bunch of msrs. Parameters are user addresses.
583 *
584 * @return number of msrs set successfully.
585 */
586static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
587 int (*do_msr)(struct kvm_vcpu *vcpu,
588 unsigned index, u64 *data),
589 int writeback)
590{
591 struct kvm_msrs msrs;
592 struct kvm_msr_entry *entries;
593 int r, n;
594 unsigned size;
595
596 r = -EFAULT;
597 if (copy_from_user(&msrs, user_msrs, sizeof msrs))
598 goto out;
599
600 r = -E2BIG;
601 if (msrs.nmsrs >= MAX_IO_MSRS)
602 goto out;
603
604 r = -ENOMEM;
605 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
606 entries = vmalloc(size);
607 if (!entries)
608 goto out;
609
610 r = -EFAULT;
611 if (copy_from_user(entries, user_msrs->entries, size))
612 goto out_free;
613
614 r = n = __msr_io(vcpu, &msrs, entries, do_msr);
615 if (r < 0)
616 goto out_free;
617
618 r = -EFAULT;
619 if (writeback && copy_to_user(user_msrs->entries, entries, size))
620 goto out_free;
621
622 r = n;
623
624out_free:
625 vfree(entries);
626out:
627 return r;
628}
629
Zhang Xiantaoe9b11c12007-11-14 20:38:21 +0800630/*
631 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
632 * cached on it.
633 */
634void decache_vcpus_on_cpu(int cpu)
635{
636 struct kvm *vm;
637 struct kvm_vcpu *vcpu;
638 int i;
639
640 spin_lock(&kvm_lock);
641 list_for_each_entry(vm, &vm_list, vm_list)
642 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
643 vcpu = vm->vcpus[i];
644 if (!vcpu)
645 continue;
646 /*
647 * If the vcpu is locked, then it is running on some
648 * other cpu and therefore it is not cached on the
649 * cpu in question.
650 *
651 * If it's not locked, check the last cpu it executed
652 * on.
653 */
654 if (mutex_trylock(&vcpu->mutex)) {
655 if (vcpu->cpu == cpu) {
656 kvm_x86_ops->vcpu_decache(vcpu);
657 vcpu->cpu = -1;
658 }
659 mutex_unlock(&vcpu->mutex);
660 }
661 }
662 spin_unlock(&kvm_lock);
663}
664
Zhang Xiantao018d00d2007-11-15 23:07:47 +0800665int kvm_dev_ioctl_check_extension(long ext)
666{
667 int r;
668
669 switch (ext) {
670 case KVM_CAP_IRQCHIP:
671 case KVM_CAP_HLT:
672 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
673 case KVM_CAP_USER_MEMORY:
674 case KVM_CAP_SET_TSS_ADDR:
Dan Kenigsberg07716712007-11-21 17:10:04 +0200675 case KVM_CAP_EXT_CPUID:
Zhang Xiantao018d00d2007-11-15 23:07:47 +0800676 r = 1;
677 break;
678 default:
679 r = 0;
680 break;
681 }
682 return r;
683
684}
685
Carsten Otte043405e2007-10-10 17:16:19 +0200686long kvm_arch_dev_ioctl(struct file *filp,
687 unsigned int ioctl, unsigned long arg)
688{
689 void __user *argp = (void __user *)arg;
690 long r;
691
692 switch (ioctl) {
693 case KVM_GET_MSR_INDEX_LIST: {
694 struct kvm_msr_list __user *user_msr_list = argp;
695 struct kvm_msr_list msr_list;
696 unsigned n;
697
698 r = -EFAULT;
699 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
700 goto out;
701 n = msr_list.nmsrs;
702 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
703 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
704 goto out;
705 r = -E2BIG;
706 if (n < num_msrs_to_save)
707 goto out;
708 r = -EFAULT;
709 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
710 num_msrs_to_save * sizeof(u32)))
711 goto out;
712 if (copy_to_user(user_msr_list->indices
713 + num_msrs_to_save * sizeof(u32),
714 &emulated_msrs,
715 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
716 goto out;
717 r = 0;
718 break;
719 }
720 default:
721 r = -EINVAL;
722 }
723out:
724 return r;
725}
726
Carsten Otte313a3dc2007-10-11 19:16:52 +0200727void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
728{
729 kvm_x86_ops->vcpu_load(vcpu, cpu);
730}
731
732void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
733{
734 kvm_x86_ops->vcpu_put(vcpu);
Amit Shah9327fd12007-11-15 18:38:46 +0200735 kvm_put_guest_fpu(vcpu);
Carsten Otte313a3dc2007-10-11 19:16:52 +0200736}
737
Dan Kenigsberg07716712007-11-21 17:10:04 +0200738static int is_efer_nx(void)
Carsten Otte313a3dc2007-10-11 19:16:52 +0200739{
740 u64 efer;
Carsten Otte313a3dc2007-10-11 19:16:52 +0200741
742 rdmsrl(MSR_EFER, efer);
Dan Kenigsberg07716712007-11-21 17:10:04 +0200743 return efer & EFER_NX;
744}
745
746static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
747{
748 int i;
749 struct kvm_cpuid_entry2 *e, *entry;
750
Carsten Otte313a3dc2007-10-11 19:16:52 +0200751 entry = NULL;
752 for (i = 0; i < vcpu->cpuid_nent; ++i) {
753 e = &vcpu->cpuid_entries[i];
754 if (e->function == 0x80000001) {
755 entry = e;
756 break;
757 }
758 }
Dan Kenigsberg07716712007-11-21 17:10:04 +0200759 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
Carsten Otte313a3dc2007-10-11 19:16:52 +0200760 entry->edx &= ~(1 << 20);
761 printk(KERN_INFO "kvm: guest NX capability removed\n");
762 }
763}
764
Dan Kenigsberg07716712007-11-21 17:10:04 +0200765/* when an old userspace process fills a new kernel module */
Carsten Otte313a3dc2007-10-11 19:16:52 +0200766static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
767 struct kvm_cpuid *cpuid,
768 struct kvm_cpuid_entry __user *entries)
769{
Dan Kenigsberg07716712007-11-21 17:10:04 +0200770 int r, i;
771 struct kvm_cpuid_entry *cpuid_entries;
772
773 r = -E2BIG;
774 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
775 goto out;
776 r = -ENOMEM;
777 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
778 if (!cpuid_entries)
779 goto out;
780 r = -EFAULT;
781 if (copy_from_user(cpuid_entries, entries,
782 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
783 goto out_free;
784 for (i = 0; i < cpuid->nent; i++) {
785 vcpu->cpuid_entries[i].function = cpuid_entries[i].function;
786 vcpu->cpuid_entries[i].eax = cpuid_entries[i].eax;
787 vcpu->cpuid_entries[i].ebx = cpuid_entries[i].ebx;
788 vcpu->cpuid_entries[i].ecx = cpuid_entries[i].ecx;
789 vcpu->cpuid_entries[i].edx = cpuid_entries[i].edx;
790 vcpu->cpuid_entries[i].index = 0;
791 vcpu->cpuid_entries[i].flags = 0;
792 vcpu->cpuid_entries[i].padding[0] = 0;
793 vcpu->cpuid_entries[i].padding[1] = 0;
794 vcpu->cpuid_entries[i].padding[2] = 0;
795 }
796 vcpu->cpuid_nent = cpuid->nent;
797 cpuid_fix_nx_cap(vcpu);
798 r = 0;
799
800out_free:
801 vfree(cpuid_entries);
802out:
803 return r;
804}
805
806static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
807 struct kvm_cpuid2 *cpuid,
808 struct kvm_cpuid_entry2 __user *entries)
809{
Carsten Otte313a3dc2007-10-11 19:16:52 +0200810 int r;
811
812 r = -E2BIG;
813 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
814 goto out;
815 r = -EFAULT;
816 if (copy_from_user(&vcpu->cpuid_entries, entries,
Dan Kenigsberg07716712007-11-21 17:10:04 +0200817 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
Carsten Otte313a3dc2007-10-11 19:16:52 +0200818 goto out;
819 vcpu->cpuid_nent = cpuid->nent;
Carsten Otte313a3dc2007-10-11 19:16:52 +0200820 return 0;
821
822out:
823 return r;
824}
825
Dan Kenigsberg07716712007-11-21 17:10:04 +0200826static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
827 struct kvm_cpuid2 *cpuid,
828 struct kvm_cpuid_entry2 __user *entries)
829{
830 int r;
831
832 r = -E2BIG;
833 if (cpuid->nent < vcpu->cpuid_nent)
834 goto out;
835 r = -EFAULT;
836 if (copy_to_user(entries, &vcpu->cpuid_entries,
837 vcpu->cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
838 goto out;
839 return 0;
840
841out:
842 cpuid->nent = vcpu->cpuid_nent;
843 return r;
844}
845
846static inline u32 bit(int bitno)
847{
848 return 1 << (bitno & 31);
849}
850
851static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
852 u32 index)
853{
854 entry->function = function;
855 entry->index = index;
856 cpuid_count(entry->function, entry->index,
857 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
858 entry->flags = 0;
859}
860
861static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
862 u32 index, int *nent, int maxnent)
863{
864 const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
865 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
866 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
867 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
868 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
869 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
870 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
871 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
872 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
873 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
874 const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
875 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
876 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
877 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
878 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
879 bit(X86_FEATURE_PGE) |
880 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
881 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
882 bit(X86_FEATURE_SYSCALL) |
883 (bit(X86_FEATURE_NX) && is_efer_nx()) |
884#ifdef CONFIG_X86_64
885 bit(X86_FEATURE_LM) |
886#endif
887 bit(X86_FEATURE_MMXEXT) |
888 bit(X86_FEATURE_3DNOWEXT) |
889 bit(X86_FEATURE_3DNOW);
890 const u32 kvm_supported_word3_x86_features =
891 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
892 const u32 kvm_supported_word6_x86_features =
893 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
894
895 /* all func 2 cpuid_count() should be called on the same cpu */
896 get_cpu();
897 do_cpuid_1_ent(entry, function, index);
898 ++*nent;
899
900 switch (function) {
901 case 0:
902 entry->eax = min(entry->eax, (u32)0xb);
903 break;
904 case 1:
905 entry->edx &= kvm_supported_word0_x86_features;
906 entry->ecx &= kvm_supported_word3_x86_features;
907 break;
908 /* function 2 entries are STATEFUL. That is, repeated cpuid commands
909 * may return different values. This forces us to get_cpu() before
910 * issuing the first command, and also to emulate this annoying behavior
911 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
912 case 2: {
913 int t, times = entry->eax & 0xff;
914
915 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
916 for (t = 1; t < times && *nent < maxnent; ++t) {
917 do_cpuid_1_ent(&entry[t], function, 0);
918 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
919 ++*nent;
920 }
921 break;
922 }
923 /* function 4 and 0xb have additional index. */
924 case 4: {
925 int index, cache_type;
926
927 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
928 /* read more entries until cache_type is zero */
929 for (index = 1; *nent < maxnent; ++index) {
930 cache_type = entry[index - 1].eax & 0x1f;
931 if (!cache_type)
932 break;
933 do_cpuid_1_ent(&entry[index], function, index);
934 entry[index].flags |=
935 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
936 ++*nent;
937 }
938 break;
939 }
940 case 0xb: {
941 int index, level_type;
942
943 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
944 /* read more entries until level_type is zero */
945 for (index = 1; *nent < maxnent; ++index) {
946 level_type = entry[index - 1].ecx & 0xff;
947 if (!level_type)
948 break;
949 do_cpuid_1_ent(&entry[index], function, index);
950 entry[index].flags |=
951 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
952 ++*nent;
953 }
954 break;
955 }
956 case 0x80000000:
957 entry->eax = min(entry->eax, 0x8000001a);
958 break;
959 case 0x80000001:
960 entry->edx &= kvm_supported_word1_x86_features;
961 entry->ecx &= kvm_supported_word6_x86_features;
962 break;
963 }
964 put_cpu();
965}
966
967static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
968 struct kvm_cpuid2 *cpuid,
969 struct kvm_cpuid_entry2 __user *entries)
970{
971 struct kvm_cpuid_entry2 *cpuid_entries;
972 int limit, nent = 0, r = -E2BIG;
973 u32 func;
974
975 if (cpuid->nent < 1)
976 goto out;
977 r = -ENOMEM;
978 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
979 if (!cpuid_entries)
980 goto out;
981
982 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
983 limit = cpuid_entries[0].eax;
984 for (func = 1; func <= limit && nent < cpuid->nent; ++func)
985 do_cpuid_ent(&cpuid_entries[nent], func, 0,
986 &nent, cpuid->nent);
987 r = -E2BIG;
988 if (nent >= cpuid->nent)
989 goto out_free;
990
991 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
992 limit = cpuid_entries[nent - 1].eax;
993 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
994 do_cpuid_ent(&cpuid_entries[nent], func, 0,
995 &nent, cpuid->nent);
996 r = -EFAULT;
997 if (copy_to_user(entries, cpuid_entries,
998 nent * sizeof(struct kvm_cpuid_entry2)))
999 goto out_free;
1000 cpuid->nent = nent;
1001 r = 0;
1002
1003out_free:
1004 vfree(cpuid_entries);
1005out:
1006 return r;
1007}
1008
Carsten Otte313a3dc2007-10-11 19:16:52 +02001009static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1010 struct kvm_lapic_state *s)
1011{
1012 vcpu_load(vcpu);
1013 memcpy(s->regs, vcpu->apic->regs, sizeof *s);
1014 vcpu_put(vcpu);
1015
1016 return 0;
1017}
1018
1019static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1020 struct kvm_lapic_state *s)
1021{
1022 vcpu_load(vcpu);
1023 memcpy(vcpu->apic->regs, s->regs, sizeof *s);
1024 kvm_apic_post_state_restore(vcpu);
1025 vcpu_put(vcpu);
1026
1027 return 0;
1028}
1029
Zhang Xiantaof77bc6a2007-11-21 04:36:41 +08001030static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1031 struct kvm_interrupt *irq)
1032{
1033 if (irq->irq < 0 || irq->irq >= 256)
1034 return -EINVAL;
1035 if (irqchip_in_kernel(vcpu->kvm))
1036 return -ENXIO;
1037 vcpu_load(vcpu);
1038
1039 set_bit(irq->irq, vcpu->irq_pending);
1040 set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
1041
1042 vcpu_put(vcpu);
1043
1044 return 0;
1045}
1046
Carsten Otte313a3dc2007-10-11 19:16:52 +02001047long kvm_arch_vcpu_ioctl(struct file *filp,
1048 unsigned int ioctl, unsigned long arg)
1049{
1050 struct kvm_vcpu *vcpu = filp->private_data;
1051 void __user *argp = (void __user *)arg;
1052 int r;
1053
1054 switch (ioctl) {
1055 case KVM_GET_LAPIC: {
1056 struct kvm_lapic_state lapic;
1057
1058 memset(&lapic, 0, sizeof lapic);
1059 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
1060 if (r)
1061 goto out;
1062 r = -EFAULT;
1063 if (copy_to_user(argp, &lapic, sizeof lapic))
1064 goto out;
1065 r = 0;
1066 break;
1067 }
1068 case KVM_SET_LAPIC: {
1069 struct kvm_lapic_state lapic;
1070
1071 r = -EFAULT;
1072 if (copy_from_user(&lapic, argp, sizeof lapic))
1073 goto out;
1074 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
1075 if (r)
1076 goto out;
1077 r = 0;
1078 break;
1079 }
Zhang Xiantaof77bc6a2007-11-21 04:36:41 +08001080 case KVM_INTERRUPT: {
1081 struct kvm_interrupt irq;
1082
1083 r = -EFAULT;
1084 if (copy_from_user(&irq, argp, sizeof irq))
1085 goto out;
1086 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1087 if (r)
1088 goto out;
1089 r = 0;
1090 break;
1091 }
Carsten Otte313a3dc2007-10-11 19:16:52 +02001092 case KVM_SET_CPUID: {
1093 struct kvm_cpuid __user *cpuid_arg = argp;
1094 struct kvm_cpuid cpuid;
1095
1096 r = -EFAULT;
1097 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1098 goto out;
1099 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1100 if (r)
1101 goto out;
1102 break;
1103 }
Dan Kenigsberg07716712007-11-21 17:10:04 +02001104 case KVM_SET_CPUID2: {
1105 struct kvm_cpuid2 __user *cpuid_arg = argp;
1106 struct kvm_cpuid2 cpuid;
1107
1108 r = -EFAULT;
1109 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1110 goto out;
1111 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1112 cpuid_arg->entries);
1113 if (r)
1114 goto out;
1115 break;
1116 }
1117 case KVM_GET_CPUID2: {
1118 struct kvm_cpuid2 __user *cpuid_arg = argp;
1119 struct kvm_cpuid2 cpuid;
1120
1121 r = -EFAULT;
1122 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1123 goto out;
1124 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1125 cpuid_arg->entries);
1126 if (r)
1127 goto out;
1128 r = -EFAULT;
1129 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1130 goto out;
1131 r = 0;
1132 break;
1133 }
Carsten Otte313a3dc2007-10-11 19:16:52 +02001134 case KVM_GET_MSRS:
1135 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1136 break;
1137 case KVM_SET_MSRS:
1138 r = msr_io(vcpu, argp, do_set_msr, 0);
1139 break;
1140 default:
1141 r = -EINVAL;
1142 }
1143out:
1144 return r;
1145}
1146
Carsten Otte1fe779f2007-10-29 16:08:35 +01001147static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1148{
1149 int ret;
1150
1151 if (addr > (unsigned int)(-3 * PAGE_SIZE))
1152 return -1;
1153 ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1154 return ret;
1155}
1156
1157static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1158 u32 kvm_nr_mmu_pages)
1159{
1160 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1161 return -EINVAL;
1162
1163 mutex_lock(&kvm->lock);
1164
1165 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1166 kvm->n_requested_mmu_pages = kvm_nr_mmu_pages;
1167
1168 mutex_unlock(&kvm->lock);
1169 return 0;
1170}
1171
1172static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1173{
1174 return kvm->n_alloc_mmu_pages;
1175}
1176
Zhang Xiantaoe9f85cd2007-11-22 11:20:33 +08001177gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1178{
1179 int i;
1180 struct kvm_mem_alias *alias;
1181
1182 for (i = 0; i < kvm->naliases; ++i) {
1183 alias = &kvm->aliases[i];
1184 if (gfn >= alias->base_gfn
1185 && gfn < alias->base_gfn + alias->npages)
1186 return alias->target_gfn + gfn - alias->base_gfn;
1187 }
1188 return gfn;
1189}
1190
Carsten Otte1fe779f2007-10-29 16:08:35 +01001191/*
1192 * Set a new alias region. Aliases map a portion of physical memory into
1193 * another portion. This is useful for memory windows, for example the PC
1194 * VGA region.
1195 */
1196static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1197 struct kvm_memory_alias *alias)
1198{
1199 int r, n;
1200 struct kvm_mem_alias *p;
1201
1202 r = -EINVAL;
1203 /* General sanity checks */
1204 if (alias->memory_size & (PAGE_SIZE - 1))
1205 goto out;
1206 if (alias->guest_phys_addr & (PAGE_SIZE - 1))
1207 goto out;
1208 if (alias->slot >= KVM_ALIAS_SLOTS)
1209 goto out;
1210 if (alias->guest_phys_addr + alias->memory_size
1211 < alias->guest_phys_addr)
1212 goto out;
1213 if (alias->target_phys_addr + alias->memory_size
1214 < alias->target_phys_addr)
1215 goto out;
1216
1217 mutex_lock(&kvm->lock);
1218
1219 p = &kvm->aliases[alias->slot];
1220 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
1221 p->npages = alias->memory_size >> PAGE_SHIFT;
1222 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
1223
1224 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
1225 if (kvm->aliases[n - 1].npages)
1226 break;
1227 kvm->naliases = n;
1228
1229 kvm_mmu_zap_all(kvm);
1230
1231 mutex_unlock(&kvm->lock);
1232
1233 return 0;
1234
1235out:
1236 return r;
1237}
1238
1239static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1240{
1241 int r;
1242
1243 r = 0;
1244 switch (chip->chip_id) {
1245 case KVM_IRQCHIP_PIC_MASTER:
1246 memcpy(&chip->chip.pic,
1247 &pic_irqchip(kvm)->pics[0],
1248 sizeof(struct kvm_pic_state));
1249 break;
1250 case KVM_IRQCHIP_PIC_SLAVE:
1251 memcpy(&chip->chip.pic,
1252 &pic_irqchip(kvm)->pics[1],
1253 sizeof(struct kvm_pic_state));
1254 break;
1255 case KVM_IRQCHIP_IOAPIC:
1256 memcpy(&chip->chip.ioapic,
1257 ioapic_irqchip(kvm),
1258 sizeof(struct kvm_ioapic_state));
1259 break;
1260 default:
1261 r = -EINVAL;
1262 break;
1263 }
1264 return r;
1265}
1266
1267static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1268{
1269 int r;
1270
1271 r = 0;
1272 switch (chip->chip_id) {
1273 case KVM_IRQCHIP_PIC_MASTER:
1274 memcpy(&pic_irqchip(kvm)->pics[0],
1275 &chip->chip.pic,
1276 sizeof(struct kvm_pic_state));
1277 break;
1278 case KVM_IRQCHIP_PIC_SLAVE:
1279 memcpy(&pic_irqchip(kvm)->pics[1],
1280 &chip->chip.pic,
1281 sizeof(struct kvm_pic_state));
1282 break;
1283 case KVM_IRQCHIP_IOAPIC:
1284 memcpy(ioapic_irqchip(kvm),
1285 &chip->chip.ioapic,
1286 sizeof(struct kvm_ioapic_state));
1287 break;
1288 default:
1289 r = -EINVAL;
1290 break;
1291 }
1292 kvm_pic_update_irq(pic_irqchip(kvm));
1293 return r;
1294}
1295
Zhang Xiantao5bb064d2007-11-18 20:29:43 +08001296/*
1297 * Get (and clear) the dirty memory log for a memory slot.
1298 */
1299int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1300 struct kvm_dirty_log *log)
1301{
1302 int r;
1303 int n;
1304 struct kvm_memory_slot *memslot;
1305 int is_dirty = 0;
1306
1307 mutex_lock(&kvm->lock);
1308
1309 r = kvm_get_dirty_log(kvm, log, &is_dirty);
1310 if (r)
1311 goto out;
1312
1313 /* If nothing is dirty, don't bother messing with page tables. */
1314 if (is_dirty) {
1315 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1316 kvm_flush_remote_tlbs(kvm);
1317 memslot = &kvm->memslots[log->slot];
1318 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1319 memset(memslot->dirty_bitmap, 0, n);
1320 }
1321 r = 0;
1322out:
1323 mutex_unlock(&kvm->lock);
1324 return r;
1325}
1326
Carsten Otte1fe779f2007-10-29 16:08:35 +01001327long kvm_arch_vm_ioctl(struct file *filp,
1328 unsigned int ioctl, unsigned long arg)
1329{
1330 struct kvm *kvm = filp->private_data;
1331 void __user *argp = (void __user *)arg;
1332 int r = -EINVAL;
1333
1334 switch (ioctl) {
1335 case KVM_SET_TSS_ADDR:
1336 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1337 if (r < 0)
1338 goto out;
1339 break;
1340 case KVM_SET_MEMORY_REGION: {
1341 struct kvm_memory_region kvm_mem;
1342 struct kvm_userspace_memory_region kvm_userspace_mem;
1343
1344 r = -EFAULT;
1345 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1346 goto out;
1347 kvm_userspace_mem.slot = kvm_mem.slot;
1348 kvm_userspace_mem.flags = kvm_mem.flags;
1349 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1350 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1351 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1352 if (r)
1353 goto out;
1354 break;
1355 }
1356 case KVM_SET_NR_MMU_PAGES:
1357 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1358 if (r)
1359 goto out;
1360 break;
1361 case KVM_GET_NR_MMU_PAGES:
1362 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1363 break;
1364 case KVM_SET_MEMORY_ALIAS: {
1365 struct kvm_memory_alias alias;
1366
1367 r = -EFAULT;
1368 if (copy_from_user(&alias, argp, sizeof alias))
1369 goto out;
1370 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
1371 if (r)
1372 goto out;
1373 break;
1374 }
1375 case KVM_CREATE_IRQCHIP:
1376 r = -ENOMEM;
1377 kvm->vpic = kvm_create_pic(kvm);
1378 if (kvm->vpic) {
1379 r = kvm_ioapic_init(kvm);
1380 if (r) {
1381 kfree(kvm->vpic);
1382 kvm->vpic = NULL;
1383 goto out;
1384 }
1385 } else
1386 goto out;
1387 break;
1388 case KVM_IRQ_LINE: {
1389 struct kvm_irq_level irq_event;
1390
1391 r = -EFAULT;
1392 if (copy_from_user(&irq_event, argp, sizeof irq_event))
1393 goto out;
1394 if (irqchip_in_kernel(kvm)) {
1395 mutex_lock(&kvm->lock);
1396 if (irq_event.irq < 16)
1397 kvm_pic_set_irq(pic_irqchip(kvm),
1398 irq_event.irq,
1399 irq_event.level);
1400 kvm_ioapic_set_irq(kvm->vioapic,
1401 irq_event.irq,
1402 irq_event.level);
1403 mutex_unlock(&kvm->lock);
1404 r = 0;
1405 }
1406 break;
1407 }
1408 case KVM_GET_IRQCHIP: {
1409 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1410 struct kvm_irqchip chip;
1411
1412 r = -EFAULT;
1413 if (copy_from_user(&chip, argp, sizeof chip))
1414 goto out;
1415 r = -ENXIO;
1416 if (!irqchip_in_kernel(kvm))
1417 goto out;
1418 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
1419 if (r)
1420 goto out;
1421 r = -EFAULT;
1422 if (copy_to_user(argp, &chip, sizeof chip))
1423 goto out;
1424 r = 0;
1425 break;
1426 }
1427 case KVM_SET_IRQCHIP: {
1428 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1429 struct kvm_irqchip chip;
1430
1431 r = -EFAULT;
1432 if (copy_from_user(&chip, argp, sizeof chip))
1433 goto out;
1434 r = -ENXIO;
1435 if (!irqchip_in_kernel(kvm))
1436 goto out;
1437 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
1438 if (r)
1439 goto out;
1440 r = 0;
1441 break;
1442 }
Dan Kenigsberg07716712007-11-21 17:10:04 +02001443 case KVM_GET_SUPPORTED_CPUID: {
1444 struct kvm_cpuid2 __user *cpuid_arg = argp;
1445 struct kvm_cpuid2 cpuid;
1446
1447 r = -EFAULT;
1448 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1449 goto out;
1450 r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
1451 cpuid_arg->entries);
1452 if (r)
1453 goto out;
1454
1455 r = -EFAULT;
1456 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1457 goto out;
1458 r = 0;
1459 break;
1460 }
Carsten Otte1fe779f2007-10-29 16:08:35 +01001461 default:
1462 ;
1463 }
1464out:
1465 return r;
1466}
1467
Zhang Xiantaoa16b0432007-11-16 14:38:21 +08001468static void kvm_init_msr_list(void)
Carsten Otte043405e2007-10-10 17:16:19 +02001469{
1470 u32 dummy[2];
1471 unsigned i, j;
1472
1473 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
1474 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1475 continue;
1476 if (j < i)
1477 msrs_to_save[j] = msrs_to_save[i];
1478 j++;
1479 }
1480 num_msrs_to_save = j;
1481}
1482
Carsten Ottebbd9b642007-10-30 18:44:21 +01001483/*
1484 * Only apic need an MMIO device hook, so shortcut now..
1485 */
1486static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1487 gpa_t addr)
1488{
1489 struct kvm_io_device *dev;
1490
1491 if (vcpu->apic) {
1492 dev = &vcpu->apic->dev;
1493 if (dev->in_range(dev, addr))
1494 return dev;
1495 }
1496 return NULL;
1497}
1498
1499
1500static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1501 gpa_t addr)
1502{
1503 struct kvm_io_device *dev;
1504
1505 dev = vcpu_find_pervcpu_dev(vcpu, addr);
1506 if (dev == NULL)
1507 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1508 return dev;
1509}
1510
1511int emulator_read_std(unsigned long addr,
1512 void *val,
1513 unsigned int bytes,
1514 struct kvm_vcpu *vcpu)
1515{
1516 void *data = val;
1517
1518 while (bytes) {
1519 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1520 unsigned offset = addr & (PAGE_SIZE-1);
1521 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1522 int ret;
1523
1524 if (gpa == UNMAPPED_GVA)
1525 return X86EMUL_PROPAGATE_FAULT;
1526 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
1527 if (ret < 0)
1528 return X86EMUL_UNHANDLEABLE;
1529
1530 bytes -= tocopy;
1531 data += tocopy;
1532 addr += tocopy;
1533 }
1534
1535 return X86EMUL_CONTINUE;
1536}
1537EXPORT_SYMBOL_GPL(emulator_read_std);
1538
Carsten Ottebbd9b642007-10-30 18:44:21 +01001539static int emulator_read_emulated(unsigned long addr,
1540 void *val,
1541 unsigned int bytes,
1542 struct kvm_vcpu *vcpu)
1543{
1544 struct kvm_io_device *mmio_dev;
1545 gpa_t gpa;
1546
1547 if (vcpu->mmio_read_completed) {
1548 memcpy(val, vcpu->mmio_data, bytes);
1549 vcpu->mmio_read_completed = 0;
1550 return X86EMUL_CONTINUE;
1551 }
1552
1553 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1554
1555 /* For APIC access vmexit */
1556 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1557 goto mmio;
1558
1559 if (emulator_read_std(addr, val, bytes, vcpu)
1560 == X86EMUL_CONTINUE)
1561 return X86EMUL_CONTINUE;
1562 if (gpa == UNMAPPED_GVA)
1563 return X86EMUL_PROPAGATE_FAULT;
1564
1565mmio:
1566 /*
1567 * Is this MMIO handled locally?
1568 */
1569 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1570 if (mmio_dev) {
1571 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1572 return X86EMUL_CONTINUE;
1573 }
1574
1575 vcpu->mmio_needed = 1;
1576 vcpu->mmio_phys_addr = gpa;
1577 vcpu->mmio_size = bytes;
1578 vcpu->mmio_is_write = 0;
1579
1580 return X86EMUL_UNHANDLEABLE;
1581}
1582
1583static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1584 const void *val, int bytes)
1585{
1586 int ret;
1587
1588 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1589 if (ret < 0)
1590 return 0;
1591 kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1592 return 1;
1593}
1594
1595static int emulator_write_emulated_onepage(unsigned long addr,
1596 const void *val,
1597 unsigned int bytes,
1598 struct kvm_vcpu *vcpu)
1599{
1600 struct kvm_io_device *mmio_dev;
1601 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1602
1603 if (gpa == UNMAPPED_GVA) {
1604 kvm_x86_ops->inject_page_fault(vcpu, addr, 2);
1605 return X86EMUL_PROPAGATE_FAULT;
1606 }
1607
1608 /* For APIC access vmexit */
1609 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1610 goto mmio;
1611
1612 if (emulator_write_phys(vcpu, gpa, val, bytes))
1613 return X86EMUL_CONTINUE;
1614
1615mmio:
1616 /*
1617 * Is this MMIO handled locally?
1618 */
1619 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1620 if (mmio_dev) {
1621 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1622 return X86EMUL_CONTINUE;
1623 }
1624
1625 vcpu->mmio_needed = 1;
1626 vcpu->mmio_phys_addr = gpa;
1627 vcpu->mmio_size = bytes;
1628 vcpu->mmio_is_write = 1;
1629 memcpy(vcpu->mmio_data, val, bytes);
1630
1631 return X86EMUL_CONTINUE;
1632}
1633
1634int emulator_write_emulated(unsigned long addr,
1635 const void *val,
1636 unsigned int bytes,
1637 struct kvm_vcpu *vcpu)
1638{
1639 /* Crossing a page boundary? */
1640 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
1641 int rc, now;
1642
1643 now = -addr & ~PAGE_MASK;
1644 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
1645 if (rc != X86EMUL_CONTINUE)
1646 return rc;
1647 addr += now;
1648 val += now;
1649 bytes -= now;
1650 }
1651 return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
1652}
1653EXPORT_SYMBOL_GPL(emulator_write_emulated);
1654
1655static int emulator_cmpxchg_emulated(unsigned long addr,
1656 const void *old,
1657 const void *new,
1658 unsigned int bytes,
1659 struct kvm_vcpu *vcpu)
1660{
1661 static int reported;
1662
1663 if (!reported) {
1664 reported = 1;
1665 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1666 }
1667 return emulator_write_emulated(addr, new, bytes, vcpu);
1668}
1669
1670static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1671{
1672 return kvm_x86_ops->get_segment_base(vcpu, seg);
1673}
1674
1675int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1676{
1677 return X86EMUL_CONTINUE;
1678}
1679
1680int emulate_clts(struct kvm_vcpu *vcpu)
1681{
1682 kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS);
1683 return X86EMUL_CONTINUE;
1684}
1685
1686int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
1687{
1688 struct kvm_vcpu *vcpu = ctxt->vcpu;
1689
1690 switch (dr) {
1691 case 0 ... 3:
1692 *dest = kvm_x86_ops->get_dr(vcpu, dr);
1693 return X86EMUL_CONTINUE;
1694 default:
1695 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
1696 return X86EMUL_UNHANDLEABLE;
1697 }
1698}
1699
1700int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1701{
1702 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1703 int exception;
1704
1705 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1706 if (exception) {
1707 /* FIXME: better handling */
1708 return X86EMUL_UNHANDLEABLE;
1709 }
1710 return X86EMUL_CONTINUE;
1711}
1712
1713void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
1714{
1715 static int reported;
1716 u8 opcodes[4];
1717 unsigned long rip = vcpu->rip;
1718 unsigned long rip_linear;
1719
1720 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
1721
1722 if (reported)
1723 return;
1724
1725 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
1726
1727 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
1728 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1729 reported = 1;
1730}
1731EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
1732
1733struct x86_emulate_ops emulate_ops = {
1734 .read_std = emulator_read_std,
Carsten Ottebbd9b642007-10-30 18:44:21 +01001735 .read_emulated = emulator_read_emulated,
1736 .write_emulated = emulator_write_emulated,
1737 .cmpxchg_emulated = emulator_cmpxchg_emulated,
1738};
1739
1740int emulate_instruction(struct kvm_vcpu *vcpu,
1741 struct kvm_run *run,
1742 unsigned long cr2,
1743 u16 error_code,
1744 int no_decode)
1745{
1746 int r;
1747
1748 vcpu->mmio_fault_cr2 = cr2;
1749 kvm_x86_ops->cache_regs(vcpu);
1750
1751 vcpu->mmio_is_write = 0;
1752 vcpu->pio.string = 0;
1753
1754 if (!no_decode) {
1755 int cs_db, cs_l;
1756 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1757
1758 vcpu->emulate_ctxt.vcpu = vcpu;
1759 vcpu->emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
Carsten Ottebbd9b642007-10-30 18:44:21 +01001760 vcpu->emulate_ctxt.mode =
1761 (vcpu->emulate_ctxt.eflags & X86_EFLAGS_VM)
1762 ? X86EMUL_MODE_REAL : cs_l
1763 ? X86EMUL_MODE_PROT64 : cs_db
1764 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1765
1766 if (vcpu->emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1767 vcpu->emulate_ctxt.cs_base = 0;
1768 vcpu->emulate_ctxt.ds_base = 0;
1769 vcpu->emulate_ctxt.es_base = 0;
1770 vcpu->emulate_ctxt.ss_base = 0;
1771 } else {
1772 vcpu->emulate_ctxt.cs_base =
1773 get_segment_base(vcpu, VCPU_SREG_CS);
1774 vcpu->emulate_ctxt.ds_base =
1775 get_segment_base(vcpu, VCPU_SREG_DS);
1776 vcpu->emulate_ctxt.es_base =
1777 get_segment_base(vcpu, VCPU_SREG_ES);
1778 vcpu->emulate_ctxt.ss_base =
1779 get_segment_base(vcpu, VCPU_SREG_SS);
1780 }
1781
1782 vcpu->emulate_ctxt.gs_base =
1783 get_segment_base(vcpu, VCPU_SREG_GS);
1784 vcpu->emulate_ctxt.fs_base =
1785 get_segment_base(vcpu, VCPU_SREG_FS);
1786
1787 r = x86_decode_insn(&vcpu->emulate_ctxt, &emulate_ops);
Avi Kivityf2b57562007-11-18 15:17:51 +02001788 ++vcpu->stat.insn_emulation;
Carsten Ottebbd9b642007-10-30 18:44:21 +01001789 if (r) {
Avi Kivityf2b57562007-11-18 15:17:51 +02001790 ++vcpu->stat.insn_emulation_fail;
Carsten Ottebbd9b642007-10-30 18:44:21 +01001791 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1792 return EMULATE_DONE;
1793 return EMULATE_FAIL;
1794 }
1795 }
1796
1797 r = x86_emulate_insn(&vcpu->emulate_ctxt, &emulate_ops);
1798
1799 if (vcpu->pio.string)
1800 return EMULATE_DO_MMIO;
1801
1802 if ((r || vcpu->mmio_is_write) && run) {
1803 run->exit_reason = KVM_EXIT_MMIO;
1804 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1805 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1806 run->mmio.len = vcpu->mmio_size;
1807 run->mmio.is_write = vcpu->mmio_is_write;
1808 }
1809
1810 if (r) {
1811 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1812 return EMULATE_DONE;
1813 if (!vcpu->mmio_needed) {
1814 kvm_report_emulation_failure(vcpu, "mmio");
1815 return EMULATE_FAIL;
1816 }
1817 return EMULATE_DO_MMIO;
1818 }
1819
1820 kvm_x86_ops->decache_regs(vcpu);
1821 kvm_x86_ops->set_rflags(vcpu, vcpu->emulate_ctxt.eflags);
1822
1823 if (vcpu->mmio_is_write) {
1824 vcpu->mmio_needed = 0;
1825 return EMULATE_DO_MMIO;
1826 }
1827
1828 return EMULATE_DONE;
1829}
1830EXPORT_SYMBOL_GPL(emulate_instruction);
1831
Carsten Ottede7d7892007-10-30 18:44:25 +01001832static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
1833{
1834 int i;
1835
1836 for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
1837 if (vcpu->pio.guest_pages[i]) {
Izik Eidusb4231d62007-11-20 11:49:33 +02001838 kvm_release_page_dirty(vcpu->pio.guest_pages[i]);
Carsten Ottede7d7892007-10-30 18:44:25 +01001839 vcpu->pio.guest_pages[i] = NULL;
1840 }
1841}
1842
1843static int pio_copy_data(struct kvm_vcpu *vcpu)
1844{
1845 void *p = vcpu->pio_data;
1846 void *q;
1847 unsigned bytes;
1848 int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
1849
1850 q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1851 PAGE_KERNEL);
1852 if (!q) {
1853 free_pio_guest_pages(vcpu);
1854 return -ENOMEM;
1855 }
1856 q += vcpu->pio.guest_page_offset;
1857 bytes = vcpu->pio.size * vcpu->pio.cur_count;
1858 if (vcpu->pio.in)
1859 memcpy(q, p, bytes);
1860 else
1861 memcpy(p, q, bytes);
1862 q -= vcpu->pio.guest_page_offset;
1863 vunmap(q);
1864 free_pio_guest_pages(vcpu);
1865 return 0;
1866}
1867
1868int complete_pio(struct kvm_vcpu *vcpu)
1869{
1870 struct kvm_pio_request *io = &vcpu->pio;
1871 long delta;
1872 int r;
1873
1874 kvm_x86_ops->cache_regs(vcpu);
1875
1876 if (!io->string) {
1877 if (io->in)
1878 memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
1879 io->size);
1880 } else {
1881 if (io->in) {
1882 r = pio_copy_data(vcpu);
1883 if (r) {
1884 kvm_x86_ops->cache_regs(vcpu);
1885 return r;
1886 }
1887 }
1888
1889 delta = 1;
1890 if (io->rep) {
1891 delta *= io->cur_count;
1892 /*
1893 * The size of the register should really depend on
1894 * current address size.
1895 */
1896 vcpu->regs[VCPU_REGS_RCX] -= delta;
1897 }
1898 if (io->down)
1899 delta = -delta;
1900 delta *= io->size;
1901 if (io->in)
1902 vcpu->regs[VCPU_REGS_RDI] += delta;
1903 else
1904 vcpu->regs[VCPU_REGS_RSI] += delta;
1905 }
1906
1907 kvm_x86_ops->decache_regs(vcpu);
1908
1909 io->count -= io->cur_count;
1910 io->cur_count = 0;
1911
1912 return 0;
1913}
1914
1915static void kernel_pio(struct kvm_io_device *pio_dev,
1916 struct kvm_vcpu *vcpu,
1917 void *pd)
1918{
1919 /* TODO: String I/O for in kernel device */
1920
1921 mutex_lock(&vcpu->kvm->lock);
1922 if (vcpu->pio.in)
1923 kvm_iodevice_read(pio_dev, vcpu->pio.port,
1924 vcpu->pio.size,
1925 pd);
1926 else
1927 kvm_iodevice_write(pio_dev, vcpu->pio.port,
1928 vcpu->pio.size,
1929 pd);
1930 mutex_unlock(&vcpu->kvm->lock);
1931}
1932
1933static void pio_string_write(struct kvm_io_device *pio_dev,
1934 struct kvm_vcpu *vcpu)
1935{
1936 struct kvm_pio_request *io = &vcpu->pio;
1937 void *pd = vcpu->pio_data;
1938 int i;
1939
1940 mutex_lock(&vcpu->kvm->lock);
1941 for (i = 0; i < io->cur_count; i++) {
1942 kvm_iodevice_write(pio_dev, io->port,
1943 io->size,
1944 pd);
1945 pd += io->size;
1946 }
1947 mutex_unlock(&vcpu->kvm->lock);
1948}
1949
1950static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
1951 gpa_t addr)
1952{
1953 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
1954}
1955
1956int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1957 int size, unsigned port)
1958{
1959 struct kvm_io_device *pio_dev;
1960
1961 vcpu->run->exit_reason = KVM_EXIT_IO;
1962 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1963 vcpu->run->io.size = vcpu->pio.size = size;
1964 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1965 vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1;
1966 vcpu->run->io.port = vcpu->pio.port = port;
1967 vcpu->pio.in = in;
1968 vcpu->pio.string = 0;
1969 vcpu->pio.down = 0;
1970 vcpu->pio.guest_page_offset = 0;
1971 vcpu->pio.rep = 0;
1972
1973 kvm_x86_ops->cache_regs(vcpu);
1974 memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
1975 kvm_x86_ops->decache_regs(vcpu);
1976
1977 kvm_x86_ops->skip_emulated_instruction(vcpu);
1978
1979 pio_dev = vcpu_find_pio_dev(vcpu, port);
1980 if (pio_dev) {
1981 kernel_pio(pio_dev, vcpu, vcpu->pio_data);
1982 complete_pio(vcpu);
1983 return 1;
1984 }
1985 return 0;
1986}
1987EXPORT_SYMBOL_GPL(kvm_emulate_pio);
1988
1989int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1990 int size, unsigned long count, int down,
1991 gva_t address, int rep, unsigned port)
1992{
1993 unsigned now, in_page;
1994 int i, ret = 0;
1995 int nr_pages = 1;
1996 struct page *page;
1997 struct kvm_io_device *pio_dev;
1998
1999 vcpu->run->exit_reason = KVM_EXIT_IO;
2000 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2001 vcpu->run->io.size = vcpu->pio.size = size;
2002 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2003 vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count;
2004 vcpu->run->io.port = vcpu->pio.port = port;
2005 vcpu->pio.in = in;
2006 vcpu->pio.string = 1;
2007 vcpu->pio.down = down;
2008 vcpu->pio.guest_page_offset = offset_in_page(address);
2009 vcpu->pio.rep = rep;
2010
2011 if (!count) {
2012 kvm_x86_ops->skip_emulated_instruction(vcpu);
2013 return 1;
2014 }
2015
2016 if (!down)
2017 in_page = PAGE_SIZE - offset_in_page(address);
2018 else
2019 in_page = offset_in_page(address) + size;
2020 now = min(count, (unsigned long)in_page / size);
2021 if (!now) {
2022 /*
2023 * String I/O straddles page boundary. Pin two guest pages
2024 * so that we satisfy atomicity constraints. Do just one
2025 * transaction to avoid complexity.
2026 */
2027 nr_pages = 2;
2028 now = 1;
2029 }
2030 if (down) {
2031 /*
2032 * String I/O in reverse. Yuck. Kill the guest, fix later.
2033 */
2034 pr_unimpl(vcpu, "guest string pio down\n");
2035 inject_gp(vcpu);
2036 return 1;
2037 }
2038 vcpu->run->io.count = now;
2039 vcpu->pio.cur_count = now;
2040
2041 if (vcpu->pio.cur_count == vcpu->pio.count)
2042 kvm_x86_ops->skip_emulated_instruction(vcpu);
2043
2044 for (i = 0; i < nr_pages; ++i) {
2045 mutex_lock(&vcpu->kvm->lock);
2046 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
2047 vcpu->pio.guest_pages[i] = page;
2048 mutex_unlock(&vcpu->kvm->lock);
2049 if (!page) {
2050 inject_gp(vcpu);
2051 free_pio_guest_pages(vcpu);
2052 return 1;
2053 }
2054 }
2055
2056 pio_dev = vcpu_find_pio_dev(vcpu, port);
2057 if (!vcpu->pio.in) {
2058 /* string PIO write */
2059 ret = pio_copy_data(vcpu);
2060 if (ret >= 0 && pio_dev) {
2061 pio_string_write(pio_dev, vcpu);
2062 complete_pio(vcpu);
2063 if (vcpu->pio.count == 0)
2064 ret = 1;
2065 }
2066 } else if (pio_dev)
2067 pr_unimpl(vcpu, "no string pio read support yet, "
2068 "port %x size %d count %ld\n",
2069 port, size, count);
2070
2071 return ret;
2072}
2073EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
2074
Zhang Xiantaof8c16bb2007-11-14 20:40:21 +08002075int kvm_arch_init(void *opaque)
Carsten Otte043405e2007-10-10 17:16:19 +02002076{
Zhang Xiantao56c6d282007-11-18 20:43:21 +08002077 int r;
Zhang Xiantaof8c16bb2007-11-14 20:40:21 +08002078 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2079
Zhang Xiantao56c6d282007-11-18 20:43:21 +08002080 r = kvm_mmu_module_init();
2081 if (r)
2082 goto out_fail;
2083
Carsten Otte043405e2007-10-10 17:16:19 +02002084 kvm_init_msr_list();
Zhang Xiantaof8c16bb2007-11-14 20:40:21 +08002085
2086 if (kvm_x86_ops) {
2087 printk(KERN_ERR "kvm: already loaded the other module\n");
Zhang Xiantao56c6d282007-11-18 20:43:21 +08002088 r = -EEXIST;
2089 goto out;
Zhang Xiantaof8c16bb2007-11-14 20:40:21 +08002090 }
2091
2092 if (!ops->cpu_has_kvm_support()) {
2093 printk(KERN_ERR "kvm: no hardware support\n");
Zhang Xiantao56c6d282007-11-18 20:43:21 +08002094 r = -EOPNOTSUPP;
2095 goto out;
Zhang Xiantaof8c16bb2007-11-14 20:40:21 +08002096 }
2097 if (ops->disabled_by_bios()) {
2098 printk(KERN_ERR "kvm: disabled by bios\n");
Zhang Xiantao56c6d282007-11-18 20:43:21 +08002099 r = -EOPNOTSUPP;
2100 goto out;
Zhang Xiantaof8c16bb2007-11-14 20:40:21 +08002101 }
2102
2103 kvm_x86_ops = ops;
Zhang Xiantao56c6d282007-11-18 20:43:21 +08002104 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
Zhang Xiantaof8c16bb2007-11-14 20:40:21 +08002105 return 0;
Zhang Xiantao56c6d282007-11-18 20:43:21 +08002106
2107out:
2108 kvm_mmu_module_exit();
2109out_fail:
2110 return r;
Carsten Otte043405e2007-10-10 17:16:19 +02002111}
Hollis Blanchard8776e512007-10-31 17:24:24 -05002112
Zhang Xiantaof8c16bb2007-11-14 20:40:21 +08002113void kvm_arch_exit(void)
2114{
2115 kvm_x86_ops = NULL;
Zhang Xiantao56c6d282007-11-18 20:43:21 +08002116 kvm_mmu_module_exit();
2117}
Zhang Xiantaof8c16bb2007-11-14 20:40:21 +08002118
Hollis Blanchard8776e512007-10-31 17:24:24 -05002119int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2120{
2121 ++vcpu->stat.halt_exits;
2122 if (irqchip_in_kernel(vcpu->kvm)) {
2123 vcpu->mp_state = VCPU_MP_STATE_HALTED;
2124 kvm_vcpu_block(vcpu);
2125 if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE)
2126 return -EINTR;
2127 return 1;
2128 } else {
2129 vcpu->run->exit_reason = KVM_EXIT_HLT;
2130 return 0;
2131 }
2132}
2133EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2134
2135int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2136{
2137 unsigned long nr, a0, a1, a2, a3, ret;
2138
2139 kvm_x86_ops->cache_regs(vcpu);
2140
2141 nr = vcpu->regs[VCPU_REGS_RAX];
2142 a0 = vcpu->regs[VCPU_REGS_RBX];
2143 a1 = vcpu->regs[VCPU_REGS_RCX];
2144 a2 = vcpu->regs[VCPU_REGS_RDX];
2145 a3 = vcpu->regs[VCPU_REGS_RSI];
2146
2147 if (!is_long_mode(vcpu)) {
2148 nr &= 0xFFFFFFFF;
2149 a0 &= 0xFFFFFFFF;
2150 a1 &= 0xFFFFFFFF;
2151 a2 &= 0xFFFFFFFF;
2152 a3 &= 0xFFFFFFFF;
2153 }
2154
2155 switch (nr) {
2156 default:
2157 ret = -KVM_ENOSYS;
2158 break;
2159 }
2160 vcpu->regs[VCPU_REGS_RAX] = ret;
2161 kvm_x86_ops->decache_regs(vcpu);
2162 return 0;
2163}
2164EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2165
2166int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2167{
2168 char instruction[3];
2169 int ret = 0;
2170
2171 mutex_lock(&vcpu->kvm->lock);
2172
2173 /*
2174 * Blow out the MMU to ensure that no other VCPU has an active mapping
2175 * to ensure that the updated hypercall appears atomically across all
2176 * VCPUs.
2177 */
2178 kvm_mmu_zap_all(vcpu->kvm);
2179
2180 kvm_x86_ops->cache_regs(vcpu);
2181 kvm_x86_ops->patch_hypercall(vcpu, instruction);
2182 if (emulator_write_emulated(vcpu->rip, instruction, 3, vcpu)
2183 != X86EMUL_CONTINUE)
2184 ret = -EFAULT;
2185
2186 mutex_unlock(&vcpu->kvm->lock);
2187
2188 return ret;
2189}
2190
2191static u64 mk_cr_64(u64 curr_cr, u32 new_val)
2192{
2193 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2194}
2195
2196void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2197{
2198 struct descriptor_table dt = { limit, base };
2199
2200 kvm_x86_ops->set_gdt(vcpu, &dt);
2201}
2202
2203void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2204{
2205 struct descriptor_table dt = { limit, base };
2206
2207 kvm_x86_ops->set_idt(vcpu, &dt);
2208}
2209
2210void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2211 unsigned long *rflags)
2212{
2213 lmsw(vcpu, msw);
2214 *rflags = kvm_x86_ops->get_rflags(vcpu);
2215}
2216
2217unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2218{
2219 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2220 switch (cr) {
2221 case 0:
2222 return vcpu->cr0;
2223 case 2:
2224 return vcpu->cr2;
2225 case 3:
2226 return vcpu->cr3;
2227 case 4:
2228 return vcpu->cr4;
2229 default:
2230 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2231 return 0;
2232 }
2233}
2234
2235void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2236 unsigned long *rflags)
2237{
2238 switch (cr) {
2239 case 0:
2240 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
2241 *rflags = kvm_x86_ops->get_rflags(vcpu);
2242 break;
2243 case 2:
2244 vcpu->cr2 = val;
2245 break;
2246 case 3:
2247 set_cr3(vcpu, val);
2248 break;
2249 case 4:
2250 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
2251 break;
2252 default:
2253 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2254 }
2255}
2256
Dan Kenigsberg07716712007-11-21 17:10:04 +02002257static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
2258{
2259 struct kvm_cpuid_entry2 *e = &vcpu->cpuid_entries[i];
2260 int j, nent = vcpu->cpuid_nent;
2261
2262 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
2263 /* when no next entry is found, the current entry[i] is reselected */
2264 for (j = i + 1; j == i; j = (j + 1) % nent) {
2265 struct kvm_cpuid_entry2 *ej = &vcpu->cpuid_entries[j];
2266 if (ej->function == e->function) {
2267 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
2268 return j;
2269 }
2270 }
2271 return 0; /* silence gcc, even though control never reaches here */
2272}
2273
2274/* find an entry with matching function, matching index (if needed), and that
2275 * should be read next (if it's stateful) */
2276static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
2277 u32 function, u32 index)
2278{
2279 if (e->function != function)
2280 return 0;
2281 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
2282 return 0;
2283 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
2284 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
2285 return 0;
2286 return 1;
2287}
2288
Hollis Blanchard8776e512007-10-31 17:24:24 -05002289void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2290{
2291 int i;
Dan Kenigsberg07716712007-11-21 17:10:04 +02002292 u32 function, index;
2293 struct kvm_cpuid_entry2 *e, *best;
Hollis Blanchard8776e512007-10-31 17:24:24 -05002294
2295 kvm_x86_ops->cache_regs(vcpu);
2296 function = vcpu->regs[VCPU_REGS_RAX];
Dan Kenigsberg07716712007-11-21 17:10:04 +02002297 index = vcpu->regs[VCPU_REGS_RCX];
Hollis Blanchard8776e512007-10-31 17:24:24 -05002298 vcpu->regs[VCPU_REGS_RAX] = 0;
2299 vcpu->regs[VCPU_REGS_RBX] = 0;
2300 vcpu->regs[VCPU_REGS_RCX] = 0;
2301 vcpu->regs[VCPU_REGS_RDX] = 0;
2302 best = NULL;
2303 for (i = 0; i < vcpu->cpuid_nent; ++i) {
2304 e = &vcpu->cpuid_entries[i];
Dan Kenigsberg07716712007-11-21 17:10:04 +02002305 if (is_matching_cpuid_entry(e, function, index)) {
2306 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
2307 move_to_next_stateful_cpuid_entry(vcpu, i);
Hollis Blanchard8776e512007-10-31 17:24:24 -05002308 best = e;
2309 break;
2310 }
2311 /*
2312 * Both basic or both extended?
2313 */
2314 if (((e->function ^ function) & 0x80000000) == 0)
2315 if (!best || e->function > best->function)
2316 best = e;
2317 }
2318 if (best) {
2319 vcpu->regs[VCPU_REGS_RAX] = best->eax;
2320 vcpu->regs[VCPU_REGS_RBX] = best->ebx;
2321 vcpu->regs[VCPU_REGS_RCX] = best->ecx;
2322 vcpu->regs[VCPU_REGS_RDX] = best->edx;
2323 }
2324 kvm_x86_ops->decache_regs(vcpu);
2325 kvm_x86_ops->skip_emulated_instruction(vcpu);
2326}
2327EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
Hollis Blanchardd0752062007-10-31 17:24:25 -05002328
2329/*
Hollis Blanchardb6c7a5d2007-11-01 14:16:10 -05002330 * Check if userspace requested an interrupt window, and that the
2331 * interrupt window is open.
2332 *
2333 * No need to exit to userspace if we already have an interrupt queued.
2334 */
2335static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
2336 struct kvm_run *kvm_run)
2337{
2338 return (!vcpu->irq_summary &&
2339 kvm_run->request_interrupt_window &&
2340 vcpu->interrupt_window_open &&
2341 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
2342}
2343
2344static void post_kvm_run_save(struct kvm_vcpu *vcpu,
2345 struct kvm_run *kvm_run)
2346{
2347 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
2348 kvm_run->cr8 = get_cr8(vcpu);
2349 kvm_run->apic_base = kvm_get_apic_base(vcpu);
2350 if (irqchip_in_kernel(vcpu->kvm))
2351 kvm_run->ready_for_interrupt_injection = 1;
2352 else
2353 kvm_run->ready_for_interrupt_injection =
2354 (vcpu->interrupt_window_open &&
2355 vcpu->irq_summary == 0);
2356}
2357
2358static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2359{
2360 int r;
2361
2362 if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
2363 pr_debug("vcpu %d received sipi with vector # %x\n",
2364 vcpu->vcpu_id, vcpu->sipi_vector);
2365 kvm_lapic_reset(vcpu);
2366 r = kvm_x86_ops->vcpu_reset(vcpu);
2367 if (r)
2368 return r;
2369 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
2370 }
2371
2372preempted:
2373 if (vcpu->guest_debug.enabled)
2374 kvm_x86_ops->guest_debug_pre(vcpu);
2375
2376again:
2377 r = kvm_mmu_reload(vcpu);
2378 if (unlikely(r))
2379 goto out;
2380
2381 kvm_inject_pending_timer_irqs(vcpu);
2382
2383 preempt_disable();
2384
2385 kvm_x86_ops->prepare_guest_switch(vcpu);
2386 kvm_load_guest_fpu(vcpu);
2387
2388 local_irq_disable();
2389
2390 if (signal_pending(current)) {
2391 local_irq_enable();
2392 preempt_enable();
2393 r = -EINTR;
2394 kvm_run->exit_reason = KVM_EXIT_INTR;
2395 ++vcpu->stat.signal_exits;
2396 goto out;
2397 }
2398
Avi Kivity298101d2007-11-25 13:41:11 +02002399 if (vcpu->exception.pending)
2400 __queue_exception(vcpu);
2401 else if (irqchip_in_kernel(vcpu->kvm))
Hollis Blanchardb6c7a5d2007-11-01 14:16:10 -05002402 kvm_x86_ops->inject_pending_irq(vcpu);
Avi Kivityeb9774f2007-11-25 17:45:31 +02002403 else
Hollis Blanchardb6c7a5d2007-11-01 14:16:10 -05002404 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2405
2406 vcpu->guest_mode = 1;
2407 kvm_guest_enter();
2408
2409 if (vcpu->requests)
2410 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2411 kvm_x86_ops->tlb_flush(vcpu);
2412
2413 kvm_x86_ops->run(vcpu, kvm_run);
2414
2415 vcpu->guest_mode = 0;
2416 local_irq_enable();
2417
2418 ++vcpu->stat.exits;
2419
2420 /*
2421 * We must have an instruction between local_irq_enable() and
2422 * kvm_guest_exit(), so the timer interrupt isn't delayed by
2423 * the interrupt shadow. The stat.exits increment will do nicely.
2424 * But we need to prevent reordering, hence this barrier():
2425 */
2426 barrier();
2427
2428 kvm_guest_exit();
2429
2430 preempt_enable();
2431
2432 /*
2433 * Profile KVM exit RIPs:
2434 */
2435 if (unlikely(prof_on == KVM_PROFILING)) {
2436 kvm_x86_ops->cache_regs(vcpu);
2437 profile_hit(KVM_PROFILING, (void *)vcpu->rip);
2438 }
2439
Avi Kivity298101d2007-11-25 13:41:11 +02002440 if (vcpu->exception.pending && kvm_x86_ops->exception_injected(vcpu))
2441 vcpu->exception.pending = false;
2442
Hollis Blanchardb6c7a5d2007-11-01 14:16:10 -05002443 r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2444
2445 if (r > 0) {
2446 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
2447 r = -EINTR;
2448 kvm_run->exit_reason = KVM_EXIT_INTR;
2449 ++vcpu->stat.request_irq_exits;
2450 goto out;
2451 }
Avi Kivitye1beb1d2007-11-18 13:50:24 +02002452 if (!need_resched())
Hollis Blanchardb6c7a5d2007-11-01 14:16:10 -05002453 goto again;
Hollis Blanchardb6c7a5d2007-11-01 14:16:10 -05002454 }
2455
2456out:
2457 if (r > 0) {
2458 kvm_resched(vcpu);
2459 goto preempted;
2460 }
2461
2462 post_kvm_run_save(vcpu, kvm_run);
2463
2464 return r;
2465}
2466
2467int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2468{
2469 int r;
2470 sigset_t sigsaved;
2471
2472 vcpu_load(vcpu);
2473
2474 if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
2475 kvm_vcpu_block(vcpu);
2476 vcpu_put(vcpu);
2477 return -EAGAIN;
2478 }
2479
2480 if (vcpu->sigset_active)
2481 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2482
2483 /* re-sync apic's tpr */
2484 if (!irqchip_in_kernel(vcpu->kvm))
2485 set_cr8(vcpu, kvm_run->cr8);
2486
2487 if (vcpu->pio.cur_count) {
2488 r = complete_pio(vcpu);
2489 if (r)
2490 goto out;
2491 }
2492#if CONFIG_HAS_IOMEM
2493 if (vcpu->mmio_needed) {
2494 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
2495 vcpu->mmio_read_completed = 1;
2496 vcpu->mmio_needed = 0;
2497 r = emulate_instruction(vcpu, kvm_run,
2498 vcpu->mmio_fault_cr2, 0, 1);
2499 if (r == EMULATE_DO_MMIO) {
2500 /*
2501 * Read-modify-write. Back to userspace.
2502 */
2503 r = 0;
2504 goto out;
2505 }
2506 }
2507#endif
2508 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
2509 kvm_x86_ops->cache_regs(vcpu);
2510 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
2511 kvm_x86_ops->decache_regs(vcpu);
2512 }
2513
2514 r = __vcpu_run(vcpu, kvm_run);
2515
2516out:
2517 if (vcpu->sigset_active)
2518 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
2519
2520 vcpu_put(vcpu);
2521 return r;
2522}
2523
2524int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2525{
2526 vcpu_load(vcpu);
2527
2528 kvm_x86_ops->cache_regs(vcpu);
2529
2530 regs->rax = vcpu->regs[VCPU_REGS_RAX];
2531 regs->rbx = vcpu->regs[VCPU_REGS_RBX];
2532 regs->rcx = vcpu->regs[VCPU_REGS_RCX];
2533 regs->rdx = vcpu->regs[VCPU_REGS_RDX];
2534 regs->rsi = vcpu->regs[VCPU_REGS_RSI];
2535 regs->rdi = vcpu->regs[VCPU_REGS_RDI];
2536 regs->rsp = vcpu->regs[VCPU_REGS_RSP];
2537 regs->rbp = vcpu->regs[VCPU_REGS_RBP];
2538#ifdef CONFIG_X86_64
2539 regs->r8 = vcpu->regs[VCPU_REGS_R8];
2540 regs->r9 = vcpu->regs[VCPU_REGS_R9];
2541 regs->r10 = vcpu->regs[VCPU_REGS_R10];
2542 regs->r11 = vcpu->regs[VCPU_REGS_R11];
2543 regs->r12 = vcpu->regs[VCPU_REGS_R12];
2544 regs->r13 = vcpu->regs[VCPU_REGS_R13];
2545 regs->r14 = vcpu->regs[VCPU_REGS_R14];
2546 regs->r15 = vcpu->regs[VCPU_REGS_R15];
2547#endif
2548
2549 regs->rip = vcpu->rip;
2550 regs->rflags = kvm_x86_ops->get_rflags(vcpu);
2551
2552 /*
2553 * Don't leak debug flags in case they were set for guest debugging
2554 */
2555 if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2556 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2557
2558 vcpu_put(vcpu);
2559
2560 return 0;
2561}
2562
2563int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2564{
2565 vcpu_load(vcpu);
2566
2567 vcpu->regs[VCPU_REGS_RAX] = regs->rax;
2568 vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
2569 vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
2570 vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
2571 vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
2572 vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
2573 vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
2574 vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
2575#ifdef CONFIG_X86_64
2576 vcpu->regs[VCPU_REGS_R8] = regs->r8;
2577 vcpu->regs[VCPU_REGS_R9] = regs->r9;
2578 vcpu->regs[VCPU_REGS_R10] = regs->r10;
2579 vcpu->regs[VCPU_REGS_R11] = regs->r11;
2580 vcpu->regs[VCPU_REGS_R12] = regs->r12;
2581 vcpu->regs[VCPU_REGS_R13] = regs->r13;
2582 vcpu->regs[VCPU_REGS_R14] = regs->r14;
2583 vcpu->regs[VCPU_REGS_R15] = regs->r15;
2584#endif
2585
2586 vcpu->rip = regs->rip;
2587 kvm_x86_ops->set_rflags(vcpu, regs->rflags);
2588
2589 kvm_x86_ops->decache_regs(vcpu);
2590
2591 vcpu_put(vcpu);
2592
2593 return 0;
2594}
2595
2596static void get_segment(struct kvm_vcpu *vcpu,
2597 struct kvm_segment *var, int seg)
2598{
2599 return kvm_x86_ops->get_segment(vcpu, var, seg);
2600}
2601
2602void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2603{
2604 struct kvm_segment cs;
2605
2606 get_segment(vcpu, &cs, VCPU_SREG_CS);
2607 *db = cs.db;
2608 *l = cs.l;
2609}
2610EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2611
2612int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2613 struct kvm_sregs *sregs)
2614{
2615 struct descriptor_table dt;
2616 int pending_vec;
2617
2618 vcpu_load(vcpu);
2619
2620 get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2621 get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2622 get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2623 get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2624 get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2625 get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2626
2627 get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2628 get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2629
2630 kvm_x86_ops->get_idt(vcpu, &dt);
2631 sregs->idt.limit = dt.limit;
2632 sregs->idt.base = dt.base;
2633 kvm_x86_ops->get_gdt(vcpu, &dt);
2634 sregs->gdt.limit = dt.limit;
2635 sregs->gdt.base = dt.base;
2636
2637 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2638 sregs->cr0 = vcpu->cr0;
2639 sregs->cr2 = vcpu->cr2;
2640 sregs->cr3 = vcpu->cr3;
2641 sregs->cr4 = vcpu->cr4;
2642 sregs->cr8 = get_cr8(vcpu);
2643 sregs->efer = vcpu->shadow_efer;
2644 sregs->apic_base = kvm_get_apic_base(vcpu);
2645
2646 if (irqchip_in_kernel(vcpu->kvm)) {
2647 memset(sregs->interrupt_bitmap, 0,
2648 sizeof sregs->interrupt_bitmap);
2649 pending_vec = kvm_x86_ops->get_irq(vcpu);
2650 if (pending_vec >= 0)
2651 set_bit(pending_vec,
2652 (unsigned long *)sregs->interrupt_bitmap);
2653 } else
2654 memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
2655 sizeof sregs->interrupt_bitmap);
2656
2657 vcpu_put(vcpu);
2658
2659 return 0;
2660}
2661
2662static void set_segment(struct kvm_vcpu *vcpu,
2663 struct kvm_segment *var, int seg)
2664{
2665 return kvm_x86_ops->set_segment(vcpu, var, seg);
2666}
2667
2668int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2669 struct kvm_sregs *sregs)
2670{
2671 int mmu_reset_needed = 0;
2672 int i, pending_vec, max_bits;
2673 struct descriptor_table dt;
2674
2675 vcpu_load(vcpu);
2676
2677 dt.limit = sregs->idt.limit;
2678 dt.base = sregs->idt.base;
2679 kvm_x86_ops->set_idt(vcpu, &dt);
2680 dt.limit = sregs->gdt.limit;
2681 dt.base = sregs->gdt.base;
2682 kvm_x86_ops->set_gdt(vcpu, &dt);
2683
2684 vcpu->cr2 = sregs->cr2;
2685 mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
2686 vcpu->cr3 = sregs->cr3;
2687
2688 set_cr8(vcpu, sregs->cr8);
2689
2690 mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
2691#ifdef CONFIG_X86_64
2692 kvm_x86_ops->set_efer(vcpu, sregs->efer);
2693#endif
2694 kvm_set_apic_base(vcpu, sregs->apic_base);
2695
2696 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2697
2698 mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
2699 vcpu->cr0 = sregs->cr0;
2700 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
2701
2702 mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
2703 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
2704 if (!is_long_mode(vcpu) && is_pae(vcpu))
2705 load_pdptrs(vcpu, vcpu->cr3);
2706
2707 if (mmu_reset_needed)
2708 kvm_mmu_reset_context(vcpu);
2709
2710 if (!irqchip_in_kernel(vcpu->kvm)) {
2711 memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
2712 sizeof vcpu->irq_pending);
2713 vcpu->irq_summary = 0;
2714 for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i)
2715 if (vcpu->irq_pending[i])
2716 __set_bit(i, &vcpu->irq_summary);
2717 } else {
2718 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
2719 pending_vec = find_first_bit(
2720 (const unsigned long *)sregs->interrupt_bitmap,
2721 max_bits);
2722 /* Only pending external irq is handled here */
2723 if (pending_vec < max_bits) {
2724 kvm_x86_ops->set_irq(vcpu, pending_vec);
2725 pr_debug("Set back pending irq %d\n",
2726 pending_vec);
2727 }
2728 }
2729
2730 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2731 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2732 set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2733 set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2734 set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2735 set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2736
2737 set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2738 set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2739
2740 vcpu_put(vcpu);
2741
2742 return 0;
2743}
2744
2745int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2746 struct kvm_debug_guest *dbg)
2747{
2748 int r;
2749
2750 vcpu_load(vcpu);
2751
2752 r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
2753
2754 vcpu_put(vcpu);
2755
2756 return r;
2757}
2758
2759/*
Hollis Blanchardd0752062007-10-31 17:24:25 -05002760 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
2761 * we have asm/x86/processor.h
2762 */
2763struct fxsave {
2764 u16 cwd;
2765 u16 swd;
2766 u16 twd;
2767 u16 fop;
2768 u64 rip;
2769 u64 rdp;
2770 u32 mxcsr;
2771 u32 mxcsr_mask;
2772 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
2773#ifdef CONFIG_X86_64
2774 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
2775#else
2776 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
2777#endif
2778};
2779
Zhang Xiantao8b006792007-11-16 13:05:55 +08002780/*
2781 * Translate a guest virtual address to a guest physical address.
2782 */
2783int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2784 struct kvm_translation *tr)
2785{
2786 unsigned long vaddr = tr->linear_address;
2787 gpa_t gpa;
2788
2789 vcpu_load(vcpu);
2790 mutex_lock(&vcpu->kvm->lock);
2791 gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
2792 tr->physical_address = gpa;
2793 tr->valid = gpa != UNMAPPED_GVA;
2794 tr->writeable = 1;
2795 tr->usermode = 0;
2796 mutex_unlock(&vcpu->kvm->lock);
2797 vcpu_put(vcpu);
2798
2799 return 0;
2800}
2801
Hollis Blanchardd0752062007-10-31 17:24:25 -05002802int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2803{
2804 struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
2805
2806 vcpu_load(vcpu);
2807
2808 memcpy(fpu->fpr, fxsave->st_space, 128);
2809 fpu->fcw = fxsave->cwd;
2810 fpu->fsw = fxsave->swd;
2811 fpu->ftwx = fxsave->twd;
2812 fpu->last_opcode = fxsave->fop;
2813 fpu->last_ip = fxsave->rip;
2814 fpu->last_dp = fxsave->rdp;
2815 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2816
2817 vcpu_put(vcpu);
2818
2819 return 0;
2820}
2821
2822int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2823{
2824 struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
2825
2826 vcpu_load(vcpu);
2827
2828 memcpy(fxsave->st_space, fpu->fpr, 128);
2829 fxsave->cwd = fpu->fcw;
2830 fxsave->swd = fpu->fsw;
2831 fxsave->twd = fpu->ftwx;
2832 fxsave->fop = fpu->last_opcode;
2833 fxsave->rip = fpu->last_ip;
2834 fxsave->rdp = fpu->last_dp;
2835 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
2836
2837 vcpu_put(vcpu);
2838
2839 return 0;
2840}
2841
2842void fx_init(struct kvm_vcpu *vcpu)
2843{
2844 unsigned after_mxcsr_mask;
2845
2846 /* Initialize guest FPU by resetting ours and saving into guest's */
2847 preempt_disable();
2848 fx_save(&vcpu->host_fx_image);
2849 fpu_init();
2850 fx_save(&vcpu->guest_fx_image);
2851 fx_restore(&vcpu->host_fx_image);
2852 preempt_enable();
2853
2854 vcpu->cr0 |= X86_CR0_ET;
2855 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
2856 vcpu->guest_fx_image.mxcsr = 0x1f80;
2857 memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask,
2858 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
2859}
2860EXPORT_SYMBOL_GPL(fx_init);
2861
2862void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
2863{
2864 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
2865 return;
2866
2867 vcpu->guest_fpu_loaded = 1;
2868 fx_save(&vcpu->host_fx_image);
2869 fx_restore(&vcpu->guest_fx_image);
2870}
2871EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
2872
2873void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
2874{
2875 if (!vcpu->guest_fpu_loaded)
2876 return;
2877
2878 vcpu->guest_fpu_loaded = 0;
2879 fx_save(&vcpu->guest_fx_image);
2880 fx_restore(&vcpu->host_fx_image);
Avi Kivityf096ed82007-11-18 13:54:33 +02002881 ++vcpu->stat.fpu_reload;
Hollis Blanchardd0752062007-10-31 17:24:25 -05002882}
2883EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
Zhang Xiantaoe9b11c12007-11-14 20:38:21 +08002884
2885void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
2886{
2887 kvm_x86_ops->vcpu_free(vcpu);
2888}
2889
2890struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
2891 unsigned int id)
2892{
Avi Kivity26e52152007-11-20 15:30:24 +02002893 return kvm_x86_ops->vcpu_create(kvm, id);
2894}
Zhang Xiantaoe9b11c12007-11-14 20:38:21 +08002895
Avi Kivity26e52152007-11-20 15:30:24 +02002896int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
2897{
2898 int r;
Zhang Xiantaoe9b11c12007-11-14 20:38:21 +08002899
2900 /* We do fxsave: this must be aligned. */
2901 BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
2902
2903 vcpu_load(vcpu);
2904 r = kvm_arch_vcpu_reset(vcpu);
2905 if (r == 0)
2906 r = kvm_mmu_setup(vcpu);
2907 vcpu_put(vcpu);
2908 if (r < 0)
2909 goto free_vcpu;
2910
Avi Kivity26e52152007-11-20 15:30:24 +02002911 return 0;
Zhang Xiantaoe9b11c12007-11-14 20:38:21 +08002912free_vcpu:
2913 kvm_x86_ops->vcpu_free(vcpu);
Avi Kivity26e52152007-11-20 15:30:24 +02002914 return r;
Zhang Xiantaoe9b11c12007-11-14 20:38:21 +08002915}
2916
Hollis Blanchardd40ccc62007-11-19 14:04:43 -06002917void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
Zhang Xiantaoe9b11c12007-11-14 20:38:21 +08002918{
2919 vcpu_load(vcpu);
2920 kvm_mmu_unload(vcpu);
2921 vcpu_put(vcpu);
2922
2923 kvm_x86_ops->vcpu_free(vcpu);
2924}
2925
2926int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
2927{
2928 return kvm_x86_ops->vcpu_reset(vcpu);
2929}
2930
2931void kvm_arch_hardware_enable(void *garbage)
2932{
2933 kvm_x86_ops->hardware_enable(garbage);
2934}
2935
2936void kvm_arch_hardware_disable(void *garbage)
2937{
2938 kvm_x86_ops->hardware_disable(garbage);
2939}
2940
2941int kvm_arch_hardware_setup(void)
2942{
2943 return kvm_x86_ops->hardware_setup();
2944}
2945
2946void kvm_arch_hardware_unsetup(void)
2947{
2948 kvm_x86_ops->hardware_unsetup();
2949}
2950
2951void kvm_arch_check_processor_compat(void *rtn)
2952{
2953 kvm_x86_ops->check_processor_compatibility(rtn);
2954}
2955
2956int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
2957{
2958 struct page *page;
2959 struct kvm *kvm;
2960 int r;
2961
2962 BUG_ON(vcpu->kvm == NULL);
2963 kvm = vcpu->kvm;
2964
2965 vcpu->mmu.root_hpa = INVALID_PAGE;
2966 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
2967 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
2968 else
2969 vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED;
2970
2971 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2972 if (!page) {
2973 r = -ENOMEM;
2974 goto fail;
2975 }
2976 vcpu->pio_data = page_address(page);
2977
2978 r = kvm_mmu_create(vcpu);
2979 if (r < 0)
2980 goto fail_free_pio_data;
2981
2982 if (irqchip_in_kernel(kvm)) {
2983 r = kvm_create_lapic(vcpu);
2984 if (r < 0)
2985 goto fail_mmu_destroy;
2986 }
2987
2988 return 0;
2989
2990fail_mmu_destroy:
2991 kvm_mmu_destroy(vcpu);
2992fail_free_pio_data:
2993 free_page((unsigned long)vcpu->pio_data);
2994fail:
2995 return r;
2996}
2997
2998void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
2999{
3000 kvm_free_lapic(vcpu);
3001 kvm_mmu_destroy(vcpu);
3002 free_page((unsigned long)vcpu->pio_data);
3003}
Zhang Xiantaod19a9cd2007-11-18 18:43:45 +08003004
3005struct kvm *kvm_arch_create_vm(void)
3006{
3007 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
3008
3009 if (!kvm)
3010 return ERR_PTR(-ENOMEM);
3011
3012 INIT_LIST_HEAD(&kvm->active_mmu_pages);
3013
3014 return kvm;
3015}
3016
3017static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
3018{
3019 vcpu_load(vcpu);
3020 kvm_mmu_unload(vcpu);
3021 vcpu_put(vcpu);
3022}
3023
3024static void kvm_free_vcpus(struct kvm *kvm)
3025{
3026 unsigned int i;
3027
3028 /*
3029 * Unpin any mmu pages first.
3030 */
3031 for (i = 0; i < KVM_MAX_VCPUS; ++i)
3032 if (kvm->vcpus[i])
3033 kvm_unload_vcpu_mmu(kvm->vcpus[i]);
3034 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3035 if (kvm->vcpus[i]) {
3036 kvm_arch_vcpu_free(kvm->vcpus[i]);
3037 kvm->vcpus[i] = NULL;
3038 }
3039 }
3040
3041}
3042
3043void kvm_arch_destroy_vm(struct kvm *kvm)
3044{
3045 kfree(kvm->vpic);
3046 kfree(kvm->vioapic);
3047 kvm_free_vcpus(kvm);
3048 kvm_free_physmem(kvm);
3049 kfree(kvm);
3050}
Zhang Xiantao0de10342007-11-20 16:25:04 +08003051
3052int kvm_arch_set_memory_region(struct kvm *kvm,
3053 struct kvm_userspace_memory_region *mem,
3054 struct kvm_memory_slot old,
3055 int user_alloc)
3056{
3057 int npages = mem->memory_size >> PAGE_SHIFT;
3058 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
3059
3060 /*To keep backward compatibility with older userspace,
3061 *x86 needs to hanlde !user_alloc case.
3062 */
3063 if (!user_alloc) {
3064 if (npages && !old.rmap) {
3065 down_write(&current->mm->mmap_sem);
3066 memslot->userspace_addr = do_mmap(NULL, 0,
3067 npages * PAGE_SIZE,
3068 PROT_READ | PROT_WRITE,
3069 MAP_SHARED | MAP_ANONYMOUS,
3070 0);
3071 up_write(&current->mm->mmap_sem);
3072
3073 if (IS_ERR((void *)memslot->userspace_addr))
3074 return PTR_ERR((void *)memslot->userspace_addr);
3075 } else {
3076 if (!old.user_alloc && old.rmap) {
3077 int ret;
3078
3079 down_write(&current->mm->mmap_sem);
3080 ret = do_munmap(current->mm, old.userspace_addr,
3081 old.npages * PAGE_SIZE);
3082 up_write(&current->mm->mmap_sem);
3083 if (ret < 0)
3084 printk(KERN_WARNING
3085 "kvm_vm_ioctl_set_memory_region: "
3086 "failed to munmap memory\n");
3087 }
3088 }
3089 }
3090
3091 if (!kvm->n_requested_mmu_pages) {
3092 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
3093 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
3094 }
3095
3096 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
3097 kvm_flush_remote_tlbs(kvm);
3098
3099 return 0;
3100}