diff options
author | Linus Torvalds | 2007-10-13 10:02:11 -0700 |
---|---|---|
committer | Linus Torvalds | 2007-10-13 10:02:11 -0700 |
commit | 3749c66c67fb5c257771815c186bc32290cacf44 (patch) | |
tree | de6634f722a9b79c60fabbd605660e46741f7160 | |
parent | 835c34a1687f524c37d4fb8bad18d642c74bed8d (diff) | |
parent | 8a45450d0a559912873428077908f9bc1411042c (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (106 commits)
KVM: Replace enum by #define
KVM: Skip pio instruction when it is emulated, not executed
KVM: x86 emulator: popf
KVM: x86 emulator: fix src, dst value initialization
KVM: x86 emulator: jmp abs
KVM: x86 emulator: lea
KVM: X86 emulator: jump conditional short
KVM: x86 emulator: imlpement jump conditional relative
KVM: x86 emulator: sort opcodes into ascending order
KVM: Improve emulation failure reporting
KVM: x86 emulator: pushf
KVM: x86 emulator: call near
KVM: x86 emulator: push imm8
KVM: VMX: Fix exit qualification width on i386
KVM: Move main vcpu loop into subarch independent code
KVM: VMX: Move vm entry failure handling to the exit handler
KVM: MMU: Don't do GFP_NOWAIT allocations
KVM: Rename kvm_arch_ops to kvm_x86_ops
KVM: Simplify memory allocation
KVM: Hoist SVM's get_cs_db_l_bits into core code.
...
-rw-r--r-- | drivers/kvm/Kconfig | 1 | ||||
-rw-r--r-- | drivers/kvm/Makefile | 2 | ||||
-rw-r--r-- | drivers/kvm/i8259.c | 450 | ||||
-rw-r--r-- | drivers/kvm/ioapic.c | 388 | ||||
-rw-r--r-- | drivers/kvm/irq.c | 98 | ||||
-rw-r--r-- | drivers/kvm/irq.h | 165 | ||||
-rw-r--r-- | drivers/kvm/kvm.h | 201 | ||||
-rw-r--r-- | drivers/kvm/kvm_main.c | 1486 | ||||
-rw-r--r-- | drivers/kvm/kvm_svm.h | 3 | ||||
-rw-r--r-- | drivers/kvm/lapic.c | 1064 | ||||
-rw-r--r-- | drivers/kvm/mmu.c | 51 | ||||
-rw-r--r-- | drivers/kvm/paging_tmpl.h | 84 | ||||
-rw-r--r-- | drivers/kvm/svm.c | 1046 | ||||
-rw-r--r-- | drivers/kvm/vmx.c | 1034 | ||||
-rw-r--r-- | drivers/kvm/vmx.h | 73 | ||||
-rw-r--r-- | drivers/kvm/x86_emulate.c | 411 | ||||
-rw-r--r-- | drivers/kvm/x86_emulate.h | 20 | ||||
-rw-r--r-- | include/asm-x86/io_apic_32.h | 16 | ||||
-rw-r--r-- | include/asm-x86/processor-flags.h | 2 | ||||
-rw-r--r-- | include/linux/kvm.h | 128 |
20 files changed, 4848 insertions, 1875 deletions
diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig index 0a419a0de603..8749fa4ffcee 100644 --- a/drivers/kvm/Kconfig +++ b/drivers/kvm/Kconfig @@ -17,6 +17,7 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on X86 && EXPERIMENTAL + select PREEMPT_NOTIFIERS select ANON_INODES ---help--- Support hosting fully virtualized guest machines using hardware diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile index c0a789fa9d65..e5a8f4d3e973 100644 --- a/drivers/kvm/Makefile +++ b/drivers/kvm/Makefile @@ -2,7 +2,7 @@ # Makefile for Kernel-based Virtual Machine module # -kvm-objs := kvm_main.o mmu.o x86_emulate.o +kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o obj-$(CONFIG_KVM) += kvm.o kvm-intel-objs = vmx.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/drivers/kvm/i8259.c b/drivers/kvm/i8259.c new file mode 100644 index 000000000000..a679157bc599 --- /dev/null +++ b/drivers/kvm/i8259.c @@ -0,0 +1,450 @@ +/* + * 8259 interrupt controller emulation + * + * Copyright (c) 2003-2004 Fabrice Bellard + * Copyright (c) 2007 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * Authors: + * Yaozu (Eddie) Dong <Eddie.dong@intel.com> + * Port from Qemu. + */ +#include <linux/mm.h> +#include "irq.h" + +/* + * set irq level. If an edge is detected, then the IRR is set to 1 + */ +static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level) +{ + int mask; + mask = 1 << irq; + if (s->elcr & mask) /* level triggered */ + if (level) { + s->irr |= mask; + s->last_irr |= mask; + } else { + s->irr &= ~mask; + s->last_irr &= ~mask; + } + else /* edge triggered */ + if (level) { + if ((s->last_irr & mask) == 0) + s->irr |= mask; + s->last_irr |= mask; + } else + s->last_irr &= ~mask; +} + +/* + * return the highest priority found in mask (highest = smallest + * number). Return 8 if no irq + */ +static inline int get_priority(struct kvm_kpic_state *s, int mask) +{ + int priority; + if (mask == 0) + return 8; + priority = 0; + while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0) + priority++; + return priority; +} + +/* + * return the pic wanted interrupt. return -1 if none + */ +static int pic_get_irq(struct kvm_kpic_state *s) +{ + int mask, cur_priority, priority; + + mask = s->irr & ~s->imr; + priority = get_priority(s, mask); + if (priority == 8) + return -1; + /* + * compute current priority. If special fully nested mode on the + * master, the IRQ coming from the slave is not taken into account + * for the priority computation. + */ + mask = s->isr; + if (s->special_fully_nested_mode && s == &s->pics_state->pics[0]) + mask &= ~(1 << 2); + cur_priority = get_priority(s, mask); + if (priority < cur_priority) + /* + * higher priority found: an irq should be generated + */ + return (priority + s->priority_add) & 7; + else + return -1; +} + +/* + * raise irq to CPU if necessary. must be called every time the active + * irq may change + */ +static void pic_update_irq(struct kvm_pic *s) +{ + int irq2, irq; + + irq2 = pic_get_irq(&s->pics[1]); + if (irq2 >= 0) { + /* + * if irq request by slave pic, signal master PIC + */ + pic_set_irq1(&s->pics[0], 2, 1); + pic_set_irq1(&s->pics[0], 2, 0); + } + irq = pic_get_irq(&s->pics[0]); + if (irq >= 0) + s->irq_request(s->irq_request_opaque, 1); + else + s->irq_request(s->irq_request_opaque, 0); +} + +void kvm_pic_update_irq(struct kvm_pic *s) +{ + pic_update_irq(s); +} + +void kvm_pic_set_irq(void *opaque, int irq, int level) +{ + struct kvm_pic *s = opaque; + + pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); + pic_update_irq(s); +} + +/* + * acknowledge interrupt 'irq' + */ +static inline void pic_intack(struct kvm_kpic_state *s, int irq) +{ + if (s->auto_eoi) { + if (s->rotate_on_auto_eoi) + s->priority_add = (irq + 1) & 7; + } else + s->isr |= (1 << irq); + /* + * We don't clear a level sensitive interrupt here + */ + if (!(s->elcr & (1 << irq))) + s->irr &= ~(1 << irq); +} + +int kvm_pic_read_irq(struct kvm_pic *s) +{ + int irq, irq2, intno; + + irq = pic_get_irq(&s->pics[0]); + if (irq >= 0) { + pic_intack(&s->pics[0], irq); + if (irq == 2) { + irq2 = pic_get_irq(&s->pics[1]); + if (irq2 >= 0) + pic_intack(&s->pics[1], irq2); + else + /* + * spurious IRQ on slave controller + */ + irq2 = 7; + intno = s->pics[1].irq_base + irq2; + irq = irq2 + 8; + } else + intno = s->pics[0].irq_base + irq; + } else { + /* + * spurious IRQ on host controller + */ + irq = 7; + intno = s->pics[0].irq_base + irq; + } + pic_update_irq(s); + + return intno; +} + +static void pic_reset(void *opaque) +{ + struct kvm_kpic_state *s = opaque; + + s->last_irr = 0; + s->irr = 0; + s->imr = 0; + s->isr = 0; + s->priority_add = 0; + s->irq_base = 0; + s->read_reg_select = 0; + s->poll = 0; + s->special_mask = 0; + s->init_state = 0; + s->auto_eoi = 0; + s->rotate_on_auto_eoi = 0; + s->special_fully_nested_mode = 0; + s->init4 = 0; +} + +static void pic_ioport_write(void *opaque, u32 addr, u32 val) +{ + struct kvm_kpic_state *s = opaque; + int priority, cmd, irq; + + addr &= 1; + if (addr == 0) { + if (val & 0x10) { + pic_reset(s); /* init */ + /* + * deassert a pending interrupt + */ + s->pics_state->irq_request(s->pics_state-> + irq_request_opaque, 0); + s->init_state = 1; + s->init4 = val & 1; + if (val & 0x02) + printk(KERN_ERR "single mode not supported"); + if (val & 0x08) + printk(KERN_ERR + "level sensitive irq not supported"); + } else if (val & 0x08) { + if (val & 0x04) + s->poll = 1; + if (val & 0x02) + s->read_reg_select = val & 1; + if (val & 0x40) + s->special_mask = (val >> 5) & 1; + } else { + cmd = val >> 5; + switch (cmd) { + case 0: + case 4: + s->rotate_on_auto_eoi = cmd >> 2; + break; + case 1: /* end of interrupt */ + case 5: + priority = get_priority(s, s->isr); + if (priority != 8) { + irq = (priority + s->priority_add) & 7; + s->isr &= ~(1 << irq); + if (cmd == 5) + s->priority_add = (irq + 1) & 7; + pic_update_irq(s->pics_state); + } + break; + case 3: + irq = val & 7; + s->isr &= ~(1 << irq); + pic_update_irq(s->pics_state); + break; + case 6: + s->priority_add = (val + 1) & 7; + pic_update_irq(s->pics_state); + break; + case 7: + irq = val & 7; + s->isr &= ~(1 << irq); + s->priority_add = (irq + 1) & 7; + pic_update_irq(s->pics_state); + break; + default: + break; /* no operation */ + } + } + } else + switch (s->init_state) { + case 0: /* normal mode */ + s->imr = val; + pic_update_irq(s->pics_state); + break; + case 1: + s->irq_base = val & 0xf8; + s->init_state = 2; + break; + case 2: + if (s->init4) + s->init_state = 3; + else + s->init_state = 0; + break; + case 3: + s->special_fully_nested_mode = (val >> 4) & 1; + s->auto_eoi = (val >> 1) & 1; + s->init_state = 0; + break; + } +} + +static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) +{ + int ret; + + ret = pic_get_irq(s); + if (ret >= 0) { + if (addr1 >> 7) { + s->pics_state->pics[0].isr &= ~(1 << 2); + s->pics_state->pics[0].irr &= ~(1 << 2); + } + s->irr &= ~(1 << ret); + s->isr &= ~(1 << ret); + if (addr1 >> 7 || ret != 2) + pic_update_irq(s->pics_state); + } else { + ret = 0x07; + pic_update_irq(s->pics_state); + } + + return ret; +} + +static u32 pic_ioport_read(void *opaque, u32 addr1) +{ + struct kvm_kpic_state *s = opaque; + unsigned int addr; + int ret; + + addr = addr1; + addr &= 1; + if (s->poll) { + ret = pic_poll_read(s, addr1); + s->poll = 0; + } else + if (addr == 0) + if (s->read_reg_select) + ret = s->isr; + else + ret = s->irr; + else + ret = s->imr; + return ret; +} + +static void elcr_ioport_write(void *opaque, u32 addr, u32 val) +{ + struct kvm_kpic_state *s = opaque; + s->elcr = val & s->elcr_mask; +} + +static u32 elcr_ioport_read(void *opaque, u32 addr1) +{ + struct kvm_kpic_state *s = opaque; + return s->elcr; +} + +static int picdev_in_range(struct kvm_io_device *this, gpa_t addr) +{ + switch (addr) { + case 0x20: + case 0x21: + case 0xa0: + case 0xa1: + case 0x4d0: + case 0x4d1: + return 1; + default: + return 0; + } +} + +static void picdev_write(struct kvm_io_device *this, + gpa_t addr, int len, const void *val) +{ + struct kvm_pic *s = this->private; + unsigned char data = *(unsigned char *)val; + + if (len != 1) { + if (printk_ratelimit()) + printk(KERN_ERR "PIC: non byte write\n"); + return; + } + switch (addr) { + case 0x20: + case 0x21: + case 0xa0: + case 0xa1: + pic_ioport_write(&s->pics[addr >> 7], addr, data); + break; + case 0x4d0: + case 0x4d1: + elcr_ioport_write(&s->pics[addr & 1], addr, data); + break; + } +} + +static void picdev_read(struct kvm_io_device *this, + gpa_t addr, int len, void *val) +{ + struct kvm_pic *s = this->private; + unsigned char data = 0; + + if (len != 1) { + if (printk_ratelimit()) + printk(KERN_ERR "PIC: non byte read\n"); + return; + } + switch (addr) { + case 0x20: + case 0x21: + case 0xa0: + case 0xa1: + data = pic_ioport_read(&s->pics[addr >> 7], addr); + break; + case 0x4d0: + case 0x4d1: + data = elcr_ioport_read(&s->pics[addr & 1], addr); + break; + } + *(unsigned char *)val = data; +} + +/* + * callback when PIC0 irq status changed + */ +static void pic_irq_request(void *opaque, int level) +{ + struct kvm *kvm = opaque; + struct kvm_vcpu *vcpu = kvm->vcpus[0]; + + pic_irqchip(kvm)->output = level; + if (vcpu) + kvm_vcpu_kick(vcpu); +} + +struct kvm_pic *kvm_create_pic(struct kvm *kvm) +{ + struct kvm_pic *s; + s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); + if (!s) + return NULL; + s->pics[0].elcr_mask = 0xf8; + s->pics[1].elcr_mask = 0xde; + s->irq_request = pic_irq_request; + s->irq_request_opaque = kvm; + s->pics[0].pics_state = s; + s->pics[1].pics_state = s; + + /* + * Initialize PIO device + */ + s->dev.read = picdev_read; + s->dev.write = picdev_write; + s->dev.in_range = picdev_in_range; + s->dev.private = s; + kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev); + return s; +} diff --git a/drivers/kvm/ioapic.c b/drivers/kvm/ioapic.c new file mode 100644 index 000000000000..c7992e667fdb --- /dev/null +++ b/drivers/kvm/ioapic.c @@ -0,0 +1,388 @@ +/* + * Copyright (C) 2001 MandrakeSoft S.A. + * + * MandrakeSoft S.A. + * 43, rue d'Aboukir + * 75002 Paris - France + * http://www.linux-mandrake.com/ + * http://www.mandrakesoft.com/ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Yunhong Jiang <yunhong.jiang@intel.com> + * Yaozu (Eddie) Dong <eddie.dong@intel.com> + * Based on Xen 3.1 code. + */ + +#include "kvm.h" +#include <linux/kvm.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/smp.h> +#include <linux/hrtimer.h> +#include <linux/io.h> +#include <asm/processor.h> +#include <asm/msr.h> +#include <asm/page.h> +#include <asm/current.h> +#include <asm/apicdef.h> +#include <asm/io_apic.h> +#include "irq.h" +/* #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ +#define ioapic_debug(fmt, arg...) +static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq); + +static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, + unsigned long addr, + unsigned long length) +{ + unsigned long result = 0; + + switch (ioapic->ioregsel) { + case IOAPIC_REG_VERSION: + result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16) + | (IOAPIC_VERSION_ID & 0xff)); + break; + + case IOAPIC_REG_APIC_ID: + case IOAPIC_REG_ARB_ID: + result = ((ioapic->id & 0xf) << 24); + break; + + default: + { + u32 redir_index = (ioapic->ioregsel - 0x10) >> 1; + u64 redir_content; + + ASSERT(redir_index < IOAPIC_NUM_PINS); + + redir_content = ioapic->redirtbl[redir_index].bits; + result = (ioapic->ioregsel & 0x1) ? + (redir_content >> 32) & 0xffffffff : + redir_content & 0xffffffff; + break; + } + } + + return result; +} + +static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) +{ + union ioapic_redir_entry *pent; + + pent = &ioapic->redirtbl[idx]; + + if (!pent->fields.mask) { + ioapic_deliver(ioapic, idx); + if (pent->fields.trig_mode == IOAPIC_LEVEL_TRIG) + pent->fields.remote_irr = 1; + } + if (!pent->fields.trig_mode) + ioapic->irr &= ~(1 << idx); +} + +static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) +{ + unsigned index; + + switch (ioapic->ioregsel) { + case IOAPIC_REG_VERSION: + /* Writes are ignored. */ + break; + + case IOAPIC_REG_APIC_ID: + ioapic->id = (val >> 24) & 0xf; + break; + + case IOAPIC_REG_ARB_ID: + break; + + default: + index = (ioapic->ioregsel - 0x10) >> 1; + + ioapic_debug("change redir index %x val %x", index, val); + if (index >= IOAPIC_NUM_PINS) + return; + if (ioapic->ioregsel & 1) { + ioapic->redirtbl[index].bits &= 0xffffffff; + ioapic->redirtbl[index].bits |= (u64) val << 32; + } else { + ioapic->redirtbl[index].bits &= ~0xffffffffULL; + ioapic->redirtbl[index].bits |= (u32) val; + ioapic->redirtbl[index].fields.remote_irr = 0; + } + if (ioapic->irr & (1 << index)) + ioapic_service(ioapic, index); + break; + } +} + +static void ioapic_inj_irq(struct kvm_ioapic *ioapic, + struct kvm_lapic *target, + u8 vector, u8 trig_mode, u8 delivery_mode) +{ + ioapic_debug("irq %d trig %d deliv %d", vector, trig_mode, + delivery_mode); + + ASSERT((delivery_mode == dest_Fixed) || + (delivery_mode == dest_LowestPrio)); + + kvm_apic_set_irq(target, vector, trig_mode); +} + +static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, + u8 dest_mode) +{ + u32 mask = 0; + int i; + struct kvm *kvm = ioapic->kvm; + struct kvm_vcpu *vcpu; + + ioapic_debug("dest %d dest_mode %d", dest, dest_mode); + + if (dest_mode == 0) { /* Physical mode. */ + if (dest == 0xFF) { /* Broadcast. */ + for (i = 0; i < KVM_MAX_VCPUS; ++i) + if (kvm->vcpus[i] && kvm->vcpus[i]->apic) + mask |= 1 << i; + return mask; + } + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = kvm->vcpus[i]; + if (!vcpu) + continue; + if (kvm_apic_match_physical_addr(vcpu->apic, dest)) { + if (vcpu->apic) + mask = 1 << i; + break; + } + } + } else if (dest != 0) /* Logical mode, MDA non-zero. */ + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = kvm->vcpus[i]; + if (!vcpu) + continue; + if (vcpu->apic && + kvm_apic_match_logical_addr(vcpu->apic, dest)) + mask |= 1 << vcpu->vcpu_id; + } + ioapic_debug("mask %x", mask); + return mask; +} + +static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq) +{ + u8 dest = ioapic->redirtbl[irq].fields.dest_id; + u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode; + u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode; + u8 vector = ioapic->redirtbl[irq].fields.vector; + u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode; + u32 deliver_bitmask; + struct kvm_lapic *target; + struct kvm_vcpu *vcpu; + int vcpu_id; + + ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " + "vector=%x trig_mode=%x", + dest, dest_mode, delivery_mode, vector, trig_mode); + + deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode); + if (!deliver_bitmask) { + ioapic_debug("no target on destination"); + return; + } + + switch (delivery_mode) { + case dest_LowestPrio: + target = + kvm_apic_round_robin(ioapic->kvm, vector, deliver_bitmask); + if (target != NULL) + ioapic_inj_irq(ioapic, target, vector, + trig_mode, delivery_mode); + else + ioapic_debug("null round robin: " + "mask=%x vector=%x delivery_mode=%x", + deliver_bitmask, vector, dest_LowestPrio); + break; + case dest_Fixed: + for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { + if (!(deliver_bitmask & (1 << vcpu_id))) + continue; + deliver_bitmask &= ~(1 << vcpu_id); + vcpu = ioapic->kvm->vcpus[vcpu_id]; + if (vcpu) { + target = vcpu->apic; + ioapic_inj_irq(ioapic, target, vector, + trig_mode, delivery_mode); + } + } + break; + + /* TODO: NMI */ + default: + printk(KERN_WARNING "Unsupported delivery mode %d\n", + delivery_mode); + break; + } +} + +void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) +{ + u32 old_irr = ioapic->irr; + u32 mask = 1 << irq; + union ioapic_redir_entry entry; + + if (irq >= 0 && irq < IOAPIC_NUM_PINS) { + entry = ioapic->redirtbl[irq]; + level ^= entry.fields.polarity; + if (!level) + ioapic->irr &= ~mask; + else { + ioapic->irr |= mask; + if ((!entry.fields.trig_mode && old_irr != ioapic->irr) + || !entry.fields.remote_irr) + ioapic_service(ioapic, irq); + } + } +} + +static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector) +{ + int i; + + for (i = 0; i < IOAPIC_NUM_PINS; i++) + if (ioapic->redirtbl[i].fields.vector == vector) + return i; + return -1; +} + +void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) +{ + struct kvm_ioapic *ioapic = kvm->vioapic; + union ioapic_redir_entry *ent; + int gsi; + + gsi = get_eoi_gsi(ioapic, vector); + if (gsi == -1) { + printk(KERN_WARNING "Can't find redir item for %d EOI\n", + vector); + return; + } + + ent = &ioapic->redirtbl[gsi]; + ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); + + ent->fields.remote_irr = 0; + if (!ent->fields.mask && (ioapic->irr & (1 << gsi))) + ioapic_deliver(ioapic, gsi); +} + +static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr) +{ + struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; + + return ((addr >= ioapic->base_address && + (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); +} + +static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, + void *val) +{ + struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; + u32 result; + + ioapic_debug("addr %lx", (unsigned long)addr); + ASSERT(!(addr & 0xf)); /* check alignment */ + + addr &= 0xff; + switch (addr) { + case IOAPIC_REG_SELECT: + result = ioapic->ioregsel; + break; + + case IOAPIC_REG_WINDOW: + result = ioapic_read_indirect(ioapic, addr, len); + break; + + default: + result = 0; + break; + } + switch (len) { + case 8: + *(u64 *) val = result; + break; + case 1: + case 2: + case 4: + memcpy(val, (char *)&result, len); + break; + default: + printk(KERN_WARNING "ioapic: wrong length %d\n", len); + } +} + +static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, + const void *val) +{ + struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; + u32 data; + + ioapic_debug("ioapic_mmio_write addr=%lx len=%d val=%p\n", + addr, len, val); + ASSERT(!(addr & 0xf)); /* check alignment */ + if (len == 4 || len == 8) + data = *(u32 *) val; + else { + printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); + return; + } + + addr &= 0xff; + switch (addr) { + case IOAPIC_REG_SELECT: + ioapic->ioregsel = data; + break; + + case IOAPIC_REG_WINDOW: + ioapic_write_indirect(ioapic, data); + break; + + default: + break; + } +} + +int kvm_ioapic_init(struct kvm *kvm) +{ + struct kvm_ioapic *ioapic; + int i; + + ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); + if (!ioapic) + return -ENOMEM; + kvm->vioapic = ioapic; + for (i = 0; i < IOAPIC_NUM_PINS; i++) + ioapic->redirtbl[i].fields.mask = 1; + ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; + ioapic->dev.read = ioapic_mmio_read; + ioapic->dev.write = ioapic_mmio_write; + ioapic->dev.in_range = ioapic_in_range; + ioapic->dev.private = ioapic; + ioapic->kvm = kvm; + kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev); + return 0; +} diff --git a/drivers/kvm/irq.c b/drivers/kvm/irq.c new file mode 100644 index 000000000000..7628c7ff628f --- /dev/null +++ b/drivers/kvm/irq.c @@ -0,0 +1,98 @@ +/* + * irq.c: API for in kernel interrupt controller + * Copyright (c) 2007, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * Authors: + * Yaozu (Eddie) Dong <Eddie.dong@intel.com> + * + */ + +#include <linux/module.h> + +#include "kvm.h" +#include "irq.h" + +/* + * check if there is pending interrupt without + * intack. + */ +int kvm_cpu_has_interrupt(struct kvm_vcpu *v) +{ + struct kvm_pic *s; + + if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ + if (kvm_apic_accept_pic_intr(v)) { + s = pic_irqchip(v->kvm); /* PIC */ + return s->output; + } else + return 0; + } + return 1; +} +EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); + +/* + * Read pending interrupt vector and intack. + */ +int kvm_cpu_get_interrupt(struct kvm_vcpu *v) +{ + struct kvm_pic *s; + int vector; + + vector = kvm_get_apic_interrupt(v); /* APIC */ + if (vector == -1) { + if (kvm_apic_accept_pic_intr(v)) { + s = pic_irqchip(v->kvm); + s->output = 0; /* PIC */ + vector = kvm_pic_read_irq(s); + } + } + return vector; +} +EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); + +static void vcpu_kick_intr(void *info) +{ +#ifdef DEBUG + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; + printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); +#endif +} + +void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +{ + int ipi_pcpu = vcpu->cpu; + + if (waitqueue_active(&vcpu->wq)) { + wake_up_interruptible(&vcpu->wq); + ++vcpu->stat.halt_wakeup; + } + if (vcpu->guest_mode) + smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); +} + +void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) +{ + kvm_inject_apic_timer_irqs(vcpu); + /* TODO: PIT, RTC etc. */ +} +EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); + +void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) +{ + kvm_apic_timer_intr_post(vcpu, vec); + /* TODO: PIT, RTC etc. */ +} +EXPORT_SYMBOL_GPL(kvm_timer_intr_post); diff --git a/drivers/kvm/irq.h b/drivers/kvm/irq.h new file mode 100644 index 000000000000..11fc014e2b30 --- /dev/null +++ b/drivers/kvm/irq.h @@ -0,0 +1,165 @@ +/* + * irq.h: in kernel interrupt controller related definitions + * Copyright (c) 2007, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * Authors: + * Yaozu (Eddie) Dong <Eddie.dong@intel.com> + * + */ + +#ifndef __IRQ_H +#define __IRQ_H + +#include "kvm.h" + +typedef void irq_request_func(void *opaque, int level); + +struct kvm_kpic_state { + u8 last_irr; /* edge detection */ + u8 irr; /* interrupt request register */ + u8 imr; /* interrupt mask register */ + u8 isr; /* interrupt service register */ + u8 priority_add; /* highest irq priority */ + u8 irq_base; + u8 read_reg_select; + u8 poll; + u8 special_mask; + u8 init_state; + u8 auto_eoi; + u8 rotate_on_auto_eoi; + u8 special_fully_nested_mode; + u8 init4; /* true if 4 byte init */ + u8 elcr; /* PIIX edge/trigger selection */ + u8 elcr_mask; + struct kvm_pic *pics_state; +}; + +struct kvm_pic { + struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ + irq_request_func *irq_request; + void *irq_request_opaque; + int output; /* intr from master PIC */ + struct kvm_io_device dev; +}; + +struct kvm_pic *kvm_create_pic(struct kvm *kvm); +void kvm_pic_set_irq(void *opaque, int irq, int level); +int kvm_pic_read_irq(struct kvm_pic *s); +int kvm_cpu_get_interrupt(struct kvm_vcpu *v); +int kvm_cpu_has_interrupt(struct kvm_vcpu *v); +void kvm_pic_update_irq(struct kvm_pic *s); + +#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS +#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ +#define IOAPIC_EDGE_TRIG 0 +#define IOAPIC_LEVEL_TRIG 1 + +#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 +#define IOAPIC_MEM_LENGTH 0x100 + +/* Direct registers. */ +#define IOAPIC_REG_SELECT 0x00 +#define IOAPIC_REG_WINDOW 0x10 +#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */ + +/* Indirect registers. */ +#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */ +#define IOAPIC_REG_VERSION 0x01 +#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */ + +struct kvm_ioapic { + u64 base_address; + u32 ioregsel; + u32 id; + u32 irr; + u32 pad; + union ioapic_redir_entry { + u64 bits; + struct { + u8 vector; + u8 delivery_mode:3; + u8 dest_mode:1; + u8 delivery_status:1; + u8 polarity:1; + u8 remote_irr:1; + u8 trig_mode:1; + u8 mask:1; + u8 reserve:7; + u8 reserved[4]; + u8 dest_id; + } fields; + } redirtbl[IOAPIC_NUM_PINS]; + struct kvm_io_device dev; + struct kvm *kvm; +}; + +struct kvm_lapic { + unsigned long base_address; + struct kvm_io_device dev; + struct { + atomic_t pending; + s64 period; /* unit: ns */ + u32 divide_count; + ktime_t last_update; + struct hrtimer dev; + } timer; + struct kvm_vcpu *vcpu; + struct page *regs_page; + void *regs; +}; + +#ifdef DEBUG +#define ASSERT(x) \ +do { \ + if (!(x)) { \ + printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ + __FILE__, __LINE__, #x); \ + BUG(); \ + } \ +} while (0) +#else +#define ASSERT(x) do { } while (0) +#endif + +void kvm_vcpu_kick(struct kvm_vcpu *vcpu); +int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); +int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); +int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); +int kvm_create_lapic(struct kvm_vcpu *vcpu); +void kvm_lapic_reset(struct kvm_vcpu *vcpu); +void kvm_free_apic(struct kvm_lapic *apic); +u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); +void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); +void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); +struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, + unsigned long bitmap); +u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); +void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); +int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); +void kvm_ioapic_update_eoi(struct kvm *kvm, int vector); +int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); +int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig); +void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); +int kvm_ioapic_init(struct kvm *kvm); +void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); +int kvm_lapic_enabled(struct kvm_vcpu *vcpu); +int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); +void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec); +void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); +void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); +void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); +void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); + +#endif diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h index 336be86c6f5a..ad0813843adc 100644 --- a/drivers/kvm/kvm.h +++ b/drivers/kvm/kvm.h @@ -13,60 +13,38 @@ #include <linux/signal.h> #include <linux/sched.h> #include <linux/mm.h> +#include <linux/preempt.h> #include <asm/signal.h> -#include "vmx.h" #include <linux/kvm.h> #include <linux/kvm_para.h> -#define CR0_PE_MASK (1ULL << 0) -#define CR0_MP_MASK (1ULL << 1) -#define CR0_TS_MASK (1ULL << 3) -#define CR0_NE_MASK (1ULL << 5) -#define CR0_WP_MASK (1ULL << 16) -#define CR0_NW_MASK (1ULL << 29) -#define CR0_CD_MASK (1ULL << 30) -#define CR0_PG_MASK (1ULL << 31) - -#define CR3_WPT_MASK (1ULL << 3) -#define CR3_PCD_MASK (1ULL << 4) - -#define CR3_RESEVED_BITS 0x07ULL -#define CR3_L_MODE_RESEVED_BITS (~((1ULL << 40) - 1) | 0x0fe7ULL) -#define CR3_FLAGS_MASK ((1ULL << 5) - 1) - -#define CR4_VME_MASK (1ULL << 0) -#define CR4_PSE_MASK (1ULL << 4) -#define CR4_PAE_MASK (1ULL << 5) -#define CR4_PGE_MASK (1ULL << 7) -#define CR4_VMXE_MASK (1ULL << 13) +#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) +#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) +#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS|0xFFFFFF0000000000ULL) #define KVM_GUEST_CR0_MASK \ - (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK \ - | CR0_NW_MASK | CR0_CD_MASK) + (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \ + | X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON \ - (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK | CR0_TS_MASK \ - | CR0_MP_MASK) + (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \ + | X86_CR0_MP) #define KVM_GUEST_CR4_MASK \ - (CR4_PSE_MASK | CR4_PAE_MASK | CR4_PGE_MASK | CR4_VMXE_MASK | CR4_VME_MASK) -#define KVM_PMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK) -#define KVM_RMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK | CR4_VME_MASK) + (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) +#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) +#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) #define INVALID_PAGE (~(hpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0) #define KVM_MAX_VCPUS 4 #define KVM_ALIAS_SLOTS 4 -#define KVM_MEMORY_SLOTS 4 +#define KVM_MEMORY_SLOTS 8 #define KVM_NUM_MMU_PAGES 1024 #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 #define KVM_MAX_CPUID_ENTRIES 40 -#define FX_IMAGE_SIZE 512 -#define FX_IMAGE_ALIGN 16 -#define FX_BUF_SIZE (2 * FX_IMAGE_SIZE + FX_IMAGE_ALIGN) - #define DE_VECTOR 0 #define NM_VECTOR 7 #define DF_VECTOR 8 @@ -158,15 +136,8 @@ struct kvm_mmu_page { }; }; -struct vmcs { - u32 revision_id; - u32 abort; - char data[0]; -}; - -#define vmx_msr_entry kvm_msr_entry - struct kvm_vcpu; +extern struct kmem_cache *kvm_vcpu_cache; /* * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level @@ -260,6 +231,7 @@ struct kvm_stat { u32 signal_exits; u32 irq_window_exits; u32 halt_exits; + u32 halt_wakeup; u32 request_irq_exits; u32 irq_exits; u32 light_exits; @@ -328,21 +300,17 @@ void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_vcpu { struct kvm *kvm; - union { - struct vmcs *vmcs; - struct vcpu_svm *svm; - }; + struct preempt_notifier preempt_notifier; + int vcpu_id; struct mutex mutex; int cpu; - int launched; u64 host_tsc; struct kvm_run *run; int interrupt_window_open; int guest_mode; unsigned long requests; unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ -#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) - unsigned long irq_pending[NR_IRQ_WORDS]; + DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ unsigned long rip; /* needs vcpu_load_rsp_rip() */ @@ -357,15 +325,15 @@ struct kvm_vcpu { u64 pdptrs[4]; /* pae */ u64 shadow_efer; u64 apic_base; + struct kvm_lapic *apic; /* kernel irqchip context */ +#define VCPU_MP_STATE_RUNNABLE 0 +#define VCPU_MP_STATE_UNINITIALIZED 1 +#define VCPU_MP_STATE_INIT_RECEIVED 2 +#define VCPU_MP_STATE_SIPI_RECEIVED 3 +#define VCPU_MP_STATE_HALTED 4 + int mp_state; + int sipi_vector; u64 ia32_misc_enable_msr; - int nmsrs; - int save_nmsrs; - int msr_offset_efer; -#ifdef CONFIG_X86_64 - int msr_offset_kernel_gs_base; -#endif - struct vmx_msr_entry *guest_msrs; - struct vmx_msr_entry *host_msrs; struct kvm_mmu mmu; @@ -379,16 +347,10 @@ struct kvm_vcpu { struct kvm_guest_debug guest_debug; - char fx_buf[FX_BUF_SIZE]; - char *host_fx_image; - char *guest_fx_image; + struct i387_fxsave_struct host_fx_image; + struct i387_fxsave_struct guest_fx_image; int fpu_active; int guest_fpu_loaded; - struct vmx_host_state { - int loaded; - u16 fs_sel, gs_sel, ldt_sel; - int fs_gs_ldt_reload_needed; - } vmx_host_state; int mmio_needed; int mmio_read_completed; @@ -399,6 +361,7 @@ struct kvm_vcpu { gva_t mmio_fault_cr2; struct kvm_pio_request pio; void *pio_data; + wait_queue_head_t wq; int sigset_active; sigset_t sigset; @@ -436,7 +399,7 @@ struct kvm_memory_slot { }; struct kvm { - spinlock_t lock; /* protects everything except vcpus */ + struct mutex lock; /* protects everything except vcpus */ int naliases; struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; int nmemslots; @@ -447,39 +410,59 @@ struct kvm { struct list_head active_mmu_pages; int n_free_mmu_pages; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; - int nvcpus; - struct kvm_vcpu vcpus[KVM_MAX_VCPUS]; - int memory_config_version; - int busy; + struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; unsigned long rmap_overflow; struct list_head vm_list; struct file *filp; struct kvm_io_bus mmio_bus; struct kvm_io_bus pio_bus; + struct kvm_pic *vpic; + struct kvm_ioapic *vioapic; + int round_robin_prev_vcpu; }; +static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) +{ + return kvm->vpic; +} + +static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) +{ + return kvm->vioapic; +} + +static inline int irqchip_in_kernel(struct kvm *kvm) +{ + return pic_irqchip(kvm) != 0; +} + struct descriptor_table { u16 limit; unsigned long base; } __attribute__((packed)); -struct kvm_arch_ops { +struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ void (*hardware_enable)(void *dummy); /* __init */ void (*hardware_disable)(void *dummy); + void (*check_processor_compatibility)(void *rtn); int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ - int (*vcpu_create)(struct kvm_vcpu *vcpu); + /* Create, but do not attach this VCPU */ + struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); void (*vcpu_free)(struct kvm_vcpu *vcpu); + void (*vcpu_reset)(struct kvm_vcpu *vcpu); - void (*vcpu_load)(struct kvm_vcpu *vcpu); + void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); + void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); void (*vcpu_decache)(struct kvm_vcpu *vcpu); int (*set_guest_debug)(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg); + void (*guest_debug_pre)(struct kvm_vcpu *vcpu); int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); @@ -505,27 +488,43 @@ struct kvm_arch_ops { unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); - void (*invlpg)(struct kvm_vcpu *vcpu, gva_t addr); void (*tlb_flush)(struct kvm_vcpu *vcpu); void (*inject_page_fault)(struct kvm_vcpu *vcpu, unsigned long addr, u32 err_code); void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code); - int (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); - int (*vcpu_setup)(struct kvm_vcpu *vcpu); + void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); + int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsigned char *hypercall_addr); + int (*get_irq)(struct kvm_vcpu *vcpu); + void (*set_irq)(struct kvm_vcpu *vcpu, int vec); + void (*inject_pending_irq)(struct kvm_vcpu *vcpu); + void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, + struct kvm_run *run); }; -extern struct kvm_arch_ops *kvm_arch_ops; +extern struct kvm_x86_ops *kvm_x86_ops; + +/* The guest did something we don't support. */ +#define pr_unimpl(vcpu, fmt, ...) \ + do { \ + if (printk_ratelimit()) \ + printk(KERN_ERR "kvm: %i: cpu%i " fmt, \ + current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \ + } while(0) #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) -int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module); -void kvm_exit_arch(void); +int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); +void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); + +int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size, + struct module *module); +void kvm_exit_x86(void); int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); @@ -545,8 +544,6 @@ static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva); struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva); -void kvm_emulator_want_group7_invlpg(void); - extern hpa_t bad_page_address; struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); @@ -561,6 +558,7 @@ enum emulation_result { int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, unsigned long cr2, u16 error_code); +void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, @@ -574,9 +572,11 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); struct x86_emulate_ctxt; -int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, - int size, unsigned long count, int string, int down, - gva_t address, int rep, unsigned port); +int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in, + int size, unsigned port); +int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, + int size, unsigned long count, int down, + gva_t address, int rep, unsigned port); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); @@ -590,34 +590,33 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0); void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0); void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0); +unsigned long get_cr8(struct kvm_vcpu *vcpu); void lmsw(struct kvm_vcpu *vcpu, unsigned long msw); +void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); void fx_init(struct kvm_vcpu *vcpu); -void load_msrs(struct vmx_msr_entry *e, int n); -void save_msrs(struct vmx_msr_entry *e, int n); void kvm_resched(struct kvm_vcpu *vcpu); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); void kvm_flush_remote_tlbs(struct kvm *kvm); -int kvm_read_guest(struct kvm_vcpu *vcpu, - gva_t addr, - unsigned long size, - void *dest); - -int kvm_write_guest(struct kvm_vcpu *vcpu, - gva_t addr, - unsigned long size, - void *data); +int emulator_read_std(unsigned long addr, + void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu); +int emulator_write_emulated(unsigned long addr, + const void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu); unsigned long segment_base(u16 selector); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *old, const u8 *new, int bytes); + const u8 *new, int bytes); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); @@ -656,17 +655,17 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu) static inline int is_pae(struct kvm_vcpu *vcpu) { - return vcpu->cr4 & CR4_PAE_MASK; + return vcpu->cr4 & X86_CR4_PAE; } static inline int is_pse(struct kvm_vcpu *vcpu) { - return vcpu->cr4 & CR4_PSE_MASK; + return vcpu->cr4 & X86_CR4_PSE; } static inline int is_paging(struct kvm_vcpu *vcpu) { - return vcpu->cr0 & CR0_PG_MASK; + return vcpu->cr0 & X86_CR0_PG; } static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot) @@ -746,12 +745,12 @@ static inline unsigned long read_msr(unsigned long msr) } #endif -static inline void fx_save(void *image) +static inline void fx_save(struct i387_fxsave_struct *image) { asm ("fxsave (%0)":: "r" (image)); } -static inline void fx_restore(void *image) +static inline void fx_restore(struct i387_fxsave_struct *image) { asm ("fxrstor (%0)":: "r" (image)); } diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index cd0557954e50..353e58527d15 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -18,6 +18,7 @@ #include "kvm.h" #include "x86_emulate.h" #include "segment_descriptor.h" +#include "irq.h" #include <linux/kvm.h> #include <linux/module.h> @@ -37,6 +38,7 @@ #include <linux/cpumask.h> #include <linux/smp.h> #include <linux/anon_inodes.h> +#include <linux/profile.h> #include <asm/processor.h> #include <asm/msr.h> @@ -52,9 +54,11 @@ static LIST_HEAD(vm_list); static cpumask_t cpus_hardware_enabled; -struct kvm_arch_ops *kvm_arch_ops; +struct kvm_x86_ops *kvm_x86_ops; +struct kmem_cache *kvm_vcpu_cache; +EXPORT_SYMBOL_GPL(kvm_vcpu_cache); -static void hardware_disable(void *ignored); +static __read_mostly struct preempt_ops kvm_preempt_ops; #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x) @@ -73,6 +77,7 @@ static struct kvm_stats_debugfs_item { { "signal_exits", STAT_OFFSET(signal_exits) }, { "irq_window", STAT_OFFSET(irq_window_exits) }, { "halt_exits", STAT_OFFSET(halt_exits) }, + { "halt_wakeup", STAT_OFFSET(halt_wakeup) }, { "request_irq", STAT_OFFSET(request_irq_exits) }, { "irq_exits", STAT_OFFSET(irq_exits) }, { "light_exits", STAT_OFFSET(light_exits) }, @@ -84,10 +89,17 @@ static struct dentry *debugfs_dir; #define MAX_IO_MSRS 256 -#define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL -#define LMSW_GUEST_MASK 0x0eULL -#define CR4_RESEVED_BITS (~((1ULL << 11) - 1)) -#define CR8_RESEVED_BITS (~0x0fULL) +#define CR0_RESERVED_BITS \ + (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ + | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ + | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) +#define CR4_RESERVED_BITS \ + (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ + | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ + | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) + +#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) #define EFER_RESERVED_BITS 0xfffffffffffff2fe #ifdef CONFIG_X86_64 @@ -139,82 +151,14 @@ static inline int valid_vcpu(int n) return likely(n >= 0 && n < KVM_MAX_VCPUS); } -int kvm_read_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size, - void *dest) -{ - unsigned char *host_buf = dest; - unsigned long req_size = size; - - while (size) { - hpa_t paddr; - unsigned now; - unsigned offset; - hva_t guest_buf; - - paddr = gva_to_hpa(vcpu, addr); - - if (is_error_hpa(paddr)) - break; - - guest_buf = (hva_t)kmap_atomic( - pfn_to_page(paddr >> PAGE_SHIFT), - KM_USER0); - offset = addr & ~PAGE_MASK; - guest_buf |= offset; - now = min(size, PAGE_SIZE - offset); - memcpy(host_buf, (void*)guest_buf, now); - host_buf += now; - addr += now; - size -= now; - kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0); - } - return req_size - size; -} -EXPORT_SYMBOL_GPL(kvm_read_guest); - -int kvm_write_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size, - void *data) -{ - unsigned char *host_buf = data; - unsigned long req_size = size; - - while (size) { - hpa_t paddr; - unsigned now; - unsigned offset; - hva_t guest_buf; - gfn_t gfn; - - paddr = gva_to_hpa(vcpu, addr); - - if (is_error_hpa(paddr)) - break; - - gfn = vcpu->mmu.gva_to_gpa(vcpu, addr) >> PAGE_SHIFT; - mark_page_dirty(vcpu->kvm, gfn); - guest_buf = (hva_t)kmap_atomic( - pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0); - offset = addr & ~PAGE_MASK; - guest_buf |= offset; - now = min(size, PAGE_SIZE - offset); - memcpy((void*)guest_buf, host_buf, now); - host_buf += now; - addr += now; - size -= now; - kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0); - } - return req_size - size; -} -EXPORT_SYMBOL_GPL(kvm_write_guest); - void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) return; vcpu->guest_fpu_loaded = 1; - fx_save(vcpu->host_fx_image); - fx_restore(vcpu->guest_fx_image); + fx_save(&vcpu->host_fx_image); + fx_restore(&vcpu->guest_fx_image); } EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); @@ -224,8 +168,8 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) return; vcpu->guest_fpu_loaded = 0; - fx_save(vcpu->guest_fx_image); - fx_restore(vcpu->host_fx_image); + fx_save(&vcpu->guest_fx_image); + fx_restore(&vcpu->host_fx_image); } EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); @@ -234,13 +178,21 @@ EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); */ static void vcpu_load(struct kvm_vcpu *vcpu) { + int cpu; + mutex_lock(&vcpu->mutex); - kvm_arch_ops->vcpu_load(vcpu); + cpu = get_cpu(); + preempt_notifier_register(&vcpu->preempt_notifier); + kvm_x86_ops->vcpu_load(vcpu, cpu); + put_cpu(); } static void vcpu_put(struct kvm_vcpu *vcpu) { - kvm_arch_ops->vcpu_put(vcpu); + preempt_disable(); + kvm_x86_ops->vcpu_put(vcpu); + preempt_notifier_unregister(&vcpu->preempt_notifier); + preempt_enable(); mutex_unlock(&vcpu->mutex); } @@ -261,8 +213,10 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) atomic_set(&completed, 0); cpus_clear(cpus); needed = 0; - for (i = 0; i < kvm->nvcpus; ++i) { - vcpu = &kvm->vcpus[i]; + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = kvm->vcpus[i]; + if (!vcpu) + continue; if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests)) continue; cpu = vcpu->cpu; @@ -286,37 +240,79 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) } } +int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) +{ + struct page *page; + int r; + + mutex_init(&vcpu->mutex); + vcpu->cpu = -1; + vcpu->mmu.root_hpa = INVALID_PAGE; + vcpu->kvm = kvm; + vcpu->vcpu_id = id; + if (!irqchip_in_kernel(kvm) || id == 0) + vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; + else + vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED; + init_waitqueue_head(&vcpu->wq); + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) { + r = -ENOMEM; + goto fail; + } + vcpu->run = page_address(page); + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) { + r = -ENOMEM; + goto fail_free_run; + } + vcpu->pio_data = page_address(page); + + r = kvm_mmu_create(vcpu); + if (r < 0) + goto fail_free_pio_data; + + return 0; + +fail_free_pio_data: + free_page((unsigned long)vcpu->pio_data); +fail_free_run: + free_page((unsigned long)vcpu->run); +fail: + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(kvm_vcpu_init); + +void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + kvm_mmu_destroy(vcpu); + if (vcpu->apic) + hrtimer_cancel(&vcpu->apic->timer.dev); + kvm_free_apic(vcpu->apic); + free_page((unsigned long)vcpu->pio_data); + free_page((unsigned long)vcpu->run); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); + static struct kvm *kvm_create_vm(void) { struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); - int i; if (!kvm) return ERR_PTR(-ENOMEM); kvm_io_bus_init(&kvm->pio_bus); - spin_lock_init(&kvm->lock); + mutex_init(&kvm->lock); INIT_LIST_HEAD(&kvm->active_mmu_pages); kvm_io_bus_init(&kvm->mmio_bus); - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - struct kvm_vcpu *vcpu = &kvm->vcpus[i]; - - mutex_init(&vcpu->mutex); - vcpu->cpu = -1; - vcpu->kvm = kvm; - vcpu->mmu.root_hpa = INVALID_PAGE; - } spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); spin_unlock(&kvm_lock); return kvm; } -static int kvm_dev_open(struct inode *inode, struct file *filp) -{ - return 0; -} - /* * Free any memory in @free but not in @dont. */ @@ -353,7 +349,7 @@ static void free_pio_guest_pages(struct kvm_vcpu *vcpu) { int i; - for (i = 0; i < 2; ++i) + for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i) if (vcpu->pio.guest_pages[i]) { __free_page(vcpu->pio.guest_pages[i]); vcpu->pio.guest_pages[i] = NULL; @@ -362,30 +358,11 @@ static void free_pio_guest_pages(struct kvm_vcpu *vcpu) static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) { - if (!vcpu->vmcs) - return; - vcpu_load(vcpu); kvm_mmu_unload(vcpu); vcpu_put(vcpu); } -static void kvm_free_vcpu(struct kvm_vcpu *vcpu) -{ - if (!vcpu->vmcs) - return; - - vcpu_load(vcpu); - kvm_mmu_destroy(vcpu); - vcpu_put(vcpu); - kvm_arch_ops->vcpu_free(vcpu); - free_page((unsigned long)vcpu->run); - vcpu->run = NULL; - free_page((unsigned long)vcpu->pio_data); - vcpu->pio_data = NULL; - free_pio_guest_pages(vcpu); -} - static void kvm_free_vcpus(struct kvm *kvm) { unsigned int i; @@ -394,14 +371,15 @@ static void kvm_free_vcpus(struct kvm *kvm) * Unpin any mmu pages first. */ for (i = 0; i < KVM_MAX_VCPUS; ++i) - kvm_unload_vcpu_mmu(&kvm->vcpus[i]); - for (i = 0; i < KVM_MAX_VCPUS; ++i) - kvm_free_vcpu(&kvm->vcpus[i]); -} + if (kvm->vcpus[i]) + kvm_unload_vcpu_mmu(kvm->vcpus[i]); + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + if (kvm->vcpus[i]) { + kvm_x86_ops->vcpu_free(kvm->vcpus[i]); + kvm->vcpus[i] = NULL; + } + } -static int kvm_dev_release(struct inode *inode, struct file *filp) -{ - return 0; } static void kvm_destroy_vm(struct kvm *kvm) @@ -411,6 +389,8 @@ static void kvm_destroy_vm(struct kvm *kvm) spin_unlock(&kvm_lock); kvm_io_bus_destroy(&kvm->pio_bus); kvm_io_bus_destroy(&kvm->mmio_bus); + kfree(kvm->vpic); + kfree(kvm->vioapic); kvm_free_vcpus(kvm); kvm_free_physmem(kvm); kfree(kvm); @@ -426,7 +406,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) static void inject_gp(struct kvm_vcpu *vcpu) { - kvm_arch_ops->inject_gp(vcpu, 0); + kvm_x86_ops->inject_gp(vcpu, 0); } /* @@ -437,58 +417,60 @@ static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; int i; - u64 pdpte; u64 *pdpt; int ret; struct page *page; + u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)]; - spin_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->lock); page = gfn_to_page(vcpu->kvm, pdpt_gfn); - /* FIXME: !page - emulate? 0xff? */ + if (!page) { + ret = 0; + goto out; + } + pdpt = kmap_atomic(page, KM_USER0); + memcpy(pdpte, pdpt+offset, sizeof(pdpte)); + kunmap_atomic(pdpt, KM_USER0); - ret = 1; - for (i = 0; i < 4; ++i) { - pdpte = pdpt[offset + i]; - if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) { + for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { + if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { ret = 0; goto out; } } + ret = 1; - for (i = 0; i < 4; ++i) - vcpu->pdptrs[i] = pdpt[offset + i]; - + memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs)); out: - kunmap_atomic(pdpt, KM_USER0); - spin_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->lock); return ret; } void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { - if (cr0 & CR0_RESEVED_BITS) { + if (cr0 & CR0_RESERVED_BITS) { printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", cr0, vcpu->cr0); inject_gp(vcpu); return; } - if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) { + if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); inject_gp(vcpu); return; } - if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) { + if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { printk(KERN_DEBUG "set_cr0: #GP, set PG flag " "and a clear PE flag\n"); inject_gp(vcpu); return; } - if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) { + if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 if ((vcpu->shadow_efer & EFER_LME)) { int cs_db, cs_l; @@ -499,7 +481,7 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) inject_gp(vcpu); return; } - kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); if (cs_l) { printk(KERN_DEBUG "set_cr0: #GP, start paging " "in long mode while CS.L == 1\n"); @@ -518,12 +500,12 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } - kvm_arch_ops->set_cr0(vcpu, cr0); + kvm_x86_ops->set_cr0(vcpu, cr0); vcpu->cr0 = cr0; - spin_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->lock); kvm_mmu_reset_context(vcpu); - spin_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->lock); return; } EXPORT_SYMBOL_GPL(set_cr0); @@ -536,62 +518,72 @@ EXPORT_SYMBOL_GPL(lmsw); void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - if (cr4 & CR4_RESEVED_BITS) { + if (cr4 & CR4_RESERVED_BITS) { printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); inject_gp(vcpu); return; } if (is_long_mode(vcpu)) { - if (!(cr4 & CR4_PAE_MASK)) { + if (!(cr4 & X86_CR4_PAE)) { printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " "in long mode\n"); inject_gp(vcpu); return; } - } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK) + } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) && !load_pdptrs(vcpu, vcpu->cr3)) { printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); inject_gp(vcpu); + return; } - if (cr4 & CR4_VMXE_MASK) { + if (cr4 & X86_CR4_VMXE) { printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); inject_gp(vcpu); return; } - kvm_arch_ops->set_cr4(vcpu, cr4); - spin_lock(&vcpu->kvm->lock); + kvm_x86_ops->set_cr4(vcpu, cr4); + vcpu->cr4 = cr4; + mutex_lock(&vcpu->kvm->lock); kvm_mmu_reset_context(vcpu); - spin_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->lock); } EXPORT_SYMBOL_GPL(set_cr4); void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (is_long_mode(vcpu)) { - if (cr3 & CR3_L_MODE_RESEVED_BITS) { + if (cr3 & CR3_L_MODE_RESERVED_BITS) { printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); inject_gp(vcpu); return; } } else { - if (cr3 & CR3_RESEVED_BITS) { - printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); - inject_gp(vcpu); - return; - } - if (is_paging(vcpu) && is_pae(vcpu) && - !load_pdptrs(vcpu, cr3)) { - printk(KERN_DEBUG "set_cr3: #GP, pdptrs " - "reserved bits\n"); - inject_gp(vcpu); - return; + if (is_pae(vcpu)) { + if (cr3 & CR3_PAE_RESERVED_BITS) { + printk(KERN_DEBUG + "set_cr3: #GP, reserved bits\n"); + inject_gp(vcpu); + return; + } + if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { + printk(KERN_DEBUG "set_cr3: #GP, pdptrs " + "reserved bits\n"); + inject_gp(vcpu); + return; + } + } else { + if (cr3 & CR3_NONPAE_RESERVED_BITS) { + printk(KERN_DEBUG + "set_cr3: #GP, reserved bits\n"); + inject_gp(vcpu); + return; + } } } - vcpu->cr3 = cr3; - spin_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->lock); /* * Does the new cr3 value map to physical memory? (Note, we * catch an invalid cr3 even in real-mode, because it would @@ -603,46 +595,73 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) */ if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) inject_gp(vcpu); - else + else { + vcpu->cr3 = cr3; vcpu->mmu.new_cr3(vcpu); - spin_unlock(&vcpu->kvm->lock); + } + mutex_unlock(&vcpu->kvm->lock); } EXPORT_SYMBOL_GPL(set_cr3); void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { - if ( cr8 & CR8_RESEVED_BITS) { + if (cr8 & CR8_RESERVED_BITS) { printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); inject_gp(vcpu); return; } - vcpu->cr8 = cr8; + if (irqchip_in_kernel(vcpu->kvm)) + kvm_lapic_set_tpr(vcpu, cr8); + else + vcpu->cr8 = cr8; } EXPORT_SYMBOL_GPL(set_cr8); -void fx_init(struct kvm_vcpu *vcpu) +unsigned long get_cr8(struct kvm_vcpu *vcpu) +{ + if (irqchip_in_kernel(vcpu->kvm)) + return kvm_lapic_get_cr8(vcpu); + else + return vcpu->cr8; +} +EXPORT_SYMBOL_GPL(get_cr8); + +u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) { - struct __attribute__ ((__packed__)) fx_image_s { - u16 control; //fcw - u16 status; //fsw - u16 tag; // ftw - u16 opcode; //fop - u64 ip; // fpu ip - u64 operand;// fpu dp - u32 mxcsr; - u32 mxcsr_mask; + if (irqchip_in_kernel(vcpu->kvm)) + return vcpu->apic_base; + else + return vcpu->apic_base; +} +EXPORT_SYMBOL_GPL(kvm_get_apic_base); - } *fx_image; +void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) +{ + /* TODO: reserve bits check */ + if (irqchip_in_kernel(vcpu->kvm)) + kvm_lapic_set_base(vcpu, data); + else + vcpu->apic_base = data; +} +EXPORT_SYMBOL_GPL(kvm_set_apic_base); + +void fx_init(struct kvm_vcpu *vcpu) +{ + unsigned after_mxcsr_mask; - fx_save(vcpu->host_fx_image); + /* Initialize guest FPU by resetting ours and saving into guest's */ + preempt_disable(); + fx_save(&vcpu->host_fx_image); fpu_init(); - fx_save(vcpu->guest_fx_image); - fx_restore(vcpu->host_fx_image); + fx_save(&vcpu->guest_fx_image); + fx_restore(&vcpu->host_fx_image); + preempt_enable(); - fx_image = (struct fx_image_s *)vcpu->guest_fx_image; - fx_image->mxcsr = 0x1f80; - memset(vcpu->guest_fx_image + sizeof(struct fx_image_s), - 0, FX_IMAGE_SIZE - sizeof(struct fx_image_s)); + vcpu->cr0 |= X86_CR0_ET; + after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); + vcpu->guest_fx_image.mxcsr = 0x1f80; + memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask, + 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); } EXPORT_SYMBOL_GPL(fx_init); @@ -661,7 +680,6 @@ static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, unsigned long i; struct kvm_memory_slot *memslot; struct kvm_memory_slot old, new; - int memory_config_version; r = -EINVAL; /* General sanity checks */ @@ -681,10 +699,8 @@ static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, if (!npages) mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; -raced: - spin_lock(&kvm->lock); + mutex_lock(&kvm->lock); - memory_config_version = kvm->memory_config_version; new = old = *memslot; new.base_gfn = base_gfn; @@ -707,11 +723,6 @@ raced: (base_gfn >= s->base_gfn + s->npages))) goto out_unlock; } - /* - * Do memory allocations outside lock. memory_config_version will - * detect any races. - */ - spin_unlock(&kvm->lock); /* Deallocate if slot is being removed */ if (!npages) @@ -728,14 +739,14 @@ raced: new.phys_mem = vmalloc(npages * sizeof(struct page *)); if (!new.phys_mem) - goto out_free; + goto out_unlock; memset(new.phys_mem, 0, npages * sizeof(struct page *)); for (i = 0; i < npages; ++i) { new.phys_mem[i] = alloc_page(GFP_HIGHUSER | __GFP_ZERO); if (!new.phys_mem[i]) - goto out_free; + goto out_unlock; set_page_private(new.phys_mem[i],0); } } @@ -746,39 +757,25 @@ raced: new.dirty_bitmap = vmalloc(dirty_bytes); if (!new.dirty_bitmap) - goto out_free; + goto out_unlock; memset(new.dirty_bitmap, 0, dirty_bytes); } - spin_lock(&kvm->lock); - - if (memory_config_version != kvm->memory_config_version) { - spin_unlock(&kvm->lock); - kvm_free_physmem_slot(&new, &old); - goto raced; - } - - r = -EAGAIN; - if (kvm->busy) - goto out_unlock; - if (mem->slot >= kvm->nmemslots) kvm->nmemslots = mem->slot + 1; *memslot = new; - ++kvm->memory_config_version; kvm_mmu_slot_remove_write_access(kvm, mem->slot); kvm_flush_remote_tlbs(kvm); - spin_unlock(&kvm->lock); + mutex_unlock(&kvm->lock); kvm_free_physmem_slot(&old, &new); return 0; out_unlock: - spin_unlock(&kvm->lock); -out_free: + mutex_unlock(&kvm->lock); kvm_free_physmem_slot(&new, &old); out: return r; @@ -795,14 +792,8 @@ static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, int n; unsigned long any = 0; - spin_lock(&kvm->lock); + mutex_lock(&kvm->lock); - /* - * Prevent changes to guest memory configuration even while the lock - * is not taken. - */ - ++kvm->busy; - spin_unlock(&kvm->lock); r = -EINVAL; if (log->slot >= KVM_MEMORY_SLOTS) goto out; @@ -821,18 +812,17 @@ static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) goto out; - spin_lock(&kvm->lock); - kvm_mmu_slot_remove_write_access(kvm, log->slot); - kvm_flush_remote_tlbs(kvm); - memset(memslot->dirty_bitmap, 0, n); - spin_unlock(&kvm->lock); + /* If nothing is dirty, don't bother messing with page tables. */ + if (any) { + kvm_mmu_slot_remove_write_access(kvm, log->slot); + kvm_flush_remote_tlbs(kvm); + memset(memslot->dirty_bitmap, 0, n); + } r = 0; out: - spin_lock(&kvm->lock); - --kvm->busy; - spin_unlock(&kvm->lock); + mutex_unlock(&kvm->lock); return r; } @@ -862,7 +852,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, < alias->target_phys_addr) goto out; - spin_lock(&kvm->lock); + mutex_lock(&kvm->lock); p = &kvm->aliases[alias->slot]; p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; @@ -876,7 +866,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, kvm_mmu_zap_all(kvm); - spin_unlock(&kvm->lock); + mutex_unlock(&kvm->lock); return 0; @@ -884,6 +874,63 @@ out: return r; } +static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) +{ + int r; + + r = 0; + switch (chip->chip_id) { + case KVM_IRQCHIP_PIC_MASTER: + memcpy (&chip->chip.pic, + &pic_irqchip(kvm)->pics[0], + sizeof(struct kvm_pic_state)); + break; + case KVM_IRQCHIP_PIC_SLAVE: + memcpy (&chip->chip.pic, + &pic_irqchip(kvm)->pics[1], + sizeof(struct kvm_pic_state)); + break; + case KVM_IRQCHIP_IOAPIC: + memcpy (&chip->chip.ioapic, + ioapic_irqchip(kvm), + sizeof(struct kvm_ioapic_state)); + break; + default: + r = -EINVAL; + break; + } + return r; +} + +static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) +{ + int r; + + r = 0; + switch (chip->chip_id) { + case KVM_IRQCHIP_PIC_MASTER: + memcpy (&pic_irqchip(kvm)->pics[0], + &chip->chip.pic, + sizeof(struct kvm_pic_state)); + break; + case KVM_IRQCHIP_PIC_SLAVE: + memcpy (&pic_irqchip(kvm)->pics[1], + &chip->chip.pic, + sizeof(struct kvm_pic_state)); + break; + case KVM_IRQCHIP_IOAPIC: + memcpy (ioapic_irqchip(kvm), + &chip->chip.ioapic, + sizeof(struct kvm_ioapic_state)); + break; + default: + r = -EINVAL; + break; + } + kvm_pic_update_irq(pic_irqchip(kvm)); + return r; +} + static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) { int i; @@ -930,37 +977,26 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_page); +/* WARNING: Does not work on aliased pages. */ void mark_page_dirty(struct kvm *kvm, gfn_t gfn) { - int i; struct kvm_memory_slot *memslot; - unsigned long rel_gfn; - for (i = 0; i < kvm->nmemslots; ++i) { - memslot = &kvm->memslots[i]; - - if (gfn >= memslot->base_gfn - && gfn < memslot->base_gfn + memslot->npages) { + memslot = __gfn_to_memslot(kvm, gfn); + if (memslot && memslot->dirty_bitmap) { + unsigned long rel_gfn = gfn - memslot->base_gfn; - if (!memslot->dirty_bitmap) - return; - - rel_gfn = gfn - memslot->base_gfn; - - /* avoid RMW */ - if (!test_bit(rel_gfn, memslot->dirty_bitmap)) - set_bit(rel_gfn, memslot->dirty_bitmap); - return; - } + /* avoid RMW */ + if (!test_bit(rel_gfn, memslot->dirty_bitmap)) + set_bit(rel_gfn, memslot->dirty_bitmap); } } -static int emulator_read_std(unsigned long addr, +int emulator_read_std(unsigned long addr, void *val, unsigned int bytes, - struct x86_emulate_ctxt *ctxt) + struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = ctxt->vcpu; void *data = val; while (bytes) { @@ -990,26 +1026,42 @@ static int emulator_read_std(unsigned long addr, return X86EMUL_CONTINUE; } +EXPORT_SYMBOL_GPL(emulator_read_std); static int emulator_write_std(unsigned long addr, const void *val, unsigned int bytes, - struct x86_emulate_ctxt *ctxt) + struct kvm_vcpu *vcpu) { - printk(KERN_ERR "emulator_write_std: addr %lx n %d\n", - addr, bytes); + pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes); return X86EMUL_UNHANDLEABLE; } +/* + * Only apic need an MMIO device hook, so shortcut now.. + */ +static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, + gpa_t addr) +{ + struct kvm_io_device *dev; + + if (vcpu->apic) { + dev = &vcpu->apic->dev; + if (dev->in_range(dev, addr)) + return dev; + } + return NULL; +} + static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, gpa_t addr) { - /* - * Note that its important to have this wrapper function because - * in the very near future we will be checking for MMIOs against - * the LAPIC as well as the general MMIO bus - */ - return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); + struct kvm_io_device *dev; + + dev = vcpu_find_pervcpu_dev(vcpu, addr); + if (dev == NULL) + dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); + return dev; } static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, @@ -1021,9 +1073,8 @@ static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, static int emulator_read_emulated(unsigned long addr, void *val, unsigned int bytes, - struct x86_emulate_ctxt *ctxt) + struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = ctxt->vcpu; struct kvm_io_device *mmio_dev; gpa_t gpa; @@ -1031,7 +1082,7 @@ static int emulator_read_emulated(unsigned long addr, memcpy(val, vcpu->mmio_data, bytes); vcpu->mmio_read_completed = 0; return X86EMUL_CONTINUE; - } else if (emulator_read_std(addr, val, bytes, ctxt) + } else if (emulator_read_std(addr, val, bytes, vcpu) == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; @@ -1061,7 +1112,6 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, { struct page *page; void *virt; - unsigned offset = offset_in_page(gpa); if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) return 0; @@ -1070,7 +1120,7 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, return 0; mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); virt = kmap_atomic(page, KM_USER0); - kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes); + kvm_mmu_pte_write(vcpu, gpa, val, bytes); memcpy(virt + offset_in_page(gpa), val, bytes); kunmap_atomic(virt, KM_USER0); return 1; @@ -1079,14 +1129,13 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, static int emulator_write_emulated_onepage(unsigned long addr, const void *val, unsigned int bytes, - struct x86_emulate_ctxt *ctxt) + struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = ctxt->vcpu; struct kvm_io_device *mmio_dev; gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); if (gpa == UNMAPPED_GVA) { - kvm_arch_ops->inject_page_fault(vcpu, addr, 2); + kvm_x86_ops->inject_page_fault(vcpu, addr, 2); return X86EMUL_PROPAGATE_FAULT; } @@ -1111,31 +1160,32 @@ static int emulator_write_emulated_onepage(unsigned long addr, return X86EMUL_CONTINUE; } -static int emulator_write_emulated(unsigned long addr, +int emulator_write_emulated(unsigned long addr, const void *val, unsigned int bytes, - struct x86_emulate_ctxt *ctxt) + struct kvm_vcpu *vcpu) { /* Crossing a page boundary? */ if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { int rc, now; now = -addr & ~PAGE_MASK; - rc = emulator_write_emulated_onepage(addr, val, now, ctxt); + rc = emulator_write_emulated_onepage(addr, val, now, vcpu); if (rc != X86EMUL_CONTINUE) return rc; addr += now; val += now; bytes -= now; } - return emulator_write_emulated_onepage(addr, val, bytes, ctxt); + return emulator_write_emulated_onepage(addr, val, bytes, vcpu); } +EXPORT_SYMBOL_GPL(emulator_write_emulated); static int emulator_cmpxchg_emulated(unsigned long addr, const void *old, const void *new, unsigned int bytes, - struct x86_emulate_ctxt *ctxt) + struct kvm_vcpu *vcpu) { static int reported; @@ -1143,12 +1193,12 @@ static int emulator_cmpxchg_emulated(unsigned long addr, reported = 1; printk(KERN_WARNING "kvm: emulating exchange as write\n"); } - return emulator_write_emulated(addr, new, bytes, ctxt); + return emulator_write_emulated(addr, new, bytes, vcpu); } static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) { - return kvm_arch_ops->get_segment_base(vcpu, seg); + return kvm_x86_ops->get_segment_base(vcpu, seg); } int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) @@ -1158,10 +1208,8 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) int emulate_clts(struct kvm_vcpu *vcpu) { - unsigned long cr0; - - cr0 = vcpu->cr0 & ~CR0_TS_MASK; - kvm_arch_ops->set_cr0(vcpu, cr0); + vcpu->cr0 &= ~X86_CR0_TS; + kvm_x86_ops->set_cr0(vcpu, vcpu->cr0); return X86EMUL_CONTINUE; } @@ -1171,11 +1219,10 @@ int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest) switch (dr) { case 0 ... 3: - *dest = kvm_arch_ops->get_dr(vcpu, dr); + *dest = kvm_x86_ops->get_dr(vcpu, dr); return X86EMUL_CONTINUE; default: - printk(KERN_DEBUG "%s: unexpected dr %u\n", - __FUNCTION__, dr); + pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr); return X86EMUL_UNHANDLEABLE; } } @@ -1185,7 +1232,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; int exception; - kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); + kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); if (exception) { /* FIXME: better handling */ return X86EMUL_UNHANDLEABLE; @@ -1193,25 +1240,25 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) return X86EMUL_CONTINUE; } -static void report_emulation_failure(struct x86_emulate_ctxt *ctxt) +void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) { static int reported; u8 opcodes[4]; - unsigned long rip = ctxt->vcpu->rip; + unsigned long rip = vcpu->rip; unsigned long rip_linear; - rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS); + rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); if (reported) return; - emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt); + emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); - printk(KERN_ERR "emulation failed but !mmio_needed?" - " rip %lx %02x %02x %02x %02x\n", - rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); + printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", + context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); reported = 1; } +EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); struct x86_emulate_ops emulate_ops = { .read_std = emulator_read_std, @@ -1231,12 +1278,12 @@ int emulate_instruction(struct kvm_vcpu *vcpu, int cs_db, cs_l; vcpu->mmio_fault_cr2 = cr2; - kvm_arch_ops->cache_regs(vcpu); + kvm_x86_ops->cache_regs(vcpu); - kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); emulate_ctxt.vcpu = vcpu; - emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu); + emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); emulate_ctxt.cr2 = cr2; emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_REAL : cs_l @@ -1259,9 +1306,13 @@ int emulate_instruction(struct kvm_vcpu *vcpu, emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS); vcpu->mmio_is_write = 0; + vcpu->pio.string = 0; r = x86_emulate_memop(&emulate_ctxt, &emulate_ops); + if (vcpu->pio.string) + return EMULATE_DO_MMIO; if ((r || vcpu->mmio_is_write) && run) { + run->exit_reason = KVM_EXIT_MMIO; run->mmio.phys_addr = vcpu->mmio_phys_addr; memcpy(run->mmio.data, vcpu->mmio_data, 8); run->mmio.len = vcpu->mmio_size; @@ -1272,14 +1323,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu, if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) return EMULATE_DONE; if (!vcpu->mmio_needed) { - report_emulation_failure(&emulate_ctxt); + kvm_report_emulation_failure(vcpu, "mmio"); return EMULATE_FAIL; } return EMULATE_DO_MMIO; } - kvm_arch_ops->decache_regs(vcpu); - kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags); + kvm_x86_ops->decache_regs(vcpu); + kvm_x86_ops->set_rflags(vcpu, emulate_ctxt.eflags); if (vcpu->mmio_is_write) { vcpu->mmio_needed = 0; @@ -1290,14 +1341,45 @@ int emulate_instruction(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(emulate_instruction); -int kvm_emulate_halt(struct kvm_vcpu *vcpu) +/* + * The vCPU has executed a HLT instruction with in-kernel mode enabled. + */ +static void kvm_vcpu_block(struct kvm_vcpu *vcpu) { - if (vcpu->irq_summary) - return 1; + DECLARE_WAITQUEUE(wait, current); - vcpu->run->exit_reason = KVM_EXIT_HLT; + add_wait_queue(&vcpu->wq, &wait); + + /* + * We will block until either an interrupt or a signal wakes us up + */ + while (!kvm_cpu_has_interrupt(vcpu) + && !signal_pending(current) + && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE + && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) { + set_current_state(TASK_INTERRUPTIBLE); + vcpu_put(vcpu); + schedule(); + vcpu_load(vcpu); + } + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&vcpu->wq, &wait); +} + +int kvm_emulate_halt(struct kvm_vcpu *vcpu) +{ ++vcpu->stat.halt_exits; - return 0; + if (irqchip_in_kernel(vcpu->kvm)) { + vcpu->mp_state = VCPU_MP_STATE_HALTED; + kvm_vcpu_block(vcpu); + if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE) + return -EINTR; + return 1; + } else { + vcpu->run->exit_reason = KVM_EXIT_HLT; + return 0; + } } EXPORT_SYMBOL_GPL(kvm_emulate_halt); @@ -1305,7 +1387,7 @@ int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) { unsigned long nr, a0, a1, a2, a3, a4, a5, ret; - kvm_arch_ops->cache_regs(vcpu); + kvm_x86_ops->cache_regs(vcpu); ret = -KVM_EINVAL; #ifdef CONFIG_X86_64 if (is_long_mode(vcpu)) { @@ -1329,6 +1411,7 @@ int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) } switch (nr) { default: + run->hypercall.nr = nr; run->hypercall.args[0] = a0; run->hypercall.args[1] = a1; run->hypercall.args[2] = a2; @@ -1337,11 +1420,11 @@ int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) run->hypercall.args[5] = a5; run->hypercall.ret = ret; run->hypercall.longmode = is_long_mode(vcpu); - kvm_arch_ops->decache_regs(vcpu); + kvm_x86_ops->decache_regs(vcpu); return 0; } vcpu->regs[VCPU_REGS_RAX] = ret; - kvm_arch_ops->decache_regs(vcpu); + kvm_x86_ops->decache_regs(vcpu); return 1; } EXPORT_SYMBOL_GPL(kvm_hypercall); @@ -1355,26 +1438,26 @@ void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) { struct descriptor_table dt = { limit, base }; - kvm_arch_ops->set_gdt(vcpu, &dt); + kvm_x86_ops->set_gdt(vcpu, &dt); } void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) { struct descriptor_table dt = { limit, base }; - kvm_arch_ops->set_idt(vcpu, &dt); + kvm_x86_ops->set_idt(vcpu, &dt); } void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, unsigned long *rflags) { lmsw(vcpu, msw); - *rflags = kvm_arch_ops->get_rflags(vcpu); + *rflags = kvm_x86_ops->get_rflags(vcpu); } unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) { - kvm_arch_ops->decache_cr4_guest_bits(vcpu); + kvm_x86_ops->decache_cr4_guest_bits(vcpu); switch (cr) { case 0: return vcpu->cr0; @@ -1396,7 +1479,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, switch (cr) { case 0: set_cr0(vcpu, mk_cr_64(vcpu->cr0, val)); - *rflags = kvm_arch_ops->get_rflags(vcpu); + *rflags = kvm_x86_ops->get_rflags(vcpu); break; case 2: vcpu->cr2 = val; @@ -1439,7 +1522,7 @@ static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa) mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT); para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT); - para_state = kmap_atomic(para_state_page, KM_USER0); + para_state = kmap(para_state_page); printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version); printk(KERN_DEBUG ".... size: %d\n", para_state->size); @@ -1470,12 +1553,12 @@ static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa) mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT); hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT), KM_USER1) + (hypercall_hpa & ~PAGE_MASK); - kvm_arch_ops->patch_hypercall(vcpu, hypercall); + kvm_x86_ops->patch_hypercall(vcpu, hypercall); kunmap_atomic(hypercall, KM_USER1); para_state->ret = 0; err_kunmap_skip: - kunmap_atomic(para_state, KM_USER0); + kunmap(para_state_page); return 0; err_gp: return 1; @@ -1511,7 +1594,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = 3; break; case MSR_IA32_APICBASE: - data = vcpu->apic_base; + data = kvm_get_apic_base(vcpu); break; case MSR_IA32_MISC_ENABLE: data = vcpu->ia32_misc_enable_msr; @@ -1522,7 +1605,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) break; #endif default: - printk(KERN_ERR "kvm: unhandled rdmsr: 0x%x\n", msr); + pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); return 1; } *pdata = data; @@ -1537,7 +1620,7 @@ EXPORT_SYMBOL_GPL(kvm_get_msr_common); */ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) { - return kvm_arch_ops->get_msr(vcpu, msr_index, pdata); + return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); } #ifdef CONFIG_X86_64 @@ -1558,7 +1641,7 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) return; } - kvm_arch_ops->set_efer(vcpu, efer); + kvm_x86_ops->set_efer(vcpu, efer); efer &= ~EFER_LMA; efer |= vcpu->shadow_efer & EFER_LMA; @@ -1577,11 +1660,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; #endif case MSR_IA32_MC0_STATUS: - printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", + pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", __FUNCTION__, data); break; case MSR_IA32_MCG_STATUS: - printk(KERN_WARNING "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", + pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", __FUNCTION__, data); break; case MSR_IA32_UCODE_REV: @@ -1589,7 +1672,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case 0x200 ... 0x2ff: /* MTRRs */ break; case MSR_IA32_APICBASE: - vcpu->apic_base = data; + kvm_set_apic_base(vcpu, data); break; case MSR_IA32_MISC_ENABLE: vcpu->ia32_misc_enable_msr = data; @@ -1601,7 +1684,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) return vcpu_register_para(vcpu, data); default: - printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr); + pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr); return 1; } return 0; @@ -1615,44 +1698,24 @@ EXPORT_SYMBOL_GPL(kvm_set_msr_common); */ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { - return kvm_arch_ops->set_msr(vcpu, msr_index, data); + return kvm_x86_ops->set_msr(vcpu, msr_index, data); } void kvm_resched(struct kvm_vcpu *vcpu) { if (!need_resched()) return; - vcpu_put(vcpu); cond_resched(); - vcpu_load(vcpu); } EXPORT_SYMBOL_GPL(kvm_resched); -void load_msrs(struct vmx_msr_entry *e, int n) -{ - int i; - - for (i = 0; i < n; ++i) - wrmsrl(e[i].index, e[i].data); -} -EXPORT_SYMBOL_GPL(load_msrs); - -void save_msrs(struct vmx_msr_entry *e, int n) -{ - int i; - - for (i = 0; i < n; ++i) - rdmsrl(e[i].index, e[i].data); -} -EXPORT_SYMBOL_GPL(save_msrs); - void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { int i; u32 function; struct kvm_cpuid_entry *e, *best; - kvm_arch_ops->cache_regs(vcpu); + kvm_x86_ops->cache_regs(vcpu); function = vcpu->regs[VCPU_REGS_RAX]; vcpu->regs[VCPU_REGS_RAX] = 0; vcpu->regs[VCPU_REGS_RBX] = 0; @@ -1678,8 +1741,8 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) vcpu->regs[VCPU_REGS_RCX] = best->ecx; vcpu->regs[VCPU_REGS_RDX] = best->edx; } - kvm_arch_ops->decache_regs(vcpu); - kvm_arch_ops->skip_emulated_instruction(vcpu); + kvm_x86_ops->decache_regs(vcpu); + kvm_x86_ops->skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); @@ -1690,11 +1753,9 @@ static int pio_copy_data(struct kvm_vcpu *vcpu) unsigned bytes; int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1; - kvm_arch_ops->vcpu_put(vcpu); q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE, PAGE_KERNEL); if (!q) { - kvm_arch_ops->vcpu_load(vcpu); free_pio_guest_pages(vcpu); return -ENOMEM; } @@ -1706,7 +1767,6 @@ static int pio_copy_data(struct kvm_vcpu *vcpu) memcpy(p, q, bytes); q -= vcpu->pio.guest_page_offset; vunmap(q); - kvm_arch_ops->vcpu_load(vcpu); free_pio_guest_pages(vcpu); return 0; } @@ -1717,7 +1777,7 @@ static int complete_pio(struct kvm_vcpu *vcpu) long delta; int r; - kvm_arch_ops->cache_regs(vcpu); + kvm_x86_ops->cache_regs(vcpu); if (!io->string) { if (io->in) @@ -1727,7 +1787,7 @@ static int complete_pio(struct kvm_vcpu *vcpu) if (io->in) { r = pio_copy_data(vcpu); if (r) { - kvm_arch_ops->cache_regs(vcpu); + kvm_x86_ops->cache_regs(vcpu); return r; } } @@ -1750,79 +1810,109 @@ static int complete_pio(struct kvm_vcpu *vcpu) vcpu->regs[VCPU_REGS_RSI] += delta; } - kvm_arch_ops->decache_regs(vcpu); + kvm_x86_ops->decache_regs(vcpu); io->count -= io->cur_count; io->cur_count = 0; - if (!io->count) - kvm_arch_ops->skip_emulated_instruction(vcpu); return 0; } -void kernel_pio(struct kvm_io_device *pio_dev, struct kvm_vcpu *vcpu) +static void kernel_pio(struct kvm_io_device *pio_dev, + struct kvm_vcpu *vcpu, + void *pd) { /* TODO: String I/O for in kernel device */ + mutex_lock(&vcpu->kvm->lock); if (vcpu->pio.in) kvm_iodevice_read(pio_dev, vcpu->pio.port, vcpu->pio.size, - vcpu->pio_data); + pd); else kvm_iodevice_write(pio_dev, vcpu->pio.port, vcpu->pio.size, - vcpu->pio_data); + pd); + mutex_unlock(&vcpu->kvm->lock); +} + +static void pio_string_write(struct kvm_io_device *pio_dev, + struct kvm_vcpu *vcpu) +{ + struct kvm_pio_request *io = &vcpu->pio; + void *pd = vcpu->pio_data; + int i; + + mutex_lock(&vcpu->kvm->lock); + for (i = 0; i < io->cur_count; i++) { + kvm_iodevice_write(pio_dev, io->port, + io->size, + pd); + pd += io->size; + } + mutex_unlock(&vcpu->kvm->lock); } -int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, - int size, unsigned long count, int string, int down, +int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in, + int size, unsigned port) +{ + struct kvm_io_device *pio_dev; + + vcpu->run->exit_reason = KVM_EXIT_IO; + vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; + vcpu->run->io.size = vcpu->pio.size = size; + vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; + vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1; + vcpu->run->io.port = vcpu->pio.port = port; + vcpu->pio.in = in; + vcpu->pio.string = 0; + vcpu->pio.down = 0; + vcpu->pio.guest_page_offset = 0; + vcpu->pio.rep = 0; + + kvm_x86_ops->cache_regs(vcpu); + memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4); + kvm_x86_ops->decache_regs(vcpu); + + kvm_x86_ops->skip_emulated_instruction(vcpu); + + pio_dev = vcpu_find_pio_dev(vcpu, port); + if (pio_dev) { + kernel_pio(pio_dev, vcpu, vcpu->pio_data); + complete_pio(vcpu); + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_emulate_pio); + +int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, + int size, unsigned long count, int down, gva_t address, int rep, unsigned port) { unsigned now, in_page; - int i; + int i, ret = 0; int nr_pages = 1; struct page *page; struct kvm_io_device *pio_dev; vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; - vcpu->run->io.size = size; + vcpu->run->io.size = vcpu->pio.size = size; vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; - vcpu->run->io.count = count; - vcpu->run->io.port = port; - vcpu->pio.count = count; - vcpu->pio.cur_count = count; - vcpu->pio.size = size; + vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count; + vcpu->run->io.port = vcpu->pio.port = port; vcpu->pio.in = in; - vcpu->pio.port = port; - vcpu->pio.string = string; + vcpu->pio.string = 1; vcpu->pio.down = down; vcpu->pio.guest_page_offset = offset_in_page(address); vcpu->pio.rep = rep; - pio_dev = vcpu_find_pio_dev(vcpu, port); - if (!string) { - kvm_arch_ops->cache_regs(vcpu); - memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4); - kvm_arch_ops->decache_regs(vcpu); - if (pio_dev) { - kernel_pio(pio_dev, vcpu); - complete_pio(vcpu); - return 1; - } - return 0; - } - /* TODO: String I/O for in kernel device */ - if (pio_dev) - printk(KERN_ERR "kvm_setup_pio: no string io support\n"); - if (!count) { - kvm_arch_ops->skip_emulated_instruction(vcpu); + kvm_x86_ops->skip_emulated_instruction(vcpu); return 1; } - now = min(count, PAGE_SIZE / size); - if (!down) in_page = PAGE_SIZE - offset_in_page(address); else @@ -1841,20 +1931,23 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, /* * String I/O in reverse. Yuck. Kill the guest, fix later. */ - printk(KERN_ERR "kvm: guest string pio down\n"); + pr_unimpl(vcpu, "guest string pio down\n"); inject_gp(vcpu); return 1; } vcpu->run->io.count = now; vcpu->pio.cur_count = now; + if (vcpu->pio.cur_count == vcpu->pio.count) + kvm_x86_ops->skip_emulated_instruction(vcpu); + for (i = 0; i < nr_pages; ++i) { - spin_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->lock); page = gva_to_page(vcpu, address + i * PAGE_SIZE); if (page) get_page(page); vcpu->pio.guest_pages[i] = page; - spin_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->lock); if (!page) { inject_gp(vcpu); free_pio_guest_pages(vcpu); @@ -1862,11 +1955,145 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, } } - if (!vcpu->pio.in) - return pio_copy_data(vcpu); - return 0; + pio_dev = vcpu_find_pio_dev(vcpu, port); + if (!vcpu->pio.in) { + /* string PIO write */ + ret = pio_copy_data(vcpu); + if (ret >= 0 && pio_dev) { + pio_string_write(pio_dev, vcpu); + complete_pio(vcpu); + if (vcpu->pio.count == 0) + ret = 1; + } + } else if (pio_dev) + pr_unimpl(vcpu, "no string pio read support yet, " + "port %x size %d count %ld\n", + port, size, count); + + return ret; +} +EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); + +/* + * Check if userspace requested an interrupt window, and that the + * interrupt window is open. + * + * No need to exit to userspace if we already have an interrupt queued. + */ +static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + return (!vcpu->irq_summary && + kvm_run->request_interrupt_window && + vcpu->interrupt_window_open && + (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); +} + +static void post_kvm_run_save(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; + kvm_run->cr8 = get_cr8(vcpu); + kvm_run->apic_base = kvm_get_apic_base(vcpu); + if (irqchip_in_kernel(vcpu->kvm)) + kvm_run->ready_for_interrupt_injection = 1; + else + kvm_run->ready_for_interrupt_injection = + (vcpu->interrupt_window_open && + vcpu->irq_summary == 0); +} + +static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + int r; + + if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { + printk("vcpu %d received sipi with vector # %x\n", + vcpu->vcpu_id, vcpu->sipi_vector); + kvm_lapic_reset(vcpu); + kvm_x86_ops->vcpu_reset(vcpu); + vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; + } + +preempted: + if (vcpu->guest_debug.enabled) + kvm_x86_ops->guest_debug_pre(vcpu); + +again: + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) + goto out; + + preempt_disable(); + + kvm_x86_ops->prepare_guest_switch(vcpu); + kvm_load_guest_fpu(vcpu); + + local_irq_disable(); + + if (signal_pending(current)) { + local_irq_enable(); + preempt_enable(); + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; + goto out; + } + + if (irqchip_in_kernel(vcpu->kvm)) + kvm_x86_ops->inject_pending_irq(vcpu); + else if (!vcpu->mmio_read_completed) + kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); + + vcpu->guest_mode = 1; + + if (vcpu->requests) + if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) + kvm_x86_ops->tlb_flush(vcpu); + + kvm_x86_ops->run(vcpu, kvm_run); + + vcpu->guest_mode = 0; + local_irq_enable(); + + ++vcpu->stat.exits; + + preempt_enable(); + + /* + * Profile KVM exit RIPs: + */ + if (unlikely(prof_on == KVM_PROFILING)) { + kvm_x86_ops->cache_regs(vcpu); + profile_hit(KVM_PROFILING, (void *)vcpu->rip); + } + + r = kvm_x86_ops->handle_exit(kvm_run, vcpu); + + if (r > 0) { + if (dm_request_for_irq_injection(vcpu, kvm_run)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.request_irq_exits; + goto out; + } + if (!need_resched()) { + ++vcpu->stat.light_exits; + goto again; + } + } + +out: + if (r > 0) { + kvm_resched(vcpu); + goto preempted; + } + + post_kvm_run_save(vcpu, kvm_run); + + return r; } -EXPORT_SYMBOL_GPL(kvm_setup_pio); + static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { @@ -1875,11 +2102,18 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu_load(vcpu); + if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) { + kvm_vcpu_block(vcpu); + vcpu_put(vcpu); + return -EAGAIN; + } + if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); /* re-sync apic's tpr */ - vcpu->cr8 = kvm_run->cr8; + if (!irqchip_in_kernel(vcpu->kvm)) + set_cr8(vcpu, kvm_run->cr8); if (vcpu->pio.cur_count) { r = complete_pio(vcpu); @@ -1897,19 +2131,18 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) /* * Read-modify-write. Back to userspace. */ - kvm_run->exit_reason = KVM_EXIT_MMIO; r = 0; goto out; } } if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { - kvm_arch_ops->cache_regs(vcpu); + kvm_x86_ops->cache_regs(vcpu); vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; - kvm_arch_ops->decache_regs(vcpu); + kvm_x86_ops->decache_regs(vcpu); } - r = kvm_arch_ops->run(vcpu, kvm_run); + r = __vcpu_run(vcpu, kvm_run); out: if (vcpu->sigset_active) @@ -1924,7 +2157,7 @@ static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, { vcpu_load(vcpu); - kvm_arch_ops->cache_regs(vcpu); + kvm_x86_ops->cache_regs(vcpu); regs->rax = vcpu->regs[VCPU_REGS_RAX]; regs->rbx = vcpu->regs[VCPU_REGS_RBX]; @@ -1946,7 +2179,7 @@ static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, #endif regs->rip = vcpu->rip; - regs->rflags = kvm_arch_ops->get_rflags(vcpu); + regs->rflags = kvm_x86_ops->get_rflags(vcpu); /* * Don't leak debug flags in case they were set for guest debugging @@ -1984,9 +2217,9 @@ static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, #endif vcpu->rip = regs->rip; - kvm_arch_ops->set_rflags(vcpu, regs->rflags); + kvm_x86_ops->set_rflags(vcpu, regs->rflags); - kvm_arch_ops->decache_regs(vcpu); + kvm_x86_ops->decache_regs(vcpu); vcpu_put(vcpu); @@ -1996,13 +2229,14 @@ static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, static void get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - return kvm_arch_ops->get_segment(vcpu, var, seg); + return kvm_x86_ops->get_segment(vcpu, var, seg); } static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct descriptor_table dt; + int pending_vec; vcpu_load(vcpu); @@ -2016,24 +2250,31 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - kvm_arch_ops->get_idt(vcpu, &dt); + kvm_x86_ops->get_idt(vcpu, &dt); sregs->idt.limit = dt.limit; sregs->idt.base = dt.base; - kvm_arch_ops->get_gdt(vcpu, &dt); + kvm_x86_ops->get_gdt(vcpu, &dt); sregs->gdt.limit = dt.limit; sregs->gdt.base = dt.base; - kvm_arch_ops->decache_cr4_guest_bits(vcpu); + kvm_x86_ops->decache_cr4_guest_bits(vcpu); sregs->cr0 = vcpu->cr0; sregs->cr2 = vcpu->cr2; sregs->cr3 = vcpu->cr3; sregs->cr4 = vcpu->cr4; - sregs->cr8 = vcpu->cr8; + sregs->cr8 = get_cr8(vcpu); sregs->efer = vcpu->shadow_efer; - sregs->apic_base = vcpu->apic_base; - - memcpy(sregs->interrupt_bitmap, vcpu->irq_pending, - sizeof sregs->interrupt_bitmap); + sregs->apic_base = kvm_get_apic_base(vcpu); + + if (irqchip_in_kernel(vcpu->kvm)) { + memset(sregs->interrupt_bitmap, 0, + sizeof sregs->interrupt_bitmap); + pending_vec = kvm_x86_ops->get_irq(vcpu); + if (pending_vec >= 0) + set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap); + } else + memcpy(sregs->interrupt_bitmap, vcpu->irq_pending, + sizeof sregs->interrupt_bitmap); vcpu_put(vcpu); @@ -2043,56 +2284,69 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, static void set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - return kvm_arch_ops->set_segment(vcpu, var, seg); + return kvm_x86_ops->set_segment(vcpu, var, seg); } static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { int mmu_reset_needed = 0; - int i; + int i, pending_vec, max_bits; struct descriptor_table dt; vcpu_load(vcpu); dt.limit = sregs->idt.limit; dt.base = sregs->idt.base; - kvm_arch_ops->set_idt(vcpu, &dt); + kvm_x86_ops->set_idt(vcpu, &dt); dt.limit = sregs->gdt.limit; dt.base = sregs->gdt.base; - kvm_arch_ops->set_gdt(vcpu, &dt); + kvm_x86_ops->set_gdt(vcpu, &dt); vcpu->cr2 = sregs->cr2; mmu_reset_needed |= vcpu->cr3 != sregs->cr3; vcpu->cr3 = sregs->cr3; - vcpu->cr8 = sregs->cr8; + set_cr8(vcpu, sregs->cr8); mmu_reset_needed |= vcpu->shadow_efer != sregs->efer; #ifdef CONFIG_X86_64 - kvm_arch_ops->set_efer(vcpu, sregs->efer); + kvm_x86_ops->set_efer(vcpu, sregs->efer); #endif - vcpu->apic_base = sregs->apic_base; + kvm_set_apic_base(vcpu, sregs->apic_base); - kvm_arch_ops->decache_cr4_guest_bits(vcpu); + kvm_x86_ops->decache_cr4_guest_bits(vcpu); mmu_reset_needed |= vcpu->cr0 != sregs->cr0; - kvm_arch_ops->set_cr0(vcpu, sregs->cr0); + vcpu->cr0 = sregs->cr0; + kvm_x86_ops->set_cr0(vcpu, sregs->cr0); mmu_reset_needed |= vcpu->cr4 != sregs->cr4; - kvm_arch_ops->set_cr4(vcpu, sregs->cr4); + kvm_x86_ops->set_cr4(vcpu, sregs->cr4); if (!is_long_mode(vcpu) && is_pae(vcpu)) load_pdptrs(vcpu, vcpu->cr3); if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); - memcpy(vcpu->irq_pending, sregs->interrupt_bitmap, - sizeof vcpu->irq_pending); - vcpu->irq_summary = 0; - for (i = 0; i < NR_IRQ_WORDS; ++i) - if (vcpu->irq_pending[i]) - __set_bit(i, &vcpu->irq_summary); + if (!irqchip_in_kernel(vcpu->kvm)) { + memcpy(vcpu->irq_pending, sregs->interrupt_bitmap, + sizeof vcpu->irq_pending); + vcpu->irq_summary = 0; + for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i) + if (vcpu->irq_pending[i]) + __set_bit(i, &vcpu->irq_summary); + } else { + max_bits = (sizeof sregs->interrupt_bitmap) << 3; + pending_vec = find_first_bit( + (const unsigned long *)sregs->interrupt_bitmap, + max_bits); + /* Only pending external irq is handled here */ + if (pending_vec < max_bits) { + kvm_x86_ops->set_irq(vcpu, pending_vec); + printk("Set back pending irq %d\n", pending_vec); + } + } set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); @@ -2109,6 +2363,16 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } +void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + struct kvm_segment cs; + + get_segment(vcpu, &cs, VCPU_SREG_CS); + *db = cs.db; + *l = cs.l; +} +EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); + /* * List of msr numbers which we expose to userspace through KVM_GET_MSRS * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. @@ -2236,13 +2500,13 @@ static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, gpa_t gpa; vcpu_load(vcpu); - spin_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->lock); gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr); tr->physical_address = gpa; tr->valid = gpa != UNMAPPED_GVA; tr->writeable = 1; tr->usermode = 0; - spin_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->lock); vcpu_put(vcpu); return 0; @@ -2253,6 +2517,8 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, { if (irq->irq < 0 || irq->irq >= 256) return -EINVAL; + if (irqchip_in_kernel(vcpu->kvm)) + return -ENXIO; vcpu_load(vcpu); set_bit(irq->irq, vcpu->irq_pending); @@ -2270,7 +2536,7 @@ static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, vcpu_load(vcpu); - r = kvm_arch_ops->set_guest_debug(vcpu, dbg); + r = kvm_x86_ops->set_guest_debug(vcpu, dbg); vcpu_put(vcpu); @@ -2285,7 +2551,6 @@ static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma, unsigned long pgoff; struct page *page; - *type = VM_FAULT_MINOR; pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; if (pgoff == 0) page = virt_to_page(vcpu->run); @@ -2294,6 +2559,9 @@ static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma, else return NOPAGE_SIGBUS; get_page(page); + if (type != NULL) + *type = VM_FAULT_MINOR; + return page; } @@ -2346,74 +2614,52 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) { int r; struct kvm_vcpu *vcpu; - struct page *page; - r = -EINVAL; if (!valid_vcpu(n)) - goto out; - - vcpu = &kvm->vcpus[n]; - - mutex_lock(&vcpu->mutex); - - if (vcpu->vmcs) { - mutex_unlock(&vcpu->mutex); - return -EEXIST; - } - - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - r = -ENOMEM; - if (!page) - goto out_unlock; - vcpu->run = page_address(page); - - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - r = -ENOMEM; - if (!page) - goto out_free_run; - vcpu->pio_data = page_address(page); + return -EINVAL; - vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf, - FX_IMAGE_ALIGN); - vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE; - vcpu->cr0 = 0x10; + vcpu = kvm_x86_ops->vcpu_create(kvm, n); + if (IS_ERR(vcpu)) + return PTR_ERR(vcpu); - r = kvm_arch_ops->vcpu_create(vcpu); - if (r < 0) - goto out_free_vcpus; + preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); - r = kvm_mmu_create(vcpu); - if (r < 0) - goto out_free_vcpus; + /* We do fxsave: this must be aligned. */ + BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF); - kvm_arch_ops->vcpu_load(vcpu); + vcpu_load(vcpu); r = kvm_mmu_setup(vcpu); - if (r >= 0) - r = kvm_arch_ops->vcpu_setup(vcpu); vcpu_put(vcpu); - if (r < 0) - goto out_free_vcpus; + goto free_vcpu; + mutex_lock(&kvm->lock); + if (kvm->vcpus[n]) { + r = -EEXIST; + mutex_unlock(&kvm->lock); + goto mmu_unload; + } + kvm->vcpus[n] = vcpu; + mutex_unlock(&kvm->lock); + + /* Now it's all set up, let userspace reach it */ r = create_vcpu_fd(vcpu); if (r < 0) - goto out_free_vcpus; + goto unlink; + return r; - spin_lock(&kvm_lock); - if (n >= kvm->nvcpus) - kvm->nvcpus = n + 1; - spin_unlock(&kvm_lock); +unlink: + mutex_lock(&kvm->lock); + kvm->vcpus[n] = NULL; + mutex_unlock(&kvm->lock); - return r; +mmu_unload: + vcpu_load(vcpu); + kvm_mmu_unload(vcpu); + vcpu_put(vcpu); -out_free_vcpus: - kvm_free_vcpu(vcpu); -out_free_run: - free_page((unsigned long)vcpu->run); - vcpu->run = NULL; -out_unlock: - mutex_unlock(&vcpu->mutex); -out: +free_vcpu: + kvm_x86_ops->vcpu_free(vcpu); return r; } @@ -2493,7 +2739,7 @@ struct fxsave { static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image; + struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image; vcpu_load(vcpu); @@ -2513,7 +2759,7 @@ static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image; + struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image; vcpu_load(vcpu); @@ -2531,6 +2777,27 @@ static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return 0; } +static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s) +{ + vcpu_load(vcpu); + memcpy(s->regs, vcpu->apic->regs, sizeof *s); + vcpu_put(vcpu); + + return 0; +} + +static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s) +{ + vcpu_load(vcpu); + memcpy(vcpu->apic->regs, s->regs, sizeof *s); + kvm_apic_post_state_restore(vcpu); + vcpu_put(vcpu); + + return 0; +} + static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2700,6 +2967,31 @@ static long kvm_vcpu_ioctl(struct file *filp, r = 0; break; } + case KVM_GET_LAPIC: { + struct kvm_lapic_state lapic; + + memset(&lapic, 0, sizeof lapic); + r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &lapic, sizeof lapic)) + goto out; + r = 0; + break; + } + case KVM_SET_LAPIC: { + struct kvm_lapic_state lapic; + + r = -EFAULT; + if (copy_from_user(&lapic, argp, sizeof lapic)) + goto out; + r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; + if (r) + goto out; + r = 0; + break; + } default: ; } @@ -2753,6 +3045,75 @@ static long kvm_vm_ioctl(struct file *filp, goto out; break; } + case KVM_CREATE_IRQCHIP: + r = -ENOMEM; + kvm->vpic = kvm_create_pic(kvm); + if (kvm->vpic) { + r = kvm_ioapic_init(kvm); + if (r) { + kfree(kvm->vpic); + kvm->vpic = NULL; + goto out; + } + } + else + goto out; + break; + case KVM_IRQ_LINE: { + struct kvm_irq_level irq_event; + + r = -EFAULT; + if (copy_from_user(&irq_event, argp, sizeof irq_event)) + goto out; + if (irqchip_in_kernel(kvm)) { + mutex_lock(&kvm->lock); + if (irq_event.irq < 16) + kvm_pic_set_irq(pic_irqchip(kvm), + irq_event.irq, + irq_event.level); + kvm_ioapic_set_irq(kvm->vioapic, + irq_event.irq, + irq_event.level); + mutex_unlock(&kvm->lock); + r = 0; + } + break; + } + case KVM_GET_IRQCHIP: { + /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ + struct kvm_irqchip chip; + + r = -EFAULT; + if (copy_from_user(&chip, argp, sizeof chip)) + goto out; + r = -ENXIO; + if (!irqchip_in_kernel(kvm)) + goto out; + r = kvm_vm_ioctl_get_irqchip(kvm, &chip); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &chip, sizeof chip)) + goto out; + r = 0; + break; + } + case KVM_SET_IRQCHIP: { + /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ + struct kvm_irqchip chip; + + r = -EFAULT; + if (copy_from_user(&chip, argp, sizeof chip)) + goto out; + r = -ENXIO; + if (!irqchip_in_kernel(kvm)) + goto out; + r = kvm_vm_ioctl_set_irqchip(kvm, &chip); + if (r) + goto out; + r = 0; + break; + } default: ; } @@ -2768,12 +3129,14 @@ static struct page *kvm_vm_nopage(struct vm_area_struct *vma, unsigned long pgoff; struct page *page; - *type = VM_FAULT_MINOR; pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; page = gfn_to_page(kvm, pgoff); if (!page) return NOPAGE_SIGBUS; get_page(page); + if (type != NULL) + *type = VM_FAULT_MINOR; + return page; } @@ -2861,12 +3224,20 @@ static long kvm_dev_ioctl(struct file *filp, r = 0; break; } - case KVM_CHECK_EXTENSION: - /* - * No extensions defined at present. - */ - r = 0; + case KVM_CHECK_EXTENSION: { + int ext = (long)argp; + + switch (ext) { + case KVM_CAP_IRQCHIP: + case KVM_CAP_HLT: + r = 1; + break; + default: + r = 0; + break; + } break; + } case KVM_GET_VCPU_MMAP_SIZE: r = -EINVAL; if (arg) @@ -2881,8 +3252,6 @@ out: } static struct file_operations kvm_chardev_ops = { - .open = kvm_dev_open, - .release = kvm_dev_release, .unlocked_ioctl = kvm_dev_ioctl, .compat_ioctl = kvm_dev_ioctl, }; @@ -2893,25 +3262,6 @@ static struct miscdevice kvm_dev = { &kvm_chardev_ops, }; -static int kvm_reboot(struct notifier_block *notifier, unsigned long val, - void *v) -{ - if (val == SYS_RESTART) { - /* - * Some (well, at least mine) BIOSes hang on reboot if - * in vmx root mode. - */ - printk(KERN_INFO "kvm: exiting hardware virtualization\n"); - on_each_cpu(hardware_disable, NULL, 0, 1); - } - return NOTIFY_OK; -} - -static struct notifier_block kvm_reboot_notifier = { - .notifier_call = kvm_reboot, - .priority = 0, -}; - /* * Make sure that a cpu that is being hot-unplugged does not have any vcpus * cached on it. @@ -2925,7 +3275,9 @@ static void decache_vcpus_on_cpu(int cpu) spin_lock(&kvm_lock); list_for_each_entry(vm, &vm_list, vm_list) for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = &vm->vcpus[i]; + vcpu = vm->vcpus[i]; + if (!vcpu) + continue; /* * If the vcpu is locked, then it is running on some * other cpu and therefore it is not cached on the @@ -2936,7 +3288,7 @@ static void decache_vcpus_on_cpu(int cpu) */ if (mutex_trylock(&vcpu->mutex)) { if (vcpu->cpu == cpu) { - kvm_arch_ops->vcpu_decache(vcpu); + kvm_x86_ops->vcpu_decache(vcpu); vcpu->cpu = -1; } mutex_unlock(&vcpu->mutex); @@ -2952,7 +3304,7 @@ static void hardware_enable(void *junk) if (cpu_isset(cpu, cpus_hardware_enabled)) return; cpu_set(cpu, cpus_hardware_enabled); - kvm_arch_ops->hardware_enable(NULL); + kvm_x86_ops->hardware_enable(NULL); } static void hardware_disable(void *junk) @@ -2963,7 +3315,7 @@ static void hardware_disable(void *junk) return; cpu_clear(cpu, cpus_hardware_enabled); decache_vcpus_on_cpu(cpu); - kvm_arch_ops->hardware_disable(NULL); + kvm_x86_ops->hardware_disable(NULL); } static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, @@ -2994,6 +3346,25 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, return NOTIFY_OK; } +static int kvm_reboot(struct notifier_block *notifier, unsigned long val, + void *v) +{ + if (val == SYS_RESTART) { + /* + * Some (well, at least mine) BIOSes hang on reboot if + * in vmx root mode. + */ + printk(KERN_INFO "kvm: exiting hardware virtualization\n"); + on_each_cpu(hardware_disable, NULL, 0, 1); + } + return NOTIFY_OK; +} + +static struct notifier_block kvm_reboot_notifier = { + .notifier_call = kvm_reboot, + .priority = 0, +}; + void kvm_io_bus_init(struct kvm_io_bus *bus) { memset(bus, 0, sizeof(*bus)); @@ -3047,18 +3418,15 @@ static u64 stat_get(void *_offset) spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = &kvm->vcpus[i]; - total += *(u32 *)((void *)vcpu + offset); + vcpu = kvm->vcpus[i]; + if (vcpu) + total += *(u32 *)((void *)vcpu + offset); } spin_unlock(&kvm_lock); return total; } -static void stat_set(void *offset, u64 val) -{ -} - -DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, stat_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n"); static __init void kvm_init_debug(void) { @@ -3105,11 +3473,34 @@ static struct sys_device kvm_sysdev = { hpa_t bad_page_address; -int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) +static inline +struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) +{ + return container_of(pn, struct kvm_vcpu, preempt_notifier); +} + +static void kvm_sched_in(struct preempt_notifier *pn, int cpu) +{ + struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); + + kvm_x86_ops->vcpu_load(vcpu, cpu); +} + +static void kvm_sched_out(struct preempt_notifier *pn, + struct task_struct *next) +{ + struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); + + kvm_x86_ops->vcpu_put(vcpu); +} + +int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size, + struct module *module) { int r; + int cpu; - if (kvm_arch_ops) { + if (kvm_x86_ops) { printk(KERN_ERR "kvm: already loaded the other module\n"); return -EEXIST; } @@ -3123,12 +3514,20 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) return -EOPNOTSUPP; } - kvm_arch_ops = ops; + kvm_x86_ops = ops; - r = kvm_arch_ops->hardware_setup(); + r = kvm_x86_ops->hardware_setup(); if (r < 0) goto out; + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, + kvm_x86_ops->check_processor_compatibility, + &r, 0, 1); + if (r < 0) + goto out_free_0; + } + on_each_cpu(hardware_enable, NULL, 0, 1); r = register_cpu_notifier(&kvm_cpu_notifier); if (r) @@ -3143,6 +3542,14 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) if (r) goto out_free_3; + /* A kmem cache lets us meet the alignment requirements of fx_save. */ + kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, + __alignof__(struct kvm_vcpu), 0, 0); + if (!kvm_vcpu_cache) { + r = -ENOMEM; + goto out_free_4; + } + kvm_chardev_ops.owner = module; r = misc_register(&kvm_dev); @@ -3151,9 +3558,14 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) goto out_free; } + kvm_preempt_ops.sched_in = kvm_sched_in; + kvm_preempt_ops.sched_out = kvm_sched_out; + return r; out_free: + kmem_cache_destroy(kvm_vcpu_cache); +out_free_4: sysdev_unregister(&kvm_sysdev); out_free_3: sysdev_class_unregister(&kvm_sysdev_class); @@ -3162,22 +3574,24 @@ out_free_2: unregister_cpu_notifier(&kvm_cpu_notifier); out_free_1: on_each_cpu(hardware_disable, NULL, 0, 1); - kvm_arch_ops->hardware_unsetup(); +out_free_0: + kvm_x86_ops->hardware_unsetup(); out: - kvm_arch_ops = NULL; + kvm_x86_ops = NULL; return r; } -void kvm_exit_arch(void) +void kvm_exit_x86(void) { misc_deregister(&kvm_dev); + kmem_cache_destroy(kvm_vcpu_cache); sysdev_unregister(&kvm_sysdev); sysdev_class_unregister(&kvm_sysdev_class); unregister_reboot_notifier(&kvm_reboot_notifier); unregister_cpu_notifier(&kvm_cpu_notifier); on_each_cpu(hardware_disable, NULL, 0, 1); - kvm_arch_ops->hardware_unsetup(); - kvm_arch_ops = NULL; + kvm_x86_ops->hardware_unsetup(); + kvm_x86_ops = NULL; } static __init int kvm_init(void) @@ -3220,5 +3634,5 @@ static __exit void kvm_exit(void) module_init(kvm_init) module_exit(kvm_exit) -EXPORT_SYMBOL_GPL(kvm_init_arch); -EXPORT_SYMBOL_GPL(kvm_exit_arch); +EXPORT_SYMBOL_GPL(kvm_init_x86); +EXPORT_SYMBOL_GPL(kvm_exit_x86); diff --git a/drivers/kvm/kvm_svm.h b/drivers/kvm/kvm_svm.h index a869983d683d..a0e415daef5b 100644 --- a/drivers/kvm/kvm_svm.h +++ b/drivers/kvm/kvm_svm.h @@ -20,7 +20,10 @@ static const u32 host_save_user_msrs[] = { #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) #define NUM_DB_REGS 4 +struct kvm_vcpu; + struct vcpu_svm { + struct kvm_vcpu vcpu; struct vmcb *vmcb; unsigned long vmcb_pa; struct svm_cpu_data *svm_data; diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c new file mode 100644 index 000000000000..a190587cf6a5 --- /dev/null +++ b/drivers/kvm/lapic.c @@ -0,0 +1,1064 @@ + +/* + * Local APIC virtualization + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright (C) 2007 Novell + * Copyright (C) 2007 Intel + * + * Authors: + * Dor Laor <dor.laor@qumranet.com> + * Gregory Haskins <ghaskins@novell.com> + * Yaozu (Eddie) Dong <eddie.dong@intel.com> + * + * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include "kvm.h" +#include <linux/kvm.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/smp.h> +#include <linux/hrtimer.h> +#include <linux/io.h> +#include <linux/module.h> +#include <asm/processor.h> +#include <asm/msr.h> +#include <asm/page.h> +#include <asm/current.h> +#include <asm/apicdef.h> +#include <asm/atomic.h> +#include <asm/div64.h> +#include "irq.h" + +#define PRId64 "d" +#define PRIx64 "llx" +#define PRIu64 "u" +#define PRIo64 "o" + +#define APIC_BUS_CYCLE_NS 1 + +/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ +#define apic_debug(fmt, arg...) + +#define APIC_LVT_NUM 6 +/* 14 is the version for Xeon and Pentium 8.4.8*/ +#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16)) +#define LAPIC_MMIO_LENGTH (1 << 12) +/* followed define is not in apicdef.h */ +#define APIC_SHORT_MASK 0xc0000 +#define APIC_DEST_NOSHORT 0x0 +#define APIC_DEST_MASK 0x800 +#define MAX_APIC_VECTOR 256 + +#define VEC_POS(v) ((v) & (32 - 1)) +#define REG_POS(v) (((v) >> 5) << 4) +static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) +{ + return *((u32 *) (apic->regs + reg_off)); +} + +static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) +{ + *((u32 *) (apic->regs + reg_off)) = val; +} + +static inline int apic_test_and_set_vector(int vec, void *bitmap) +{ + return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +static inline int apic_test_and_clear_vector(int vec, void *bitmap) +{ + return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +static inline void apic_set_vector(int vec, void *bitmap) +{ + set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +static inline void apic_clear_vector(int vec, void *bitmap) +{ + clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +static inline int apic_hw_enabled(struct kvm_lapic *apic) +{ + return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE; +} + +static inline int apic_sw_enabled(struct kvm_lapic *apic) +{ + return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; +} + +static inline int apic_enabled(struct kvm_lapic *apic) +{ + return apic_sw_enabled(apic) && apic_hw_enabled(apic); +} + +#define LVT_MASK \ + (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK) + +#define LINT_MASK \ + (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ + APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) + +static inline int kvm_apic_id(struct kvm_lapic *apic) +{ + return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff; +} + +static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) +{ + return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); +} + +static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) +{ + return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; +} + +static inline int apic_lvtt_period(struct kvm_lapic *apic) +{ + return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC; +} + +static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { + LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ + LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ + LVT_MASK | APIC_MODE_MASK, /* LVTPC */ + LINT_MASK, LINT_MASK, /* LVT0-1 */ + LVT_MASK /* LVTERR */ +}; + +static int find_highest_vector(void *bitmap) +{ + u32 *word = bitmap; + int word_offset = MAX_APIC_VECTOR >> 5; + + while ((word_offset != 0) && (word[(--word_offset) << 2] == 0)) + continue; + + if (likely(!word_offset && !word[0])) + return -1; + else + return fls(word[word_offset << 2]) - 1 + (word_offset << 5); +} + +static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) +{ + return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); +} + +static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) +{ + apic_clear_vector(vec, apic->regs + APIC_IRR); +} + +static inline int apic_find_highest_irr(struct kvm_lapic *apic) +{ + int result; + + result = find_highest_vector(apic->regs + APIC_IRR); + ASSERT(result == -1 || result >= 16); + + return result; +} + +int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; + int highest_irr; + + if (!apic) + return 0; + highest_irr = apic_find_highest_irr(apic); + + return highest_irr; +} +EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); + +int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig) +{ + if (!apic_test_and_set_irr(vec, apic)) { + /* a new pending irq is set in IRR */ + if (trig) + apic_set_vector(vec, apic->regs + APIC_TMR); + else + apic_clear_vector(vec, apic->regs + APIC_TMR); + kvm_vcpu_kick(apic->vcpu); + return 1; + } + return 0; +} + +static inline int apic_find_highest_isr(struct kvm_lapic *apic) +{ + int result; + + result = find_highest_vector(apic->regs + APIC_ISR); + ASSERT(result == -1 || result >= 16); + + return result; +} + +static void apic_update_ppr(struct kvm_lapic *apic) +{ + u32 tpr, isrv, ppr; + int isr; + + tpr = apic_get_reg(apic, APIC_TASKPRI); + isr = apic_find_highest_isr(apic); + isrv = (isr != -1) ? isr : 0; + + if ((tpr & 0xf0) >= (isrv & 0xf0)) + ppr = tpr & 0xff; + else + ppr = isrv & 0xf0; + + apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", + apic, ppr, isr, isrv); + + apic_set_reg(apic, APIC_PROCPRI, ppr); +} + +static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) +{ + apic_set_reg(apic, APIC_TASKPRI, tpr); + apic_update_ppr(apic); +} + +int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) +{ + return kvm_apic_id(apic) == dest; +} + +int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) +{ + int result = 0; + u8 logical_id; + + logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); + + switch (apic_get_reg(apic, APIC_DFR)) { + case APIC_DFR_FLAT: + if (logical_id & mda) + result = 1; + break; + case APIC_DFR_CLUSTER: + if (((logical_id >> 4) == (mda >> 0x4)) + && (logical_id & mda & 0xf)) + result = 1; + break; + default: + printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n", + apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); + break; + } + + return result; +} + +static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, + int short_hand, int dest, int dest_mode) +{ + int result = 0; + struct kvm_lapic *target = vcpu->apic; + + apic_debug("target %p, source %p, dest 0x%x, " + "dest_mode 0x%x, short_hand 0x%x", + target, source, dest, dest_mode, short_hand); + + ASSERT(!target); + switch (short_hand) { + case APIC_DEST_NOSHORT: + if (dest_mode == 0) { + /* Physical mode. */ + if ((dest == 0xFF) || (dest == kvm_apic_id(target))) + result = 1; + } else + /* Logical mode. */ + result = kvm_apic_match_logical_addr(target, dest); + break; + case APIC_DEST_SELF: + if (target == source) + result = 1; + break; + case APIC_DEST_ALLINC: + result = 1; + break; + case APIC_DEST_ALLBUT: + if (target != source) + result = 1; + break; + default: + printk(KERN_WARNING "Bad dest shorthand value %x\n", + short_hand); + break; + } + + return result; +} + +/* + * Add a pending IRQ into lapic. + * Return 1 if successfully added and 0 if discarded. + */ +static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, + int vector, int level, int trig_mode) +{ + int orig_irr, result = 0; + struct kvm_vcpu *vcpu = apic->vcpu; + + switch (delivery_mode) { + case APIC_DM_FIXED: + case APIC_DM_LOWEST: + /* FIXME add logic for vcpu on reset */ + if (unlikely(!apic_enabled(apic))) + break; + + orig_irr = apic_test_and_set_irr(vector, apic); + if (orig_irr && trig_mode) { + apic_debug("level trig mode repeatedly for vector %d", + vector); + break; + } + + if (trig_mode) { + apic_debug("level trig mode for vector %d", vector); + apic_set_vector(vector, apic->regs + APIC_TMR); + } else + apic_clear_vector(vector, apic->regs + APIC_TMR); + + if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE) + kvm_vcpu_kick(vcpu); + else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) { + vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; + if (waitqueue_active(&vcpu->wq)) + wake_up_interruptible(&vcpu->wq); + } + + result = (orig_irr == 0); + break; + + case APIC_DM_REMRD: + printk(KERN_DEBUG "Ignoring delivery mode 3\n"); + break; + + case APIC_DM_SMI: + printk(KERN_DEBUG "Ignoring guest SMI\n"); + break; + case APIC_DM_NMI: + printk(KERN_DEBUG "Ignoring guest NMI\n"); + break; + + case APIC_DM_INIT: + if (level) { + if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE) + printk(KERN_DEBUG + "INIT on a runnable vcpu %d\n", + vcpu->vcpu_id); + vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED; + kvm_vcpu_kick(vcpu); + } else { + printk(KERN_DEBUG + "Ignoring de-assert INIT to vcpu %d\n", + vcpu->vcpu_id); + } + + break; + + case APIC_DM_STARTUP: + printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", + vcpu->vcpu_id, vector); + if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) { + vcpu->sipi_vector = vector; + vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED; + if (waitqueue_active(&vcpu->wq)) + wake_up_interruptible(&vcpu->wq); + } + break; + + default: + printk(KERN_ERR "TODO: unsupported delivery mode %x\n", + delivery_mode); + break; + } + return result; +} + +struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, + unsigned long bitmap) +{ + int vcpu_id; + int last; + int next; + struct kvm_lapic *apic; + + last = kvm->round_robin_prev_vcpu; + next = last; + + do { + if (++next == KVM_MAX_VCPUS) + next = 0; + if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap)) + continue; + apic = kvm->vcpus[next]->apic; + if (apic && apic_enabled(apic)) + break; + apic = NULL; + } while (next != last); + kvm->round_robin_prev_vcpu = next; + + if (!apic) { + vcpu_id = ffs(bitmap) - 1; + if (vcpu_id < 0) { + vcpu_id = 0; + printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n"); + } + apic = kvm->vcpus[vcpu_id]->apic; + } + + return apic; +} + +static void apic_set_eoi(struct kvm_lapic *apic) +{ + int vector = apic_find_highest_isr(apic); + + /* + * Not every write EOI will has corresponding ISR, + * one example is when Kernel check timer on setup_IO_APIC + */ + if (vector == -1) + return; + + apic_clear_vector(vector, apic->regs + APIC_ISR); + apic_update_ppr(apic); + + if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) + kvm_ioapic_update_eoi(apic->vcpu->kvm, vector); +} + +static void apic_send_ipi(struct kvm_lapic *apic) +{ + u32 icr_low = apic_get_reg(apic, APIC_ICR); + u32 icr_high = apic_get_reg(apic, APIC_ICR2); + + unsigned int dest = GET_APIC_DEST_FIELD(icr_high); + unsigned int short_hand = icr_low & APIC_SHORT_MASK; + unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG; + unsigned int level = icr_low & APIC_INT_ASSERT; + unsigned int dest_mode = icr_low & APIC_DEST_MASK; + unsigned int delivery_mode = icr_low & APIC_MODE_MASK; + unsigned int vector = icr_low & APIC_VECTOR_MASK; + + struct kvm_lapic *target; + struct kvm_vcpu *vcpu; + unsigned long lpr_map = 0; + int i; + + apic_debug("icr_high 0x%x, icr_low 0x%x, " + "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " + "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", + icr_high, icr_low, short_hand, dest, + trig_mode, level, dest_mode, delivery_mode, vector); + + for (i = 0; i < KVM_MAX_VCPUS; i++) { + vcpu = apic->vcpu->kvm->vcpus[i]; + if (!vcpu) + continue; + + if (vcpu->apic && + apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { + if (delivery_mode == APIC_DM_LOWEST) + set_bit(vcpu->vcpu_id, &lpr_map); + else + __apic_accept_irq(vcpu->apic, delivery_mode, + vector, level, trig_mode); + } + } + + if (delivery_mode == APIC_DM_LOWEST) { + target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map); + if (target != NULL) + __apic_accept_irq(target, delivery_mode, + vector, level, trig_mode); + } +} + +static u32 apic_get_tmcct(struct kvm_lapic *apic) +{ + u32 counter_passed; + ktime_t passed, now = apic->timer.dev.base->get_time(); + u32 tmcct = apic_get_reg(apic, APIC_TMICT); + + ASSERT(apic != NULL); + + if (unlikely(ktime_to_ns(now) <= + ktime_to_ns(apic->timer.last_update))) { + /* Wrap around */ + passed = ktime_add(( { + (ktime_t) { + .tv64 = KTIME_MAX - + (apic->timer.last_update).tv64}; } + ), now); + apic_debug("time elapsed\n"); + } else + passed = ktime_sub(now, apic->timer.last_update); + + counter_passed = div64_64(ktime_to_ns(passed), + (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); + tmcct -= counter_passed; + + if (tmcct <= 0) { + if (unlikely(!apic_lvtt_period(apic))) + tmcct = 0; + else + do { + tmcct += apic_get_reg(apic, APIC_TMICT); + } while (tmcct <= 0); + } + + return tmcct; +} + +static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) +{ + u32 val = 0; + + if (offset >= LAPIC_MMIO_LENGTH) + return 0; + + switch (offset) { + case APIC_ARBPRI: + printk(KERN_WARNING "Access APIC ARBPRI register " + "which is for P6\n"); + break; + + case APIC_TMCCT: /* Timer CCR */ + val = apic_get_tmcct(apic); + break; + + default: + apic_update_ppr(apic); + val = apic_get_reg(apic, offset); + break; + } + + return val; +} + +static void apic_mmio_read(struct kvm_io_device *this, + gpa_t address, int len, void *data) +{ + struct kvm_lapic *apic = (struct kvm_lapic *)this->private; + unsigned int offset = address - apic->base_address; + unsigned char alignment = offset & 0xf; + u32 result; + + if ((alignment + len) > 4) { + printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d", + (unsigned long)address, len); + return; + } + result = __apic_read(apic, offset & ~0xf); + + switch (len) { + case 1: + case 2: + case 4: + memcpy(data, (char *)&result + alignment, len); + break; + default: + printk(KERN_ERR "Local APIC read with len = %x, " + "should be 1,2, or 4 instead\n", len); + break; + } +} + +static void update_divide_count(struct kvm_lapic *apic) +{ + u32 tmp1, tmp2, tdcr; + + tdcr = apic_get_reg(apic, APIC_TDCR); + tmp1 = tdcr & 0xf; + tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; + apic->timer.divide_count = 0x1 << (tmp2 & 0x7); + + apic_debug("timer divide count is 0x%x\n", + apic->timer.divide_count); +} + +static void start_apic_timer(struct kvm_lapic *apic) +{ + ktime_t now = apic->timer.dev.base->get_time(); + + apic->timer.last_update = now; + + apic->timer.period = apic_get_reg(apic, APIC_TMICT) * + APIC_BUS_CYCLE_NS * apic->timer.divide_count; + atomic_set(&apic->timer.pending, 0); + hrtimer_start(&apic->timer.dev, + ktime_add_ns(now, apic->timer.period), + HRTIMER_MODE_ABS); + + apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" + PRIx64 ", " + "timer initial count 0x%x, period %lldns, " + "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__, + APIC_BUS_CYCLE_NS, ktime_to_ns(now), + apic_get_reg(apic, APIC_TMICT), + apic->timer.period, + ktime_to_ns(ktime_add_ns(now, + apic->timer.period))); +} + +static void apic_mmio_write(struct kvm_io_device *this, + gpa_t address, int len, const void *data) +{ + struct kvm_lapic *apic = (struct kvm_lapic *)this->private; + unsigned int offset = address - apic->base_address; + unsigned char alignment = offset & 0xf; + u32 val; + + /* + * APIC register must be aligned on 128-bits boundary. + * 32/64/128 bits registers must be accessed thru 32 bits. + * Refer SDM 8.4.1 + */ + if (len != 4 || alignment) { + if (printk_ratelimit()) + printk(KERN_ERR "apic write: bad size=%d %lx\n", + len, (long)address); + return; + } + + val = *(u32 *) data; + + /* too common printing */ + if (offset != APIC_EOI) + apic_debug("%s: offset 0x%x with length 0x%x, and value is " + "0x%x\n", __FUNCTION__, offset, len, val); + + offset &= 0xff0; + + switch (offset) { + case APIC_ID: /* Local APIC ID */ + apic_set_reg(apic, APIC_ID, val); + break; + + case APIC_TASKPRI: + apic_set_tpr(apic, val & 0xff); + break; + + case APIC_EOI: + apic_set_eoi(apic); + break; + + case APIC_LDR: + apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); + break; + + case APIC_DFR: + apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); + break; + + case APIC_SPIV: + apic_set_reg(apic, APIC_SPIV, val & 0x3ff); + if (!(val & APIC_SPIV_APIC_ENABLED)) { + int i; + u32 lvt_val; + + for (i = 0; i < APIC_LVT_NUM; i++) { + lvt_val = apic_get_reg(apic, + APIC_LVTT + 0x10 * i); + apic_set_reg(apic, APIC_LVTT + 0x10 * i, + lvt_val | APIC_LVT_MASKED); + } + atomic_set(&apic->timer.pending, 0); + + } + break; + + case APIC_ICR: + /* No delay here, so we always clear the pending bit */ + apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); + apic_send_ipi(apic); + break; + + case APIC_ICR2: + apic_set_reg(apic, APIC_ICR2, val & 0xff000000); + break; + + case APIC_LVTT: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVT0: + case APIC_LVT1: + case APIC_LVTERR: + /* TODO: Check vector */ + if (!apic_sw_enabled(apic)) + val |= APIC_LVT_MASKED; + + val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4]; + apic_set_reg(apic, offset, val); + + break; + + case APIC_TMICT: + hrtimer_cancel(&apic->timer.dev); + apic_set_reg(apic, APIC_TMICT, val); + start_apic_timer(apic); + return; + + case APIC_TDCR: + if (val & 4) + printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val); + apic_set_reg(apic, APIC_TDCR, val); + update_divide_count(apic); + break; + + default: + apic_debug("Local APIC Write to read-only register %x\n", + offset); + break; + } + +} + +static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr) +{ + struct kvm_lapic *apic = (struct kvm_lapic *)this->private; + int ret = 0; + + + if (apic_hw_enabled(apic) && + (addr >= apic->base_address) && + (addr < (apic->base_address + LAPIC_MMIO_LENGTH))) + ret = 1; + + return ret; +} + +void kvm_free_apic(struct kvm_lapic *apic) +{ + if (!apic) + return; + + hrtimer_cancel(&apic->timer.dev); + + if (apic->regs_page) { + __free_page(apic->regs_page); + apic->regs_page = 0; + } + + kfree(apic); +} + +/* + *---------------------------------------------------------------------- + * LAPIC interface + *---------------------------------------------------------------------- + */ + +void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) +{ + struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; + + if (!apic) + return; + apic_set_tpr(apic, ((cr8 & 0x0f) << 4)); +} + +u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; + u64 tpr; + + if (!apic) + return 0; + tpr = (u64) apic_get_reg(apic, APIC_TASKPRI); + + return (tpr & 0xf0) >> 4; +} +EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8); + +void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) +{ + struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; + + if (!apic) { + value |= MSR_IA32_APICBASE_BSP; + vcpu->apic_base = value; + return; + } + if (apic->vcpu->vcpu_id) + value &= ~MSR_IA32_APICBASE_BSP; + + vcpu->apic_base = value; + apic->base_address = apic->vcpu->apic_base & + MSR_IA32_APICBASE_BASE; + + /* with FSB delivery interrupt, we can restart APIC functionality */ + apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " + "0x%lx.\n", apic->apic_base, apic->base_address); + +} + +u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) +{ + return vcpu->apic_base; +} +EXPORT_SYMBOL_GPL(kvm_lapic_get_base); + +void kvm_lapic_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic; + int i; + + apic_debug("%s\n", __FUNCTION__); + + ASSERT(vcpu); + apic = vcpu->apic; + ASSERT(apic != NULL); + + /* Stop the timer in case it's a reset to an active apic */ + hrtimer_cancel(&apic->timer.dev); + + apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); + apic_set_reg(apic, APIC_LVR, APIC_VERSION); + + for (i = 0; i < APIC_LVT_NUM; i++) + apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); + apic_set_reg(apic, APIC_LVT0, + SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); + + apic_set_reg(apic, APIC_DFR, 0xffffffffU); + apic_set_reg(apic, APIC_SPIV, 0xff); + apic_set_reg(apic, APIC_TASKPRI, 0); + apic_set_reg(apic, APIC_LDR, 0); + apic_set_reg(apic, APIC_ESR, 0); + apic_set_reg(apic, APIC_ICR, 0); + apic_set_reg(apic, APIC_ICR2, 0); + apic_set_reg(apic, APIC_TDCR, 0); + apic_set_reg(apic, APIC_TMICT, 0); + for (i = 0; i < 8; i++) { + apic_set_reg(apic, APIC_IRR + 0x10 * i, 0); + apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); + apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); + } + apic->timer.divide_count = 0; + atomic_set(&apic->timer.pending, 0); + if (vcpu->vcpu_id == 0) + vcpu->apic_base |= MSR_IA32_APICBASE_BSP; + apic_update_ppr(apic); + + apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" + "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__, + vcpu, kvm_apic_id(apic), + vcpu->apic_base, apic->base_address); +} +EXPORT_SYMBOL_GPL(kvm_lapic_reset); + +int kvm_lapic_enabled(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; + int ret = 0; + + if (!apic) + return 0; + ret = apic_enabled(apic); + + return ret; +} +EXPORT_SYMBOL_GPL(kvm_lapic_enabled); + +/* + *---------------------------------------------------------------------- + * timer interface + *---------------------------------------------------------------------- + */ + +/* TODO: make sure __apic_timer_fn runs in current pCPU */ +static int __apic_timer_fn(struct kvm_lapic *apic) +{ + int result = 0; + wait_queue_head_t *q = &apic->vcpu->wq; + + atomic_inc(&apic->timer.pending); + if (waitqueue_active(q)) + { + apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; + wake_up_interruptible(q); + } + if (apic_lvtt_period(apic)) { + result = 1; + apic->timer.dev.expires = ktime_add_ns( + apic->timer.dev.expires, + apic->timer.period); + } + return result; +} + +static int __inject_apic_timer_irq(struct kvm_lapic *apic) +{ + int vector; + + vector = apic_lvt_vector(apic, APIC_LVTT); + return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0); +} + +static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) +{ + struct kvm_lapic *apic; + int restart_timer = 0; + + apic = container_of(data, struct kvm_lapic, timer.dev); + + restart_timer = __apic_timer_fn(apic); + + if (restart_timer) + return HRTIMER_RESTART; + else + return HRTIMER_NORESTART; +} + +int kvm_create_lapic(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic; + + ASSERT(vcpu != NULL); + apic_debug("apic_init %d\n", vcpu->vcpu_id); + + apic = kzalloc(sizeof(*apic), GFP_KERNEL); + if (!apic) + goto nomem; + + vcpu->apic = apic; + + apic->regs_page = alloc_page(GFP_KERNEL); + if (apic->regs_page == NULL) { + printk(KERN_ERR "malloc apic regs error for vcpu %x\n", + vcpu->vcpu_id); + goto nomem; + } + apic->regs = page_address(apic->regs_page); + memset(apic->regs, 0, PAGE_SIZE); + apic->vcpu = vcpu; + + hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + apic->timer.dev.function = apic_timer_fn; + apic->base_address = APIC_DEFAULT_PHYS_BASE; + vcpu->apic_base = APIC_DEFAULT_PHYS_BASE; + + kvm_lapic_reset(vcpu); + apic->dev.read = apic_mmio_read; + apic->dev.write = apic_mmio_write; + apic->dev.in_range = apic_mmio_range; + apic->dev.private = apic; + + return 0; +nomem: + kvm_free_apic(apic); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(kvm_create_lapic); + +int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->apic; + int highest_irr; + + if (!apic || !apic_enabled(apic)) + return -1; + + apic_update_ppr(apic); + highest_irr = apic_find_highest_irr(apic); + if ((highest_irr == -1) || + ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI))) + return -1; + return highest_irr; +} + +int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) +{ + u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0); + int r = 0; + + if (vcpu->vcpu_id == 0) { + if (!apic_hw_enabled(vcpu->apic)) + r = 1; + if ((lvt0 & APIC_LVT_MASKED) == 0 && + GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) + r = 1; + } + return r; +} + +void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->apic; + + if (apic && apic_lvt_enabled(apic, APIC_LVTT) && + atomic_read(&apic->timer.pending) > 0) { + if (__inject_apic_timer_irq(apic)) + atomic_dec(&apic->timer.pending); + } +} + +void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) +{ + struct kvm_lapic *apic = vcpu->apic; + + if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec) + apic->timer.last_update = ktime_add_ns( + apic->timer.last_update, + apic->timer.period); +} + +int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) +{ + int vector = kvm_apic_has_interrupt(vcpu); + struct kvm_lapic *apic = vcpu->apic; + + if (vector == -1) + return -1; + + apic_set_vector(vector, apic->regs + APIC_ISR); + apic_update_ppr(apic); + apic_clear_irr(vector, apic); + return vector; +} + +void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->apic; + + apic->base_address = vcpu->apic_base & + MSR_IA32_APICBASE_BASE; + apic_set_reg(apic, APIC_LVR, APIC_VERSION); + apic_update_ppr(apic); + hrtimer_cancel(&apic->timer.dev); + update_divide_count(apic); + start_apic_timer(apic); +} + +void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->apic; + struct hrtimer *timer; + + if (!apic) + return; + + timer = &apic->timer.dev; + if (hrtimer_cancel(timer)) + hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); +} +EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer); diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 23965aa5ee78..6d84d30f5ed0 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -158,7 +158,7 @@ static struct kmem_cache *mmu_page_header_cache; static int is_write_protection(struct kvm_vcpu *vcpu) { - return vcpu->cr0 & CR0_WP_MASK; + return vcpu->cr0 & X86_CR0_WP; } static int is_cpuid_PSE36(void) @@ -202,15 +202,14 @@ static void set_shadow_pte(u64 *sptep, u64 spte) } static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, - struct kmem_cache *base_cache, int min, - gfp_t gfp_flags) + struct kmem_cache *base_cache, int min) { void *obj; if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - obj = kmem_cache_zalloc(base_cache, gfp_flags); + obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); if (!obj) return -ENOMEM; cache->objects[cache->nobjs++] = obj; @@ -225,14 +224,14 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) } static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, - int min, gfp_t gfp_flags) + int min) { struct page *page; if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = alloc_page(gfp_flags); + page = alloc_page(GFP_KERNEL); if (!page) return -ENOMEM; set_page_private(page, 0); @@ -247,44 +246,28 @@ static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) free_page((unsigned long)mc->objects[--mc->nobjs]); } -static int __mmu_topup_memory_caches(struct kvm_vcpu *vcpu, gfp_t gfp_flags) +static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) { int r; + kvm_mmu_free_some_pages(vcpu); r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache, - pte_chain_cache, 4, gfp_flags); + pte_chain_cache, 4); if (r) goto out; r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache, - rmap_desc_cache, 1, gfp_flags); + rmap_desc_cache, 1); if (r) goto out; - r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4, gfp_flags); + r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4); if (r) goto out; r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache, - mmu_page_header_cache, 4, gfp_flags); + mmu_page_header_cache, 4); out: return r; } -static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) -{ - int r; - - r = __mmu_topup_memory_caches(vcpu, GFP_NOWAIT); - kvm_mmu_free_some_pages(vcpu); - if (r < 0) { - spin_unlock(&vcpu->kvm->lock); - kvm_arch_ops->vcpu_put(vcpu); - r = __mmu_topup_memory_caches(vcpu, GFP_KERNEL); - kvm_arch_ops->vcpu_load(vcpu); - spin_lock(&vcpu->kvm->lock); - kvm_mmu_free_some_pages(vcpu); - } - return r; -} - static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache); @@ -969,7 +952,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - kvm_arch_ops->tlb_flush(vcpu); + kvm_x86_ops->tlb_flush(vcpu); } static void paging_new_cr3(struct kvm_vcpu *vcpu) @@ -982,7 +965,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu, u64 addr, u32 err_code) { - kvm_arch_ops->inject_page_fault(vcpu, addr, err_code); + kvm_x86_ops->inject_page_fault(vcpu, addr, err_code); } static void paging_free(struct kvm_vcpu *vcpu) @@ -1071,15 +1054,15 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) { int r; - spin_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->lock); r = mmu_topup_memory_caches(vcpu); if (r) goto out; mmu_alloc_roots(vcpu); - kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); + kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); kvm_mmu_flush_tlb(vcpu); out: - spin_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->lock); return r; } EXPORT_SYMBOL_GPL(kvm_mmu_load); @@ -1124,7 +1107,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, } void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *old, const u8 *new, int bytes) + const u8 *new, int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *page; diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 4b5391c717f8..6b094b44f8fb 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -58,7 +58,10 @@ struct guest_walker { int level; gfn_t table_gfn[PT_MAX_FULL_LEVELS]; pt_element_t *table; + pt_element_t pte; pt_element_t *ptep; + struct page *page; + int index; pt_element_t inherited_ar; gfn_t gfn; u32 error_code; @@ -80,11 +83,14 @@ static int FNAME(walk_addr)(struct guest_walker *walker, pgprintk("%s: addr %lx\n", __FUNCTION__, addr); walker->level = vcpu->mmu.root_level; walker->table = NULL; + walker->page = NULL; + walker->ptep = NULL; root = vcpu->cr3; #if PTTYPE == 64 if (!is_long_mode(vcpu)) { walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3]; root = *walker->ptep; + walker->pte = root; if (!(root & PT_PRESENT_MASK)) goto not_present; --walker->level; @@ -96,10 +102,11 @@ static int FNAME(walk_addr)(struct guest_walker *walker, walker->level - 1, table_gfn); slot = gfn_to_memslot(vcpu->kvm, table_gfn); hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK); - walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0); + walker->page = pfn_to_page(hpa >> PAGE_SHIFT); + walker->table = kmap_atomic(walker->page, KM_USER0); ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || - (vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) == 0); + (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0); walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK; @@ -108,6 +115,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, hpa_t paddr; ptep = &walker->table[index]; + walker->index = index; ASSERT(((unsigned long)walker->table & PAGE_MASK) == ((unsigned long)ptep & PAGE_MASK)); @@ -148,16 +156,20 @@ static int FNAME(walk_addr)(struct guest_walker *walker, walker->inherited_ar &= walker->table[index]; table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; - paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK); kunmap_atomic(walker->table, KM_USER0); - walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT), - KM_USER0); + paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT); + walker->page = pfn_to_page(paddr >> PAGE_SHIFT); + walker->table = kmap_atomic(walker->page, KM_USER0); --walker->level; walker->table_gfn[walker->level - 1 ] = table_gfn; pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, walker->level - 1, table_gfn); } - walker->ptep = ptep; + walker->pte = *ptep; + if (walker->page) + walker->ptep = NULL; + if (walker->table) + kunmap_atomic(walker->table, KM_USER0); pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep); return 1; @@ -175,13 +187,9 @@ err: walker->error_code |= PFERR_USER_MASK; if (fetch_fault) walker->error_code |= PFERR_FETCH_MASK; - return 0; -} - -static void FNAME(release_walker)(struct guest_walker *walker) -{ if (walker->table) kunmap_atomic(walker->table, KM_USER0); + return 0; } static void FNAME(mark_pagetable_dirty)(struct kvm *kvm, @@ -193,7 +201,7 @@ static void FNAME(mark_pagetable_dirty)(struct kvm *kvm, static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, u64 *shadow_pte, gpa_t gaddr, - pt_element_t *gpte, + pt_element_t gpte, u64 access_bits, int user_fault, int write_fault, @@ -202,23 +210,34 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, gfn_t gfn) { hpa_t paddr; - int dirty = *gpte & PT_DIRTY_MASK; + int dirty = gpte & PT_DIRTY_MASK; u64 spte = *shadow_pte; int was_rmapped = is_rmap_pte(spte); pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d" " user_fault %d gfn %lx\n", - __FUNCTION__, spte, (u64)*gpte, access_bits, + __FUNCTION__, spte, (u64)gpte, access_bits, write_fault, user_fault, gfn); if (write_fault && !dirty) { - *gpte |= PT_DIRTY_MASK; + pt_element_t *guest_ent, *tmp = NULL; + + if (walker->ptep) + guest_ent = walker->ptep; + else { + tmp = kmap_atomic(walker->page, KM_USER0); + guest_ent = &tmp[walker->index]; + } + + *guest_ent |= PT_DIRTY_MASK; + if (!walker->ptep) + kunmap_atomic(tmp, KM_USER0); dirty = 1; FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); } spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK; - spte |= *gpte & PT64_NX_MASK; + spte |= gpte & PT64_NX_MASK; if (!dirty) access_bits &= ~PT_WRITABLE_MASK; @@ -255,7 +274,7 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, access_bits &= ~PT_WRITABLE_MASK; if (is_writeble_pte(spte)) { spte &= ~PT_WRITABLE_MASK; - kvm_arch_ops->tlb_flush(vcpu); + kvm_x86_ops->tlb_flush(vcpu); } if (write_fault) *ptwrite = 1; @@ -273,13 +292,13 @@ unshadowed: rmap_add(vcpu, shadow_pte); } -static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t *gpte, +static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte, u64 *shadow_pte, u64 access_bits, int user_fault, int write_fault, int *ptwrite, struct guest_walker *walker, gfn_t gfn) { - access_bits &= *gpte; - FNAME(set_pte_common)(vcpu, shadow_pte, *gpte & PT_BASE_ADDR_MASK, + access_bits &= gpte; + FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK, gpte, access_bits, user_fault, write_fault, ptwrite, walker, gfn); } @@ -295,22 +314,22 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) return; pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); - FNAME(set_pte)(vcpu, &gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0, + FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0, 0, NULL, NULL, (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT); } -static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t *gpde, +static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde, u64 *shadow_pte, u64 access_bits, int user_fault, int write_fault, int *ptwrite, struct guest_walker *walker, gfn_t gfn) { gpa_t gaddr; - access_bits &= *gpde; + access_bits &= gpde; gaddr = (gpa_t)gfn << PAGE_SHIFT; if (PTTYPE == 32 && is_cpuid_PSE36()) - gaddr |= (*gpde & PT32_DIR_PSE36_MASK) << + gaddr |= (gpde & PT32_DIR_PSE36_MASK) << (32 - PT32_DIR_PSE36_SHIFT); FNAME(set_pte_common)(vcpu, shadow_pte, gaddr, gpde, access_bits, user_fault, write_fault, @@ -328,9 +347,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, int level; u64 *shadow_ent; u64 *prev_shadow_ent = NULL; - pt_element_t *guest_ent = walker->ptep; - if (!is_present_pte(*guest_ent)) + if (!is_present_pte(walker->pte)) return NULL; shadow_addr = vcpu->mmu.root_hpa; @@ -364,12 +382,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (level - 1 == PT_PAGE_TABLE_LEVEL && walker->level == PT_DIRECTORY_LEVEL) { metaphysical = 1; - hugepage_access = *guest_ent; + hugepage_access = walker->pte; hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK; - if (*guest_ent & PT64_NX_MASK) + if (walker->pte & PT64_NX_MASK) hugepage_access |= (1 << 2); hugepage_access >>= PT_WRITABLE_SHIFT; - table_gfn = (*guest_ent & PT_BASE_ADDR_MASK) + table_gfn = (walker->pte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; } else { metaphysical = 0; @@ -386,12 +404,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, } if (walker->level == PT_DIRECTORY_LEVEL) { - FNAME(set_pde)(vcpu, guest_ent, shadow_ent, + FNAME(set_pde)(vcpu, walker->pte, shadow_ent, walker->inherited_ar, user_fault, write_fault, ptwrite, walker, walker->gfn); } else { ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); - FNAME(set_pte)(vcpu, guest_ent, shadow_ent, + FNAME(set_pte)(vcpu, walker->pte, shadow_ent, walker->inherited_ar, user_fault, write_fault, ptwrite, walker, walker->gfn); } @@ -442,7 +460,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, if (!r) { pgprintk("%s: guest page fault\n", __FUNCTION__); inject_page_fault(vcpu, addr, walker.error_code); - FNAME(release_walker)(&walker); vcpu->last_pt_write_count = 0; /* reset fork detector */ return 0; } @@ -452,8 +469,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, shadow_pte, *shadow_pte, write_pt); - FNAME(release_walker)(&walker); - if (!write_pt) vcpu->last_pt_write_count = 0; /* reset fork detector */ @@ -482,7 +497,6 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) gpa |= vaddr & ~PAGE_MASK; } - FNAME(release_walker)(&walker); return gpa; } diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c index bc818cc126e3..729f1cd93606 100644 --- a/drivers/kvm/svm.c +++ b/drivers/kvm/svm.c @@ -16,12 +16,12 @@ #include "kvm_svm.h" #include "x86_emulate.h" +#include "irq.h" #include <linux/module.h> #include <linux/kernel.h> #include <linux/vmalloc.h> #include <linux/highmem.h> -#include <linux/profile.h> #include <linux/sched.h> #include <asm/desc.h> @@ -38,7 +38,6 @@ MODULE_LICENSE("GPL"); #define DR7_GD_MASK (1 << 13) #define DR6_BD_MASK (1 << 13) -#define CR4_DE_MASK (1UL << 3) #define SEG_TYPE_LDT 2 #define SEG_TYPE_BUSY_TSS16 3 @@ -50,6 +49,13 @@ MODULE_LICENSE("GPL"); #define SVM_FEATURE_LBRV (1 << 1) #define SVM_DEATURE_SVML (1 << 2) +static void kvm_reput_irq(struct vcpu_svm *svm); + +static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct vcpu_svm, vcpu); +} + unsigned long iopm_base; unsigned long msrpm_base; @@ -94,20 +100,6 @@ static inline u32 svm_has(u32 feat) return svm_features & feat; } -static unsigned get_addr_size(struct kvm_vcpu *vcpu) -{ - struct vmcb_save_area *sa = &vcpu->svm->vmcb->save; - u16 cs_attrib; - - if (!(sa->cr0 & CR0_PE_MASK) || (sa->rflags & X86_EFLAGS_VM)) - return 2; - - cs_attrib = sa->cs.attrib; - - return (cs_attrib & SVM_SELECTOR_L_MASK) ? 8 : - (cs_attrib & SVM_SELECTOR_DB_MASK) ? 4 : 2; -} - static inline u8 pop_irq(struct kvm_vcpu *vcpu) { int word_index = __ffs(vcpu->irq_summary); @@ -182,7 +174,7 @@ static inline void write_dr7(unsigned long val) static inline void force_new_asid(struct kvm_vcpu *vcpu) { - vcpu->svm->asid_generation--; + to_svm(vcpu)->asid_generation--; } static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) @@ -195,22 +187,24 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) if (!(efer & KVM_EFER_LMA)) efer &= ~KVM_EFER_LME; - vcpu->svm->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; + to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; vcpu->shadow_efer = efer; } static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) { - vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_VALID_ERR | SVM_EVTINJ_TYPE_EXEPT | GP_VECTOR; - vcpu->svm->vmcb->control.event_inj_err = error_code; + svm->vmcb->control.event_inj_err = error_code; } static void inject_ud(struct kvm_vcpu *vcpu) { - vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | + to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT | UD_VECTOR; } @@ -229,19 +223,21 @@ static int is_external_interrupt(u32 info) static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { - if (!vcpu->svm->next_rip) { + struct vcpu_svm *svm = to_svm(vcpu); + + if (!svm->next_rip) { printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); return; } - if (vcpu->svm->next_rip - vcpu->svm->vmcb->save.rip > 15) { + if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) { printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", __FUNCTION__, - vcpu->svm->vmcb->save.rip, - vcpu->svm->next_rip); + svm->vmcb->save.rip, + svm->next_rip); } - vcpu->rip = vcpu->svm->vmcb->save.rip = vcpu->svm->next_rip; - vcpu->svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; + vcpu->rip = svm->vmcb->save.rip = svm->next_rip; + svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; vcpu->interrupt_window_open = 1; } @@ -351,8 +347,8 @@ err_1: } -static int set_msr_interception(u32 *msrpm, unsigned msr, - int read, int write) +static void set_msr_interception(u32 *msrpm, unsigned msr, + int read, int write) { int i; @@ -367,11 +363,10 @@ static int set_msr_interception(u32 *msrpm, unsigned msr, u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1); *base = (*base & ~(0x3 << msr_shift)) | (mask << msr_shift); - return 1; + return; } } - printk(KERN_DEBUG "%s: not found 0x%x\n", __FUNCTION__, msr); - return 0; + BUG(); } static __init int svm_hardware_setup(void) @@ -382,8 +377,6 @@ static __init int svm_hardware_setup(void) void *iopm_va, *msrpm_va; int r; - kvm_emulator_want_group7_invlpg(); - iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); if (!iopm_pages) @@ -458,11 +451,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } -static int svm_vcpu_setup(struct kvm_vcpu *vcpu) -{ - return 0; -} - static void init_vmcb(struct vmcb *vmcb) { struct vmcb_control_area *control = &vmcb->control; @@ -563,59 +551,83 @@ static void init_vmcb(struct vmcb *vmcb) * cr0 val on cpu init should be 0x60000010, we enable cpu * cache by default. the orderly way is to enable cache in bios. */ - save->cr0 = 0x00000010 | CR0_PG_MASK | CR0_WP_MASK; - save->cr4 = CR4_PAE_MASK; + save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; + save->cr4 = X86_CR4_PAE; /* rdx = ?? */ } -static int svm_create_vcpu(struct kvm_vcpu *vcpu) +static void svm_vcpu_reset(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + init_vmcb(svm->vmcb); +} + +static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) { + struct vcpu_svm *svm; struct page *page; - int r; + int err; + + svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + if (!svm) { + err = -ENOMEM; + goto out; + } + + err = kvm_vcpu_init(&svm->vcpu, kvm, id); + if (err) + goto free_svm; + + if (irqchip_in_kernel(kvm)) { + err = kvm_create_lapic(&svm->vcpu); + if (err < 0) + goto free_svm; + } - r = -ENOMEM; - vcpu->svm = kzalloc(sizeof *vcpu->svm, GFP_KERNEL); - if (!vcpu->svm) - goto out1; page = alloc_page(GFP_KERNEL); - if (!page) - goto out2; - - vcpu->svm->vmcb = page_address(page); - clear_page(vcpu->svm->vmcb); - vcpu->svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; - vcpu->svm->asid_generation = 0; - memset(vcpu->svm->db_regs, 0, sizeof(vcpu->svm->db_regs)); - init_vmcb(vcpu->svm->vmcb); - - fx_init(vcpu); - vcpu->fpu_active = 1; - vcpu->apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; - if (vcpu == &vcpu->kvm->vcpus[0]) - vcpu->apic_base |= MSR_IA32_APICBASE_BSP; + if (!page) { + err = -ENOMEM; + goto uninit; + } - return 0; + svm->vmcb = page_address(page); + clear_page(svm->vmcb); + svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; + svm->asid_generation = 0; + memset(svm->db_regs, 0, sizeof(svm->db_regs)); + init_vmcb(svm->vmcb); -out2: - kfree(vcpu->svm); -out1: - return r; + fx_init(&svm->vcpu); + svm->vcpu.fpu_active = 1; + svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + if (svm->vcpu.vcpu_id == 0) + svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP; + + return &svm->vcpu; + +uninit: + kvm_vcpu_uninit(&svm->vcpu); +free_svm: + kmem_cache_free(kvm_vcpu_cache, svm); +out: + return ERR_PTR(err); } static void svm_free_vcpu(struct kvm_vcpu *vcpu) { - if (!vcpu->svm) - return; - if (vcpu->svm->vmcb) - __free_page(pfn_to_page(vcpu->svm->vmcb_pa >> PAGE_SHIFT)); - kfree(vcpu->svm); + struct vcpu_svm *svm = to_svm(vcpu); + + __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, svm); } -static void svm_vcpu_load(struct kvm_vcpu *vcpu) +static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - int cpu, i; + struct vcpu_svm *svm = to_svm(vcpu); + int i; - cpu = get_cpu(); if (unlikely(cpu != vcpu->cpu)) { u64 tsc_this, delta; @@ -625,23 +637,24 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu) */ rdtscll(tsc_this); delta = vcpu->host_tsc - tsc_this; - vcpu->svm->vmcb->control.tsc_offset += delta; + svm->vmcb->control.tsc_offset += delta; vcpu->cpu = cpu; + kvm_migrate_apic_timer(vcpu); } for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) - rdmsrl(host_save_user_msrs[i], vcpu->svm->host_user_msrs[i]); + rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); } static void svm_vcpu_put(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); int i; for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) - wrmsrl(host_save_user_msrs[i], vcpu->svm->host_user_msrs[i]); + wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); rdtscll(vcpu->host_tsc); - put_cpu(); } static void svm_vcpu_decache(struct kvm_vcpu *vcpu) @@ -650,31 +663,34 @@ static void svm_vcpu_decache(struct kvm_vcpu *vcpu) static void svm_cache_regs(struct kvm_vcpu *vcpu) { - vcpu->regs[VCPU_REGS_RAX] = vcpu->svm->vmcb->save.rax; - vcpu->regs[VCPU_REGS_RSP] = vcpu->svm->vmcb->save.rsp; - vcpu->rip = vcpu->svm->vmcb->save.rip; + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; + vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; + vcpu->rip = svm->vmcb->save.rip; } static void svm_decache_regs(struct kvm_vcpu *vcpu) { - vcpu->svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX]; - vcpu->svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP]; - vcpu->svm->vmcb->save.rip = vcpu->rip; + struct vcpu_svm *svm = to_svm(vcpu); + svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX]; + svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP]; + svm->vmcb->save.rip = vcpu->rip; } static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) { - return vcpu->svm->vmcb->save.rflags; + return to_svm(vcpu)->vmcb->save.rflags; } static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - vcpu->svm->vmcb->save.rflags = rflags; + to_svm(vcpu)->vmcb->save.rflags = rflags; } static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) { - struct vmcb_save_area *save = &vcpu->svm->vmcb->save; + struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; switch (seg) { case VCPU_SREG_CS: return &save->cs; @@ -716,36 +732,36 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->unusable = !var->present; } -static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) -{ - struct vmcb_seg *s = svm_seg(vcpu, VCPU_SREG_CS); - - *db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; - *l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; -} - static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) { - dt->limit = vcpu->svm->vmcb->save.idtr.limit; - dt->base = vcpu->svm->vmcb->save.idtr.base; + struct vcpu_svm *svm = to_svm(vcpu); + + dt->limit = svm->vmcb->save.idtr.limit; + dt->base = svm->vmcb->save.idtr.base; } static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) { - vcpu->svm->vmcb->save.idtr.limit = dt->limit; - vcpu->svm->vmcb->save.idtr.base = dt->base ; + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->save.idtr.limit = dt->limit; + svm->vmcb->save.idtr.base = dt->base ; } static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) { - dt->limit = vcpu->svm->vmcb->save.gdtr.limit; - dt->base = vcpu->svm->vmcb->save.gdtr.base; + struct vcpu_svm *svm = to_svm(vcpu); + + dt->limit = svm->vmcb->save.gdtr.limit; + dt->base = svm->vmcb->save.gdtr.base; } static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) { - vcpu->svm->vmcb->save.gdtr.limit = dt->limit; - vcpu->svm->vmcb->save.gdtr.base = dt->base ; + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->save.gdtr.limit = dt->limit; + svm->vmcb->save.gdtr.base = dt->base ; } static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) @@ -754,39 +770,42 @@ static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { + struct vcpu_svm *svm = to_svm(vcpu); + #ifdef CONFIG_X86_64 if (vcpu->shadow_efer & KVM_EFER_LME) { - if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) { + if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { vcpu->shadow_efer |= KVM_EFER_LMA; - vcpu->svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME; + svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME; } - if (is_paging(vcpu) && !(cr0 & CR0_PG_MASK) ) { + if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) { vcpu->shadow_efer &= ~KVM_EFER_LMA; - vcpu->svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME); + svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME); } } #endif - if ((vcpu->cr0 & CR0_TS_MASK) && !(cr0 & CR0_TS_MASK)) { - vcpu->svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); + if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { + svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); vcpu->fpu_active = 1; } vcpu->cr0 = cr0; - cr0 |= CR0_PG_MASK | CR0_WP_MASK; - cr0 &= ~(CR0_CD_MASK | CR0_NW_MASK); - vcpu->svm->vmcb->save.cr0 = cr0; + cr0 |= X86_CR0_PG | X86_CR0_WP; + cr0 &= ~(X86_CR0_CD | X86_CR0_NW); + svm->vmcb->save.cr0 = cr0; } static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { vcpu->cr4 = cr4; - vcpu->svm->vmcb->save.cr4 = cr4 | CR4_PAE_MASK; + to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE; } static void svm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { + struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_seg *s = svm_seg(vcpu, seg); s->base = var->base; @@ -805,16 +824,16 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; } if (seg == VCPU_SREG_CS) - vcpu->svm->vmcb->save.cpl - = (vcpu->svm->vmcb->save.cs.attrib + svm->vmcb->save.cpl + = (svm->vmcb->save.cs.attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; } /* FIXME: - vcpu->svm->vmcb->control.int_ctl &= ~V_TPR_MASK; - vcpu->svm->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK); + svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK; + svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK); */ @@ -823,61 +842,68 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) return -EOPNOTSUPP; } +static int svm_get_irq(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u32 exit_int_info = svm->vmcb->control.exit_int_info; + + if (is_external_interrupt(exit_int_info)) + return exit_int_info & SVM_EVTINJ_VEC_MASK; + return -1; +} + static void load_host_msrs(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, vcpu->svm->host_gs_base); + wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); #endif } static void save_host_msrs(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 - rdmsrl(MSR_GS_BASE, vcpu->svm->host_gs_base); + rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); #endif } -static void new_asid(struct kvm_vcpu *vcpu, struct svm_cpu_data *svm_data) +static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) { if (svm_data->next_asid > svm_data->max_asid) { ++svm_data->asid_generation; svm_data->next_asid = 1; - vcpu->svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; + svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; } - vcpu->cpu = svm_data->cpu; - vcpu->svm->asid_generation = svm_data->asid_generation; - vcpu->svm->vmcb->control.asid = svm_data->next_asid++; -} - -static void svm_invlpg(struct kvm_vcpu *vcpu, gva_t address) -{ - invlpga(address, vcpu->svm->vmcb->control.asid); // is needed? + svm->vcpu.cpu = svm_data->cpu; + svm->asid_generation = svm_data->asid_generation; + svm->vmcb->control.asid = svm_data->next_asid++; } static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) { - return vcpu->svm->db_regs[dr]; + return to_svm(vcpu)->db_regs[dr]; } static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, int *exception) { + struct vcpu_svm *svm = to_svm(vcpu); + *exception = 0; - if (vcpu->svm->vmcb->save.dr7 & DR7_GD_MASK) { - vcpu->svm->vmcb->save.dr7 &= ~DR7_GD_MASK; - vcpu->svm->vmcb->save.dr6 |= DR6_BD_MASK; + if (svm->vmcb->save.dr7 & DR7_GD_MASK) { + svm->vmcb->save.dr7 &= ~DR7_GD_MASK; + svm->vmcb->save.dr6 |= DR6_BD_MASK; *exception = DB_VECTOR; return; } switch (dr) { case 0 ... 3: - vcpu->svm->db_regs[dr] = value; + svm->db_regs[dr] = value; return; case 4 ... 5: - if (vcpu->cr4 & CR4_DE_MASK) { + if (vcpu->cr4 & X86_CR4_DE) { *exception = UD_VECTOR; return; } @@ -886,7 +912,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, *exception = GP_VECTOR; return; } - vcpu->svm->vmcb->save.dr7 = value; + svm->vmcb->save.dr7 = value; return; } default: @@ -897,42 +923,44 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, } } -static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - u32 exit_int_info = vcpu->svm->vmcb->control.exit_int_info; + u32 exit_int_info = svm->vmcb->control.exit_int_info; + struct kvm *kvm = svm->vcpu.kvm; u64 fault_address; u32 error_code; enum emulation_result er; int r; - if (is_external_interrupt(exit_int_info)) - push_irq(vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); + if (!irqchip_in_kernel(kvm) && + is_external_interrupt(exit_int_info)) + push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); - spin_lock(&vcpu->kvm->lock); + mutex_lock(&kvm->lock); - fault_address = vcpu->svm->vmcb->control.exit_info_2; - error_code = vcpu->svm->vmcb->control.exit_info_1; - r = kvm_mmu_page_fault(vcpu, fault_address, error_code); + fault_address = svm->vmcb->control.exit_info_2; + error_code = svm->vmcb->control.exit_info_1; + r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); if (r < 0) { - spin_unlock(&vcpu->kvm->lock); + mutex_unlock(&kvm->lock); return r; } if (!r) { - spin_unlock(&vcpu->kvm->lock); + mutex_unlock(&kvm->lock); return 1; } - er = emulate_instruction(vcpu, kvm_run, fault_address, error_code); - spin_unlock(&vcpu->kvm->lock); + er = emulate_instruction(&svm->vcpu, kvm_run, fault_address, + error_code); + mutex_unlock(&kvm->lock); switch (er) { case EMULATE_DONE: return 1; case EMULATE_DO_MMIO: - ++vcpu->stat.mmio_exits; - kvm_run->exit_reason = KVM_EXIT_MMIO; + ++svm->vcpu.stat.mmio_exits; return 0; case EMULATE_FAIL: - vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__); + kvm_report_emulation_failure(&svm->vcpu, "pagetable"); break; default: BUG(); @@ -942,252 +970,142 @@ static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; } -static int nm_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - vcpu->svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - if (!(vcpu->cr0 & CR0_TS_MASK)) - vcpu->svm->vmcb->save.cr0 &= ~CR0_TS_MASK; - vcpu->fpu_active = 1; + svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); + if (!(svm->vcpu.cr0 & X86_CR0_TS)) + svm->vmcb->save.cr0 &= ~X86_CR0_TS; + svm->vcpu.fpu_active = 1; - return 1; + return 1; } -static int shutdown_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { /* * VMCB is undefined after a SHUTDOWN intercept * so reinitialize it. */ - clear_page(vcpu->svm->vmcb); - init_vmcb(vcpu->svm->vmcb); + clear_page(svm->vmcb); + init_vmcb(svm->vmcb); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; } -static int io_get_override(struct kvm_vcpu *vcpu, - struct vmcb_seg **seg, - int *addr_override) -{ - u8 inst[MAX_INST_SIZE]; - unsigned ins_length; - gva_t rip; - int i; - - rip = vcpu->svm->vmcb->save.rip; - ins_length = vcpu->svm->next_rip - rip; - rip += vcpu->svm->vmcb->save.cs.base; - - if (ins_length > MAX_INST_SIZE) - printk(KERN_DEBUG - "%s: inst length err, cs base 0x%llx rip 0x%llx " - "next rip 0x%llx ins_length %u\n", - __FUNCTION__, - vcpu->svm->vmcb->save.cs.base, - vcpu->svm->vmcb->save.rip, - vcpu->svm->vmcb->control.exit_info_2, - ins_length); - - if (kvm_read_guest(vcpu, rip, ins_length, inst) != ins_length) - /* #PF */ - return 0; - - *addr_override = 0; - *seg = NULL; - for (i = 0; i < ins_length; i++) - switch (inst[i]) { - case 0xf0: - case 0xf2: - case 0xf3: - case 0x66: - continue; - case 0x67: - *addr_override = 1; - continue; - case 0x2e: - *seg = &vcpu->svm->vmcb->save.cs; - continue; - case 0x36: - *seg = &vcpu->svm->vmcb->save.ss; - continue; - case 0x3e: - *seg = &vcpu->svm->vmcb->save.ds; - continue; - case 0x26: - *seg = &vcpu->svm->vmcb->save.es; - continue; - case 0x64: - *seg = &vcpu->svm->vmcb->save.fs; - continue; - case 0x65: - *seg = &vcpu->svm->vmcb->save.gs; - continue; - default: - return 1; - } - printk(KERN_DEBUG "%s: unexpected\n", __FUNCTION__); - return 0; -} - -static unsigned long io_adress(struct kvm_vcpu *vcpu, int ins, gva_t *address) +static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - unsigned long addr_mask; - unsigned long *reg; - struct vmcb_seg *seg; - int addr_override; - struct vmcb_save_area *save_area = &vcpu->svm->vmcb->save; - u16 cs_attrib = save_area->cs.attrib; - unsigned addr_size = get_addr_size(vcpu); - - if (!io_get_override(vcpu, &seg, &addr_override)) - return 0; - - if (addr_override) - addr_size = (addr_size == 2) ? 4: (addr_size >> 1); + u32 io_info = svm->vmcb->control.exit_info_1; //address size bug? + int size, down, in, string, rep; + unsigned port; - if (ins) { - reg = &vcpu->regs[VCPU_REGS_RDI]; - seg = &vcpu->svm->vmcb->save.es; - } else { - reg = &vcpu->regs[VCPU_REGS_RSI]; - seg = (seg) ? seg : &vcpu->svm->vmcb->save.ds; - } + ++svm->vcpu.stat.io_exits; - addr_mask = ~0ULL >> (64 - (addr_size * 8)); + svm->next_rip = svm->vmcb->control.exit_info_2; - if ((cs_attrib & SVM_SELECTOR_L_MASK) && - !(vcpu->svm->vmcb->save.rflags & X86_EFLAGS_VM)) { - *address = (*reg & addr_mask); - return addr_mask; - } + string = (io_info & SVM_IOIO_STR_MASK) != 0; - if (!(seg->attrib & SVM_SELECTOR_P_SHIFT)) { - svm_inject_gp(vcpu, 0); - return 0; + if (string) { + if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO) + return 0; + return 1; } - *address = (*reg & addr_mask) + seg->base; - return addr_mask; -} - -static int io_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - u32 io_info = vcpu->svm->vmcb->control.exit_info_1; //address size bug? - int size, down, in, string, rep; - unsigned port; - unsigned long count; - gva_t address = 0; - - ++vcpu->stat.io_exits; - - vcpu->svm->next_rip = vcpu->svm->vmcb->control.exit_info_2; - in = (io_info & SVM_IOIO_TYPE_MASK) != 0; port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; - string = (io_info & SVM_IOIO_STR_MASK) != 0; rep = (io_info & SVM_IOIO_REP_MASK) != 0; - count = 1; - down = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0; + down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0; - if (string) { - unsigned addr_mask; - - addr_mask = io_adress(vcpu, in, &address); - if (!addr_mask) { - printk(KERN_DEBUG "%s: get io address failed\n", - __FUNCTION__); - return 1; - } - - if (rep) - count = vcpu->regs[VCPU_REGS_RCX] & addr_mask; - } - return kvm_setup_pio(vcpu, kvm_run, in, size, count, string, down, - address, rep, port); + return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); } -static int nop_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { return 1; } -static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1; - skip_emulated_instruction(vcpu); - return kvm_emulate_halt(vcpu); + svm->next_rip = svm->vmcb->save.rip + 1; + skip_emulated_instruction(&svm->vcpu); + return kvm_emulate_halt(&svm->vcpu); } -static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 3; - skip_emulated_instruction(vcpu); - return kvm_hypercall(vcpu, kvm_run); + svm->next_rip = svm->vmcb->save.rip + 3; + skip_emulated_instruction(&svm->vcpu); + return kvm_hypercall(&svm->vcpu, kvm_run); } -static int invalid_op_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int invalid_op_interception(struct vcpu_svm *svm, + struct kvm_run *kvm_run) { - inject_ud(vcpu); + inject_ud(&svm->vcpu); return 1; } -static int task_switch_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int task_switch_interception(struct vcpu_svm *svm, + struct kvm_run *kvm_run) { - printk(KERN_DEBUG "%s: task swiche is unsupported\n", __FUNCTION__); + pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__); kvm_run->exit_reason = KVM_EXIT_UNKNOWN; return 0; } -static int cpuid_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2; - kvm_emulate_cpuid(vcpu); + svm->next_rip = svm->vmcb->save.rip + 2; + kvm_emulate_cpuid(&svm->vcpu); return 1; } -static int emulate_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int emulate_on_interception(struct vcpu_svm *svm, + struct kvm_run *kvm_run) { - if (emulate_instruction(vcpu, NULL, 0, 0) != EMULATE_DONE) - printk(KERN_ERR "%s: failed\n", __FUNCTION__); + if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE) + pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__); return 1; } static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) { + struct vcpu_svm *svm = to_svm(vcpu); + switch (ecx) { case MSR_IA32_TIME_STAMP_COUNTER: { u64 tsc; rdtscll(tsc); - *data = vcpu->svm->vmcb->control.tsc_offset + tsc; + *data = svm->vmcb->control.tsc_offset + tsc; break; } case MSR_K6_STAR: - *data = vcpu->svm->vmcb->save.star; + *data = svm->vmcb->save.star; break; #ifdef CONFIG_X86_64 case MSR_LSTAR: - *data = vcpu->svm->vmcb->save.lstar; + *data = svm->vmcb->save.lstar; break; case MSR_CSTAR: - *data = vcpu->svm->vmcb->save.cstar; + *data = svm->vmcb->save.cstar; break; case MSR_KERNEL_GS_BASE: - *data = vcpu->svm->vmcb->save.kernel_gs_base; + *data = svm->vmcb->save.kernel_gs_base; break; case MSR_SYSCALL_MASK: - *data = vcpu->svm->vmcb->save.sfmask; + *data = svm->vmcb->save.sfmask; break; #endif case MSR_IA32_SYSENTER_CS: - *data = vcpu->svm->vmcb->save.sysenter_cs; + *data = svm->vmcb->save.sysenter_cs; break; case MSR_IA32_SYSENTER_EIP: - *data = vcpu->svm->vmcb->save.sysenter_eip; + *data = svm->vmcb->save.sysenter_eip; break; case MSR_IA32_SYSENTER_ESP: - *data = vcpu->svm->vmcb->save.sysenter_esp; + *data = svm->vmcb->save.sysenter_esp; break; default: return kvm_get_msr_common(vcpu, ecx, data); @@ -1195,57 +1113,59 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) return 0; } -static int rdmsr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - u32 ecx = vcpu->regs[VCPU_REGS_RCX]; + u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX]; u64 data; - if (svm_get_msr(vcpu, ecx, &data)) - svm_inject_gp(vcpu, 0); + if (svm_get_msr(&svm->vcpu, ecx, &data)) + svm_inject_gp(&svm->vcpu, 0); else { - vcpu->svm->vmcb->save.rax = data & 0xffffffff; - vcpu->regs[VCPU_REGS_RDX] = data >> 32; - vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2; - skip_emulated_instruction(vcpu); + svm->vmcb->save.rax = data & 0xffffffff; + svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32; + svm->next_rip = svm->vmcb->save.rip + 2; + skip_emulated_instruction(&svm->vcpu); } return 1; } static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) { + struct vcpu_svm *svm = to_svm(vcpu); + switch (ecx) { case MSR_IA32_TIME_STAMP_COUNTER: { u64 tsc; rdtscll(tsc); - vcpu->svm->vmcb->control.tsc_offset = data - tsc; + svm->vmcb->control.tsc_offset = data - tsc; break; } case MSR_K6_STAR: - vcpu->svm->vmcb->save.star = data; + svm->vmcb->save.star = data; break; #ifdef CONFIG_X86_64 case MSR_LSTAR: - vcpu->svm->vmcb->save.lstar = data; + svm->vmcb->save.lstar = data; break; case MSR_CSTAR: - vcpu->svm->vmcb->save.cstar = data; + svm->vmcb->save.cstar = data; break; case MSR_KERNEL_GS_BASE: - vcpu->svm->vmcb->save.kernel_gs_base = data; + svm->vmcb->save.kernel_gs_base = data; break; case MSR_SYSCALL_MASK: - vcpu->svm->vmcb->save.sfmask = data; + svm->vmcb->save.sfmask = data; break; #endif case MSR_IA32_SYSENTER_CS: - vcpu->svm->vmcb->save.sysenter_cs = data; + svm->vmcb->save.sysenter_cs = data; break; case MSR_IA32_SYSENTER_EIP: - vcpu->svm->vmcb->save.sysenter_eip = data; + svm->vmcb->save.sysenter_eip = data; break; case MSR_IA32_SYSENTER_ESP: - vcpu->svm->vmcb->save.sysenter_esp = data; + svm->vmcb->save.sysenter_esp = data; break; default: return kvm_set_msr_common(vcpu, ecx, data); @@ -1253,37 +1173,39 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) return 0; } -static int wrmsr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - u32 ecx = vcpu->regs[VCPU_REGS_RCX]; - u64 data = (vcpu->svm->vmcb->save.rax & -1u) - | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32); - vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2; - if (svm_set_msr(vcpu, ecx, data)) - svm_inject_gp(vcpu, 0); + u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX]; + u64 data = (svm->vmcb->save.rax & -1u) + | ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32); + svm->next_rip = svm->vmcb->save.rip + 2; + if (svm_set_msr(&svm->vcpu, ecx, data)) + svm_inject_gp(&svm->vcpu, 0); else - skip_emulated_instruction(vcpu); + skip_emulated_instruction(&svm->vcpu); return 1; } -static int msr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - if (vcpu->svm->vmcb->control.exit_info_1) - return wrmsr_interception(vcpu, kvm_run); + if (svm->vmcb->control.exit_info_1) + return wrmsr_interception(svm, kvm_run); else - return rdmsr_interception(vcpu, kvm_run); + return rdmsr_interception(svm, kvm_run); } -static int interrupt_window_interception(struct kvm_vcpu *vcpu, +static int interrupt_window_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { + svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); + svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; /* * If the user space waits to inject interrupts, exit as soon as * possible */ if (kvm_run->request_interrupt_window && - !vcpu->irq_summary) { - ++vcpu->stat.irq_window_exits; + !svm->vcpu.irq_summary) { + ++svm->vcpu.stat.irq_window_exits; kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; return 0; } @@ -1291,7 +1213,7 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu, return 1; } -static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu, +static int (*svm_exit_handlers[])(struct vcpu_svm *svm, struct kvm_run *kvm_run) = { [SVM_EXIT_READ_CR0] = emulate_on_interception, [SVM_EXIT_READ_CR3] = emulate_on_interception, @@ -1338,15 +1260,25 @@ static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu, }; -static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { - u32 exit_code = vcpu->svm->vmcb->control.exit_code; + struct vcpu_svm *svm = to_svm(vcpu); + u32 exit_code = svm->vmcb->control.exit_code; + + kvm_reput_irq(svm); - if (is_external_interrupt(vcpu->svm->vmcb->control.exit_int_info) && + if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { + kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; + kvm_run->fail_entry.hardware_entry_failure_reason + = svm->vmcb->control.exit_code; + return 0; + } + + if (is_external_interrupt(svm->vmcb->control.exit_int_info) && exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR) printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " "exit_code 0x%x\n", - __FUNCTION__, vcpu->svm->vmcb->control.exit_int_info, + __FUNCTION__, svm->vmcb->control.exit_int_info, exit_code); if (exit_code >= ARRAY_SIZE(svm_exit_handlers) @@ -1356,7 +1288,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; } - return svm_exit_handlers[exit_code](vcpu, kvm_run); + return svm_exit_handlers[exit_code](svm, kvm_run); } static void reload_tss(struct kvm_vcpu *vcpu) @@ -1368,93 +1300,126 @@ static void reload_tss(struct kvm_vcpu *vcpu) load_TR_desc(); } -static void pre_svm_run(struct kvm_vcpu *vcpu) +static void pre_svm_run(struct vcpu_svm *svm) { int cpu = raw_smp_processor_id(); struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); - vcpu->svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; - if (vcpu->cpu != cpu || - vcpu->svm->asid_generation != svm_data->asid_generation) - new_asid(vcpu, svm_data); + svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; + if (svm->vcpu.cpu != cpu || + svm->asid_generation != svm_data->asid_generation) + new_asid(svm, svm_data); } -static inline void kvm_do_inject_irq(struct kvm_vcpu *vcpu) +static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) { struct vmcb_control_area *control; - control = &vcpu->svm->vmcb->control; - control->int_vector = pop_irq(vcpu); + control = &svm->vmcb->control; + control->int_vector = irq; control->int_ctl &= ~V_INTR_PRIO_MASK; control->int_ctl |= V_IRQ_MASK | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); } -static void kvm_reput_irq(struct kvm_vcpu *vcpu) +static void svm_set_irq(struct kvm_vcpu *vcpu, int irq) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm_inject_irq(svm, irq); +} + +static void svm_intr_assist(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; + int intr_vector = -1; + + kvm_inject_pending_timer_irqs(vcpu); + if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) && + ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) { + intr_vector = vmcb->control.exit_int_info & + SVM_EVTINJ_VEC_MASK; + vmcb->control.exit_int_info = 0; + svm_inject_irq(svm, intr_vector); + return; + } + + if (vmcb->control.int_ctl & V_IRQ_MASK) + return; + + if (!kvm_cpu_has_interrupt(vcpu)) + return; + + if (!(vmcb->save.rflags & X86_EFLAGS_IF) || + (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || + (vmcb->control.event_inj & SVM_EVTINJ_VALID)) { + /* unable to deliver irq, set pending irq */ + vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR); + svm_inject_irq(svm, 0x0); + return; + } + /* Okay, we can deliver the interrupt: grab it and update PIC state. */ + intr_vector = kvm_cpu_get_interrupt(vcpu); + svm_inject_irq(svm, intr_vector); + kvm_timer_intr_post(vcpu, intr_vector); +} + +static void kvm_reput_irq(struct vcpu_svm *svm) { - struct vmcb_control_area *control = &vcpu->svm->vmcb->control; + struct vmcb_control_area *control = &svm->vmcb->control; - if (control->int_ctl & V_IRQ_MASK) { + if ((control->int_ctl & V_IRQ_MASK) + && !irqchip_in_kernel(svm->vcpu.kvm)) { control->int_ctl &= ~V_IRQ_MASK; - push_irq(vcpu, control->int_vector); + push_irq(&svm->vcpu, control->int_vector); } - vcpu->interrupt_window_open = + svm->vcpu.interrupt_window_open = !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); } +static void svm_do_inject_vector(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + int word_index = __ffs(vcpu->irq_summary); + int bit_index = __ffs(vcpu->irq_pending[word_index]); + int irq = word_index * BITS_PER_LONG + bit_index; + + clear_bit(bit_index, &vcpu->irq_pending[word_index]); + if (!vcpu->irq_pending[word_index]) + clear_bit(word_index, &vcpu->irq_summary); + svm_inject_irq(svm, irq); +} + static void do_interrupt_requests(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - struct vmcb_control_area *control = &vcpu->svm->vmcb->control; + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; - vcpu->interrupt_window_open = + svm->vcpu.interrupt_window_open = (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && - (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF)); + (svm->vmcb->save.rflags & X86_EFLAGS_IF)); - if (vcpu->interrupt_window_open && vcpu->irq_summary) + if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary) /* * If interrupts enabled, and not blocked by sti or mov ss. Good. */ - kvm_do_inject_irq(vcpu); + svm_do_inject_vector(svm); /* * Interrupts blocked. Wait for unblock. */ - if (!vcpu->interrupt_window_open && - (vcpu->irq_summary || kvm_run->request_interrupt_window)) { + if (!svm->vcpu.interrupt_window_open && + (svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) { control->intercept |= 1ULL << INTERCEPT_VINTR; } else control->intercept &= ~(1ULL << INTERCEPT_VINTR); } -static void post_kvm_run_save(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) -{ - kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open && - vcpu->irq_summary == 0); - kvm_run->if_flag = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF) != 0; - kvm_run->cr8 = vcpu->cr8; - kvm_run->apic_base = vcpu->apic_base; -} - -/* - * Check if userspace requested an interrupt window, and that the - * interrupt window is open. - * - * No need to exit to userspace if we already have an interrupt queued. - */ -static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) -{ - return (!vcpu->irq_summary && - kvm_run->request_interrupt_window && - vcpu->interrupt_window_open && - (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF)); -} - static void save_db_regs(unsigned long *db_regs) { asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0])); @@ -1476,49 +1441,37 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu) force_new_asid(vcpu); } -static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) +{ +} + +static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { + struct vcpu_svm *svm = to_svm(vcpu); u16 fs_selector; u16 gs_selector; u16 ldt_selector; - int r; - -again: - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - return r; - - if (!vcpu->mmio_read_completed) - do_interrupt_requests(vcpu, kvm_run); - clgi(); - - vcpu->guest_mode = 1; - if (vcpu->requests) - if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) - svm_flush_tlb(vcpu); - - pre_svm_run(vcpu); + pre_svm_run(svm); save_host_msrs(vcpu); fs_selector = read_fs(); gs_selector = read_gs(); ldt_selector = read_ldt(); - vcpu->svm->host_cr2 = kvm_read_cr2(); - vcpu->svm->host_dr6 = read_dr6(); - vcpu->svm->host_dr7 = read_dr7(); - vcpu->svm->vmcb->save.cr2 = vcpu->cr2; + svm->host_cr2 = kvm_read_cr2(); + svm->host_dr6 = read_dr6(); + svm->host_dr7 = read_dr7(); + svm->vmcb->save.cr2 = vcpu->cr2; - if (vcpu->svm->vmcb->save.dr7 & 0xff) { + if (svm->vmcb->save.dr7 & 0xff) { write_dr7(0); - save_db_regs(vcpu->svm->host_db_regs); - load_db_regs(vcpu->svm->db_regs); + save_db_regs(svm->host_db_regs); + load_db_regs(svm->db_regs); } - if (vcpu->fpu_active) { - fx_save(vcpu->host_fx_image); - fx_restore(vcpu->guest_fx_image); - } + clgi(); + + local_irq_enable(); asm volatile ( #ifdef CONFIG_X86_64 @@ -1532,34 +1485,33 @@ again: #endif #ifdef CONFIG_X86_64 - "mov %c[rbx](%[vcpu]), %%rbx \n\t" - "mov %c[rcx](%[vcpu]), %%rcx \n\t" - "mov %c[rdx](%[vcpu]), %%rdx \n\t" - "mov %c[rsi](%[vcpu]), %%rsi \n\t" - "mov %c[rdi](%[vcpu]), %%rdi \n\t" - "mov %c[rbp](%[vcpu]), %%rbp \n\t" - "mov %c[r8](%[vcpu]), %%r8 \n\t" - "mov %c[r9](%[vcpu]), %%r9 \n\t" - "mov %c[r10](%[vcpu]), %%r10 \n\t" - "mov %c[r11](%[vcpu]), %%r11 \n\t" - "mov %c[r12](%[vcpu]), %%r12 \n\t" - "mov %c[r13](%[vcpu]), %%r13 \n\t" - "mov %c[r14](%[vcpu]), %%r14 \n\t" - "mov %c[r15](%[vcpu]), %%r15 \n\t" + "mov %c[rbx](%[svm]), %%rbx \n\t" + "mov %c[rcx](%[svm]), %%rcx \n\t" + "mov %c[rdx](%[svm]), %%rdx \n\t" + "mov %c[rsi](%[svm]), %%rsi \n\t" + "mov %c[rdi](%[svm]), %%rdi \n\t" + "mov %c[rbp](%[svm]), %%rbp \n\t" + "mov %c[r8](%[svm]), %%r8 \n\t" + "mov %c[r9](%[svm]), %%r9 \n\t" + "mov %c[r10](%[svm]), %%r10 \n\t" + "mov %c[r11](%[svm]), %%r11 \n\t" + "mov %c[r12](%[svm]), %%r12 \n\t" + "mov %c[r13](%[svm]), %%r13 \n\t" + "mov %c[r14](%[svm]), %%r14 \n\t" + "mov %c[r15](%[svm]), %%r15 \n\t" #else - "mov %c[rbx](%[vcpu]), %%ebx \n\t" - "mov %c[rcx](%[vcpu]), %%ecx \n\t" - "mov %c[rdx](%[vcpu]), %%edx \n\t" - "mov %c[rsi](%[vcpu]), %%esi \n\t" - "mov %c[rdi](%[vcpu]), %%edi \n\t" - "mov %c[rbp](%[vcpu]), %%ebp \n\t" + "mov %c[rbx](%[svm]), %%ebx \n\t" + "mov %c[rcx](%[svm]), %%ecx \n\t" + "mov %c[rdx](%[svm]), %%edx \n\t" + "mov %c[rsi](%[svm]), %%esi \n\t" + "mov %c[rdi](%[svm]), %%edi \n\t" + "mov %c[rbp](%[svm]), %%ebp \n\t" #endif #ifdef CONFIG_X86_64 /* Enter guest mode */ "push %%rax \n\t" - "mov %c[svm](%[vcpu]), %%rax \n\t" - "mov %c[vmcb](%%rax), %%rax \n\t" + "mov %c[vmcb](%[svm]), %%rax \n\t" SVM_VMLOAD "\n\t" SVM_VMRUN "\n\t" SVM_VMSAVE "\n\t" @@ -1567,8 +1519,7 @@ again: #else /* Enter guest mode */ "push %%eax \n\t" - "mov %c[svm](%[vcpu]), %%eax \n\t" - "mov %c[vmcb](%%eax), %%eax \n\t" + "mov %c[vmcb](%[svm]), %%eax \n\t" SVM_VMLOAD "\n\t" SVM_VMRUN "\n\t" SVM_VMSAVE "\n\t" @@ -1577,73 +1528,69 @@ again: /* Save guest registers, load host registers */ #ifdef CONFIG_X86_64 - "mov %%rbx, %c[rbx](%[vcpu]) \n\t" - "mov %%rcx, %c[rcx](%[vcpu]) \n\t" - "mov %%rdx, %c[rdx](%[vcpu]) \n\t" - "mov %%rsi, %c[rsi](%[vcpu]) \n\t" - "mov %%rdi, %c[rdi](%[vcpu]) \n\t" - "mov %%rbp, %c[rbp](%[vcpu]) \n\t" - "mov %%r8, %c[r8](%[vcpu]) \n\t" - "mov %%r9, %c[r9](%[vcpu]) \n\t" - "mov %%r10, %c[r10](%[vcpu]) \n\t" - "mov %%r11, %c[r11](%[vcpu]) \n\t" - "mov %%r12, %c[r12](%[vcpu]) \n\t" - "mov %%r13, %c[r13](%[vcpu]) \n\t" - "mov %%r14, %c[r14](%[vcpu]) \n\t" - "mov %%r15, %c[r15](%[vcpu]) \n\t" + "mov %%rbx, %c[rbx](%[svm]) \n\t" + "mov %%rcx, %c[rcx](%[svm]) \n\t" + "mov %%rdx, %c[rdx](%[svm]) \n\t" + "mov %%rsi, %c[rsi](%[svm]) \n\t" + "mov %%rdi, %c[rdi](%[svm]) \n\t" + "mov %%rbp, %c[rbp](%[svm]) \n\t" + "mov %%r8, %c[r8](%[svm]) \n\t" + "mov %%r9, %c[r9](%[svm]) \n\t" + "mov %%r10, %c[r10](%[svm]) \n\t" + "mov %%r11, %c[r11](%[svm]) \n\t" + "mov %%r12, %c[r12](%[svm]) \n\t" + "mov %%r13, %c[r13](%[svm]) \n\t" + "mov %%r14, %c[r14](%[svm]) \n\t" + "mov %%r15, %c[r15](%[svm]) \n\t" "pop %%r15; pop %%r14; pop %%r13; pop %%r12;" "pop %%r11; pop %%r10; pop %%r9; pop %%r8;" "pop %%rbp; pop %%rdi; pop %%rsi;" "pop %%rdx; pop %%rcx; pop %%rbx; \n\t" #else - "mov %%ebx, %c[rbx](%[vcpu]) \n\t" - "mov %%ecx, %c[rcx](%[vcpu]) \n\t" - "mov %%edx, %c[rdx](%[vcpu]) \n\t" - "mov %%esi, %c[rsi](%[vcpu]) \n\t" - "mov %%edi, %c[rdi](%[vcpu]) \n\t" - "mov %%ebp, %c[rbp](%[vcpu]) \n\t" + "mov %%ebx, %c[rbx](%[svm]) \n\t" + "mov %%ecx, %c[rcx](%[svm]) \n\t" + "mov %%edx, %c[rdx](%[svm]) \n\t" + "mov %%esi, %c[rsi](%[svm]) \n\t" + "mov %%edi, %c[rdi](%[svm]) \n\t" + "mov %%ebp, %c[rbp](%[svm]) \n\t" "pop %%ebp; pop %%edi; pop %%esi;" "pop %%edx; pop %%ecx; pop %%ebx; \n\t" #endif : - : [vcpu]"a"(vcpu), - [svm]"i"(offsetof(struct kvm_vcpu, svm)), + : [svm]"a"(svm), [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), - [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])), - [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])), - [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])), - [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])), - [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])), - [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])) + [rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])), + [rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])), + [rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])), + [rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])), + [rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])), + [rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP])) #ifdef CONFIG_X86_64 - ,[r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])), - [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])), - [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])), - [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])), - [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])), - [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])), - [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])), - [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])) + ,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])), + [r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])), + [r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])), + [r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])), + [r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])), + [r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])), + [r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])), + [r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15])) #endif : "cc", "memory" ); - vcpu->guest_mode = 0; + local_irq_disable(); - if (vcpu->fpu_active) { - fx_save(vcpu->guest_fx_image); - fx_restore(vcpu->host_fx_image); - } + stgi(); - if ((vcpu->svm->vmcb->save.dr7 & 0xff)) - load_db_regs(vcpu->svm->host_db_regs); + if ((svm->vmcb->save.dr7 & 0xff)) + load_db_regs(svm->host_db_regs); - vcpu->cr2 = vcpu->svm->vmcb->save.cr2; + vcpu->cr2 = svm->vmcb->save.cr2; - write_dr6(vcpu->svm->host_dr6); - write_dr7(vcpu->svm->host_dr7); - kvm_write_cr2(vcpu->svm->host_cr2); + write_dr6(svm->host_dr6); + write_dr7(svm->host_dr7); + kvm_write_cr2(svm->host_cr2); load_fs(fs_selector); load_gs(gs_selector); @@ -1652,57 +1599,19 @@ again: reload_tss(vcpu); - /* - * Profile KVM exit RIPs: - */ - if (unlikely(prof_on == KVM_PROFILING)) - profile_hit(KVM_PROFILING, - (void *)(unsigned long)vcpu->svm->vmcb->save.rip); - - stgi(); - - kvm_reput_irq(vcpu); - - vcpu->svm->next_rip = 0; - - if (vcpu->svm->vmcb->control.exit_code == SVM_EXIT_ERR) { - kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; - kvm_run->fail_entry.hardware_entry_failure_reason - = vcpu->svm->vmcb->control.exit_code; - post_kvm_run_save(vcpu, kvm_run); - return 0; - } - - r = handle_exit(vcpu, kvm_run); - if (r > 0) { - if (signal_pending(current)) { - ++vcpu->stat.signal_exits; - post_kvm_run_save(vcpu, kvm_run); - kvm_run->exit_reason = KVM_EXIT_INTR; - return -EINTR; - } - - if (dm_request_for_irq_injection(vcpu, kvm_run)) { - ++vcpu->stat.request_irq_exits; - post_kvm_run_save(vcpu, kvm_run); - kvm_run->exit_reason = KVM_EXIT_INTR; - return -EINTR; - } - kvm_resched(vcpu); - goto again; - } - post_kvm_run_save(vcpu, kvm_run); - return r; + svm->next_rip = 0; } static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { - vcpu->svm->vmcb->save.cr3 = root; + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->save.cr3 = root; force_new_asid(vcpu); if (vcpu->fpu_active) { - vcpu->svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); - vcpu->svm->vmcb->save.cr0 |= CR0_TS_MASK; + svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); + svm->vmcb->save.cr0 |= X86_CR0_TS; vcpu->fpu_active = 0; } } @@ -1711,26 +1620,27 @@ static void svm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, uint32_t err_code) { - uint32_t exit_int_info = vcpu->svm->vmcb->control.exit_int_info; + struct vcpu_svm *svm = to_svm(vcpu); + uint32_t exit_int_info = svm->vmcb->control.exit_int_info; ++vcpu->stat.pf_guest; if (is_page_fault(exit_int_info)) { - vcpu->svm->vmcb->control.event_inj_err = 0; - vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | - SVM_EVTINJ_VALID_ERR | - SVM_EVTINJ_TYPE_EXEPT | - DF_VECTOR; + svm->vmcb->control.event_inj_err = 0; + svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | + SVM_EVTINJ_VALID_ERR | + SVM_EVTINJ_TYPE_EXEPT | + DF_VECTOR; return; } vcpu->cr2 = addr; - vcpu->svm->vmcb->save.cr2 = addr; - vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | - SVM_EVTINJ_VALID_ERR | - SVM_EVTINJ_TYPE_EXEPT | - PF_VECTOR; - vcpu->svm->vmcb->control.event_inj_err = err_code; + svm->vmcb->save.cr2 = addr; + svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | + SVM_EVTINJ_VALID_ERR | + SVM_EVTINJ_TYPE_EXEPT | + PF_VECTOR; + svm->vmcb->control.event_inj_err = err_code; } @@ -1757,17 +1667,25 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[3] = 0xc3; } -static struct kvm_arch_ops svm_arch_ops = { +static void svm_check_processor_compat(void *rtn) +{ + *(int *)rtn = 0; +} + +static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, .hardware_setup = svm_hardware_setup, .hardware_unsetup = svm_hardware_unsetup, + .check_processor_compatibility = svm_check_processor_compat, .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, .vcpu_create = svm_create_vcpu, .vcpu_free = svm_free_vcpu, + .vcpu_reset = svm_vcpu_reset, + .prepare_guest_switch = svm_prepare_guest_switch, .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, .vcpu_decache = svm_vcpu_decache, @@ -1778,7 +1696,7 @@ static struct kvm_arch_ops svm_arch_ops = { .get_segment_base = svm_get_segment_base, .get_segment = svm_get_segment, .set_segment = svm_set_segment, - .get_cs_db_l_bits = svm_get_cs_db_l_bits, + .get_cs_db_l_bits = kvm_get_cs_db_l_bits, .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, .set_cr3 = svm_set_cr3, @@ -1795,26 +1713,30 @@ static struct kvm_arch_ops svm_arch_ops = { .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, - .invlpg = svm_invlpg, .tlb_flush = svm_flush_tlb, .inject_page_fault = svm_inject_page_fault, .inject_gp = svm_inject_gp, .run = svm_vcpu_run, + .handle_exit = handle_exit, .skip_emulated_instruction = skip_emulated_instruction, - .vcpu_setup = svm_vcpu_setup, .patch_hypercall = svm_patch_hypercall, + .get_irq = svm_get_irq, + .set_irq = svm_set_irq, + .inject_pending_irq = svm_intr_assist, + .inject_pending_vectors = do_interrupt_requests, }; static int __init svm_init(void) { - return kvm_init_arch(&svm_arch_ops, THIS_MODULE); + return kvm_init_x86(&svm_x86_ops, sizeof(struct vcpu_svm), + THIS_MODULE); } static void __exit svm_exit(void) { - kvm_exit_arch(); + kvm_exit_x86(); } module_init(svm_init) diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 80628f69916d..4f115a8e45ef 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -16,6 +16,8 @@ */ #include "kvm.h" +#include "x86_emulate.h" +#include "irq.h" #include "vmx.h" #include "segment_descriptor.h" @@ -23,7 +25,6 @@ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/highmem.h> -#include <linux/profile.h> #include <linux/sched.h> #include <asm/io.h> @@ -32,6 +33,39 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); +struct vmcs { + u32 revision_id; + u32 abort; + char data[0]; +}; + +struct vcpu_vmx { + struct kvm_vcpu vcpu; + int launched; + u8 fail; + struct kvm_msr_entry *guest_msrs; + struct kvm_msr_entry *host_msrs; + int nmsrs; + int save_nmsrs; + int msr_offset_efer; +#ifdef CONFIG_X86_64 + int msr_offset_kernel_gs_base; +#endif + struct vmcs *vmcs; + struct { + int loaded; + u16 fs_sel, gs_sel, ldt_sel; + int gs_ldt_reload_needed; + int fs_reload_needed; + }host_state; + +}; + +static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct vcpu_vmx, vcpu); +} + static int init_rmode_tss(struct kvm *kvm); static DEFINE_PER_CPU(struct vmcs *, vmxarea); @@ -40,18 +74,17 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static struct page *vmx_io_bitmap_a; static struct page *vmx_io_bitmap_b; -#ifdef CONFIG_X86_64 -#define HOST_IS_64 1 -#else -#define HOST_IS_64 0 -#endif #define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE) -static struct vmcs_descriptor { +static struct vmcs_config { int size; int order; u32 revision_id; -} vmcs_descriptor; + u32 pin_based_exec_ctrl; + u32 cpu_based_exec_ctrl; + u32 vmexit_ctrl; + u32 vmentry_ctrl; +} vmcs_config; #define VMX_SEGMENT_FIELD(seg) \ [VCPU_SREG_##seg] = { \ @@ -89,16 +122,32 @@ static const u32 vmx_msr_index[] = { }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) -static inline u64 msr_efer_save_restore_bits(struct vmx_msr_entry msr) +static void load_msrs(struct kvm_msr_entry *e, int n) +{ + int i; + + for (i = 0; i < n; ++i) + wrmsrl(e[i].index, e[i].data); +} + +static void save_msrs(struct kvm_msr_entry *e, int n) +{ + int i; + + for (i = 0; i < n; ++i) + rdmsrl(e[i].index, e[i].data); +} + +static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr) { return (u64)msr.data & EFER_SAVE_RESTORE_BITS; } -static inline int msr_efer_need_save_restore(struct kvm_vcpu *vcpu) +static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx) { - int efer_offset = vcpu->msr_offset_efer; - return msr_efer_save_restore_bits(vcpu->host_msrs[efer_offset]) != - msr_efer_save_restore_bits(vcpu->guest_msrs[efer_offset]); + int efer_offset = vmx->msr_offset_efer; + return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) != + msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]); } static inline int is_page_fault(u32 intr_info) @@ -121,23 +170,33 @@ static inline int is_external_interrupt(u32 intr_info) == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); } -static int __find_msr_index(struct kvm_vcpu *vcpu, u32 msr) +static inline int cpu_has_vmx_tpr_shadow(void) +{ + return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW); +} + +static inline int vm_need_tpr_shadow(struct kvm *kvm) +{ + return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); +} + +static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { int i; - for (i = 0; i < vcpu->nmsrs; ++i) - if (vcpu->guest_msrs[i].index == msr) + for (i = 0; i < vmx->nmsrs; ++i) + if (vmx->guest_msrs[i].index == msr) return i; return -1; } -static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr) +static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) { int i; - i = __find_msr_index(vcpu, msr); + i = __find_msr_index(vmx, msr); if (i >= 0) - return &vcpu->guest_msrs[i]; + return &vmx->guest_msrs[i]; return NULL; } @@ -156,23 +215,24 @@ static void vmcs_clear(struct vmcs *vmcs) static void __vcpu_clear(void *arg) { - struct kvm_vcpu *vcpu = arg; + struct vcpu_vmx *vmx = arg; int cpu = raw_smp_processor_id(); - if (vcpu->cpu == cpu) - vmcs_clear(vcpu->vmcs); - if (per_cpu(current_vmcs, cpu) == vcpu->vmcs) + if (vmx->vcpu.cpu == cpu) + vmcs_clear(vmx->vmcs); + if (per_cpu(current_vmcs, cpu) == vmx->vmcs) per_cpu(current_vmcs, cpu) = NULL; - rdtscll(vcpu->host_tsc); + rdtscll(vmx->vcpu.host_tsc); } -static void vcpu_clear(struct kvm_vcpu *vcpu) +static void vcpu_clear(struct vcpu_vmx *vmx) { - if (vcpu->cpu != raw_smp_processor_id() && vcpu->cpu != -1) - smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, 0, 1); + if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1) + smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, + vmx, 0, 1); else - __vcpu_clear(vcpu); - vcpu->launched = 0; + __vcpu_clear(vmx); + vmx->launched = 0; } static unsigned long vmcs_readl(unsigned long field) @@ -282,121 +342,122 @@ static void reload_tss(void) #endif } -static void load_transition_efer(struct kvm_vcpu *vcpu) +static void load_transition_efer(struct vcpu_vmx *vmx) { u64 trans_efer; - int efer_offset = vcpu->msr_offset_efer; + int efer_offset = vmx->msr_offset_efer; - trans_efer = vcpu->host_msrs[efer_offset].data; + trans_efer = vmx->host_msrs[efer_offset].data; trans_efer &= ~EFER_SAVE_RESTORE_BITS; - trans_efer |= msr_efer_save_restore_bits( - vcpu->guest_msrs[efer_offset]); + trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]); wrmsrl(MSR_EFER, trans_efer); - vcpu->stat.efer_reload++; + vmx->vcpu.stat.efer_reload++; } static void vmx_save_host_state(struct kvm_vcpu *vcpu) { - struct vmx_host_state *hs = &vcpu->vmx_host_state; + struct vcpu_vmx *vmx = to_vmx(vcpu); - if (hs->loaded) + if (vmx->host_state.loaded) return; - hs->loaded = 1; + vmx->host_state.loaded = 1; /* * Set host fs and gs selectors. Unfortunately, 22.2.3 does not * allow segment selectors with cpl > 0 or ti == 1. */ - hs->ldt_sel = read_ldt(); - hs->fs_gs_ldt_reload_needed = hs->ldt_sel; - hs->fs_sel = read_fs(); - if (!(hs->fs_sel & 7)) - vmcs_write16(HOST_FS_SELECTOR, hs->fs_sel); - else { + vmx->host_state.ldt_sel = read_ldt(); + vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; + vmx->host_state.fs_sel = read_fs(); + if (!(vmx->host_state.fs_sel & 7)) { + vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); + vmx->host_state.fs_reload_needed = 0; + } else { vmcs_write16(HOST_FS_SELECTOR, 0); - hs->fs_gs_ldt_reload_needed = 1; + vmx->host_state.fs_reload_needed = 1; } - hs->gs_sel = read_gs(); - if (!(hs->gs_sel & 7)) - vmcs_write16(HOST_GS_SELECTOR, hs->gs_sel); + vmx->host_state.gs_sel = read_gs(); + if (!(vmx->host_state.gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); else { vmcs_write16(HOST_GS_SELECTOR, 0); - hs->fs_gs_ldt_reload_needed = 1; + vmx->host_state.gs_ldt_reload_needed = 1; } #ifdef CONFIG_X86_64 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); #else - vmcs_writel(HOST_FS_BASE, segment_base(hs->fs_sel)); - vmcs_writel(HOST_GS_BASE, segment_base(hs->gs_sel)); + vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); + vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); #endif #ifdef CONFIG_X86_64 - if (is_long_mode(vcpu)) { - save_msrs(vcpu->host_msrs + vcpu->msr_offset_kernel_gs_base, 1); + if (is_long_mode(&vmx->vcpu)) { + save_msrs(vmx->host_msrs + + vmx->msr_offset_kernel_gs_base, 1); } #endif - load_msrs(vcpu->guest_msrs, vcpu->save_nmsrs); - if (msr_efer_need_save_restore(vcpu)) - load_transition_efer(vcpu); + load_msrs(vmx->guest_msrs, vmx->save_nmsrs); + if (msr_efer_need_save_restore(vmx)) + load_transition_efer(vmx); } -static void vmx_load_host_state(struct kvm_vcpu *vcpu) +static void vmx_load_host_state(struct vcpu_vmx *vmx) { - struct vmx_host_state *hs = &vcpu->vmx_host_state; + unsigned long flags; - if (!hs->loaded) + if (!vmx->host_state.loaded) return; - hs->loaded = 0; - if (hs->fs_gs_ldt_reload_needed) { - load_ldt(hs->ldt_sel); - load_fs(hs->fs_sel); + vmx->host_state.loaded = 0; + if (vmx->host_state.fs_reload_needed) + load_fs(vmx->host_state.fs_sel); + if (vmx->host_state.gs_ldt_reload_needed) { + load_ldt(vmx->host_state.ldt_sel); /* * If we have to reload gs, we must take care to * preserve our gs base. */ - local_irq_disable(); - load_gs(hs->gs_sel); + local_irq_save(flags); + load_gs(vmx->host_state.gs_sel); #ifdef CONFIG_X86_64 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); #endif - local_irq_enable(); - - reload_tss(); + local_irq_restore(flags); } - save_msrs(vcpu->guest_msrs, vcpu->save_nmsrs); - load_msrs(vcpu->host_msrs, vcpu->save_nmsrs); - if (msr_efer_need_save_restore(vcpu)) - load_msrs(vcpu->host_msrs + vcpu->msr_offset_efer, 1); + reload_tss(); + save_msrs(vmx->guest_msrs, vmx->save_nmsrs); + load_msrs(vmx->host_msrs, vmx->save_nmsrs); + if (msr_efer_need_save_restore(vmx)) + load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1); } /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. */ -static void vmx_vcpu_load(struct kvm_vcpu *vcpu) +static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - u64 phys_addr = __pa(vcpu->vmcs); - int cpu; + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 phys_addr = __pa(vmx->vmcs); u64 tsc_this, delta; - cpu = get_cpu(); - - if (vcpu->cpu != cpu) - vcpu_clear(vcpu); + if (vcpu->cpu != cpu) { + vcpu_clear(vmx); + kvm_migrate_apic_timer(vcpu); + } - if (per_cpu(current_vmcs, cpu) != vcpu->vmcs) { + if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { u8 error; - per_cpu(current_vmcs, cpu) = vcpu->vmcs; + per_cpu(current_vmcs, cpu) = vmx->vmcs; asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) : "cc"); if (error) printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", - vcpu->vmcs, phys_addr); + vmx->vmcs, phys_addr); } if (vcpu->cpu != cpu) { @@ -426,9 +487,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu) static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { - vmx_load_host_state(vcpu); + vmx_load_host_state(to_vmx(vcpu)); kvm_put_guest_fpu(vcpu); - put_cpu(); } static void vmx_fpu_activate(struct kvm_vcpu *vcpu) @@ -436,9 +496,9 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu) if (vcpu->fpu_active) return; vcpu->fpu_active = 1; - vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); - if (vcpu->cr0 & CR0_TS_MASK) - vmcs_set_bits(GUEST_CR0, CR0_TS_MASK); + vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); + if (vcpu->cr0 & X86_CR0_TS) + vmcs_set_bits(GUEST_CR0, X86_CR0_TS); update_exception_bitmap(vcpu); } @@ -447,13 +507,13 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) if (!vcpu->fpu_active) return; vcpu->fpu_active = 0; - vmcs_set_bits(GUEST_CR0, CR0_TS_MASK); + vmcs_set_bits(GUEST_CR0, X86_CR0_TS); update_exception_bitmap(vcpu); } static void vmx_vcpu_decache(struct kvm_vcpu *vcpu) { - vcpu_clear(vcpu); + vcpu_clear(to_vmx(vcpu)); } static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) @@ -501,59 +561,62 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) /* * Swap MSR entry in host/guest MSR entry array. */ -void move_msr_up(struct kvm_vcpu *vcpu, int from, int to) +#ifdef CONFIG_X86_64 +static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) { - struct vmx_msr_entry tmp; - tmp = vcpu->guest_msrs[to]; - vcpu->guest_msrs[to] = vcpu->guest_msrs[from]; - vcpu->guest_msrs[from] = tmp; - tmp = vcpu->host_msrs[to]; - vcpu->host_msrs[to] = vcpu->host_msrs[from]; - vcpu->host_msrs[from] = tmp; + struct kvm_msr_entry tmp; + + tmp = vmx->guest_msrs[to]; + vmx->guest_msrs[to] = vmx->guest_msrs[from]; + vmx->guest_msrs[from] = tmp; + tmp = vmx->host_msrs[to]; + vmx->host_msrs[to] = vmx->host_msrs[from]; + vmx->host_msrs[from] = tmp; } +#endif /* * Set up the vmcs to automatically save and restore system * msrs. Don't touch the 64-bit msrs if the guest is in legacy * mode, as fiddling with msrs is very expensive. */ -static void setup_msrs(struct kvm_vcpu *vcpu) +static void setup_msrs(struct vcpu_vmx *vmx) { int save_nmsrs; save_nmsrs = 0; #ifdef CONFIG_X86_64 - if (is_long_mode(vcpu)) { + if (is_long_mode(&vmx->vcpu)) { int index; - index = __find_msr_index(vcpu, MSR_SYSCALL_MASK); + index = __find_msr_index(vmx, MSR_SYSCALL_MASK); if (index >= 0) - move_msr_up(vcpu, index, save_nmsrs++); - index = __find_msr_index(vcpu, MSR_LSTAR); + move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_LSTAR); if (index >= 0) - move_msr_up(vcpu, index, save_nmsrs++); - index = __find_msr_index(vcpu, MSR_CSTAR); + move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_CSTAR); if (index >= 0) - move_msr_up(vcpu, index, save_nmsrs++); - index = __find_msr_index(vcpu, MSR_KERNEL_GS_BASE); + move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE); if (index >= 0) - move_msr_up(vcpu, index, save_nmsrs++); + move_msr_up(vmx, index, save_nmsrs++); /* * MSR_K6_STAR is only needed on long mode guests, and only * if efer.sce is enabled. */ - index = __find_msr_index(vcpu, MSR_K6_STAR); - if ((index >= 0) && (vcpu->shadow_efer & EFER_SCE)) - move_msr_up(vcpu, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_K6_STAR); + if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE)) + move_msr_up(vmx, index, save_nmsrs++); } #endif - vcpu->save_nmsrs = save_nmsrs; + vmx->save_nmsrs = save_nmsrs; #ifdef CONFIG_X86_64 - vcpu->msr_offset_kernel_gs_base = - __find_msr_index(vcpu, MSR_KERNEL_GS_BASE); + vmx->msr_offset_kernel_gs_base = + __find_msr_index(vmx, MSR_KERNEL_GS_BASE); #endif - vcpu->msr_offset_efer = __find_msr_index(vcpu, MSR_EFER); + vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); } /* @@ -589,7 +652,7 @@ static void guest_write_tsc(u64 guest_tsc) static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) { u64 data; - struct vmx_msr_entry *msr; + struct kvm_msr_entry *msr; if (!pdata) { printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); @@ -620,7 +683,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) data = vmcs_readl(GUEST_SYSENTER_ESP); break; default: - msr = find_msr_entry(vcpu, msr_index); + msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { data = msr->data; break; @@ -639,15 +702,16 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) */ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { - struct vmx_msr_entry *msr; + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_msr_entry *msr; int ret = 0; switch (msr_index) { #ifdef CONFIG_X86_64 case MSR_EFER: ret = kvm_set_msr_common(vcpu, msr_index, data); - if (vcpu->vmx_host_state.loaded) - load_transition_efer(vcpu); + if (vmx->host_state.loaded) + load_transition_efer(vmx); break; case MSR_FS_BASE: vmcs_writel(GUEST_FS_BASE, data); @@ -669,11 +733,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) guest_write_tsc(data); break; default: - msr = find_msr_entry(vcpu, msr_index); + msr = find_msr_entry(vmx, msr_index); if (msr) { msr->data = data; - if (vcpu->vmx_host_state.loaded) - load_msrs(vcpu->guest_msrs, vcpu->save_nmsrs); + if (vmx->host_state.loaded) + load_msrs(vmx->guest_msrs, vmx->save_nmsrs); break; } ret = kvm_set_msr_common(vcpu, msr_index, data); @@ -740,6 +804,20 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) return 0; } +static int vmx_get_irq(struct kvm_vcpu *vcpu) +{ + u32 idtv_info_field; + + idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD); + if (idtv_info_field & INTR_INFO_VALID_MASK) { + if (is_external_interrupt(idtv_info_field)) + return idtv_info_field & VECTORING_INFO_VECTOR_MASK; + else + printk("pending exception: not handled yet\n"); + } + return -1; +} + static __init int cpu_has_kvm_support(void) { unsigned long ecx = cpuid_ecx(1); @@ -751,7 +829,10 @@ static __init int vmx_disabled_by_bios(void) u64 msr; rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); - return (msr & 5) == 1; /* locked but not enabled */ + return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | + MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) + == MSR_IA32_FEATURE_CONTROL_LOCKED; + /* locked but not enabled */ } static void hardware_enable(void *garbage) @@ -761,10 +842,15 @@ static void hardware_enable(void *garbage) u64 old; rdmsrl(MSR_IA32_FEATURE_CONTROL, old); - if ((old & 5) != 5) + if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | + MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) + != (MSR_IA32_FEATURE_CONTROL_LOCKED | + MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) /* enable and lock */ - wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 5); - write_cr4(read_cr4() | CR4_VMXE); /* FIXME: not cpu hotplug safe */ + wrmsrl(MSR_IA32_FEATURE_CONTROL, old | + MSR_IA32_FEATURE_CONTROL_LOCKED | + MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); + write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) : "memory", "cc"); } @@ -774,14 +860,102 @@ static void hardware_disable(void *garbage) asm volatile (ASM_VMX_VMXOFF : : : "cc"); } -static __init void setup_vmcs_descriptor(void) +static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, + u32 msr, u32* result) +{ + u32 vmx_msr_low, vmx_msr_high; + u32 ctl = ctl_min | ctl_opt; + + rdmsr(msr, vmx_msr_low, vmx_msr_high); + + ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ + ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ + + /* Ensure minimum (required) set of control bits are supported. */ + if (ctl_min & ~ctl) + return -EIO; + + *result = ctl; + return 0; +} + +static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) { u32 vmx_msr_low, vmx_msr_high; + u32 min, opt; + u32 _pin_based_exec_control = 0; + u32 _cpu_based_exec_control = 0; + u32 _vmexit_control = 0; + u32 _vmentry_control = 0; + + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; + opt = 0; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, + &_pin_based_exec_control) < 0) + return -EIO; + + min = CPU_BASED_HLT_EXITING | +#ifdef CONFIG_X86_64 + CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING | +#endif + CPU_BASED_USE_IO_BITMAPS | + CPU_BASED_MOV_DR_EXITING | + CPU_BASED_USE_TSC_OFFSETING; +#ifdef CONFIG_X86_64 + opt = CPU_BASED_TPR_SHADOW; +#else + opt = 0; +#endif + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, + &_cpu_based_exec_control) < 0) + return -EIO; +#ifdef CONFIG_X86_64 + if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) + _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & + ~CPU_BASED_CR8_STORE_EXITING; +#endif + + min = 0; +#ifdef CONFIG_X86_64 + min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; +#endif + opt = 0; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, + &_vmexit_control) < 0) + return -EIO; + + min = opt = 0; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, + &_vmentry_control) < 0) + return -EIO; rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); - vmcs_descriptor.size = vmx_msr_high & 0x1fff; - vmcs_descriptor.order = get_order(vmcs_descriptor.size); - vmcs_descriptor.revision_id = vmx_msr_low; + + /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ + if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) + return -EIO; + +#ifdef CONFIG_X86_64 + /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ + if (vmx_msr_high & (1u<<16)) + return -EIO; +#endif + + /* Require Write-Back (WB) memory type for VMCS accesses. */ + if (((vmx_msr_high >> 18) & 15) != 6) + return -EIO; + + vmcs_conf->size = vmx_msr_high & 0x1fff; + vmcs_conf->order = get_order(vmcs_config.size); + vmcs_conf->revision_id = vmx_msr_low; + + vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; + vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; + vmcs_conf->vmexit_ctrl = _vmexit_control; + vmcs_conf->vmentry_ctrl = _vmentry_control; + + return 0; } static struct vmcs *alloc_vmcs_cpu(int cpu) @@ -790,12 +964,12 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) struct page *pages; struct vmcs *vmcs; - pages = alloc_pages_node(node, GFP_KERNEL, vmcs_descriptor.order); + pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); if (!pages) return NULL; vmcs = page_address(pages); - memset(vmcs, 0, vmcs_descriptor.size); - vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */ + memset(vmcs, 0, vmcs_config.size); + vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */ return vmcs; } @@ -806,7 +980,7 @@ static struct vmcs *alloc_vmcs(void) static void free_vmcs(struct vmcs *vmcs) { - free_pages((unsigned long)vmcs, vmcs_descriptor.order); + free_pages((unsigned long)vmcs, vmcs_config.order); } static void free_kvm_area(void) @@ -817,8 +991,6 @@ static void free_kvm_area(void) free_vmcs(per_cpu(vmxarea, cpu)); } -extern struct vmcs *alloc_vmcs_cpu(int cpu); - static __init int alloc_kvm_area(void) { int cpu; @@ -839,7 +1011,8 @@ static __init int alloc_kvm_area(void) static __init int hardware_setup(void) { - setup_vmcs_descriptor(); + if (setup_vmcs_config(&vmcs_config) < 0) + return -EIO; return alloc_kvm_area(); } @@ -879,8 +1052,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu) flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT); vmcs_writel(GUEST_RFLAGS, flags); - vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) | - (vmcs_readl(CR4_READ_SHADOW) & CR4_VME_MASK)); + vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | + (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); update_exception_bitmap(vcpu); @@ -897,7 +1070,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); } -static int rmode_tss_base(struct kvm* kvm) +static gva_t rmode_tss_base(struct kvm* kvm) { gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3; return base_gfn << PAGE_SHIFT; @@ -937,7 +1110,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) flags |= IOPL_MASK | X86_EFLAGS_VM; vmcs_writel(GUEST_RFLAGS, flags); - vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK); + vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); update_exception_bitmap(vcpu); vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); @@ -975,10 +1148,10 @@ static void enter_lmode(struct kvm_vcpu *vcpu) vcpu->shadow_efer |= EFER_LMA; - find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME; + find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME; vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) - | VM_ENTRY_CONTROLS_IA32E_MASK); + | VM_ENTRY_IA32E_MODE); } static void exit_lmode(struct kvm_vcpu *vcpu) @@ -987,7 +1160,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu) vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) - & ~VM_ENTRY_CONTROLS_IA32E_MASK); + & ~VM_ENTRY_IA32E_MODE); } #endif @@ -1002,17 +1175,17 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { vmx_fpu_deactivate(vcpu); - if (vcpu->rmode.active && (cr0 & CR0_PE_MASK)) + if (vcpu->rmode.active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); - if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK)) + if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE)) enter_rmode(vcpu); #ifdef CONFIG_X86_64 if (vcpu->shadow_efer & EFER_LME) { - if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) + if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) enter_lmode(vcpu); - if (is_paging(vcpu) && !(cr0 & CR0_PG_MASK)) + if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) exit_lmode(vcpu); } #endif @@ -1022,14 +1195,14 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); vcpu->cr0 = cr0; - if (!(cr0 & CR0_TS_MASK) || !(cr0 & CR0_PE_MASK)) + if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) vmx_fpu_activate(vcpu); } static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { vmcs_writel(GUEST_CR3, cr3); - if (vcpu->cr0 & CR0_PE_MASK) + if (vcpu->cr0 & X86_CR0_PE) vmx_fpu_deactivate(vcpu); } @@ -1045,23 +1218,24 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { - struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); vcpu->shadow_efer = efer; if (efer & EFER_LMA) { vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) | - VM_ENTRY_CONTROLS_IA32E_MASK); + VM_ENTRY_IA32E_MODE); msr->data = efer; } else { vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) & - ~VM_ENTRY_CONTROLS_IA32E_MASK); + ~VM_ENTRY_IA32E_MODE); msr->data = efer & ~EFER_LME; } - setup_msrs(vcpu); + setup_msrs(vmx); } #endif @@ -1210,17 +1384,6 @@ static int init_rmode_tss(struct kvm* kvm) return 1; } -static void vmcs_write32_fixedbits(u32 msr, u32 vmcs_field, u32 val) -{ - u32 msr_high, msr_low; - - rdmsr(msr, msr_low, msr_high); - - val &= msr_high; - val |= msr_low; - vmcs_write32(vmcs_field, val); -} - static void seg_setup(int seg) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -1234,7 +1397,7 @@ static void seg_setup(int seg) /* * Sets up the vmcs for emulated real mode. */ -static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) +static int vmx_vcpu_setup(struct vcpu_vmx *vmx) { u32 host_sysenter_cs; u32 junk; @@ -1243,27 +1406,36 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) int i; int ret = 0; unsigned long kvm_vmx_return; + u64 msr; + u32 exec_control; - if (!init_rmode_tss(vcpu->kvm)) { + if (!init_rmode_tss(vmx->vcpu.kvm)) { ret = -ENOMEM; goto out; } - memset(vcpu->regs, 0, sizeof(vcpu->regs)); - vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val(); - vcpu->cr8 = 0; - vcpu->apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; - if (vcpu == &vcpu->kvm->vcpus[0]) - vcpu->apic_base |= MSR_IA32_APICBASE_BSP; + vmx->vcpu.rmode.active = 0; - fx_init(vcpu); + vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val(); + set_cr8(&vmx->vcpu, 0); + msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + if (vmx->vcpu.vcpu_id == 0) + msr |= MSR_IA32_APICBASE_BSP; + kvm_set_apic_base(&vmx->vcpu, msr); + + fx_init(&vmx->vcpu); /* * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. */ - vmcs_write16(GUEST_CS_SELECTOR, 0xf000); - vmcs_writel(GUEST_CS_BASE, 0x000f0000); + if (vmx->vcpu.vcpu_id == 0) { + vmcs_write16(GUEST_CS_SELECTOR, 0xf000); + vmcs_writel(GUEST_CS_BASE, 0x000f0000); + } else { + vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8); + vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12); + } vmcs_write32(GUEST_CS_LIMIT, 0xffff); vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); @@ -1288,7 +1460,10 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_SYSENTER_EIP, 0); vmcs_writel(GUEST_RFLAGS, 0x02); - vmcs_writel(GUEST_RIP, 0xfff0); + if (vmx->vcpu.vcpu_id == 0) + vmcs_writel(GUEST_RIP, 0xfff0); + else + vmcs_writel(GUEST_RIP, 0); vmcs_writel(GUEST_RSP, 0); //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 @@ -1316,20 +1491,18 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) vmcs_write64(GUEST_IA32_DEBUGCTL, 0); /* Control */ - vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS, - PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_EXT_INTR_MASK /* 20.6.1 */ - | PIN_BASED_NMI_EXITING /* 20.6.1 */ - ); - vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS, - CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_HLT_EXITING /* 20.6.2 */ - | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */ - | CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */ - | CPU_BASED_ACTIVATE_IO_BITMAP /* 20.6.2 */ - | CPU_BASED_MOV_DR_EXITING - | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */ - ); + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, + vmcs_config.pin_based_exec_ctrl); + + exec_control = vmcs_config.cpu_based_exec_ctrl; + if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { + exec_control &= ~CPU_BASED_TPR_SHADOW; +#ifdef CONFIG_X86_64 + exec_control |= CPU_BASED_CR8_STORE_EXITING | + CPU_BASED_CR8_LOAD_EXITING; +#endif + } + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); @@ -1377,46 +1550,48 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) u32 index = vmx_msr_index[i]; u32 data_low, data_high; u64 data; - int j = vcpu->nmsrs; + int j = vmx->nmsrs; if (rdmsr_safe(index, &data_low, &data_high) < 0) continue; if (wrmsr_safe(index, data_low, data_high) < 0) continue; data = data_low | ((u64)data_high << 32); - vcpu->host_msrs[j].index = index; - vcpu->host_msrs[j].reserved = 0; - vcpu->host_msrs[j].data = data; - vcpu->guest_msrs[j] = vcpu->host_msrs[j]; - ++vcpu->nmsrs; + vmx->host_msrs[j].index = index; + vmx->host_msrs[j].reserved = 0; + vmx->host_msrs[j].data = data; + vmx->guest_msrs[j] = vmx->host_msrs[j]; + ++vmx->nmsrs; } - setup_msrs(vcpu); + setup_msrs(vmx); - vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS, VM_EXIT_CONTROLS, - (HOST_IS_64 << 9)); /* 22.2,1, 20.7.1 */ + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); /* 22.2.1, 20.8.1 */ - vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS, - VM_ENTRY_CONTROLS, 0); + vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ #ifdef CONFIG_X86_64 - vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0); - vmcs_writel(TPR_THRESHOLD, 0); + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); + if (vm_need_tpr_shadow(vmx->vcpu.kvm)) + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, + page_to_phys(vmx->vcpu.apic->regs_page)); + vmcs_write32(TPR_THRESHOLD, 0); #endif vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); - vcpu->cr0 = 0x60000010; - vmx_set_cr0(vcpu, vcpu->cr0); // enter rmode - vmx_set_cr4(vcpu, 0); + vmx->vcpu.cr0 = 0x60000010; + vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode + vmx_set_cr4(&vmx->vcpu, 0); #ifdef CONFIG_X86_64 - vmx_set_efer(vcpu, 0); + vmx_set_efer(&vmx->vcpu, 0); #endif - vmx_fpu_activate(vcpu); - update_exception_bitmap(vcpu); + vmx_fpu_activate(&vmx->vcpu); + update_exception_bitmap(&vmx->vcpu); return 0; @@ -1424,6 +1599,13 @@ out: return ret; } +static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + vmx_vcpu_setup(vmx); +} + static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq) { u16 ent[2]; @@ -1443,8 +1625,8 @@ static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq) return; } - if (kvm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) != - sizeof(ent)) { + if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) != + X86EMUL_CONTINUE) { vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__); return; } @@ -1454,9 +1636,9 @@ static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq) ip = vmcs_readl(GUEST_RIP); - if (kvm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 || - kvm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 || - kvm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) { + if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE || + emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE || + emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) { vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__); return; } @@ -1469,6 +1651,16 @@ static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq) vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6)); } +static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) +{ + if (vcpu->rmode.active) { + inject_rmode_irq(vcpu, irq); + return; + } + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); +} + static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) { int word_index = __ffs(vcpu->irq_summary); @@ -1478,13 +1670,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) clear_bit(bit_index, &vcpu->irq_pending[word_index]); if (!vcpu->irq_pending[word_index]) clear_bit(word_index, &vcpu->irq_summary); - - if (vcpu->rmode.active) { - inject_rmode_irq(vcpu, irq); - return; - } - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); + vmx_inject_irq(vcpu, irq); } @@ -1568,7 +1754,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); } - if (is_external_interrupt(vect_info)) { + if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { int irq = vect_info & VECTORING_INFO_VECTOR_MASK; set_bit(irq, vcpu->irq_pending); set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); @@ -1591,29 +1777,28 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (is_page_fault(intr_info)) { cr2 = vmcs_readl(EXIT_QUALIFICATION); - spin_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->lock); r = kvm_mmu_page_fault(vcpu, cr2, error_code); if (r < 0) { - spin_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->lock); return r; } if (!r) { - spin_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->lock); return 1; } er = emulate_instruction(vcpu, kvm_run, cr2, error_code); - spin_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->lock); switch (er) { case EMULATE_DONE: return 1; case EMULATE_DO_MMIO: ++vcpu->stat.mmio_exits; - kvm_run->exit_reason = KVM_EXIT_MMIO; return 0; case EMULATE_FAIL: - vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__); + kvm_report_emulation_failure(vcpu, "pagetable"); break; default: BUG(); @@ -1653,80 +1838,29 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; } -static int get_io_count(struct kvm_vcpu *vcpu, unsigned long *count) -{ - u64 inst; - gva_t rip; - int countr_size; - int i, n; - - if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) { - countr_size = 2; - } else { - u32 cs_ar = vmcs_read32(GUEST_CS_AR_BYTES); - - countr_size = (cs_ar & AR_L_MASK) ? 8: - (cs_ar & AR_DB_MASK) ? 4: 2; - } - - rip = vmcs_readl(GUEST_RIP); - if (countr_size != 8) - rip += vmcs_readl(GUEST_CS_BASE); - - n = kvm_read_guest(vcpu, rip, sizeof(inst), &inst); - - for (i = 0; i < n; i++) { - switch (((u8*)&inst)[i]) { - case 0xf0: - case 0xf2: - case 0xf3: - case 0x2e: - case 0x36: - case 0x3e: - case 0x26: - case 0x64: - case 0x65: - case 0x66: - break; - case 0x67: - countr_size = (countr_size == 2) ? 4: (countr_size >> 1); - default: - goto done; - } - } - return 0; -done: - countr_size *= 8; - *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size)); - //printk("cx: %lx\n", vcpu->regs[VCPU_REGS_RCX]); - return 1; -} - static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - u64 exit_qualification; + unsigned long exit_qualification; int size, down, in, string, rep; unsigned port; - unsigned long count; - gva_t address; ++vcpu->stat.io_exits; - exit_qualification = vmcs_read64(EXIT_QUALIFICATION); - in = (exit_qualification & 8) != 0; - size = (exit_qualification & 7) + 1; + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); string = (exit_qualification & 16) != 0; + + if (string) { + if (emulate_instruction(vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO) + return 0; + return 1; + } + + size = (exit_qualification & 7) + 1; + in = (exit_qualification & 8) != 0; down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; - count = 1; rep = (exit_qualification & 32) != 0; port = exit_qualification >> 16; - address = 0; - if (string) { - if (rep && !get_io_count(vcpu, &count)) - return 1; - address = vmcs_readl(GUEST_LINEAR_ADDRESS); - } - return kvm_setup_pio(vcpu, kvm_run, in, size, count, string, down, - address, rep, port); + + return kvm_emulate_pio(vcpu, kvm_run, in, size, port); } static void @@ -1743,11 +1877,11 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - u64 exit_qualification; + unsigned long exit_qualification; int cr; int reg; - exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); cr = exit_qualification & 15; reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { @@ -1772,13 +1906,14 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu_load_rsp_rip(vcpu); set_cr8(vcpu, vcpu->regs[reg]); skip_emulated_instruction(vcpu); - return 1; + kvm_run->exit_reason = KVM_EXIT_SET_TPR; + return 0; }; break; case 2: /* clts */ vcpu_load_rsp_rip(vcpu); vmx_fpu_deactivate(vcpu); - vcpu->cr0 &= ~CR0_TS_MASK; + vcpu->cr0 &= ~X86_CR0_TS; vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); vmx_fpu_activate(vcpu); skip_emulated_instruction(vcpu); @@ -1793,7 +1928,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; case 8: vcpu_load_rsp_rip(vcpu); - vcpu->regs[reg] = vcpu->cr8; + vcpu->regs[reg] = get_cr8(vcpu); vcpu_put_rsp_rip(vcpu); skip_emulated_instruction(vcpu); return 1; @@ -1808,14 +1943,14 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) break; } kvm_run->exit_reason = 0; - printk(KERN_ERR "kvm: unhandled control register: op %d cr %d\n", + pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", (int)(exit_qualification >> 4) & 3, cr); return 0; } static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - u64 exit_qualification; + unsigned long exit_qualification; unsigned long val; int dr, reg; @@ -1823,7 +1958,7 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) * FIXME: this code assumes the host is debugging the guest. * need to deal with guest debugging itself too. */ - exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); dr = exit_qualification & 7; reg = (exit_qualification >> 8) & 15; vcpu_load_rsp_rip(vcpu); @@ -1886,19 +2021,21 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } -static void post_kvm_run_save(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) { - kvm_run->if_flag = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) != 0; - kvm_run->cr8 = vcpu->cr8; - kvm_run->apic_base = vcpu->apic_base; - kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open && - vcpu->irq_summary == 0); + return 1; } static int handle_interrupt_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { + u32 cpu_based_vm_exec_control; + + /* clear pending irq */ + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); /* * If the user space waits to inject interrupts, exit as soon as * possible @@ -1943,6 +2080,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, [EXIT_REASON_HLT] = handle_halt, [EXIT_REASON_VMCALL] = handle_vmcall, + [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold }; static const int kvm_vmx_max_exit_handlers = @@ -1956,6 +2094,14 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); u32 exit_reason = vmcs_read32(VM_EXIT_REASON); + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (unlikely(vmx->fail)) { + kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; + kvm_run->fail_entry.hardware_entry_failure_reason + = vmcs_read32(VM_INSTRUCTION_ERROR); + return 0; + } if ( (vectoring_info & VECTORING_INFO_VALID_MASK) && exit_reason != EXIT_REASON_EXCEPTION_NMI ) @@ -1971,57 +2117,91 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return 0; } -/* - * Check if userspace requested an interrupt window, and that the - * interrupt window is open. - * - * No need to exit to userspace if we already have an interrupt queued. - */ -static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static void vmx_flush_tlb(struct kvm_vcpu *vcpu) { - return (!vcpu->irq_summary && - kvm_run->request_interrupt_window && - vcpu->interrupt_window_open && - (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)); } -static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +static void update_tpr_threshold(struct kvm_vcpu *vcpu) { + int max_irr, tpr; + + if (!vm_need_tpr_shadow(vcpu->kvm)) + return; + + if (!kvm_lapic_enabled(vcpu) || + ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) { + vmcs_write32(TPR_THRESHOLD, 0); + return; + } + + tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4; + vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4); } -static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void enable_irq_window(struct kvm_vcpu *vcpu) { - u8 fail; - int r; + u32 cpu_based_vm_exec_control; -preempted: - if (vcpu->guest_debug.enabled) - kvm_guest_debug_pre(vcpu); + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} -again: - if (!vcpu->mmio_read_completed) - do_interrupt_requests(vcpu, kvm_run); +static void vmx_intr_assist(struct kvm_vcpu *vcpu) +{ + u32 idtv_info_field, intr_info_field; + int has_ext_irq, interrupt_window_open; + int vector; - vmx_save_host_state(vcpu); - kvm_load_guest_fpu(vcpu); + kvm_inject_pending_timer_irqs(vcpu); + update_tpr_threshold(vcpu); - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - goto out; + has_ext_irq = kvm_cpu_has_interrupt(vcpu); + intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); + idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD); + if (intr_info_field & INTR_INFO_VALID_MASK) { + if (idtv_info_field & INTR_INFO_VALID_MASK) { + /* TODO: fault when IDT_Vectoring */ + printk(KERN_ERR "Fault when IDT_Vectoring\n"); + } + if (has_ext_irq) + enable_irq_window(vcpu); + return; + } + if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); + + if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK)) + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, + vmcs_read32(IDT_VECTORING_ERROR_CODE)); + if (unlikely(has_ext_irq)) + enable_irq_window(vcpu); + return; + } + if (!has_ext_irq) + return; + interrupt_window_open = + ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); + if (interrupt_window_open) { + vector = kvm_cpu_get_interrupt(vcpu); + vmx_inject_irq(vcpu, vector); + kvm_timer_intr_post(vcpu, vector); + } else + enable_irq_window(vcpu); +} + +static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); /* * Loading guest fpu may have cleared host cr0.ts */ vmcs_writel(HOST_CR0, read_cr0()); - local_irq_disable(); - - vcpu->guest_mode = 1; - if (vcpu->requests) - if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) - vmx_flush_tlb(vcpu); - asm ( /* Store host registers */ #ifdef CONFIG_X86_64 @@ -2115,8 +2295,8 @@ again: "pop %%ecx; popa \n\t" #endif "setbe %0 \n\t" - : "=q" (fail) - : "r"(vcpu->launched), "d"((unsigned long)HOST_RSP), + : "=q" (vmx->fail) + : "r"(vmx->launched), "d"((unsigned long)HOST_RSP), "c"(vcpu), [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])), @@ -2138,59 +2318,10 @@ again: [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) : "cc", "memory" ); - vcpu->guest_mode = 0; - local_irq_enable(); - - ++vcpu->stat.exits; - vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); - - if (unlikely(fail)) { - kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; - kvm_run->fail_entry.hardware_entry_failure_reason - = vmcs_read32(VM_INSTRUCTION_ERROR); - r = 0; - goto out; - } - /* - * Profile KVM exit RIPs: - */ - if (unlikely(prof_on == KVM_PROFILING)) - profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP)); - - vcpu->launched = 1; - r = kvm_handle_exit(kvm_run, vcpu); - if (r > 0) { - /* Give scheduler a change to reschedule. */ - if (signal_pending(current)) { - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; - goto out; - } - - if (dm_request_for_irq_injection(vcpu, kvm_run)) { - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.request_irq_exits; - goto out; - } - if (!need_resched()) { - ++vcpu->stat.light_exits; - goto again; - } - } - -out: - if (r > 0) { - kvm_resched(vcpu); - goto preempted; - } - - post_kvm_run_save(vcpu, kvm_run); - return r; + vmx->launched = 1; } static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, @@ -2225,67 +2356,118 @@ static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, static void vmx_free_vmcs(struct kvm_vcpu *vcpu) { - if (vcpu->vmcs) { - on_each_cpu(__vcpu_clear, vcpu, 0, 1); - free_vmcs(vcpu->vmcs); - vcpu->vmcs = NULL; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vmx->vmcs) { + on_each_cpu(__vcpu_clear, vmx, 0, 1); + free_vmcs(vmx->vmcs); + vmx->vmcs = NULL; } } static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + vmx_free_vmcs(vcpu); + kfree(vmx->host_msrs); + kfree(vmx->guest_msrs); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, vmx); } -static int vmx_create_vcpu(struct kvm_vcpu *vcpu) +static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { - struct vmcs *vmcs; + int err; + struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + int cpu; - vcpu->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!vcpu->guest_msrs) - return -ENOMEM; + if (!vmx) + return ERR_PTR(-ENOMEM); - vcpu->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!vcpu->host_msrs) - goto out_free_guest_msrs; + err = kvm_vcpu_init(&vmx->vcpu, kvm, id); + if (err) + goto free_vcpu; - vmcs = alloc_vmcs(); - if (!vmcs) - goto out_free_msrs; + if (irqchip_in_kernel(kvm)) { + err = kvm_create_lapic(&vmx->vcpu); + if (err < 0) + goto free_vcpu; + } - vmcs_clear(vmcs); - vcpu->vmcs = vmcs; - vcpu->launched = 0; + vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!vmx->guest_msrs) { + err = -ENOMEM; + goto uninit_vcpu; + } - return 0; + vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!vmx->host_msrs) + goto free_guest_msrs; -out_free_msrs: - kfree(vcpu->host_msrs); - vcpu->host_msrs = NULL; + vmx->vmcs = alloc_vmcs(); + if (!vmx->vmcs) + goto free_msrs; -out_free_guest_msrs: - kfree(vcpu->guest_msrs); - vcpu->guest_msrs = NULL; + vmcs_clear(vmx->vmcs); - return -ENOMEM; + cpu = get_cpu(); + vmx_vcpu_load(&vmx->vcpu, cpu); + err = vmx_vcpu_setup(vmx); + vmx_vcpu_put(&vmx->vcpu); + put_cpu(); + if (err) + goto free_vmcs; + + return &vmx->vcpu; + +free_vmcs: + free_vmcs(vmx->vmcs); +free_msrs: + kfree(vmx->host_msrs); +free_guest_msrs: + kfree(vmx->guest_msrs); +uninit_vcpu: + kvm_vcpu_uninit(&vmx->vcpu); +free_vcpu: + kmem_cache_free(kvm_vcpu_cache, vmx); + return ERR_PTR(err); +} + +static void __init vmx_check_processor_compat(void *rtn) +{ + struct vmcs_config vmcs_conf; + + *(int *)rtn = 0; + if (setup_vmcs_config(&vmcs_conf) < 0) + *(int *)rtn = -EIO; + if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { + printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", + smp_processor_id()); + *(int *)rtn = -EIO; + } } -static struct kvm_arch_ops vmx_arch_ops = { +static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, .hardware_setup = hardware_setup, .hardware_unsetup = hardware_unsetup, + .check_processor_compatibility = vmx_check_processor_compat, .hardware_enable = hardware_enable, .hardware_disable = hardware_disable, .vcpu_create = vmx_create_vcpu, .vcpu_free = vmx_free_vcpu, + .vcpu_reset = vmx_vcpu_reset, + .prepare_guest_switch = vmx_save_host_state, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, .vcpu_decache = vmx_vcpu_decache, .set_guest_debug = set_guest_debug, + .guest_debug_pre = kvm_guest_debug_pre, .get_msr = vmx_get_msr, .set_msr = vmx_set_msr, .get_segment_base = vmx_get_segment_base, @@ -2314,9 +2496,13 @@ static struct kvm_arch_ops vmx_arch_ops = { .inject_gp = vmx_inject_gp, .run = vmx_vcpu_run, + .handle_exit = kvm_handle_exit, .skip_emulated_instruction = skip_emulated_instruction, - .vcpu_setup = vmx_vcpu_setup, .patch_hypercall = vmx_patch_hypercall, + .get_irq = vmx_get_irq, + .set_irq = vmx_inject_irq, + .inject_pending_irq = vmx_intr_assist, + .inject_pending_vectors = do_interrupt_requests, }; static int __init vmx_init(void) @@ -2347,7 +2533,7 @@ static int __init vmx_init(void) memset(iova, 0xff, PAGE_SIZE); kunmap(vmx_io_bitmap_b); - r = kvm_init_arch(&vmx_arch_ops, THIS_MODULE); + r = kvm_init_x86(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); if (r) goto out1; @@ -2365,7 +2551,7 @@ static void __exit vmx_exit(void) __free_page(vmx_io_bitmap_b); __free_page(vmx_io_bitmap_a); - kvm_exit_arch(); + kvm_exit_x86(); } module_init(vmx_init) diff --git a/drivers/kvm/vmx.h b/drivers/kvm/vmx.h index d0dc93df411b..fd4e14666088 100644 --- a/drivers/kvm/vmx.h +++ b/drivers/kvm/vmx.h @@ -25,29 +25,36 @@ * */ -#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 -#define CPU_BASED_USE_TSC_OFFSETING 0x00000008 -#define CPU_BASED_HLT_EXITING 0x00000080 -#define CPU_BASED_INVDPG_EXITING 0x00000200 -#define CPU_BASED_MWAIT_EXITING 0x00000400 -#define CPU_BASED_RDPMC_EXITING 0x00000800 -#define CPU_BASED_RDTSC_EXITING 0x00001000 -#define CPU_BASED_CR8_LOAD_EXITING 0x00080000 -#define CPU_BASED_CR8_STORE_EXITING 0x00100000 -#define CPU_BASED_TPR_SHADOW 0x00200000 -#define CPU_BASED_MOV_DR_EXITING 0x00800000 -#define CPU_BASED_UNCOND_IO_EXITING 0x01000000 -#define CPU_BASED_ACTIVATE_IO_BITMAP 0x02000000 -#define CPU_BASED_MSR_BITMAPS 0x10000000 -#define CPU_BASED_MONITOR_EXITING 0x20000000 -#define CPU_BASED_PAUSE_EXITING 0x40000000 +#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 +#define CPU_BASED_USE_TSC_OFFSETING 0x00000008 +#define CPU_BASED_HLT_EXITING 0x00000080 +#define CPU_BASED_INVLPG_EXITING 0x00000200 +#define CPU_BASED_MWAIT_EXITING 0x00000400 +#define CPU_BASED_RDPMC_EXITING 0x00000800 +#define CPU_BASED_RDTSC_EXITING 0x00001000 +#define CPU_BASED_CR8_LOAD_EXITING 0x00080000 +#define CPU_BASED_CR8_STORE_EXITING 0x00100000 +#define CPU_BASED_TPR_SHADOW 0x00200000 +#define CPU_BASED_MOV_DR_EXITING 0x00800000 +#define CPU_BASED_UNCOND_IO_EXITING 0x01000000 +#define CPU_BASED_USE_IO_BITMAPS 0x02000000 +#define CPU_BASED_USE_MSR_BITMAPS 0x10000000 +#define CPU_BASED_MONITOR_EXITING 0x20000000 +#define CPU_BASED_PAUSE_EXITING 0x40000000 +#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 -#define PIN_BASED_EXT_INTR_MASK 0x1 -#define PIN_BASED_NMI_EXITING 0x8 +#define PIN_BASED_EXT_INTR_MASK 0x00000001 +#define PIN_BASED_NMI_EXITING 0x00000008 +#define PIN_BASED_VIRTUAL_NMIS 0x00000020 -#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 -#define VM_EXIT_HOST_ADD_SPACE_SIZE 0x00000200 +#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 +#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 +#define VM_ENTRY_IA32E_MODE 0x00000200 +#define VM_ENTRY_SMM 0x00000400 +#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 + +#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 /* VMCS Encodings */ enum vmcs_field { @@ -206,6 +213,7 @@ enum vmcs_field { #define EXIT_REASON_MSR_READ 31 #define EXIT_REASON_MSR_WRITE 32 #define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 /* * Interruption-information format @@ -261,9 +269,6 @@ enum vmcs_field { /* segment AR */ #define SEGMENT_AR_L_MASK (1 << 13) -/* entry controls */ -#define VM_ENTRY_CONTROLS_IA32E_MASK (1 << 9) - #define AR_TYPE_ACCESSES_MASK 1 #define AR_TYPE_READABLE_MASK (1 << 1) #define AR_TYPE_WRITEABLE_MASK (1 << 2) @@ -285,13 +290,21 @@ enum vmcs_field { #define AR_RESERVD_MASK 0xfffe0f00 -#define CR4_VMXE 0x2000 +#define MSR_IA32_VMX_BASIC 0x480 +#define MSR_IA32_VMX_PINBASED_CTLS 0x481 +#define MSR_IA32_VMX_PROCBASED_CTLS 0x482 +#define MSR_IA32_VMX_EXIT_CTLS 0x483 +#define MSR_IA32_VMX_ENTRY_CTLS 0x484 +#define MSR_IA32_VMX_MISC 0x485 +#define MSR_IA32_VMX_CR0_FIXED0 0x486 +#define MSR_IA32_VMX_CR0_FIXED1 0x487 +#define MSR_IA32_VMX_CR4_FIXED0 0x488 +#define MSR_IA32_VMX_CR4_FIXED1 0x489 +#define MSR_IA32_VMX_VMCS_ENUM 0x48a +#define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b -#define MSR_IA32_VMX_BASIC 0x480 -#define MSR_IA32_FEATURE_CONTROL 0x03a -#define MSR_IA32_VMX_PINBASED_CTLS 0x481 -#define MSR_IA32_VMX_PROCBASED_CTLS 0x482 -#define MSR_IA32_VMX_EXIT_CTLS 0x483 -#define MSR_IA32_VMX_ENTRY_CTLS 0x484 +#define MSR_IA32_FEATURE_CONTROL 0x3a +#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 +#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 #endif diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c index 4b8a0cc9665e..9737c3b2f48c 100644 --- a/drivers/kvm/x86_emulate.c +++ b/drivers/kvm/x86_emulate.c @@ -6,7 +6,7 @@ * Copyright (c) 2005 Keir Fraser * * Linux coding style, mod r/m decoder, segment base fixes, real-mode - * privieged instructions: + * privileged instructions: * * Copyright (C) 2006 Qumranet * @@ -83,7 +83,7 @@ static u8 opcode_table[256] = { /* 0x20 - 0x27 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + SrcImmByte, SrcImm, 0, 0, /* 0x28 - 0x2F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, @@ -99,15 +99,24 @@ static u8 opcode_table[256] = { /* 0x40 - 0x4F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x50 - 0x57 */ - 0, 0, 0, 0, 0, 0, 0, 0, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0x58 - 0x5F */ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - /* 0x60 - 0x6F */ + /* 0x60 - 0x67 */ 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x70 - 0x7F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, + /* 0x68 - 0x6F */ + 0, 0, ImplicitOps|Mov, 0, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ + /* 0x70 - 0x77 */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0x78 - 0x7F */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0x80 - 0x87 */ ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, @@ -116,9 +125,9 @@ static u8 opcode_table[256] = { /* 0x88 - 0x8F */ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - 0, 0, 0, DstMem | SrcNone | ModRM | Mov, + 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov, /* 0x90 - 0x9F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0, /* 0xA0 - 0xA7 */ ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov, ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov, @@ -142,8 +151,10 @@ static u8 opcode_table[256] = { 0, 0, 0, 0, /* 0xD8 - 0xDF */ 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xE0 - 0xEF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE0 - 0xE7 */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE8 - 0xEF */ + ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, ImplicitOps, 0, @@ -181,7 +192,10 @@ static u16 twobyte_table[256] = { /* 0x70 - 0x7F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0x8F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0x90 - 0x9F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xA7 */ @@ -207,19 +221,6 @@ static u16 twobyte_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; -/* - * Tell the emulator that of the Group 7 instructions (sgdt, lidt, etc.) we - * are interested only in invlpg and not in any of the rest. - * - * invlpg is a special instruction in that the data it references may not - * be mapped. - */ -void kvm_emulator_want_group7_invlpg(void) -{ - twobyte_table[1] &= ~SrcMem; -} -EXPORT_SYMBOL_GPL(kvm_emulator_want_group7_invlpg); - /* Type, address-of, and value of an instruction's operand. */ struct operand { enum { OP_REG, OP_MEM, OP_IMM } type; @@ -420,7 +421,7 @@ struct operand { #define insn_fetch(_type, _size, _eip) \ ({ unsigned long _x; \ rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \ - (_size), ctxt); \ + (_size), ctxt->vcpu); \ if ( rc != 0 ) \ goto done; \ (_eip) += (_size); \ @@ -428,10 +429,11 @@ struct operand { }) /* Access/update address held in a register, based on addressing mode. */ +#define address_mask(reg) \ + ((ad_bytes == sizeof(unsigned long)) ? \ + (reg) : ((reg) & ((1UL << (ad_bytes << 3)) - 1))) #define register_address(base, reg) \ - ((base) + ((ad_bytes == sizeof(unsigned long)) ? (reg) : \ - ((reg) & ((1UL << (ad_bytes << 3)) - 1)))) - + ((base) + address_mask(reg)) #define register_address_increment(reg, inc) \ do { \ /* signed type ensures sign extension to long */ \ @@ -443,8 +445,19 @@ struct operand { (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \ } while (0) -void *decode_register(u8 modrm_reg, unsigned long *regs, - int highbyte_regs) +#define JMP_REL(rel) \ + do { \ + _eip += (int)(rel); \ + _eip = ((op_bytes == 2) ? (uint16_t)_eip : (uint32_t)_eip); \ + } while (0) + +/* + * Given the 'reg' portion of a ModRM byte, and a register block, return a + * pointer into the block that addresses the relevant register. + * @highbyte_regs specifies whether to decode AH,CH,DH,BH. + */ +static void *decode_register(u8 modrm_reg, unsigned long *regs, + int highbyte_regs) { void *p; @@ -464,13 +477,50 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, if (op_bytes == 2) op_bytes = 3; *address = 0; - rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, ctxt); + rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, + ctxt->vcpu); if (rc) return rc; - rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, ctxt); + rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, + ctxt->vcpu); return rc; } +static int test_cc(unsigned int condition, unsigned int flags) +{ + int rc = 0; + + switch ((condition & 15) >> 1) { + case 0: /* o */ + rc |= (flags & EFLG_OF); + break; + case 1: /* b/c/nae */ + rc |= (flags & EFLG_CF); + break; + case 2: /* z/e */ + rc |= (flags & EFLG_ZF); + break; + case 3: /* be/na */ + rc |= (flags & (EFLG_CF|EFLG_ZF)); + break; + case 4: /* s */ + rc |= (flags & EFLG_SF); + break; + case 5: /* p/pe */ + rc |= (flags & EFLG_PF); + break; + case 7: /* le/ng */ + rc |= (flags & EFLG_ZF); + /* fall through */ + case 6: /* l/nge */ + rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); + break; + } + + /* Odd condition identifiers (lsb == 1) have inverted sense. */ + return (!!rc ^ (condition & 1)); +} + int x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -771,11 +821,15 @@ done_prefixes: goto srcmem_common; case SrcMem: src.bytes = (d & ByteOp) ? 1 : op_bytes; + /* Don't fetch the address for invlpg: it could be unmapped. */ + if (twobyte && b == 0x01 && modrm_reg == 7) + break; srcmem_common: src.type = OP_MEM; src.ptr = (unsigned long *)cr2; + src.val = 0; if ((rc = ops->read_emulated((unsigned long)src.ptr, - &src.val, src.bytes, ctxt)) != 0) + &src.val, src.bytes, ctxt->vcpu)) != 0) goto done; src.orig_val = src.val; break; @@ -814,7 +868,7 @@ done_prefixes: case DstReg: dst.type = OP_REG; if ((d & ByteOp) - && !(twobyte_table && (b == 0xb6 || b == 0xb7))) { + && !(twobyte && (b == 0xb6 || b == 0xb7))) { dst.ptr = decode_register(modrm_reg, _regs, (rex_prefix == 0)); dst.val = *(u8 *) dst.ptr; @@ -838,6 +892,7 @@ done_prefixes: dst.type = OP_MEM; dst.ptr = (unsigned long *)cr2; dst.bytes = (d & ByteOp) ? 1 : op_bytes; + dst.val = 0; if (d & BitOp) { unsigned long mask = ~(dst.bytes * 8 - 1); @@ -845,7 +900,7 @@ done_prefixes: } if (!(d & Mov) && /* optimisation - avoid slow emulated read */ ((rc = ops->read_emulated((unsigned long)dst.ptr, - &dst.val, dst.bytes, ctxt)) != 0)) + &dst.val, dst.bytes, ctxt->vcpu)) != 0)) goto done; break; } @@ -871,10 +926,27 @@ done_prefixes: sbb: /* sbb */ emulate_2op_SrcV("sbb", src, dst, _eflags); break; - case 0x20 ... 0x25: + case 0x20 ... 0x23: and: /* and */ emulate_2op_SrcV("and", src, dst, _eflags); break; + case 0x24: /* and al imm8 */ + dst.type = OP_REG; + dst.ptr = &_regs[VCPU_REGS_RAX]; + dst.val = *(u8 *)dst.ptr; + dst.bytes = 1; + dst.orig_val = dst.val; + goto and; + case 0x25: /* and ax imm16, or eax imm32 */ + dst.type = OP_REG; + dst.bytes = op_bytes; + dst.ptr = &_regs[VCPU_REGS_RAX]; + if (op_bytes == 2) + dst.val = *(u16 *)dst.ptr; + else + dst.val = *(u32 *)dst.ptr; + dst.orig_val = dst.val; + goto and; case 0x28 ... 0x2d: sub: /* sub */ emulate_2op_SrcV("sub", src, dst, _eflags); @@ -892,6 +964,17 @@ done_prefixes: goto cannot_emulate; dst.val = (s32) src.val; break; + case 0x6a: /* push imm8 */ + src.val = 0L; + src.val = insn_fetch(s8, 1, _eip); +push: + dst.type = OP_MEM; + dst.bytes = op_bytes; + dst.val = src.val; + register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes); + dst.ptr = (void *) register_address(ctxt->ss_base, + _regs[VCPU_REGS_RSP]); + break; case 0x80 ... 0x83: /* Grp1 */ switch (modrm_reg) { case 0: @@ -939,18 +1022,10 @@ done_prefixes: dst.val = src.val; lock_prefix = 1; break; - case 0xa0 ... 0xa1: /* mov */ - dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; - dst.val = src.val; - _eip += ad_bytes; /* skip src displacement */ - break; - case 0xa2 ... 0xa3: /* mov */ - dst.val = (unsigned long)_regs[VCPU_REGS_RAX]; - _eip += ad_bytes; /* skip dst displacement */ - break; case 0x88 ... 0x8b: /* mov */ - case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ - dst.val = src.val; + goto mov; + case 0x8d: /* lea r16/r32, m */ + dst.val = modrm_val; break; case 0x8f: /* pop (sole member of Grp1a) */ /* 64-bit mode: POP always pops a 64-bit operand. */ @@ -958,10 +1033,19 @@ done_prefixes: dst.bytes = 8; if ((rc = ops->read_std(register_address(ctxt->ss_base, _regs[VCPU_REGS_RSP]), - &dst.val, dst.bytes, ctxt)) != 0) + &dst.val, dst.bytes, ctxt->vcpu)) != 0) goto done; register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes); break; + case 0xa0 ... 0xa1: /* mov */ + dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; + dst.val = src.val; + _eip += ad_bytes; /* skip src displacement */ + break; + case 0xa2 ... 0xa3: /* mov */ + dst.val = (unsigned long)_regs[VCPU_REGS_RAX]; + _eip += ad_bytes; /* skip dst displacement */ + break; case 0xc0 ... 0xc1: grp2: /* Grp2 */ switch (modrm_reg) { @@ -989,12 +1073,41 @@ done_prefixes: break; } break; + case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ + mov: + dst.val = src.val; + break; case 0xd0 ... 0xd1: /* Grp2 */ src.val = 1; goto grp2; case 0xd2 ... 0xd3: /* Grp2 */ src.val = _regs[VCPU_REGS_RCX]; goto grp2; + case 0xe8: /* call (near) */ { + long int rel; + switch (op_bytes) { + case 2: + rel = insn_fetch(s16, 2, _eip); + break; + case 4: + rel = insn_fetch(s32, 4, _eip); + break; + case 8: + rel = insn_fetch(s64, 8, _eip); + break; + default: + DPRINTF("Call: Invalid op_bytes\n"); + goto cannot_emulate; + } + src.val = (unsigned long) _eip; + JMP_REL(rel); + goto push; + } + case 0xe9: /* jmp rel */ + case 0xeb: /* jmp rel short */ + JMP_REL(src.val); + no_wb = 1; /* Disable writeback. */ + break; case 0xf6 ... 0xf7: /* Grp3 */ switch (modrm_reg) { case 0 ... 1: /* test */ @@ -1037,13 +1150,19 @@ done_prefixes: case 1: /* dec */ emulate_1op("dec", dst, _eflags); break; + case 4: /* jmp abs */ + if (b == 0xff) + _eip = dst.val; + else + goto cannot_emulate; + break; case 6: /* push */ /* 64-bit mode: PUSH always pushes a 64-bit operand. */ if (mode == X86EMUL_MODE_PROT64) { dst.bytes = 8; if ((rc = ops->read_std((unsigned long)dst.ptr, &dst.val, 8, - ctxt)) != 0) + ctxt->vcpu)) != 0) goto done; } register_address_increment(_regs[VCPU_REGS_RSP], @@ -1051,7 +1170,7 @@ done_prefixes: if ((rc = ops->write_std( register_address(ctxt->ss_base, _regs[VCPU_REGS_RSP]), - &dst.val, dst.bytes, ctxt)) != 0) + &dst.val, dst.bytes, ctxt->vcpu)) != 0) goto done; no_wb = 1; break; @@ -1086,11 +1205,11 @@ writeback: rc = ops->cmpxchg_emulated((unsigned long)dst. ptr, &dst.orig_val, &dst.val, dst.bytes, - ctxt); + ctxt->vcpu); else rc = ops->write_emulated((unsigned long)dst.ptr, &dst.val, dst.bytes, - ctxt); + ctxt->vcpu); if (rc != 0) goto done; default: @@ -1109,6 +1228,81 @@ done: special_insn: if (twobyte) goto twobyte_special_insn; + switch(b) { + case 0x50 ... 0x57: /* push reg */ + if (op_bytes == 2) + src.val = (u16) _regs[b & 0x7]; + else + src.val = (u32) _regs[b & 0x7]; + dst.type = OP_MEM; + dst.bytes = op_bytes; + dst.val = src.val; + register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes); + dst.ptr = (void *) register_address( + ctxt->ss_base, _regs[VCPU_REGS_RSP]); + break; + case 0x58 ... 0x5f: /* pop reg */ + dst.ptr = (unsigned long *)&_regs[b & 0x7]; + pop_instruction: + if ((rc = ops->read_std(register_address(ctxt->ss_base, + _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu)) + != 0) + goto done; + + register_address_increment(_regs[VCPU_REGS_RSP], op_bytes); + no_wb = 1; /* Disable writeback. */ + break; + case 0x6c: /* insb */ + case 0x6d: /* insw/insd */ + if (kvm_emulate_pio_string(ctxt->vcpu, NULL, + 1, /* in */ + (d & ByteOp) ? 1 : op_bytes, /* size */ + rep_prefix ? + address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */ + (_eflags & EFLG_DF), /* down */ + register_address(ctxt->es_base, + _regs[VCPU_REGS_RDI]), /* address */ + rep_prefix, + _regs[VCPU_REGS_RDX] /* port */ + ) == 0) + return -1; + return 0; + case 0x6e: /* outsb */ + case 0x6f: /* outsw/outsd */ + if (kvm_emulate_pio_string(ctxt->vcpu, NULL, + 0, /* in */ + (d & ByteOp) ? 1 : op_bytes, /* size */ + rep_prefix ? + address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */ + (_eflags & EFLG_DF), /* down */ + register_address(override_base ? + *override_base : ctxt->ds_base, + _regs[VCPU_REGS_RSI]), /* address */ + rep_prefix, + _regs[VCPU_REGS_RDX] /* port */ + ) == 0) + return -1; + return 0; + case 0x70 ... 0x7f: /* jcc (short) */ { + int rel = insn_fetch(s8, 1, _eip); + + if (test_cc(b, _eflags)) + JMP_REL(rel); + break; + } + case 0x9c: /* pushf */ + src.val = (unsigned long) _eflags; + goto push; + case 0x9d: /* popf */ + dst.ptr = (unsigned long *) &_eflags; + goto pop_instruction; + case 0xc3: /* ret */ + dst.ptr = &_eip; + goto pop_instruction; + case 0xf4: /* hlt */ + ctxt->vcpu->halt_request = 1; + goto done; + } if (rep_prefix) { if (_regs[VCPU_REGS_RCX] == 0) { ctxt->vcpu->rip = _eip; @@ -1125,7 +1319,7 @@ special_insn: _regs[VCPU_REGS_RDI]); if ((rc = ops->read_emulated(register_address( override_base ? *override_base : ctxt->ds_base, - _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt)) != 0) + _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0) goto done; register_address_increment(_regs[VCPU_REGS_RSI], (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); @@ -1147,7 +1341,8 @@ special_insn: dst.type = OP_REG; dst.bytes = (d & ByteOp) ? 1 : op_bytes; dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; - if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes, ctxt)) != 0) + if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes, + ctxt->vcpu)) != 0) goto done; register_address_increment(_regs[VCPU_REGS_RSI], (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); @@ -1155,23 +1350,7 @@ special_insn: case 0xae ... 0xaf: /* scas */ DPRINTF("Urk! I don't handle SCAS.\n"); goto cannot_emulate; - case 0xf4: /* hlt */ - ctxt->vcpu->halt_request = 1; - goto done; - case 0xc3: /* ret */ - dst.ptr = &_eip; - goto pop_instruction; - case 0x58 ... 0x5f: /* pop reg */ - dst.ptr = (unsigned long *)&_regs[b & 0x7]; -pop_instruction: - if ((rc = ops->read_std(register_address(ctxt->ss_base, - _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt)) != 0) - goto done; - - register_address_increment(_regs[VCPU_REGS_RSP], op_bytes); - no_wb = 1; /* Disable writeback. */ - break; } goto writeback; @@ -1230,40 +1409,50 @@ twobyte_insn: break; case 0x40 ... 0x4f: /* cmov */ dst.val = dst.orig_val = src.val; - d &= ~Mov; /* default to no move */ + no_wb = 1; /* * First, assume we're decoding an even cmov opcode * (lsb == 0). */ switch ((b & 15) >> 1) { case 0: /* cmovo */ - d |= (_eflags & EFLG_OF) ? Mov : 0; + no_wb = (_eflags & EFLG_OF) ? 0 : 1; break; case 1: /* cmovb/cmovc/cmovnae */ - d |= (_eflags & EFLG_CF) ? Mov : 0; + no_wb = (_eflags & EFLG_CF) ? 0 : 1; break; case 2: /* cmovz/cmove */ - d |= (_eflags & EFLG_ZF) ? Mov : 0; + no_wb = (_eflags & EFLG_ZF) ? 0 : 1; break; case 3: /* cmovbe/cmovna */ - d |= (_eflags & (EFLG_CF | EFLG_ZF)) ? Mov : 0; + no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1; break; case 4: /* cmovs */ - d |= (_eflags & EFLG_SF) ? Mov : 0; + no_wb = (_eflags & EFLG_SF) ? 0 : 1; break; case 5: /* cmovp/cmovpe */ - d |= (_eflags & EFLG_PF) ? Mov : 0; + no_wb = (_eflags & EFLG_PF) ? 0 : 1; break; case 7: /* cmovle/cmovng */ - d |= (_eflags & EFLG_ZF) ? Mov : 0; + no_wb = (_eflags & EFLG_ZF) ? 0 : 1; /* fall through */ case 6: /* cmovl/cmovnge */ - d |= (!(_eflags & EFLG_SF) != - !(_eflags & EFLG_OF)) ? Mov : 0; + no_wb &= (!(_eflags & EFLG_SF) != + !(_eflags & EFLG_OF)) ? 0 : 1; break; } /* Odd cmov opcodes (lsb == 1) have inverted sense. */ - d ^= (b & 1) ? Mov : 0; + no_wb ^= b & 1; + break; + case 0xa3: + bt: /* bt */ + src.val &= (dst.bytes << 3) - 1; /* only subword offset */ + emulate_2op_SrcV_nobyte("bt", src, dst, _eflags); + break; + case 0xab: + bts: /* bts */ + src.val &= (dst.bytes << 3) - 1; /* only subword offset */ + emulate_2op_SrcV_nobyte("bts", src, dst, _eflags); break; case 0xb0 ... 0xb1: /* cmpxchg */ /* @@ -1273,8 +1462,6 @@ twobyte_insn: src.orig_val = src.val; src.val = _regs[VCPU_REGS_RAX]; emulate_2op_SrcV("cmp", src, dst, _eflags); - /* Always write back. The question is: where to? */ - d |= Mov; if (_eflags & EFLG_ZF) { /* Success: write back to memory. */ dst.val = src.orig_val; @@ -1284,30 +1471,15 @@ twobyte_insn: dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; } break; - case 0xa3: - bt: /* bt */ - src.val &= (dst.bytes << 3) - 1; /* only subword offset */ - emulate_2op_SrcV_nobyte("bt", src, dst, _eflags); - break; case 0xb3: btr: /* btr */ src.val &= (dst.bytes << 3) - 1; /* only subword offset */ emulate_2op_SrcV_nobyte("btr", src, dst, _eflags); break; - case 0xab: - bts: /* bts */ - src.val &= (dst.bytes << 3) - 1; /* only subword offset */ - emulate_2op_SrcV_nobyte("bts", src, dst, _eflags); - break; case 0xb6 ... 0xb7: /* movzx */ dst.bytes = op_bytes; dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val; break; - case 0xbb: - btc: /* btc */ - src.val &= (dst.bytes << 3) - 1; /* only subword offset */ - emulate_2op_SrcV_nobyte("btc", src, dst, _eflags); - break; case 0xba: /* Grp8 */ switch (modrm_reg & 3) { case 0: @@ -1320,6 +1492,11 @@ twobyte_insn: goto btc; } break; + case 0xbb: + btc: /* btc */ + src.val &= (dst.bytes << 3) - 1; /* only subword offset */ + emulate_2op_SrcV_nobyte("btc", src, dst, _eflags); + break; case 0xbe ... 0xbf: /* movsx */ dst.bytes = op_bytes; dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val; @@ -1331,14 +1508,14 @@ twobyte_special_insn: /* Disable writeback. */ no_wb = 1; switch (b) { + case 0x06: + emulate_clts(ctxt->vcpu); + break; case 0x09: /* wbinvd */ break; case 0x0d: /* GrpP (prefetch) */ case 0x18: /* Grp16 (prefetch/nop) */ break; - case 0x06: - emulate_clts(ctxt->vcpu); - break; case 0x20: /* mov cr, reg */ if (modrm_mod != 3) goto cannot_emulate; @@ -1355,7 +1532,7 @@ twobyte_special_insn: | ((u64)_regs[VCPU_REGS_RDX] << 32); rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data); if (rc) { - kvm_arch_ops->inject_gp(ctxt->vcpu, 0); + kvm_x86_ops->inject_gp(ctxt->vcpu, 0); _eip = ctxt->vcpu->rip; } rc = X86EMUL_CONTINUE; @@ -1364,7 +1541,7 @@ twobyte_special_insn: /* rdmsr */ rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data); if (rc) { - kvm_arch_ops->inject_gp(ctxt->vcpu, 0); + kvm_x86_ops->inject_gp(ctxt->vcpu, 0); _eip = ctxt->vcpu->rip; } else { _regs[VCPU_REGS_RAX] = (u32)msr_data; @@ -1372,10 +1549,32 @@ twobyte_special_insn: } rc = X86EMUL_CONTINUE; break; + case 0x80 ... 0x8f: /* jnz rel, etc*/ { + long int rel; + + switch (op_bytes) { + case 2: + rel = insn_fetch(s16, 2, _eip); + break; + case 4: + rel = insn_fetch(s32, 4, _eip); + break; + case 8: + rel = insn_fetch(s64, 8, _eip); + break; + default: + DPRINTF("jnz: Invalid op_bytes\n"); + goto cannot_emulate; + } + if (test_cc(b, _eflags)) + JMP_REL(rel); + break; + } case 0xc7: /* Grp9 (cmpxchg8b) */ { u64 old, new; - if ((rc = ops->read_emulated(cr2, &old, 8, ctxt)) != 0) + if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu)) + != 0) goto done; if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) || ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) { @@ -1386,7 +1585,7 @@ twobyte_special_insn: new = ((u64)_regs[VCPU_REGS_RCX] << 32) | (u32) _regs[VCPU_REGS_RBX]; if ((rc = ops->cmpxchg_emulated(cr2, &old, - &new, 8, ctxt)) != 0) + &new, 8, ctxt->vcpu)) != 0) goto done; _eflags |= EFLG_ZF; } diff --git a/drivers/kvm/x86_emulate.h b/drivers/kvm/x86_emulate.h index ea3407d7feee..92c73aa7f9ac 100644 --- a/drivers/kvm/x86_emulate.h +++ b/drivers/kvm/x86_emulate.h @@ -60,7 +60,7 @@ struct x86_emulate_ops { * @bytes: [IN ] Number of bytes to read from memory. */ int (*read_std)(unsigned long addr, void *val, - unsigned int bytes, struct x86_emulate_ctxt * ctxt); + unsigned int bytes, struct kvm_vcpu *vcpu); /* * write_std: Write bytes of standard (non-emulated/special) memory. @@ -71,7 +71,7 @@ struct x86_emulate_ops { * @bytes: [IN ] Number of bytes to write to memory. */ int (*write_std)(unsigned long addr, const void *val, - unsigned int bytes, struct x86_emulate_ctxt * ctxt); + unsigned int bytes, struct kvm_vcpu *vcpu); /* * read_emulated: Read bytes from emulated/special memory area. @@ -82,7 +82,7 @@ struct x86_emulate_ops { int (*read_emulated) (unsigned long addr, void *val, unsigned int bytes, - struct x86_emulate_ctxt * ctxt); + struct kvm_vcpu *vcpu); /* * write_emulated: Read bytes from emulated/special memory area. @@ -94,7 +94,7 @@ struct x86_emulate_ops { int (*write_emulated) (unsigned long addr, const void *val, unsigned int bytes, - struct x86_emulate_ctxt * ctxt); + struct kvm_vcpu *vcpu); /* * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an @@ -108,12 +108,10 @@ struct x86_emulate_ops { const void *old, const void *new, unsigned int bytes, - struct x86_emulate_ctxt * ctxt); + struct kvm_vcpu *vcpu); }; -struct cpu_user_regs; - struct x86_emulate_ctxt { /* Register state before/after emulation. */ struct kvm_vcpu *vcpu; @@ -154,12 +152,4 @@ struct x86_emulate_ctxt { int x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops); -/* - * Given the 'reg' portion of a ModRM byte, and a register block, return a - * pointer into the block that addresses the relevant register. - * @highbyte_regs specifies whether to decode AH,CH,DH,BH. - */ -void *decode_register(u8 modrm_reg, unsigned long *regs, - int highbyte_regs); - #endif /* __X86_EMULATE_H__ */ diff --git a/include/asm-x86/io_apic_32.h b/include/asm-x86/io_apic_32.h index dbe734ddf2af..3f087883ea48 100644 --- a/include/asm-x86/io_apic_32.h +++ b/include/asm-x86/io_apic_32.h @@ -11,8 +11,6 @@ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar */ -#ifdef CONFIG_X86_IO_APIC - /* * The structure of the IO-APIC: */ @@ -55,12 +53,6 @@ union IO_APIC_reg_03 { } __attribute__ ((packed)) bits; }; -/* - * # of IO-APICs and # of IRQ routing registers - */ -extern int nr_ioapics; -extern int nr_ioapic_registers[MAX_IO_APICS]; - enum ioapic_irq_destination_types { dest_Fixed = 0, dest_LowestPrio = 1, @@ -100,6 +92,14 @@ struct IO_APIC_route_entry { } __attribute__ ((packed)); +#ifdef CONFIG_X86_IO_APIC + +/* + * # of IO-APICs and # of IRQ routing registers + */ +extern int nr_ioapics; +extern int nr_ioapic_registers[MAX_IO_APICS]; + /* * MP-BIOS irq configuration table structures: */ diff --git a/include/asm-x86/processor-flags.h b/include/asm-x86/processor-flags.h index 5404e90edd57..199cab107d85 100644 --- a/include/asm-x86/processor-flags.h +++ b/include/asm-x86/processor-flags.h @@ -63,7 +63,7 @@ /* * x86-64 Task Priority Register, CR8 */ -#define X86_CR8_TPR 0x00000007 /* task priority register */ +#define X86_CR8_TPR 0x0000000F /* task priority register */ /* * AMD and Transmeta use MSRs for configuration; see <asm/msr-index.h> diff --git a/include/linux/kvm.h b/include/linux/kvm.h index e6edca81ab84..057a7f34ee36 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -4,8 +4,7 @@ /* * Userspace interface for /dev/kvm - kernel based virtual machine * - * Note: this interface is considered experimental and may change without - * notice. + * Note: you must update KVM_API_VERSION if you change this interface. */ #include <asm/types.h> @@ -13,14 +12,8 @@ #define KVM_API_VERSION 12 -/* - * Architectural interrupt line count, and the size of the bitmap needed - * to hold them. - */ +/* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 -#define KVM_IRQ_BITMAP_SIZE_BYTES ((KVM_NR_INTERRUPTS + 7) / 8) -#define KVM_IRQ_BITMAP_SIZE(type) (KVM_IRQ_BITMAP_SIZE_BYTES / sizeof(type)) - /* for KVM_CREATE_MEMORY_REGION */ struct kvm_memory_region { @@ -41,20 +34,89 @@ struct kvm_memory_alias { __u64 target_phys_addr; }; -enum kvm_exit_reason { - KVM_EXIT_UNKNOWN = 0, - KVM_EXIT_EXCEPTION = 1, - KVM_EXIT_IO = 2, - KVM_EXIT_HYPERCALL = 3, - KVM_EXIT_DEBUG = 4, - KVM_EXIT_HLT = 5, - KVM_EXIT_MMIO = 6, - KVM_EXIT_IRQ_WINDOW_OPEN = 7, - KVM_EXIT_SHUTDOWN = 8, - KVM_EXIT_FAIL_ENTRY = 9, - KVM_EXIT_INTR = 10, +/* for KVM_IRQ_LINE */ +struct kvm_irq_level { + /* + * ACPI gsi notion of irq. + * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47.. + * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23.. + */ + __u32 irq; + __u32 level; +}; + +/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ +struct kvm_pic_state { + __u8 last_irr; /* edge detection */ + __u8 irr; /* interrupt request register */ + __u8 imr; /* interrupt mask register */ + __u8 isr; /* interrupt service register */ + __u8 priority_add; /* highest irq priority */ + __u8 irq_base; + __u8 read_reg_select; + __u8 poll; + __u8 special_mask; + __u8 init_state; + __u8 auto_eoi; + __u8 rotate_on_auto_eoi; + __u8 special_fully_nested_mode; + __u8 init4; /* true if 4 byte init */ + __u8 elcr; /* PIIX edge/trigger selection */ + __u8 elcr_mask; +}; + +#define KVM_IOAPIC_NUM_PINS 24 +struct kvm_ioapic_state { + __u64 base_address; + __u32 ioregsel; + __u32 id; + __u32 irr; + __u32 pad; + union { + __u64 bits; + struct { + __u8 vector; + __u8 delivery_mode:3; + __u8 dest_mode:1; + __u8 delivery_status:1; + __u8 polarity:1; + __u8 remote_irr:1; + __u8 trig_mode:1; + __u8 mask:1; + __u8 reserve:7; + __u8 reserved[4]; + __u8 dest_id; + } fields; + } redirtbl[KVM_IOAPIC_NUM_PINS]; }; +#define KVM_IRQCHIP_PIC_MASTER 0 +#define KVM_IRQCHIP_PIC_SLAVE 1 +#define KVM_IRQCHIP_IOAPIC 2 + +struct kvm_irqchip { + __u32 chip_id; + __u32 pad; + union { + char dummy[512]; /* reserving space */ + struct kvm_pic_state pic; + struct kvm_ioapic_state ioapic; + } chip; +}; + +#define KVM_EXIT_UNKNOWN 0 +#define KVM_EXIT_EXCEPTION 1 +#define KVM_EXIT_IO 2 +#define KVM_EXIT_HYPERCALL 3 +#define KVM_EXIT_DEBUG 4 +#define KVM_EXIT_HLT 5 +#define KVM_EXIT_MMIO 6 +#define KVM_EXIT_IRQ_WINDOW_OPEN 7 +#define KVM_EXIT_SHUTDOWN 8 +#define KVM_EXIT_FAIL_ENTRY 9 +#define KVM_EXIT_INTR 10 +#define KVM_EXIT_SET_TPR 11 + /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ struct kvm_run { /* in */ @@ -106,11 +168,14 @@ struct kvm_run { } mmio; /* KVM_EXIT_HYPERCALL */ struct { + __u64 nr; __u64 args[6]; __u64 ret; __u32 longmode; __u32 pad; } hypercall; + /* Fix the size of the union. */ + char padding[256]; }; }; @@ -139,6 +204,12 @@ struct kvm_fpu { __u32 pad2; }; +/* for KVM_GET_LAPIC and KVM_SET_LAPIC */ +#define KVM_APIC_REG_SIZE 0x400 +struct kvm_lapic_state { + char regs[KVM_APIC_REG_SIZE]; +}; + struct kvm_segment { __u64 base; __u32 limit; @@ -164,7 +235,7 @@ struct kvm_sregs { __u64 cr0, cr2, cr3, cr4, cr8; __u64 efer; __u64 apic_base; - __u64 interrupt_bitmap[KVM_IRQ_BITMAP_SIZE(__u64)]; + __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; }; struct kvm_msr_entry { @@ -272,6 +343,12 @@ struct kvm_signal_mask { #define KVM_GET_VCPU_MMAP_SIZE _IO(KVMIO, 0x04) /* in bytes */ /* + * Extension capability list. + */ +#define KVM_CAP_IRQCHIP 0 +#define KVM_CAP_HLT 1 + +/* * ioctls for VM fds */ #define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) @@ -282,6 +359,11 @@ struct kvm_signal_mask { #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) #define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) +/* Device model IOC */ +#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) +#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) +#define KVM_GET_IRQCHIP _IOWR(KVMIO, 0x62, struct kvm_irqchip) +#define KVM_SET_IRQCHIP _IOR(KVMIO, 0x63, struct kvm_irqchip) /* * ioctls for vcpu fds @@ -300,5 +382,7 @@ struct kvm_signal_mask { #define KVM_SET_SIGNAL_MASK _IOW(KVMIO, 0x8b, struct kvm_signal_mask) #define KVM_GET_FPU _IOR(KVMIO, 0x8c, struct kvm_fpu) #define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu) +#define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state) +#define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state) #endif |