diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/Kconfig | 2 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 1551 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.c | 27 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/i8259.c | 72 | ||||
-rw-r--r-- | arch/x86/kvm/irq.h | 4 | ||||
-rw-r--r-- | arch/x86/kvm/kvm_cache_regs.h | 31 | ||||
-rw-r--r-- | arch/x86/kvm/kvm_timer.h | 4 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 32 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 8 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 360 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 35 | ||||
-rw-r--r-- | arch/x86/kvm/mmutrace.h | 84 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 59 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 1147 | ||||
-rw-r--r-- | arch/x86/kvm/timer.c | 3 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 224 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 739 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 2353 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 40 |
20 files changed, 4363 insertions, 2414 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 4cd498332466..970bbd479516 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -29,6 +29,7 @@ config KVM select HAVE_KVM_EVENTFD select KVM_APIC_ARCHITECTURE select USER_RETURN_NOTIFIER + select KVM_MMIO ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent @@ -65,6 +66,7 @@ config KVM_AMD # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. +source drivers/vhost/Kconfig source drivers/lguest/Kconfig source drivers/virtio/Kconfig diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7e8faea4651e..5ac0bb465ed6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -32,7 +32,8 @@ #include <linux/module.h> #include <asm/kvm_emulate.h> -#include "mmu.h" /* for is_long_mode() */ +#include "x86.h" +#include "tss.h" /* * Opcode effective-address decode tables. @@ -50,6 +51,8 @@ #define DstReg (2<<1) /* Register operand. */ #define DstMem (3<<1) /* Memory operand. */ #define DstAcc (4<<1) /* Destination Accumulator */ +#define DstDI (5<<1) /* Destination is in ES:(E)DI */ +#define DstMem64 (6<<1) /* 64bit memory operand */ #define DstMask (7<<1) /* Source operand type. */ #define SrcNone (0<<4) /* No source operand. */ @@ -63,6 +66,7 @@ #define SrcOne (7<<4) /* Implied '1' */ #define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ #define SrcImmU (9<<4) /* Immediate operand, unsigned */ +#define SrcSI (0xa<<4) /* Source is in the DS:RSI */ #define SrcMask (0xf<<4) /* Generic ModRM decode. */ #define ModRM (1<<8) @@ -76,6 +80,8 @@ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ /* Misc flags */ +#define Lock (1<<26) /* lock prefix is allowed for the instruction */ +#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) /* Source 2 operand type */ #define Src2None (0<<29) @@ -83,44 +89,48 @@ #define Src2ImmByte (2<<29) #define Src2One (3<<29) #define Src2Imm16 (4<<29) +#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be + in memory and second argument is located + immediately after the first one in memory. */ #define Src2Mask (7<<29) enum { Group1_80, Group1_81, Group1_82, Group1_83, Group1A, Group3_Byte, Group3, Group4, Group5, Group7, + Group8, Group9, }; static u32 opcode_table[256] = { /* 0x00 - 0x07 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x08 - 0x0F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, 0, /* 0x10 - 0x17 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x18 - 0x1F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x20 - 0x27 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x28 - 0x2F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 0, 0, 0, 0, /* 0x30 - 0x37 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 0, 0, 0, 0, /* 0x38 - 0x3F */ @@ -144,8 +154,8 @@ static u32 opcode_table[256] = { 0, 0, 0, 0, /* 0x68 - 0x6F */ SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ + DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */ + SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */ /* 0x70 - 0x77 */ SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, @@ -156,7 +166,7 @@ static u32 opcode_table[256] = { Group | Group1_80, Group | Group1_81, Group | Group1_82, Group | Group1_83, ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, /* 0x88 - 0x8F */ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -170,12 +180,12 @@ static u32 opcode_table[256] = { /* 0xA0 - 0xA7 */ ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, - ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, - ByteOp | ImplicitOps | String, ImplicitOps | String, + ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String, + ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String, /* 0xA8 - 0xAF */ - 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, - ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, - ByteOp | ImplicitOps | String, ImplicitOps | String, + 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String, + ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String, + ByteOp | DstDI | String, DstDI | String, /* 0xB0 - 0xB7 */ ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, @@ -201,16 +211,16 @@ static u32 opcode_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE0 - 0xE7 */ 0, 0, 0, 0, - ByteOp | SrcImmUByte, SrcImmUByte, - ByteOp | SrcImmUByte, SrcImmUByte, + ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, + ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, /* 0xE8 - 0xEF */ SrcImm | Stack, SrcImm | ImplicitOps, SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, + SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, - ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, + ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, /* 0xF8 - 0xFF */ ImplicitOps, 0, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, @@ -218,16 +228,20 @@ static u32 opcode_table[256] = { static u32 twobyte_table[256] = { /* 0x00 - 0x0F */ - 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, + 0, Group | GroupDual | Group7, 0, 0, + 0, ImplicitOps, ImplicitOps | Priv, 0, + ImplicitOps | Priv, ImplicitOps | Priv, 0, 0, + 0, ImplicitOps | ModRM, 0, 0, /* 0x10 - 0x1F */ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, /* 0x20 - 0x2F */ - ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, + ModRM | ImplicitOps | Priv, ModRM | Priv, + ModRM | ImplicitOps | Priv, ModRM | Priv, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x3F */ - ImplicitOps, 0, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, + ImplicitOps | Priv, 0, ImplicitOps | Priv, 0, + ImplicitOps, ImplicitOps | Priv, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x47 */ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -257,21 +271,23 @@ static u32 twobyte_table[256] = { DstMem | SrcReg | Src2CL | ModRM, 0, 0, /* 0xA8 - 0xAF */ ImplicitOps | Stack, ImplicitOps | Stack, - 0, DstMem | SrcReg | ModRM | BitOp, + 0, DstMem | SrcReg | ModRM | BitOp | Lock, DstMem | SrcReg | Src2ImmByte | ModRM, DstMem | SrcReg | Src2CL | ModRM, ModRM, 0, /* 0xB0 - 0xB7 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, - DstMem | SrcReg | ModRM | BitOp, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, + 0, DstMem | SrcReg | ModRM | BitOp | Lock, 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xB8 - 0xBF */ - 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, + 0, 0, + Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock, 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xC0 - 0xCF */ - 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, + 0, 0, 0, DstMem | SrcReg | ModRM | Mov, + 0, 0, 0, Group | GroupDual | Group9, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xD0 - 0xDF */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -283,25 +299,41 @@ static u32 twobyte_table[256] = { static u32 group_table[] = { [Group1_80*8] = - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM, [Group1_81*8] = - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM, [Group1_82*8] = - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64, [Group1_83*8] = - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM, [Group1A*8] = DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, [Group3_Byte*8] = @@ -318,26 +350,42 @@ static u32 group_table[] = { [Group5*8] = DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, SrcMem | ModRM | Stack, 0, - SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, + SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps, + SrcMem | ModRM | Stack, 0, [Group7*8] = - 0, 0, ModRM | SrcMem, ModRM | SrcMem, + 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, + SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv, + [Group8*8] = + 0, 0, 0, 0, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, + [Group9*8] = + 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0, }; static u32 group2_table[] = { [Group7*8] = - SrcNone | ModRM, 0, 0, SrcNone | ModRM, + SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv, SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov, 0, + SrcMem16 | ModRM | Mov | Priv, 0, + [Group9*8] = + 0, 0, 0, 0, 0, 0, 0, 0, }; /* EFLAGS bit definitions. */ +#define EFLG_ID (1<<21) +#define EFLG_VIP (1<<20) +#define EFLG_VIF (1<<19) +#define EFLG_AC (1<<18) #define EFLG_VM (1<<17) #define EFLG_RF (1<<16) +#define EFLG_IOPL (3<<12) +#define EFLG_NT (1<<14) #define EFLG_OF (1<<11) #define EFLG_DF (1<<10) #define EFLG_IF (1<<9) +#define EFLG_TF (1<<8) #define EFLG_SF (1<<7) #define EFLG_ZF (1<<6) #define EFLG_AF (1<<4) @@ -522,7 +570,7 @@ static u32 group2_table[] = { #define insn_fetch(_type, _size, _eip) \ ({ unsigned long _x; \ rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ - if (rc != 0) \ + if (rc != X86EMUL_CONTINUE) \ goto done; \ (_eip) += (_size); \ (_type)_x; \ @@ -598,40 +646,40 @@ static unsigned long ss_base(struct x86_emulate_ctxt *ctxt) static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, - unsigned long linear, u8 *dest) + unsigned long eip, u8 *dest) { struct fetch_cache *fc = &ctxt->decode.fetch; int rc; - int size; - - if (linear < fc->start || linear >= fc->end) { - size = min(15UL, PAGE_SIZE - offset_in_page(linear)); - rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); - if (rc) + int size, cur_size; + + if (eip == fc->end) { + cur_size = fc->end - fc->start; + size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); + rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, + size, ctxt->vcpu, NULL); + if (rc != X86EMUL_CONTINUE) return rc; - fc->start = linear; - fc->end = linear + size; + fc->end += size; } - *dest = fc->data[linear - fc->start]; - return 0; + *dest = fc->data[eip - fc->start]; + return X86EMUL_CONTINUE; } static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned long eip, void *dest, unsigned size) { - int rc = 0; + int rc; /* x86 instructions are limited to 15 bytes. */ - if (eip + size - ctxt->decode.eip_orig > 15) + if (eip + size - ctxt->eip > 15) return X86EMUL_UNHANDLEABLE; - eip += ctxt->cs_base; while (size--) { rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); - if (rc) + if (rc != X86EMUL_CONTINUE) return rc; } - return 0; + return X86EMUL_CONTINUE; } /* @@ -661,11 +709,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, op_bytes = 3; *address = 0; rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, - ctxt->vcpu); - if (rc) + ctxt->vcpu, NULL); + if (rc != X86EMUL_CONTINUE) return rc; rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, - ctxt->vcpu); + ctxt->vcpu, NULL); return rc; } @@ -742,7 +790,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; u8 sib; int index_reg = 0, base_reg = 0, scale; - int rc = 0; + int rc = X86EMUL_CONTINUE; if (c->rex_prefix) { c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ @@ -855,7 +903,7 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - int rc = 0; + int rc = X86EMUL_CONTINUE; switch (c->ad_bytes) { case 2: @@ -876,19 +924,24 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - int rc = 0; + int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; int def_op_bytes, def_ad_bytes, group; - /* Shadow copy of register state. Committed on successful emulation. */ + /* we cannot decode insn before we complete previous rep insn */ + WARN_ON(ctxt->restart); + + /* Shadow copy of register state. Committed on successful emulation. */ memset(c, 0, sizeof(struct decode_cache)); - c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu); + c->eip = ctxt->eip; + c->fetch.start = c->fetch.end = c->eip; ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); switch (mode) { case X86EMUL_MODE_REAL: + case X86EMUL_MODE_VM86: case X86EMUL_MODE_PROT16: def_op_bytes = def_ad_bytes = 2; break; @@ -974,11 +1027,6 @@ done_prefixes: } } - if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { - kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");; - return -1; - } - if (c->d & Group) { group = c->d & GroupMask; c->modrm = insn_fetch(u8, 1, c->eip); @@ -1005,7 +1053,7 @@ done_prefixes: rc = decode_modrm(ctxt, ops); else if (c->d & MemAbs) rc = decode_abs(ctxt, ops); - if (rc) + if (rc != X86EMUL_CONTINUE) goto done; if (!c->has_seg_override) @@ -1016,6 +1064,10 @@ done_prefixes: if (c->ad_bytes != 8) c->modrm_ea = (u32)c->modrm_ea; + + if (c->rip_relative) + c->modrm_ea += c->eip; + /* * Decode and fetch the source operand: register, memory * or immediate. @@ -1050,6 +1102,8 @@ done_prefixes: break; } c->src.type = OP_MEM; + c->src.ptr = (unsigned long *)c->modrm_ea; + c->src.val = 0; break; case SrcImm: case SrcImmU: @@ -1098,6 +1152,14 @@ done_prefixes: c->src.bytes = 1; c->src.val = 1; break; + case SrcSI: + c->src.type = OP_MEM; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->src.ptr = (unsigned long *) + register_address(c, seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]); + c->src.val = 0; + break; } /* @@ -1127,6 +1189,12 @@ done_prefixes: c->src2.bytes = 1; c->src2.val = 1; break; + case Src2Mem16: + c->src2.type = OP_MEM; + c->src2.bytes = 2; + c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes); + c->src2.val = 0; + break; } /* Decode and fetch the destination operand: register or memory. */ @@ -1139,6 +1207,7 @@ done_prefixes: c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); break; case DstMem: + case DstMem64: if ((c->d & ModRM) && c->modrm_mod == 3) { c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.type = OP_REG; @@ -1147,12 +1216,24 @@ done_prefixes: break; } c->dst.type = OP_MEM; + c->dst.ptr = (unsigned long *)c->modrm_ea; + if ((c->d & DstMask) == DstMem64) + c->dst.bytes = 8; + else + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.val = 0; + if (c->d & BitOp) { + unsigned long mask = ~(c->dst.bytes * 8 - 1); + + c->dst.ptr = (void *)c->dst.ptr + + (c->src.val & mask) / 8; + } break; case DstAcc: c->dst.type = OP_REG; - c->dst.bytes = c->op_bytes; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.ptr = &c->regs[VCPU_REGS_RAX]; - switch (c->op_bytes) { + switch (c->dst.bytes) { case 1: c->dst.val = *(u8 *)c->dst.ptr; break; @@ -1162,18 +1243,248 @@ done_prefixes: case 4: c->dst.val = *(u32 *)c->dst.ptr; break; + case 8: + c->dst.val = *(u64 *)c->dst.ptr; + break; } c->dst.orig_val = c->dst.val; break; + case DstDI: + c->dst.type = OP_MEM; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *) + register_address(c, es_base(ctxt), + c->regs[VCPU_REGS_RDI]); + c->dst.val = 0; + break; } - if (c->rip_relative) - c->modrm_ea += c->eip; - done: return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; } +static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned int size, unsigned short port, + void *dest) +{ + struct read_cache *rc = &ctxt->decode.io_read; + + if (rc->pos == rc->end) { /* refill pio read ahead */ + struct decode_cache *c = &ctxt->decode; + unsigned int in_page, n; + unsigned int count = c->rep_prefix ? + address_mask(c, c->regs[VCPU_REGS_RCX]) : 1; + in_page = (ctxt->eflags & EFLG_DF) ? + offset_in_page(c->regs[VCPU_REGS_RDI]) : + PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]); + n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, + count); + if (n == 0) + n = 1; + rc->pos = rc->end = 0; + if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu)) + return 0; + rc->end = n * size; + } + + memcpy(dest, rc->data + rc->pos, size); + rc->pos += size; + return 1; +} + +static u32 desc_limit_scaled(struct desc_struct *desc) +{ + u32 limit = get_desc_limit(desc); + + return desc->g ? (limit << 12) | 0xfff : limit; +} + +static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 selector, struct desc_ptr *dt) +{ + if (selector & 1 << 2) { + struct desc_struct desc; + memset (dt, 0, sizeof *dt); + if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu)) + return; + + dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ + dt->address = get_desc_base(&desc); + } else + ops->get_gdt(dt, ctxt->vcpu); +} + +/* allowed just for 8 bytes segments */ +static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 selector, struct desc_struct *desc) +{ + struct desc_ptr dt; + u16 index = selector >> 3; + int ret; + u32 err; + ulong addr; + + get_descriptor_table_ptr(ctxt, ops, selector, &dt); + + if (dt.size < index * 8 + 7) { + kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); + return X86EMUL_PROPAGATE_FAULT; + } + addr = dt.address + index * 8; + ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(ctxt->vcpu, addr, err); + + return ret; +} + +/* allowed just for 8 bytes segments */ +static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 selector, struct desc_struct *desc) +{ + struct desc_ptr dt; + u16 index = selector >> 3; + u32 err; + ulong addr; + int ret; + + get_descriptor_table_ptr(ctxt, ops, selector, &dt); + + if (dt.size < index * 8 + 7) { + kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); + return X86EMUL_PROPAGATE_FAULT; + } + + addr = dt.address + index * 8; + ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(ctxt->vcpu, addr, err); + + return ret; +} + +static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 selector, int seg) +{ + struct desc_struct seg_desc; + u8 dpl, rpl, cpl; + unsigned err_vec = GP_VECTOR; + u32 err_code = 0; + bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + int ret; + + memset(&seg_desc, 0, sizeof seg_desc); + + if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) + || ctxt->mode == X86EMUL_MODE_REAL) { + /* set real mode segment descriptor */ + set_desc_base(&seg_desc, selector << 4); + set_desc_limit(&seg_desc, 0xffff); + seg_desc.type = 3; + seg_desc.p = 1; + seg_desc.s = 1; + goto load; + } + + /* NULL selector is not valid for TR, CS and SS */ + if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) + && null_selector) + goto exception; + + /* TR should be in GDT only */ + if (seg == VCPU_SREG_TR && (selector & (1 << 2))) + goto exception; + + if (null_selector) /* for NULL selector skip all following checks */ + goto load; + + ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + + err_code = selector & 0xfffc; + err_vec = GP_VECTOR; + + /* can't load system descriptor into segment selecor */ + if (seg <= VCPU_SREG_GS && !seg_desc.s) + goto exception; + + if (!seg_desc.p) { + err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; + goto exception; + } + + rpl = selector & 3; + dpl = seg_desc.dpl; + cpl = ops->cpl(ctxt->vcpu); + + switch (seg) { + case VCPU_SREG_SS: + /* + * segment is not a writable data segment or segment + * selector's RPL != CPL or segment selector's RPL != CPL + */ + if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl) + goto exception; + break; + case VCPU_SREG_CS: + if (!(seg_desc.type & 8)) + goto exception; + + if (seg_desc.type & 4) { + /* conforming */ + if (dpl > cpl) + goto exception; + } else { + /* nonconforming */ + if (rpl > cpl || dpl != cpl) + goto exception; + } + /* CS(RPL) <- CPL */ + selector = (selector & 0xfffc) | cpl; + break; + case VCPU_SREG_TR: + if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) + goto exception; + break; + case VCPU_SREG_LDTR: + if (seg_desc.s || seg_desc.type != 2) + goto exception; + break; + default: /* DS, ES, FS, or GS */ + /* + * segment is not a data or readable code segment or + * ((segment is a data or nonconforming code segment) + * and (both RPL and CPL > DPL)) + */ + if ((seg_desc.type & 0xa) == 0x8 || + (((seg_desc.type & 0xc) != 0xc) && + (rpl > dpl && cpl > dpl))) + goto exception; + break; + } + + if (seg_desc.s) { + /* mark segment as accessed */ + seg_desc.type |= 1; + ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + } +load: + ops->set_segment_selector(selector, seg, ctxt->vcpu); + ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); + return X86EMUL_CONTINUE; +exception: + kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code); + return X86EMUL_PROPAGATE_FAULT; +} + static inline void emulate_push(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; @@ -1196,13 +1507,56 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, rc = ops->read_emulated(register_address(c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]), dest, len, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); return rc; } +static int emulate_popf(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *dest, int len) +{ + int rc; + unsigned long val, change_mask; + int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + int cpl = ops->cpl(ctxt->vcpu); + + rc = emulate_pop(ctxt, ops, &val, len); + if (rc != X86EMUL_CONTINUE) + return rc; + + change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF + | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID; + + switch(ctxt->mode) { + case X86EMUL_MODE_PROT64: + case X86EMUL_MODE_PROT32: + case X86EMUL_MODE_PROT16: + if (cpl == 0) + change_mask |= EFLG_IOPL; + if (cpl <= iopl) + change_mask |= EFLG_IF; + break; + case X86EMUL_MODE_VM86: + if (iopl < 3) { + kvm_inject_gp(ctxt->vcpu, 0); + return X86EMUL_PROPAGATE_FAULT; + } + change_mask |= EFLG_IF; + break; + default: /* real mode */ + change_mask |= (EFLG_IOPL | EFLG_IF); + break; + } + + *(unsigned long *)dest = + (ctxt->eflags & ~change_mask) | (val & change_mask); + + return rc; +} + static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) { struct decode_cache *c = &ctxt->decode; @@ -1222,10 +1576,10 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int rc; rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; - rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, 1, seg); + rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg); return rc; } @@ -1248,7 +1602,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - int rc = 0; + int rc = X86EMUL_CONTINUE; int reg = VCPU_REGS_RDI; while (reg >= VCPU_REGS_RAX) { @@ -1259,7 +1613,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt, } rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) break; --reg; } @@ -1270,12 +1624,8 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - int rc; - rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); - if (rc != 0) - return rc; - return 0; + return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); } static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) @@ -1311,7 +1661,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - int rc = 0; switch (c->modrm_reg) { case 0 ... 1: /* test */ @@ -1324,11 +1673,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, emulate_1op("neg", c->dst, ctxt->eflags); break; default: - DPRINTF("Cannot emulate %02x\n", c->b); - rc = X86EMUL_UNHANDLEABLE; - break; + return 0; } - return rc; + return 1; } static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, @@ -1358,20 +1705,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, emulate_push(ctxt); break; } - return 0; + return X86EMUL_CONTINUE; } static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - unsigned long memop) + struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - u64 old, new; - int rc; - - rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); - if (rc != 0) - return rc; + u64 old = c->dst.orig_val; if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { @@ -1379,17 +1720,13 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); ctxt->eflags &= ~EFLG_ZF; - } else { - new = ((u64)c->regs[VCPU_REGS_RCX] << 32) | + c->dst.val = ((u64)c->regs[VCPU_REGS_RCX] << 32) | (u32) c->regs[VCPU_REGS_RBX]; - rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); - if (rc != 0) - return rc; ctxt->eflags |= EFLG_ZF; } - return 0; + return X86EMUL_CONTINUE; } static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, @@ -1400,14 +1737,14 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, unsigned long cs; rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); - if (rc) + if (rc != X86EMUL_CONTINUE) return rc; if (c->op_bytes == 4) c->eip = (u32)c->eip; rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); - if (rc) + if (rc != X86EMUL_CONTINUE) return rc; - rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); + rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); return rc; } @@ -1451,7 +1788,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, &c->dst.val, c->dst.bytes, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; break; case OP_NONE: @@ -1460,7 +1797,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, default: break; } - return 0; + return X86EMUL_CONTINUE; } static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) @@ -1514,9 +1851,11 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) u64 msr_data; /* syscall is not available in real mode */ - if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) - return -1; + if (ctxt->mode == X86EMUL_MODE_REAL || + ctxt->mode == X86EMUL_MODE_VM86) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + return X86EMUL_PROPAGATE_FAULT; + } setup_syscalls_segments(ctxt, &cs, &ss); @@ -1553,7 +1892,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); } - return 0; + return X86EMUL_CONTINUE; } static int @@ -1563,22 +1902,19 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) struct kvm_segment cs, ss; u64 msr_data; - /* inject #UD if LOCK prefix is used */ - if (c->lock_prefix) - return -1; - - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL || - !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + /* inject #GP if in real mode */ + if (ctxt->mode == X86EMUL_MODE_REAL) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } /* XXX sysenter/sysexit have not been tested in 64bit mode. * Therefore, we inject an #UD. */ - if (ctxt->mode == X86EMUL_MODE_PROT64) - return -1; + if (ctxt->mode == X86EMUL_MODE_PROT64) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + return X86EMUL_PROPAGATE_FAULT; + } setup_syscalls_segments(ctxt, &cs, &ss); @@ -1587,13 +1923,13 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) case X86EMUL_MODE_PROT32: if ((msr_data & 0xfffc) == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } break; case X86EMUL_MODE_PROT64: if (msr_data == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } break; } @@ -1618,7 +1954,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); c->regs[VCPU_REGS_RSP] = msr_data; - return 0; + return X86EMUL_CONTINUE; } static int @@ -1629,21 +1965,11 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) u64 msr_data; int usermode; - /* inject #UD if LOCK prefix is used */ - if (c->lock_prefix) - return -1; - - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - - /* sysexit must be called from CPL 0 */ - if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) { + /* inject #GP if in real mode or Virtual 8086 mode */ + if (ctxt->mode == X86EMUL_MODE_REAL || + ctxt->mode == X86EMUL_MODE_VM86) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } setup_syscalls_segments(ctxt, &cs, &ss); @@ -1661,7 +1987,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) cs.selector = (u16)(msr_data + 16); if ((msr_data & 0xfffc) == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } ss.selector = (u16)(msr_data + 24); break; @@ -1669,7 +1995,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) cs.selector = (u16)(msr_data + 32); if (msr_data == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } ss.selector = cs.selector + 8; cs.db = 0; @@ -1685,19 +2011,468 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; - return 0; + return X86EMUL_CONTINUE; +} + +static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + int iopl; + if (ctxt->mode == X86EMUL_MODE_REAL) + return false; + if (ctxt->mode == X86EMUL_MODE_VM86) + return true; + iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + return ops->cpl(ctxt->vcpu) > iopl; +} + +static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 port, u16 len) +{ + struct kvm_segment tr_seg; + int r; + u16 io_bitmap_ptr; + u8 perm, bit_idx = port & 0x7; + unsigned mask = (1 << len) - 1; + + kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); + if (tr_seg.unusable) + return false; + if (tr_seg.limit < 103) + return false; + r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, + NULL); + if (r != X86EMUL_CONTINUE) + return false; + if (io_bitmap_ptr + port/8 > tr_seg.limit) + return false; + r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, + ctxt->vcpu, NULL); + if (r != X86EMUL_CONTINUE) + return false; + if ((perm >> bit_idx) & mask) + return false; + return true; +} + +static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 port, u16 len) +{ + if (emulator_bad_iopl(ctxt, ops)) + if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) + return false; + return true; +} + +static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + int seg) +{ + struct desc_struct desc; + if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu)) + return get_desc_base(&desc); + else + return ~0; +} + +static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + struct tss_segment_16 *tss) +{ + struct decode_cache *c = &ctxt->decode; + + tss->ip = c->eip; + tss->flag = ctxt->eflags; + tss->ax = c->regs[VCPU_REGS_RAX]; + tss->cx = c->regs[VCPU_REGS_RCX]; + tss->dx = c->regs[VCPU_REGS_RDX]; + tss->bx = c->regs[VCPU_REGS_RBX]; + tss->sp = c->regs[VCPU_REGS_RSP]; + tss->bp = c->regs[VCPU_REGS_RBP]; + tss->si = c->regs[VCPU_REGS_RSI]; + tss->di = c->regs[VCPU_REGS_RDI]; + + tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); + tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); + tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); + tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); + tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); +} + +static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + struct tss_segment_16 *tss) +{ + struct decode_cache *c = &ctxt->decode; + int ret; + + c->eip = tss->ip; + ctxt->eflags = tss->flag | 2; + c->regs[VCPU_REGS_RAX] = tss->ax; + c->regs[VCPU_REGS_RCX] = tss->cx; + c->regs[VCPU_REGS_RDX] = tss->dx; + c->regs[VCPU_REGS_RBX] = tss->bx; + c->regs[VCPU_REGS_RSP] = tss->sp; + c->regs[VCPU_REGS_RBP] = tss->bp; + c->regs[VCPU_REGS_RSI] = tss->si; + c->regs[VCPU_REGS_RDI] = tss->di; + + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu); + ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); + ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS); + if (ret != X86EMUL_CONTINUE) + return ret; + + return X86EMUL_CONTINUE; +} + +static int task_switch_16(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 tss_selector, u16 old_tss_sel, + ulong old_tss_base, struct desc_struct *new_desc) +{ + struct tss_segment_16 tss_seg; + int ret; + u32 err, new_tss_base = get_desc_base(new_desc); + + ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + &err); + if (ret == X86EMUL_PROPAGATE_FAULT) { + /* FIXME: need to provide precise fault address */ + kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); + return ret; + } + + save_state_to_tss16(ctxt, ops, &tss_seg); + + ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + &err); + if (ret == X86EMUL_PROPAGATE_FAULT) { + /* FIXME: need to provide precise fault address */ + kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); + return ret; + } + + ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + &err); + if (ret == X86EMUL_PROPAGATE_FAULT) { + /* FIXME: need to provide precise fault address */ + kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); + return ret; + } + + if (old_tss_sel != 0xffff) { + tss_seg.prev_task_link = old_tss_sel; + + ret = ops->write_std(new_tss_base, + &tss_seg.prev_task_link, + sizeof tss_seg.prev_task_link, + ctxt->vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) { + /* FIXME: need to provide precise fault address */ + kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); + return ret; + } + } + + return load_state_from_tss16(ctxt, ops, &tss_seg); +} + +static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + struct tss_segment_32 *tss) +{ + struct decode_cache *c = &ctxt->decode; + + tss->cr3 = ops->get_cr(3, ctxt->vcpu); + tss->eip = c->eip; + tss->eflags = ctxt->eflags; + tss->eax = c->regs[VCPU_REGS_RAX]; + tss->ecx = c->regs[VCPU_REGS_RCX]; + tss->edx = c->regs[VCPU_REGS_RDX]; + tss->ebx = c->regs[VCPU_REGS_RBX]; + tss->esp = c->regs[VCPU_REGS_RSP]; + tss->ebp = c->regs[VCPU_REGS_RBP]; + tss->esi = c->regs[VCPU_REGS_RSI]; + tss->edi = c->regs[VCPU_REGS_RDI]; + + tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); + tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); + tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); + tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); + tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu); + tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu); + tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); +} + +static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + struct tss_segment_32 *tss) +{ + struct decode_cache *c = &ctxt->decode; + int ret; + + ops->set_cr(3, tss->cr3, ctxt->vcpu); + c->eip = tss->eip; + ctxt->eflags = tss->eflags | 2; + c->regs[VCPU_REGS_RAX] = tss->eax; + c->regs[VCPU_REGS_RCX] = tss->ecx; + c->regs[VCPU_REGS_RDX] = tss->edx; + c->regs[VCPU_REGS_RBX] = tss->ebx; + c->regs[VCPU_REGS_RSP] = tss->esp; + c->regs[VCPU_REGS_RBP] = tss->ebp; + c->regs[VCPU_REGS_RSI] = tss->esi; + c->regs[VCPU_REGS_RDI] = tss->edi; + + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu); + ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); + ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); + ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu); + ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS); + if (ret != X86EMUL_CONTINUE) + return ret; + + return X86EMUL_CONTINUE; +} + +static int task_switch_32(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 tss_selector, u16 old_tss_sel, + ulong old_tss_base, struct desc_struct *new_desc) +{ + struct tss_segment_32 tss_seg; + int ret; + u32 err, new_tss_base = get_desc_base(new_desc); + + ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + &err); + if (ret == X86EMUL_PROPAGATE_FAULT) { + /* FIXME: need to provide precise fault address */ + kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); + return ret; + } + + save_state_to_tss32(ctxt, ops, &tss_seg); + + ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + &err); + if (ret == X86EMUL_PROPAGATE_FAULT) { + /* FIXME: need to provide precise fault address */ + kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); + return ret; + } + + ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + &err); + if (ret == X86EMUL_PROPAGATE_FAULT) { + /* FIXME: need to provide precise fault address */ + kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); + return ret; + } + + if (old_tss_sel != 0xffff) { + tss_seg.prev_task_link = old_tss_sel; + + ret = ops->write_std(new_tss_base, + &tss_seg.prev_task_link, + sizeof tss_seg.prev_task_link, + ctxt->vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) { + /* FIXME: need to provide precise fault address */ + kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); + return ret; + } + } + + return load_state_from_tss32(ctxt, ops, &tss_seg); +} + +static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 tss_selector, int reason, + bool has_error_code, u32 error_code) +{ + struct desc_struct curr_tss_desc, next_tss_desc; + int ret; + u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); + ulong old_tss_base = + get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR); + u32 desc_limit; + + /* FIXME: old_tss_base == ~0 ? */ + + ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + + /* FIXME: check that next_tss_desc is tss */ + + if (reason != TASK_SWITCH_IRET) { + if ((tss_selector & 3) > next_tss_desc.dpl || + ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { + kvm_inject_gp(ctxt->vcpu, 0); + return X86EMUL_PROPAGATE_FAULT; + } + } + + desc_limit = desc_limit_scaled(&next_tss_desc); + if (!next_tss_desc.p || + ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || + desc_limit < 0x2b)) { + kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR, + tss_selector & 0xfffc); + return X86EMUL_PROPAGATE_FAULT; + } + + if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { + curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */ + write_segment_descriptor(ctxt, ops, old_tss_sel, + &curr_tss_desc); + } + + if (reason == TASK_SWITCH_IRET) + ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT; + + /* set back link to prev task only if NT bit is set in eflags + note that old_tss_sel is not used afetr this point */ + if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) + old_tss_sel = 0xffff; + + if (next_tss_desc.type & 8) + ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel, + old_tss_base, &next_tss_desc); + else + ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel, + old_tss_base, &next_tss_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + + if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) + ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT; + + if (reason != TASK_SWITCH_IRET) { + next_tss_desc.type |= (1 << 1); /* set busy flag */ + write_segment_descriptor(ctxt, ops, tss_selector, + &next_tss_desc); + } + + ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); + ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu); + ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu); + + if (has_error_code) { + struct decode_cache *c = &ctxt->decode; + + c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; + c->lock_prefix = 0; + c->src.val = (unsigned long) error_code; + emulate_push(ctxt); + } + + return ret; +} + +int emulator_task_switch(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 tss_selector, int reason, + bool has_error_code, u32 error_code) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + memset(c, 0, sizeof(struct decode_cache)); + c->eip = ctxt->eip; + memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); + c->dst.type = OP_NONE; + + rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, + has_error_code, error_code); + + if (rc == X86EMUL_CONTINUE) { + memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); + kvm_rip_write(ctxt->vcpu, c->eip); + rc = writeback(ctxt, ops); + } + + return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; +} + +static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base, + int reg, struct operand *op) +{ + struct decode_cache *c = &ctxt->decode; + int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; + + register_address_increment(c, &c->regs[reg], df * op->bytes); + op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]); } int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { - unsigned long memop = 0; u64 msr_data; - unsigned long saved_eip = 0; struct decode_cache *c = &ctxt->decode; - unsigned int port; - int io_dir_in; - int rc = 0; + int rc = X86EMUL_CONTINUE; + int saved_dst_type = c->dst.type; ctxt->interruptibility = 0; @@ -1707,14 +2482,30 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) */ memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); - saved_eip = c->eip; - if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) - memop = c->modrm_ea; + if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; + } + + /* LOCK prefix is allowed only with some instructions */ + if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; + } + + /* Privileged instruction can be executed only in CPL=0 */ + if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } if (c->rep_prefix && (c->d & String)) { + ctxt->restart = true; /* All REP prefixes have the same first termination condition */ - if (c->regs[VCPU_REGS_RCX] == 0) { + if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { + string_done: + ctxt->restart = false; kvm_rip_write(ctxt->vcpu, c->eip); goto done; } @@ -1726,53 +2517,45 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) * - if REPNE/REPNZ and ZF = 1 then done */ if ((c->b == 0xa6) || (c->b == 0xa7) || - (c->b == 0xae) || (c->b == 0xaf)) { + (c->b == 0xae) || (c->b == 0xaf)) { if ((c->rep_prefix == REPE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == 0)) { - kvm_rip_write(ctxt->vcpu, c->eip); - goto done; - } + ((ctxt->eflags & EFLG_ZF) == 0)) + goto string_done; if ((c->rep_prefix == REPNE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { - kvm_rip_write(ctxt->vcpu, c->eip); - goto done; - } + ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) + goto string_done; } - c->regs[VCPU_REGS_RCX]--; - c->eip = kvm_rip_read(ctxt->vcpu); + c->eip = ctxt->eip; } if (c->src.type == OP_MEM) { - c->src.ptr = (unsigned long *)memop; - c->src.val = 0; rc = ops->read_emulated((unsigned long)c->src.ptr, &c->src.val, c->src.bytes, ctxt->vcpu); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; c->src.orig_val = c->src.val; } + if (c->src2.type == OP_MEM) { + rc = ops->read_emulated((unsigned long)c->src2.ptr, + &c->src2.val, + c->src2.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + goto done; + } + if ((c->d & DstMask) == ImplicitOps) goto special_insn; - if (c->dst.type == OP_MEM) { - c->dst.ptr = (unsigned long *)memop; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.val = 0; - if (c->d & BitOp) { - unsigned long mask = ~(c->dst.bytes * 8 - 1); - - c->dst.ptr = (void *)c->dst.ptr + - (c->src.val & mask) / 8; - } - if (!(c->d & Mov) && - /* optimisation - avoid slow emulated read */ - ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0)) + if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { + /* optimisation - avoid slow emulated read if Mov */ + rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val, + c->dst.bytes, ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) goto done; } c->dst.orig_val = c->dst.val; @@ -1792,7 +2575,7 @@ special_insn: break; case 0x07: /* pop es */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0x08 ... 0x0d: @@ -1811,7 +2594,7 @@ special_insn: break; case 0x17: /* pop ss */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0x18 ... 0x1d: @@ -1823,7 +2606,7 @@ special_insn: break; case 0x1f: /* pop ds */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0x20 ... 0x25: @@ -1854,7 +2637,7 @@ special_insn: case 0x58 ... 0x5f: /* pop reg */ pop_instruction: rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0x60: /* pusha */ @@ -1862,7 +2645,7 @@ special_insn: break; case 0x61: /* popa */ rc = emulate_popa(ctxt, ops); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0x63: /* movsxd */ @@ -1876,37 +2659,29 @@ special_insn: break; case 0x6c: /* insb */ case 0x6d: /* insw/insd */ - if (kvm_emulate_pio_string(ctxt->vcpu, - 1, - (c->d & ByteOp) ? 1 : c->op_bytes, - c->rep_prefix ? - address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, - (ctxt->eflags & EFLG_DF), - register_address(c, es_base(ctxt), - c->regs[VCPU_REGS_RDI]), - c->rep_prefix, - c->regs[VCPU_REGS_RDX]) == 0) { - c->eip = saved_eip; - return -1; + c->dst.bytes = min(c->dst.bytes, 4u); + if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], + c->dst.bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; } - return 0; + if (!pio_in_emulated(ctxt, ops, c->dst.bytes, + c->regs[VCPU_REGS_RDX], &c->dst.val)) + goto done; /* IO is needed, skip writeback */ + break; case 0x6e: /* outsb */ case 0x6f: /* outsw/outsd */ - if (kvm_emulate_pio_string(ctxt->vcpu, - 0, - (c->d & ByteOp) ? 1 : c->op_bytes, - c->rep_prefix ? - address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, - (ctxt->eflags & EFLG_DF), - register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), - c->rep_prefix, - c->regs[VCPU_REGS_RDX]) == 0) { - c->eip = saved_eip; - return -1; + c->src.bytes = min(c->src.bytes, 4u); + if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], + c->src.bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; } - return 0; + ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX], + &c->src.val, 1, ctxt->vcpu); + + c->dst.type = OP_NONE; /* nothing to writeback */ + break; case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(c->b, ctxt->eflags)) jmp_rel(c, c->src.val); @@ -1963,12 +2738,11 @@ special_insn: case 0x8c: { /* mov r/m, sreg */ struct kvm_segment segreg; - if (c->modrm_reg <= 5) + if (c->modrm_reg <= VCPU_SREG_GS) kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg); else { - printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n", - c->modrm); - goto cannot_emulate; + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; } c->dst.val = segreg.selector; break; @@ -1978,32 +2752,26 @@ special_insn: break; case 0x8e: { /* mov seg, r/m16 */ uint16_t sel; - int type_bits; - int err; sel = c->src.val; - if (c->modrm_reg == VCPU_SREG_SS) - toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); - if (c->modrm_reg <= 5) { - type_bits = (c->modrm_reg == 1) ? 9 : 1; - err = kvm_load_segment_descriptor(ctxt->vcpu, sel, - type_bits, c->modrm_reg); - } else { - printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n", - c->modrm); - goto cannot_emulate; + if (c->modrm_reg == VCPU_SREG_CS || + c->modrm_reg > VCPU_SREG_GS) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; } - if (err < 0) - goto cannot_emulate; + if (c->modrm_reg == VCPU_SREG_SS) + toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS); + + rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg); c->dst.type = OP_NONE; /* Disable writeback. */ break; } case 0x8f: /* pop (sole member of Grp1a) */ rc = emulate_grp1a(ctxt, ops); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0x90: /* nop / xchg r8,rax */ @@ -2025,7 +2793,10 @@ special_insn: c->dst.type = OP_REG; c->dst.ptr = (unsigned long *) &ctxt->eflags; c->dst.bytes = c->op_bytes; - goto pop_instruction; + rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); + if (rc != X86EMUL_CONTINUE) + goto done; + break; case 0xa0 ... 0xa1: /* mov */ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; c->dst.val = c->src.val; @@ -2034,85 +2805,16 @@ special_insn: c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; break; case 0xa4 ... 0xa5: /* movs */ - c->dst.type = OP_MEM; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address(c, - es_base(ctxt), - c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated(register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), - &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0) - goto done; - register_address_increment(c, &c->regs[VCPU_REGS_RSI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - register_address_increment(c, &c->regs[VCPU_REGS_RDI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - break; + goto mov; case 0xa6 ... 0xa7: /* cmps */ - c->src.type = OP_NONE; /* Disable writeback. */ - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->src.ptr = (unsigned long *)register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]); - if ((rc = ops->read_emulated((unsigned long)c->src.ptr, - &c->src.val, - c->src.bytes, - ctxt->vcpu)) != 0) - goto done; - c->dst.type = OP_NONE; /* Disable writeback. */ - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address(c, - es_base(ctxt), - c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) - goto done; - DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); - - emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); - - register_address_increment(c, &c->regs[VCPU_REGS_RSI], - (ctxt->eflags & EFLG_DF) ? -c->src.bytes - : c->src.bytes); - register_address_increment(c, &c->regs[VCPU_REGS_RDI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - - break; + goto cmp; case 0xaa ... 0xab: /* stos */ - c->dst.type = OP_MEM; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address(c, - es_base(ctxt), - c->regs[VCPU_REGS_RDI]); c->dst.val = c->regs[VCPU_REGS_RAX]; - register_address_increment(c, &c->regs[VCPU_REGS_RDI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); break; case 0xac ... 0xad: /* lods */ - c->dst.type = OP_REG; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - if ((rc = ops->read_emulated(register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) - goto done; - register_address_increment(c, &c->regs[VCPU_REGS_RSI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - break; + goto mov; case 0xae ... 0xaf: /* scas */ DPRINTF("Urk! I don't handle SCAS.\n"); goto cannot_emulate; @@ -2132,7 +2834,7 @@ special_insn: break; case 0xcb: /* ret far */ rc = emulate_ret_far(ctxt, ops); - if (rc) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0xd0 ... 0xd1: /* Grp2 */ @@ -2145,14 +2847,10 @@ special_insn: break; case 0xe4: /* inb */ case 0xe5: /* in */ - port = c->src.val; - io_dir_in = 1; - goto do_io; + goto do_io_in; case 0xe6: /* outb */ case 0xe7: /* out */ - port = c->src.val; - io_dir_in = 0; - goto do_io; + goto do_io_out; case 0xe8: /* call (near) */ { long int rel = c->src.val; c->src.val = (unsigned long) c->eip; @@ -2163,11 +2861,10 @@ special_insn: case 0xe9: /* jmp rel */ goto jmp; case 0xea: /* jmp far */ - if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, - VCPU_SREG_CS) < 0) { - DPRINTF("jmp far: Failed to load CS descriptor\n"); - goto cannot_emulate; - } + jump_far: + if (load_segment_descriptor(ctxt, ops, c->src2.val, + VCPU_SREG_CS)) + goto done; c->eip = c->src.val; break; @@ -2178,19 +2875,29 @@ special_insn: break; case 0xec: /* in al,dx */ case 0xed: /* in (e/r)ax,dx */ - port = c->regs[VCPU_REGS_RDX]; - io_dir_in = 1; - goto do_io; + c->src.val = c->regs[VCPU_REGS_RDX]; + do_io_in: + c->dst.bytes = min(c->dst.bytes, 4u); + if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, + &c->dst.val)) + goto done; /* IO is needed */ + break; case 0xee: /* out al,dx */ case 0xef: /* out (e/r)ax,dx */ - port = c->regs[VCPU_REGS_RDX]; - io_dir_in = 0; - do_io: if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, - (c->d & ByteOp) ? 1 : c->op_bytes, - port) != 0) { - c->eip = saved_eip; - goto cannot_emulate; + c->src.val = c->regs[VCPU_REGS_RDX]; + do_io_out: + c->dst.bytes = min(c->dst.bytes, 4u); + if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; } + ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, + ctxt->vcpu); + c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf4: /* hlt */ ctxt->vcpu->arch.halt_request = 1; @@ -2201,22 +2908,29 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf6 ... 0xf7: /* Grp3 */ - rc = emulate_grp3(ctxt, ops); - if (rc != 0) - goto done; + if (!emulate_grp3(ctxt, ops)) + goto cannot_emulate; break; case 0xf8: /* clc */ ctxt->eflags &= ~EFLG_CF; c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfa: /* cli */ - ctxt->eflags &= ~X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ + if (emulator_bad_iopl(ctxt, ops)) + kvm_inject_gp(ctxt->vcpu, 0); + else { + ctxt->eflags &= ~X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } break; case 0xfb: /* sti */ - toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); - ctxt->eflags |= X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ + if (emulator_bad_iopl(ctxt, ops)) + kvm_inject_gp(ctxt->vcpu, 0); + else { + toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI); + ctxt->eflags |= X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } break; case 0xfc: /* cld */ ctxt->eflags &= ~EFLG_DF; @@ -2226,28 +2940,55 @@ special_insn: ctxt->eflags |= EFLG_DF; c->dst.type = OP_NONE; /* Disable writeback. */ break; - case 0xfe ... 0xff: /* Grp4/Grp5 */ + case 0xfe: /* Grp4 */ + grp45: rc = emulate_grp45(ctxt, ops); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; + case 0xff: /* Grp5 */ + if (c->modrm_reg == 5) + goto jump_far; + goto grp45; } writeback: rc = writeback(ctxt, ops); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; + /* + * restore dst type in case the decoding will be reused + * (happens for string instruction ) + */ + c->dst.type = saved_dst_type; + + if ((c->d & SrcMask) == SrcSI) + string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI, + &c->src); + + if ((c->d & DstMask) == DstDI) + string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst); + + if (c->rep_prefix && (c->d & String)) { + struct read_cache *rc = &ctxt->decode.io_read; + register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); + /* + * Re-enter guest when pio read ahead buffer is empty or, + * if it is not used, after each 1024 iteration. + */ + if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) || + (rc->end != 0 && rc->end == rc->pos)) + ctxt->restart = false; + } + /* Commit shadow register state. */ memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(ctxt->vcpu, c->eip); + ops->set_rflags(ctxt->vcpu, ctxt->eflags); done: - if (rc == X86EMUL_UNHANDLEABLE) { - c->eip = saved_eip; - return -1; - } - return 0; + return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; twobyte_insn: switch (c->b) { @@ -2261,18 +3002,18 @@ twobyte_insn: goto cannot_emulate; rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc) + if (rc != X86EMUL_CONTINUE) goto done; /* Let the processor re-execute the fixed hypercall */ - c->eip = kvm_rip_read(ctxt->vcpu); + c->eip = ctxt->eip; /* Disable writeback. */ c->dst.type = OP_NONE; break; case 2: /* lgdt */ rc = read_descriptor(ctxt, ops, c->src.ptr, &size, &address, c->op_bytes); - if (rc) + if (rc != X86EMUL_CONTINUE) goto done; realmode_lgdt(ctxt->vcpu, size, address); /* Disable writeback. */ @@ -2283,7 +3024,7 @@ twobyte_insn: switch (c->modrm_rm) { case 1: rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc) + if (rc != X86EMUL_CONTINUE) goto done; break; default: @@ -2293,7 +3034,7 @@ twobyte_insn: rc = read_descriptor(ctxt, ops, c->src.ptr, &size, &address, c->op_bytes); - if (rc) + if (rc != X86EMUL_CONTINUE) goto done; realmode_lidt(ctxt->vcpu, size, address); } @@ -2302,15 +3043,18 @@ twobyte_insn: break; case 4: /* smsw */ c->dst.bytes = 2; - c->dst.val = realmode_get_cr(ctxt->vcpu, 0); + c->dst.val = ops->get_cr(0, ctxt->vcpu); break; case 6: /* lmsw */ - realmode_lmsw(ctxt->vcpu, (u16)c->src.val, - &ctxt->eflags); + ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) | + (c->src.val & 0x0f), ctxt->vcpu); c->dst.type = OP_NONE; break; + case 5: /* not defined */ + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; case 7: /* invlpg*/ - emulate_invlpg(ctxt->vcpu, memop); + emulate_invlpg(ctxt->vcpu, c->modrm_ea); /* Disable writeback. */ c->dst.type = OP_NONE; break; @@ -2319,8 +3063,9 @@ twobyte_insn: } break; case 0x05: /* syscall */ - if (emulate_syscall(ctxt) == -1) - goto cannot_emulate; + rc = emulate_syscall(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; @@ -2335,54 +3080,54 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0x20: /* mov cr, reg */ - if (c->modrm_mod != 3) - goto cannot_emulate; - c->regs[c->modrm_rm] = - realmode_get_cr(ctxt->vcpu, c->modrm_reg); + switch (c->modrm_reg) { + case 1: + case 5 ... 7: + case 9 ... 15: + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; + } + c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); c->dst.type = OP_NONE; /* no writeback */ break; case 0x21: /* mov from dr to reg */ - if (c->modrm_mod != 3) - goto cannot_emulate; - rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); - if (rc) - goto cannot_emulate; + if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && + (c->modrm_reg == 4 || c->modrm_reg == 5)) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; + } + emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); c->dst.type = OP_NONE; /* no writeback */ break; case 0x22: /* mov reg, cr */ - if (c->modrm_mod != 3) - goto cannot_emulate; - realmode_set_cr(ctxt->vcpu, - c->modrm_reg, c->modrm_val, &ctxt->eflags); + ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu); c->dst.type = OP_NONE; break; case 0x23: /* mov from reg to dr */ - if (c->modrm_mod != 3) - goto cannot_emulate; - rc = emulator_set_dr(ctxt, c->modrm_reg, - c->regs[c->modrm_rm]); - if (rc) - goto cannot_emulate; + if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && + (c->modrm_reg == 4 || c->modrm_reg == 5)) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; + } + emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]); c->dst.type = OP_NONE; /* no writeback */ break; case 0x30: /* wrmsr */ msr_data = (u32)c->regs[VCPU_REGS_RAX] | ((u64)c->regs[VCPU_REGS_RDX] << 32); - rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); - if (rc) { + if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { kvm_inject_gp(ctxt->vcpu, 0); - c->eip = kvm_rip_read(ctxt->vcpu); + goto done; } rc = X86EMUL_CONTINUE; c->dst.type = OP_NONE; break; case 0x32: /* rdmsr */ - rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); - if (rc) { + if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { kvm_inject_gp(ctxt->vcpu, 0); - c->eip = kvm_rip_read(ctxt->vcpu); + goto done; } else { c->regs[VCPU_REGS_RAX] = (u32)msr_data; c->regs[VCPU_REGS_RDX] = msr_data >> 32; @@ -2391,14 +3136,16 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0x34: /* sysenter */ - if (emulate_sysenter(ctxt) == -1) - goto cannot_emulate; + rc = emulate_sysenter(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; case 0x35: /* sysexit */ - if (emulate_sysexit(ctxt) == -1) - goto cannot_emulate; + rc = emulate_sysexit(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; @@ -2417,7 +3164,7 @@ twobyte_insn: break; case 0xa1: /* pop fs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0xa3: @@ -2436,7 +3183,7 @@ twobyte_insn: break; case 0xa9: /* pop gs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0xab: @@ -2508,16 +3255,14 @@ twobyte_insn: (u64) c->src.val; break; case 0xc7: /* Grp9 (cmpxchg8b) */ - rc = emulate_grp9(ctxt, ops, memop); - if (rc != 0) + rc = emulate_grp9(ctxt, ops); + if (rc != X86EMUL_CONTINUE) goto done; - c->dst.type = OP_NONE; break; } goto writeback; cannot_emulate: DPRINTF("Cannot emulate %02x\n", c->b); - c->eip = saved_eip; return -1; } diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 296aba49472a..0150affad25d 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -32,6 +32,7 @@ #define pr_fmt(fmt) "pit: " fmt #include <linux/kvm_host.h> +#include <linux/slab.h> #include "irq.h" #include "i8254.h" @@ -242,11 +243,11 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, irq_ack_notifier); - spin_lock(&ps->inject_lock); + raw_spin_lock(&ps->inject_lock); if (atomic_dec_return(&ps->pit_timer.pending) < 0) atomic_inc(&ps->pit_timer.pending); ps->irq_ack = 1; - spin_unlock(&ps->inject_lock); + raw_spin_unlock(&ps->inject_lock); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -467,6 +468,9 @@ static int pit_ioport_read(struct kvm_io_device *this, return -EOPNOTSUPP; addr &= KVM_PIT_CHANNEL_MASK; + if (addr == 3) + return 0; + s = &pit_state->channels[addr]; mutex_lock(&pit_state->lock); @@ -602,7 +606,7 @@ static const struct kvm_io_device_ops speaker_dev_ops = { .write = speaker_ioport_write, }; -/* Caller must have writers lock on slots_lock */ +/* Caller must hold slots_lock */ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; @@ -621,7 +625,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) mutex_init(&pit->pit_state.lock); mutex_lock(&pit->pit_state.lock); - spin_lock_init(&pit->pit_state.inject_lock); + raw_spin_lock_init(&pit->pit_state.inject_lock); kvm->arch.vpit = pit; pit->kvm = kvm; @@ -642,13 +646,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); kvm_iodevice_init(&pit->dev, &pit_dev_ops); - ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev); if (ret < 0) goto fail; if (flags & KVM_PIT_SPEAKER_DUMMY) { kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); - ret = __kvm_io_bus_register_dev(&kvm->pio_bus, + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); if (ret < 0) goto fail_unregister; @@ -657,11 +661,12 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) return pit; fail_unregister: - __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); fail: - if (pit->irq_source_id >= 0) - kvm_free_irq_source_id(kvm, pit->irq_source_id); + kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); + kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); + kvm_free_irq_source_id(kvm, pit->irq_source_id); kfree(pit); return NULL; @@ -720,12 +725,12 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) /* Try to inject pending interrupts when * last one has been acked. */ - spin_lock(&ps->inject_lock); + raw_spin_lock(&ps->inject_lock); if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { ps->irq_ack = 0; inject = 1; } - spin_unlock(&ps->inject_lock); + raw_spin_unlock(&ps->inject_lock); if (inject) __inject_pit_timer_intr(kvm); } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index d4c1c7ffdc09..900d6b0ba7c2 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -27,7 +27,7 @@ struct kvm_kpit_state { u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; - spinlock_t inject_lock; + raw_spinlock_t inject_lock; unsigned long irq_ack; struct kvm_irq_ack_notifier irq_ack_notifier; }; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index d057c0cbd245..93825ff3338f 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -26,12 +26,36 @@ * Port from Qemu. */ #include <linux/mm.h> +#include <linux/slab.h> #include <linux/bitops.h> #include "irq.h" #include <linux/kvm_host.h> #include "trace.h" +static void pic_lock(struct kvm_pic *s) + __acquires(&s->lock) +{ + raw_spin_lock(&s->lock); +} + +static void pic_unlock(struct kvm_pic *s) + __releases(&s->lock) +{ + bool wakeup = s->wakeup_needed; + struct kvm_vcpu *vcpu; + + s->wakeup_needed = false; + + raw_spin_unlock(&s->lock); + + if (wakeup) { + vcpu = s->kvm->bsp_vcpu; + if (vcpu) + kvm_vcpu_kick(vcpu); + } +} + static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s->isr &= ~(1 << irq); @@ -44,18 +68,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) * Other interrupt may be delivered to PIC while lock is dropped but * it should be safe since PIC state is already updated at this stage. */ - spin_unlock(&s->pics_state->lock); + pic_unlock(s->pics_state); kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); - spin_lock(&s->pics_state->lock); + pic_lock(s->pics_state); } void kvm_pic_clear_isr_ack(struct kvm *kvm) { struct kvm_pic *s = pic_irqchip(kvm); - spin_lock(&s->lock); + + pic_lock(s); s->pics[0].isr_ack = 0xff; s->pics[1].isr_ack = 0xff; - spin_unlock(&s->lock); + pic_unlock(s); } /* @@ -156,9 +181,9 @@ static void pic_update_irq(struct kvm_pic *s) void kvm_pic_update_irq(struct kvm_pic *s) { - spin_lock(&s->lock); + pic_lock(s); pic_update_irq(s); - spin_unlock(&s->lock); + pic_unlock(s); } int kvm_pic_set_irq(void *opaque, int irq, int level) @@ -166,14 +191,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) struct kvm_pic *s = opaque; int ret = -1; - spin_lock(&s->lock); + pic_lock(s); if (irq >= 0 && irq < PIC_NUM_PINS) { ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); pic_update_irq(s); trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, s->pics[irq >> 3].imr, ret == 0); } - spin_unlock(&s->lock); + pic_unlock(s); return ret; } @@ -203,7 +228,7 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); - spin_lock(&s->lock); + pic_lock(s); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { pic_intack(&s->pics[0], irq); @@ -228,7 +253,7 @@ int kvm_pic_read_irq(struct kvm *kvm) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); - spin_unlock(&s->lock); + pic_unlock(s); return intno; } @@ -442,7 +467,7 @@ static int picdev_write(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte write\n"); return 0; } - spin_lock(&s->lock); + pic_lock(s); switch (addr) { case 0x20: case 0x21: @@ -455,7 +480,7 @@ static int picdev_write(struct kvm_io_device *this, elcr_ioport_write(&s->pics[addr & 1], addr, data); break; } - spin_unlock(&s->lock); + pic_unlock(s); return 0; } @@ -472,7 +497,7 @@ static int picdev_read(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte read\n"); return 0; } - spin_lock(&s->lock); + pic_lock(s); switch (addr) { case 0x20: case 0x21: @@ -486,7 +511,7 @@ static int picdev_read(struct kvm_io_device *this, break; } *(unsigned char *)val = data; - spin_unlock(&s->lock); + pic_unlock(s); return 0; } @@ -503,7 +528,7 @@ static void pic_irq_request(void *opaque, int level) s->output = level; if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { s->pics[0].isr_ack &= ~(1 << irq); - kvm_vcpu_kick(vcpu); + s->wakeup_needed = true; } } @@ -520,7 +545,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) return NULL; - spin_lock_init(&s->lock); + raw_spin_lock_init(&s->lock); s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; s->pics[1].elcr_mask = 0xde; @@ -533,7 +558,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) * Initialize PIO device */ kvm_iodevice_init(&s->dev, &picdev_ops); - ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev); + mutex_unlock(&kvm->slots_lock); if (ret < 0) { kfree(s); return NULL; @@ -541,3 +568,14 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) return s; } + +void kvm_destroy_pic(struct kvm *kvm) +{ + struct kvm_pic *vpic = kvm->arch.vpic; + + if (vpic) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev); + kvm->arch.vpic = NULL; + kfree(vpic); + } +} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index be399e207d57..cd1f362f413d 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -62,7 +62,8 @@ struct kvm_kpic_state { }; struct kvm_pic { - spinlock_t lock; + raw_spinlock_t lock; + bool wakeup_needed; unsigned pending_acks; struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ @@ -75,6 +76,7 @@ struct kvm_pic { }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); +void kvm_destroy_pic(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); void kvm_pic_clear_isr_ack(struct kvm *kvm); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 7bcc5b6a4403..cff851cf5322 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -1,6 +1,11 @@ #ifndef ASM_KVM_CACHE_REGS_H #define ASM_KVM_CACHE_REGS_H +#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS +#define KVM_POSSIBLE_CR4_GUEST_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT | X86_CR4_PGE) + static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, enum kvm_reg reg) { @@ -38,4 +43,30 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) return vcpu->arch.pdptrs[index]; } +static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; + if (tmask & vcpu->arch.cr0_guest_owned_bits) + kvm_x86_ops->decache_cr0_guest_bits(vcpu); + return vcpu->arch.cr0 & mask; +} + +static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, ~0UL); +} + +static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; + if (tmask & vcpu->arch.cr4_guest_owned_bits) + kvm_x86_ops->decache_cr4_guest_bits(vcpu); + return vcpu->arch.cr4 & mask; +} + +static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, ~0UL); +} + #endif diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h index 55c7524dda54..64bc6ea78d90 100644 --- a/arch/x86/kvm/kvm_timer.h +++ b/arch/x86/kvm/kvm_timer.h @@ -10,9 +10,7 @@ struct kvm_timer { }; struct kvm_timer_ops { - bool (*is_periodic)(struct kvm_timer *); + bool (*is_periodic)(struct kvm_timer *); }; - enum hrtimer_restart kvm_timer_fn(struct hrtimer *data); - diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ba8c045da782..1eb7a4ae0c9c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -26,6 +26,7 @@ #include <linux/io.h> #include <linux/module.h> #include <linux/math64.h> +#include <linux/slab.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/page.h> @@ -1246,3 +1247,34 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) return 0; } + +int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!irqchip_in_kernel(vcpu->kvm)) + return 1; + + /* if this is ICR write vector before command */ + if (reg == APIC_ICR) + apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); + return apic_reg_write(apic, reg, (u32)data); +} + +int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 low, high = 0; + + if (!irqchip_in_kernel(vcpu->kvm)) + return 1; + + if (apic_reg_read(apic, reg, 4, &low)) + return 1; + if (reg == APIC_ICR) + apic_reg_read(apic, APIC_ICR2, 4, &high); + + *data = (((u64)high) << 32) | low; + + return 0; +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 40010b09c4aa..f5fe32c5edad 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -48,4 +48,12 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + +int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + +static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; +} #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 89a49fb46a27..81563e76e28f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -18,6 +18,7 @@ */ #include "mmu.h" +#include "x86.h" #include "kvm_cache_regs.h" #include <linux/kvm_host.h> @@ -29,6 +30,8 @@ #include <linux/swap.h> #include <linux/hugetlb.h> #include <linux/compiler.h> +#include <linux/srcu.h> +#include <linux/slab.h> #include <asm/page.h> #include <asm/cmpxchg.h> @@ -136,16 +139,6 @@ module_param(oos_shadow, bool, 0644); #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | PT64_NX_MASK) -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_RSVD_MASK (1U << 3) -#define PFERR_FETCH_MASK (1U << 4) - -#define PT_PDPE_LEVEL 3 -#define PT_DIRECTORY_LEVEL 2 -#define PT_PAGE_TABLE_LEVEL 1 - #define RMAP_EXT 4 #define ACC_EXEC_MASK 1 @@ -153,6 +146,8 @@ module_param(oos_shadow, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +#include <trace/events/kvm.h> + #define CREATE_TRACE_POINTS #include "mmutrace.h" @@ -178,12 +173,7 @@ struct kvm_shadow_walk_iterator { shadow_walk_okay(&(_walker)); \ shadow_walk_next(&(_walker))) - -struct kvm_unsync_walk { - int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk); -}; - -typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); +typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp); static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; @@ -227,9 +217,9 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); -static int is_write_protection(struct kvm_vcpu *vcpu) +static bool is_write_protection(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr0 & X86_CR0_WP; + return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } static int is_cpuid_PSE36(void) @@ -239,7 +229,7 @@ static int is_cpuid_PSE36(void) static int is_nx(struct kvm_vcpu *vcpu) { - return vcpu->arch.shadow_efer & EFER_NX; + return vcpu->arch.efer & EFER_NX; } static int is_shadow_present_pte(u64 pte) @@ -253,7 +243,7 @@ static int is_large_pte(u64 pte) return pte & PT_PAGE_SIZE_MASK; } -static int is_writeble_pte(unsigned long pte) +static int is_writable_pte(unsigned long pte) { return pte & PT_WRITABLE_MASK; } @@ -331,7 +321,6 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, page = alloc_page(GFP_KERNEL); if (!page) return -ENOMEM; - set_page_private(page, 0); cache->objects[cache->nobjs++] = page_address(page); } return 0; @@ -442,9 +431,9 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) int i; gfn = unalias_gfn(kvm, gfn); + slot = gfn_to_memslot_unaliased(kvm, gfn); for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - slot = gfn_to_memslot_unaliased(kvm, gfn); write_count = slot_largepage_idx(gfn, slot, i); *write_count -= 1; WARN_ON(*write_count < 0); @@ -470,24 +459,10 @@ static int has_wrprotected_page(struct kvm *kvm, static int host_mapping_level(struct kvm *kvm, gfn_t gfn) { - unsigned long page_size = PAGE_SIZE; - struct vm_area_struct *vma; - unsigned long addr; + unsigned long page_size; int i, ret = 0; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return PT_PAGE_TABLE_LEVEL; - - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, addr); - if (!vma) - goto out; - - page_size = vma_kernel_pagesize(vma); - -out: - up_read(¤t->mm->mmap_sem); + page_size = kvm_host_page_size(kvm, gfn); for (i = PT_PAGE_TABLE_LEVEL; i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { @@ -503,8 +478,7 @@ out: static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) { struct kvm_memory_slot *slot; - int host_level; - int level = PT_PAGE_TABLE_LEVEL; + int host_level, level, max_level; slot = gfn_to_memslot(vcpu->kvm, large_gfn); if (slot && slot->dirty_bitmap) @@ -515,7 +489,10 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) if (host_level == PT_PAGE_TABLE_LEVEL) return host_level; - for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) + max_level = kvm_x86_ops->get_lpage_level() < host_level ? + kvm_x86_ops->get_lpage_level() : host_level; + + for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) break; @@ -633,7 +610,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) pfn = spte_to_pfn(*spte); if (*spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); - if (is_writeble_pte(*spte)) + if (is_writable_pte(*spte)) kvm_set_pfn_dirty(pfn); rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); if (!*rmapp) { @@ -662,6 +639,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) prev_desc = desc; desc = desc->more; } + pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); BUG(); } } @@ -669,7 +647,6 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) { struct kvm_rmap_desc *desc; - struct kvm_rmap_desc *prev_desc; u64 *prev_spte; int i; @@ -681,7 +658,6 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) return NULL; } desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - prev_desc = NULL; prev_spte = NULL; while (desc) { for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { @@ -708,7 +684,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!spte); BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); - if (is_writeble_pte(*spte)) { + if (is_writable_pte(*spte)) { __set_spte(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } @@ -732,7 +708,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!(*spte & PT_PRESENT_MASK)); BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); - if (is_writeble_pte(*spte)) { + if (is_writable_pte(*spte)) { rmap_remove(kvm, spte); --kvm->stat.lpages; __set_spte(spte, shadow_trap_nonpresent_pte); @@ -787,7 +763,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; - if (is_writeble_pte(*spte)) + if (is_writable_pte(*spte)) kvm_set_pfn_dirty(spte_to_pfn(*spte)); __set_spte(spte, new_spte); spte = rmap_next(kvm, rmapp, spte); @@ -805,35 +781,32 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, unsigned long data)) { int i, j; + int ret; int retval = 0; + struct kvm_memslots *slots; - /* - * If mmap_sem isn't taken, we can look the memslots with only - * the mmu_lock by skipping over the slots with userspace_addr == 0. - */ - for (i = 0; i < kvm->nmemslots; i++) { - struct kvm_memory_slot *memslot = &kvm->memslots[i]; + slots = kvm_memslots(kvm); + + for (i = 0; i < slots->nmemslots; i++) { + struct kvm_memory_slot *memslot = &slots->memslots[i]; unsigned long start = memslot->userspace_addr; unsigned long end; - /* mmu_lock protects userspace_addr */ - if (!start) - continue; - end = start + (memslot->npages << PAGE_SHIFT); if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; - retval |= handler(kvm, &memslot->rmap[gfn_offset], - data); + ret = handler(kvm, &memslot->rmap[gfn_offset], data); for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { int idx = gfn_offset; idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); - retval |= handler(kvm, + ret |= handler(kvm, &memslot->lpage_info[j][idx].rmap_pde, data); } + trace_kvm_age_page(hva, memslot, ret); + retval |= ret; } } @@ -856,9 +829,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 *spte; int young = 0; - /* always return old for EPT */ + /* + * Emulate the accessed bit for EPT, by checking if this page has + * an EPT mapping, and clearing it if it does. On the next access, + * a new EPT mapping will be established. + * This has some overhead, but not as much as the cost of swapping + * out actively used pages or breaking up actively used hugepages. + */ if (!shadow_accessed_mask) - return 0; + return kvm_unmap_rmapp(kvm, rmapp, data); spte = rmap_next(kvm, rmapp, NULL); while (spte) { @@ -937,7 +916,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&sp->oos_link); bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); sp->multimapped = 0; sp->parent_pte = parent_pte; @@ -1021,8 +999,7 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, } -static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - mmu_parent_walk_fn fn) +static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) { struct kvm_pte_chain *pte_chain; struct hlist_node *node; @@ -1031,8 +1008,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (!sp->multimapped && sp->parent_pte) { parent_sp = page_header(__pa(sp->parent_pte)); - fn(vcpu, parent_sp); - mmu_parent_walk(vcpu, parent_sp, fn); + fn(parent_sp); + mmu_parent_walk(parent_sp, fn); return; } hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) @@ -1040,8 +1017,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (!pte_chain->parent_ptes[i]) break; parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); - fn(vcpu, parent_sp); - mmu_parent_walk(vcpu, parent_sp, fn); + fn(parent_sp); + mmu_parent_walk(parent_sp, fn); } } @@ -1078,16 +1055,15 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) } } -static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static int unsync_walk_fn(struct kvm_mmu_page *sp) { kvm_mmu_update_parents_unsync(sp); return 1; } -static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) +static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) { - mmu_parent_walk(vcpu, sp, unsync_walk_fn); + mmu_parent_walk(sp, unsync_walk_fn); kvm_mmu_update_parents_unsync(sp); } @@ -1213,6 +1189,7 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { WARN_ON(!sp->unsync); + trace_kvm_mmu_sync_page(sp); sp->unsync = 0; --kvm->stat.mmu_unsync; } @@ -1221,12 +1198,11 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - if (sp->role.glevels != vcpu->arch.mmu.root_level) { + if (sp->role.cr4_pae != !!is_pae(vcpu)) { kvm_mmu_zap_page(vcpu->kvm, sp); return 1; } - trace_kvm_mmu_sync_page(sp); if (rmap_write_protect(vcpu->kvm, sp->gfn)) kvm_flush_remote_tlbs(vcpu->kvm); kvm_unlink_unsync_page(vcpu->kvm, sp); @@ -1343,6 +1319,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, role = vcpu->arch.mmu.base_role; role.level = level; role.direct = direct; + if (role.direct) + role.cr4_pae = 0; role.access = access; if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); @@ -1363,7 +1341,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, mmu_page_add_parent_pte(vcpu, sp, parent_pte); if (sp->unsync_children) { set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); - kvm_mmu_mark_parents_unsync(vcpu, sp); + kvm_mmu_mark_parents_unsync(sp); } trace_kvm_mmu_get_page(sp, false); return sp; @@ -1502,8 +1480,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm, for_each_sp(pages, sp, parents, i) { kvm_mmu_zap_page(kvm, sp); mmu_pages_clear_parents(&parents); + zapped++; } - zapped += pages.nr; kvm_mmu_pages_init(parent, &parents, &pages); } @@ -1554,14 +1532,16 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) */ if (used_pages > kvm_nr_mmu_pages) { - while (used_pages > kvm_nr_mmu_pages) { + while (used_pages > kvm_nr_mmu_pages && + !list_empty(&kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - kvm_mmu_zap_page(kvm, page); + used_pages -= kvm_mmu_zap_page(kvm, page); used_pages--; } + kvm_nr_mmu_pages = used_pages; kvm->arch.n_free_mmu_pages = 0; } else @@ -1583,13 +1563,14 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) r = 0; index = kvm_page_table_hashfn(gfn); bucket = &kvm->arch.mmu_page_hash[index]; +restart: hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) if (sp->gfn == gfn && !sp->role.direct) { pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); r = 1; if (kvm_mmu_zap_page(kvm, sp)) - n = bucket->first; + goto restart; } return r; } @@ -1603,19 +1584,21 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) index = kvm_page_table_hashfn(gfn); bucket = &kvm->arch.mmu_page_hash[index]; +restart: hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { if (sp->gfn == gfn && !sp->role.direct && !sp->role.invalid) { pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); - kvm_mmu_zap_page(kvm, sp); + if (kvm_mmu_zap_page(kvm, sp)) + goto restart; } } } static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) { - int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); + int slot = memslot_id(kvm, gfn); struct kvm_mmu_page *sp = page_header(__pa(pte)); __set_bit(slot, sp->slot_bitmap); @@ -1635,20 +1618,6 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp) } } -struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) -{ - struct page *page; - - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); - - if (gpa == UNMAPPED_GVA) - return NULL; - - page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - - return page; -} - /* * The function is based on mtrr_type_lookup() in * arch/x86/kernel/cpu/mtrr/generic.c @@ -1761,7 +1730,6 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) struct kvm_mmu_page *s; struct hlist_node *node, *n; - trace_kvm_mmu_unsync_page(sp); index = kvm_page_table_hashfn(sp->gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; /* don't unsync if pagetable is shadowed with multiple roles */ @@ -1771,10 +1739,11 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (s->role.word != sp->role.word) return 1; } + trace_kvm_mmu_unsync_page(sp); ++vcpu->kvm->stat.mmu_unsync; sp->unsync = 1; - kvm_mmu_mark_parents_unsync(vcpu, sp); + kvm_mmu_mark_parents_unsync(sp); mmu_convert_notrap(sp); return 0; @@ -1852,7 +1821,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * is responsibility of mmu_get_page / kvm_sync_page. * Same reasoning can be applied to dirty page accounting. */ - if (!can_unsync && is_writeble_pte(*sptep)) + if (!can_unsync && is_writable_pte(*sptep)) goto set_pte; if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { @@ -1860,7 +1829,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, __func__, gfn); ret = 1; pte_access &= ~ACC_WRITE_MASK; - if (is_writeble_pte(spte)) + if (is_writable_pte(spte)) spte &= ~PT_WRITABLE_MASK; } } @@ -1881,7 +1850,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, bool reset_host_protection) { int was_rmapped = 0; - int was_writeble = is_writeble_pte(*sptep); + int was_writable = is_writable_pte(*sptep); int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" @@ -1932,7 +1901,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); } else { - if (was_writeble) + if (was_writable) kvm_release_pfn_dirty(pfn); else kvm_release_pfn_clean(pfn); @@ -2090,21 +2059,23 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->arch.mmu.root_hpa; ASSERT(!VALID_PAGE(root)); - if (tdp_enabled) - direct = 1; if (mmu_check_root(vcpu, root_gfn)) return 1; + if (tdp_enabled) { + direct = 1; + root_gfn = 0; + } + spin_lock(&vcpu->kvm->mmu_lock); sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, direct, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; + spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = root; return 0; } direct = !is_paging(vcpu); - if (tdp_enabled) - direct = 1; for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -2120,11 +2091,18 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = 0; if (mmu_check_root(vcpu, root_gfn)) return 1; + if (tdp_enabled) { + direct = 1; + root_gfn = i << 30; + } + spin_lock(&vcpu->kvm->mmu_lock); sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, direct, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; + spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; } vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); @@ -2162,8 +2140,11 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->kvm->mmu_lock); } -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, + u32 access, u32 *error) { + if (error) + *error = 0; return vaddr; } @@ -2305,13 +2286,19 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) /* no rsvd bits for 2 level 4K page table entries */ context->rsvd_bits_mask[0][1] = 0; context->rsvd_bits_mask[0][0] = 0; + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + + if (!is_pse(vcpu)) { + context->rsvd_bits_mask[1][1] = 0; + break; + } + if (is_cpuid_PSE36()) /* 36bits PSE 4MB page */ context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); else /* 32 bits PSE 4MB page */ context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; break; case PT32E_ROOT_LEVEL: context->rsvd_bits_mask[0][2] = @@ -2324,7 +2311,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62) | rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; break; case PT64_ROOT_LEVEL: context->rsvd_bits_mask[0][3] = exb_bit_rsvd | @@ -2342,7 +2329,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; break; } } @@ -2444,7 +2431,8 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu) else r = paging32_init_context(vcpu); - vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level; + vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); + vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); return r; } @@ -2484,7 +2472,9 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) goto out; spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); + spin_unlock(&vcpu->kvm->mmu_lock); r = mmu_alloc_roots(vcpu); + spin_lock(&vcpu->kvm->mmu_lock); mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); if (r) @@ -2533,7 +2523,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, } ++vcpu->kvm->stat.mmu_pte_updated; - if (sp->role.glevels == PT32_ROOT_LEVEL) + if (!sp->role.cr4_pae) paging32_update_pte(vcpu, sp, spte, new); else paging64_update_pte(vcpu, sp, spte, new); @@ -2568,36 +2558,11 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) } static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) + u64 gpte) { gfn_t gfn; - int r; - u64 gpte = 0; pfn_t pfn; - if (bytes != 4 && bytes != 8) - return; - - /* - * Assume that the pte write on a page table of the same type - * as the current vcpu paging mode. This is nearly always true - * (might be false while changing modes). Note it is verified later - * by update_pte(). - */ - if (is_pae(vcpu)) { - /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ - if ((bytes == 4) && (gpa % 4 == 0)) { - r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8); - if (r) - return; - memcpy((void *)&gpte + (gpa % 8), new, 4); - } else if ((bytes == 8) && (gpa % 8 == 0)) { - memcpy((void *)&gpte, new, 8); - } - } else { - if ((bytes == 4) && (gpa % 4 == 0)) - memcpy((void *)&gpte, new, 4); - } if (!is_present_gpte(gpte)) return; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -2646,10 +2611,46 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, int flooded = 0; int npte; int r; + int invlpg_counter; pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); - mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); + + invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter); + + /* + * Assume that the pte write on a page table of the same type + * as the current vcpu paging mode. This is nearly always true + * (might be false while changing modes). Note it is verified later + * by update_pte(). + */ + if ((is_pae(vcpu) && bytes == 4) || !new) { + /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ + if (is_pae(vcpu)) { + gpa &= ~(gpa_t)7; + bytes = 8; + } + r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8)); + if (r) + gentry = 0; + new = (const u8 *)&gentry; + } + + switch (bytes) { + case 4: + gentry = *(const u32 *)new; + break; + case 8: + gentry = *(const u64 *)new; + break; + default: + gentry = 0; + break; + } + + mmu_guess_page_from_pte_write(vcpu, gpa, gentry); spin_lock(&vcpu->kvm->mmu_lock); + if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) + gentry = 0; kvm_mmu_access_page(vcpu, gfn); kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; @@ -2668,10 +2669,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + +restart: hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { if (sp->gfn != gfn || sp->role.direct || sp->role.invalid) continue; - pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; + pte_size = sp->role.cr4_pae ? 8 : 4; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); misaligned |= bytes < 4; if (misaligned || flooded) { @@ -2688,14 +2691,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); if (kvm_mmu_zap_page(vcpu->kvm, sp)) - n = bucket->first; + goto restart; ++vcpu->kvm->stat.mmu_flooded; continue; } page_offset = offset; level = sp->role.level; npte = 1; - if (sp->role.glevels == PT32_ROOT_LEVEL) { + if (!sp->role.cr4_pae) { page_offset <<= 1; /* 32->64 */ /* * A 32-bit pde maps 4MB while the shadow pdes map @@ -2713,20 +2716,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, continue; } spte = &sp->spt[page_offset / sizeof(*spte)]; - if ((gpa & (pte_size - 1)) || (bytes < pte_size)) { - gentry = 0; - r = kvm_read_guest_atomic(vcpu->kvm, - gpa & ~(u64)(pte_size - 1), - &gentry, pte_size); - new = (const void *)&gentry; - if (r < 0) - new = NULL; - } while (npte--) { entry = *spte; mmu_pte_write_zap_pte(vcpu, sp, spte); - if (new) - mmu_pte_write_new_pte(vcpu, sp, spte, new); + if (gentry) + mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); mmu_pte_write_flush_tlb(vcpu, entry, *spte); ++spte; } @@ -2747,7 +2741,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) if (tdp_enabled) return 0; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); spin_lock(&vcpu->kvm->mmu_lock); r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); @@ -2847,16 +2841,13 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) */ page = alloc_page(GFP_KERNEL | __GFP_DMA32); if (!page) - goto error_1; + return -ENOMEM; + vcpu->arch.mmu.pae_root = page_address(page); for (i = 0; i < 4; ++i) vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; return 0; - -error_1: - free_mmu_pages(vcpu); - return -ENOMEM; } int kvm_mmu_create(struct kvm_vcpu *vcpu) @@ -2909,22 +2900,23 @@ void kvm_mmu_zap_all(struct kvm *kvm) struct kvm_mmu_page *sp, *node; spin_lock(&kvm->mmu_lock); +restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) if (kvm_mmu_zap_page(kvm, sp)) - node = container_of(kvm->arch.active_mmu_pages.next, - struct kvm_mmu_page, link); + goto restart; + spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); } -static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) +static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - kvm_mmu_zap_page(kvm, page); + return kvm_mmu_zap_page(kvm, page) + 1; } static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) @@ -2936,23 +2928,22 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - int npages; + int npages, idx, freed_pages; - if (!down_read_trylock(&kvm->slots_lock)) - continue; + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); npages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; cache_count += npages; if (!kvm_freed && nr_to_scan > 0 && npages > 0) { - kvm_mmu_remove_one_alloc_mmu_page(kvm); - cache_count--; + freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm); + cache_count -= freed_pages; kvm_freed = kvm; } nr_to_scan--; spin_unlock(&kvm->mmu_lock); - up_read(&kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, idx); } if (kvm_freed) list_move_tail(&kvm_freed->vm_list, &vm_list); @@ -3019,9 +3010,12 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) int i; unsigned int nr_mmu_pages; unsigned int nr_pages = 0; + struct kvm_memslots *slots; + + slots = kvm_memslots(kvm); - for (i = 0; i < kvm->nmemslots; i++) - nr_pages += kvm->memslots[i].npages; + for (i = 0; i < slots->nmemslots; i++) + nr_pages += slots->memslots[i].npages; nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; nr_mmu_pages = max(nr_mmu_pages, @@ -3182,8 +3176,7 @@ static gva_t canonicalize(gva_t gva) } -typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp, - u64 *sptep); +typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep); static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, inspect_spte_fn fn) @@ -3199,7 +3192,7 @@ static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, child = page_header(ent & PT64_BASE_ADDR_MASK); __mmu_spte_walk(kvm, child, fn); } else - fn(kvm, sp, &sp->spt[i]); + fn(kvm, &sp->spt[i]); } } } @@ -3246,7 +3239,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) audit_mappings_page(vcpu, ent, va, level - 1); else { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); + gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL); gfn_t gfn = gpa >> PAGE_SHIFT; pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; @@ -3290,11 +3283,15 @@ static void audit_mappings(struct kvm_vcpu *vcpu) static int count_rmaps(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; + struct kvm_memslots *slots; int nmaps = 0; - int i, j, k; + int i, j, k, idx; + idx = srcu_read_lock(&kvm->srcu); + slots = kvm_memslots(kvm); for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; + struct kvm_memory_slot *m = &slots->memslots[i]; struct kvm_rmap_desc *d; for (j = 0; j < m->npages; ++j) { @@ -3317,10 +3314,11 @@ static int count_rmaps(struct kvm_vcpu *vcpu) } } } + srcu_read_unlock(&kvm->srcu, idx); return nmaps; } -void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep) +void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) { unsigned long *rmapp; struct kvm_mmu_page *rev_sp; @@ -3336,14 +3334,14 @@ void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep) printk(KERN_ERR "%s: no memslot for gfn %ld\n", audit_msg, gfn); printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", - audit_msg, sptep - rev_sp->spt, + audit_msg, (long int)(sptep - rev_sp->spt), rev_sp->gfn); dump_stack(); return; } rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], - is_large_pte(*sptep)); + rev_sp->role.level); if (!*rmapp) { if (!printk_ratelimit()) return; @@ -3378,7 +3376,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) continue; if (!(ent & PT_WRITABLE_MASK)) continue; - inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]); + inspect_spte_has_rmap(vcpu->kvm, &pt[i]); } } return; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 61a1b3884b49..be66759321a5 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -2,6 +2,7 @@ #define __KVM_X86_MMU_H #include <linux/kvm_host.h> +#include "kvm_cache_regs.h" #define PT64_PT_BITS 9 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) @@ -37,6 +38,16 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 +#define PT_PDPE_LEVEL 3 +#define PT_DIRECTORY_LEVEL 2 +#define PT_PAGE_TABLE_LEVEL 1 + +#define PFERR_PRESENT_MASK (1U << 0) +#define PFERR_WRITE_MASK (1U << 1) +#define PFERR_USER_MASK (1U << 2) +#define PFERR_RSVD_MASK (1U << 3) +#define PFERR_FETCH_MASK (1U << 4) + int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) @@ -53,30 +64,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) return kvm_mmu_load(vcpu); } -static inline int is_long_mode(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_X86_64 - return vcpu->arch.shadow_efer & EFER_LMA; -#else - return 0; -#endif -} - -static inline int is_pae(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr4 & X86_CR4_PAE; -} - -static inline int is_pse(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr4 & X86_CR4_PSE; -} - -static inline int is_paging(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr0 & X86_CR0_PG; -} - static inline int is_present_gpte(unsigned long pte) { return pte & PT_PRESENT_MASK; diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 3e4a5c6ca2a9..42f07b1bfbc9 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -6,14 +6,12 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvmmmu -#define TRACE_INCLUDE_PATH . -#define TRACE_INCLUDE_FILE mmutrace #define KVM_MMU_PAGE_FIELDS \ __field(__u64, gfn) \ __field(__u32, role) \ __field(__u32, root_count) \ - __field(__u32, unsync) + __field(bool, unsync) #define KVM_MMU_PAGE_ASSIGN(sp) \ __entry->gfn = sp->gfn; \ @@ -30,14 +28,14 @@ \ role.word = __entry->role; \ \ - trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge" \ + trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \ " %snxe root %u %s%c", \ - __entry->gfn, role.level, role.glevels, \ + __entry->gfn, role.level, \ + role.cr4_pae ? " pae" : "", \ role.quadrant, \ role.direct ? " direct" : "", \ access_str[role.access], \ role.invalid ? " invalid" : "", \ - role.cr4_pge ? "" : "!", \ role.nxe ? "" : "!", \ __entry->root_count, \ __entry->unsync ? "unsync" : "sync", 0); \ @@ -94,15 +92,15 @@ TRACE_EVENT( TP_printk("pte %llx level %u", __entry->pte, __entry->level) ); -/* We set a pte accessed bit */ -TRACE_EVENT( - kvm_mmu_set_accessed_bit, +DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class, + TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), + TP_ARGS(table_gfn, index, size), TP_STRUCT__entry( __field(__u64, gpa) - ), + ), TP_fast_assign( __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) @@ -112,22 +110,20 @@ TRACE_EVENT( TP_printk("gpa %llx", __entry->gpa) ); -/* We set a pte dirty bit */ -TRACE_EVENT( - kvm_mmu_set_dirty_bit, +/* We set a pte accessed bit */ +DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit, + TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), - TP_ARGS(table_gfn, index, size), - TP_STRUCT__entry( - __field(__u64, gpa) - ), + TP_ARGS(table_gfn, index, size) +); - TP_fast_assign( - __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) - + index * size; - ), +/* We set a pte dirty bit */ +DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit, - TP_printk("gpa %llx", __entry->gpa) + TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), + + TP_ARGS(table_gfn, index, size) ); TRACE_EVENT( @@ -166,55 +162,45 @@ TRACE_EVENT( __entry->created ? "new" : "existing") ); -TRACE_EVENT( - kvm_mmu_sync_page, +DECLARE_EVENT_CLASS(kvm_mmu_page_class, + TP_PROTO(struct kvm_mmu_page *sp), TP_ARGS(sp), TP_STRUCT__entry( KVM_MMU_PAGE_FIELDS - ), + ), TP_fast_assign( KVM_MMU_PAGE_ASSIGN(sp) - ), + ), TP_printk("%s", KVM_MMU_PAGE_PRINTK()) ); -TRACE_EVENT( - kvm_mmu_unsync_page, +DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page, TP_PROTO(struct kvm_mmu_page *sp), - TP_ARGS(sp), - - TP_STRUCT__entry( - KVM_MMU_PAGE_FIELDS - ), - TP_fast_assign( - KVM_MMU_PAGE_ASSIGN(sp) - ), - - TP_printk("%s", KVM_MMU_PAGE_PRINTK()) + TP_ARGS(sp) ); -TRACE_EVENT( - kvm_mmu_zap_page, +DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page, TP_PROTO(struct kvm_mmu_page *sp), - TP_ARGS(sp), - TP_STRUCT__entry( - KVM_MMU_PAGE_FIELDS - ), + TP_ARGS(sp) +); - TP_fast_assign( - KVM_MMU_PAGE_ASSIGN(sp) - ), +DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page, + TP_PROTO(struct kvm_mmu_page *sp), - TP_printk("%s", KVM_MMU_PAGE_PRINTK()) + TP_ARGS(sp) ); - #endif /* _TRACE_KVMMMU_H */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE mmutrace + /* This part must be outside protection */ #include <trace/define_trace.h> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index ede2131a9225..89d66ca4d87c 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -162,7 +162,7 @@ walk: if (rsvd_fault) goto access_error; - if (write_fault && !is_writeble_pte(pte)) + if (write_fault && !is_writable_pte(pte)) if (user_fault || is_write_protection(vcpu)) goto access_error; @@ -170,7 +170,7 @@ walk: goto access_error; #if PTTYPE == 64 - if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK)) + if (fetch_fault && (pte & PT64_NX_MASK)) goto access_error; #endif @@ -190,10 +190,10 @@ walk: if ((walker->level == PT_PAGE_TABLE_LEVEL) || ((walker->level == PT_DIRECTORY_LEVEL) && - (pte & PT_PAGE_SIZE_MASK) && + is_large_pte(pte) && (PTTYPE == 64 || is_pse(vcpu))) || ((walker->level == PT_PDPE_LEVEL) && - (pte & PT_PAGE_SIZE_MASK) && + is_large_pte(pte) && is_long_mode(vcpu))) { int lvl = walker->level; @@ -258,11 +258,17 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, pt_element_t gpte; unsigned pte_access; pfn_t pfn; + u64 new_spte; gpte = *(const pt_element_t *)pte; if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { - if (!is_present_gpte(gpte)) - __set_spte(spte, shadow_notrap_nonpresent_pte); + if (!is_present_gpte(gpte)) { + if (page->unsync) + new_spte = shadow_trap_nonpresent_pte; + else + new_spte = shadow_notrap_nonpresent_pte; + __set_spte(spte, new_spte); + } return; } pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); @@ -457,6 +463,7 @@ out_unlock: static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { struct kvm_shadow_walk_iterator iterator; + gpa_t pte_gpa = -1; int level; u64 *sptep; int need_flush = 0; @@ -467,9 +474,16 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) level = iterator.level; sptep = iterator.sptep; - if (level == PT_PAGE_TABLE_LEVEL || - ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || - ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { + if (is_last_spte(*sptep, level)) { + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + int offset, shift; + + shift = PAGE_SHIFT - + (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level; + offset = sp->role.quadrant << shift; + + pte_gpa = (sp->gfn << PAGE_SHIFT) + offset; + pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); if (is_shadow_present_pte(*sptep)) { rmap_remove(vcpu->kvm, sptep); @@ -487,21 +501,36 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) if (need_flush) kvm_flush_remote_tlbs(vcpu->kvm); + + atomic_inc(&vcpu->kvm->arch.invlpg_counter); + spin_unlock(&vcpu->kvm->mmu_lock); + + if (pte_gpa == -1) + return; + + if (mmu_topup_memory_caches(vcpu)) + return; + kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0); } -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, + u32 *error) { struct guest_walker walker; gpa_t gpa = UNMAPPED_GVA; int r; - r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); + r = FNAME(walk_addr)(&walker, vcpu, vaddr, + !!(access & PFERR_WRITE_MASK), + !!(access & PFERR_USER_MASK), + !!(access & PFERR_FETCH_MASK)); if (r) { gpa = gfn_to_gpa(walker.gfn); gpa |= vaddr & ~PAGE_MASK; - } + } else if (error) + *error = walker.error_code; return gpa; } @@ -546,12 +575,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { int i, offset, nr_present; bool reset_host_protection; + gpa_t first_pte_gpa; offset = nr_present = 0; if (PTTYPE == 32) offset = sp->role.quadrant << PT64_LEVEL_BITS; + first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { unsigned pte_access; pt_element_t gpte; @@ -561,8 +593,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (!is_shadow_present_pte(sp->spt[i])) continue; - pte_gpa = gfn_to_gpa(sp->gfn); - pte_gpa += (i+offset) * sizeof(pt_element_t); + pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, sizeof(pt_element_t))) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1d9b33843c80..96dc232bfc56 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -26,6 +26,7 @@ #include <linux/highmem.h> #include <linux/sched.h> #include <linux/ftrace_event.h> +#include <linux/slab.h> #include <asm/desc.h> @@ -43,10 +44,11 @@ MODULE_LICENSE("GPL"); #define SEG_TYPE_LDT 2 #define SEG_TYPE_BUSY_TSS16 3 -#define SVM_FEATURE_NPT (1 << 0) -#define SVM_FEATURE_LBRV (1 << 1) -#define SVM_FEATURE_SVML (1 << 2) -#define SVM_FEATURE_PAUSE_FILTER (1 << 10) +#define SVM_FEATURE_NPT (1 << 0) +#define SVM_FEATURE_LBRV (1 << 1) +#define SVM_FEATURE_SVML (1 << 2) +#define SVM_FEATURE_NRIP (1 << 3) +#define SVM_FEATURE_PAUSE_FILTER (1 << 10) #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ #define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ @@ -69,6 +71,7 @@ struct kvm_vcpu; struct nested_state { struct vmcb *hsave; u64 hsave_msr; + u64 vm_cr_msr; u64 vmcb; /* These are the merged vectors */ @@ -76,6 +79,7 @@ struct nested_state { /* gpa pointers to the real vectors */ u64 vmcb_msrpm; + u64 vmcb_iopm; /* A VMEXIT is required but not yet emulated */ bool exit_required; @@ -90,6 +94,9 @@ struct nested_state { }; +#define MSRPM_OFFSETS 16 +static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; + struct vcpu_svm { struct kvm_vcpu vcpu; struct vmcb *vmcb; @@ -109,13 +116,39 @@ struct vcpu_svm { struct nested_state nested; bool nmi_singlestep; + + unsigned int3_injected; + unsigned long int3_rip; +}; + +#define MSR_INVALID 0xffffffffU + +static struct svm_direct_access_msrs { + u32 index; /* Index of the MSR */ + bool always; /* True if intercept is always on */ +} direct_access_msrs[] = { + { .index = MSR_K6_STAR, .always = true }, + { .index = MSR_IA32_SYSENTER_CS, .always = true }, +#ifdef CONFIG_X86_64 + { .index = MSR_GS_BASE, .always = true }, + { .index = MSR_FS_BASE, .always = true }, + { .index = MSR_KERNEL_GS_BASE, .always = true }, + { .index = MSR_LSTAR, .always = true }, + { .index = MSR_CSTAR, .always = true }, + { .index = MSR_SYSCALL_MASK, .always = true }, +#endif + { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, + { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, + { .index = MSR_IA32_LASTINTFROMIP, .always = false }, + { .index = MSR_IA32_LASTINTTOIP, .always = false }, + { .index = MSR_INVALID, .always = false }, }; /* enable NPT for AMD64 and X86 with PAE */ #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) static bool npt_enabled = true; #else -static bool npt_enabled = false; +static bool npt_enabled; #endif static int npt = 1; @@ -128,6 +161,7 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); static int nested_svm_exit_handled(struct vcpu_svm *svm); +static int nested_svm_intercept(struct vcpu_svm *svm); static int nested_svm_vmexit(struct vcpu_svm *svm); static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); @@ -162,8 +196,8 @@ static unsigned long iopm_base; struct kvm_ldttss_desc { u16 limit0; u16 base0; - unsigned base1 : 8, type : 5, dpl : 2, p : 1; - unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; + unsigned base1:8, type:5, dpl:2, p:1; + unsigned limit1:4, zero0:3, g:1, base2:8; u32 base3; u32 zero1; } __attribute__((packed)); @@ -193,6 +227,27 @@ static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; #define MSRS_RANGE_SIZE 2048 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) +static u32 svm_msrpm_offset(u32 msr) +{ + u32 offset; + int i; + + for (i = 0; i < NUM_MSR_MAPS; i++) { + if (msr < msrpm_ranges[i] || + msr >= msrpm_ranges[i] + MSRS_IN_RANGE) + continue; + + offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */ + offset += (i * MSRS_RANGE_SIZE); /* add range offset */ + + /* Now we have the u8 offset - but need the u32 offset */ + return offset / 4; + } + + /* MSR not in any range */ + return MSR_INVALID; +} + #define MAX_INST_SIZE 15 static inline u32 svm_has(u32 feat) @@ -212,7 +267,7 @@ static inline void stgi(void) static inline void invlpga(unsigned long addr, u32 asid) { - asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid)); + asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); } static inline void force_new_asid(struct kvm_vcpu *vcpu) @@ -231,24 +286,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) efer &= ~EFER_LME; to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; - vcpu->arch.shadow_efer = efer; -} - -static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - /* If we are within a nested VM we'd better #VMEXIT and let the - guest handle the exception */ - if (nested_svm_check_exception(svm, nr, has_error_code, error_code)) - return; - - svm->vmcb->control.event_inj = nr - | SVM_EVTINJ_VALID - | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) - | SVM_EVTINJ_TYPE_EXEPT; - svm->vmcb->control.event_inj_err = error_code; + vcpu->arch.efer = efer; } static int is_external_interrupt(u32 info) @@ -263,7 +301,7 @@ static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) u32 ret = 0; if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) - ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS; + ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; return ret & mask; } @@ -282,6 +320,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + if (svm->vmcb->control.next_rip != 0) + svm->next_rip = svm->vmcb->control.next_rip; + if (!svm->next_rip) { if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != EMULATE_DONE) @@ -296,6 +337,43 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) svm_set_interrupt_shadow(vcpu, 0); } +static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, + bool has_error_code, u32 error_code, + bool reinject) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * If we are within a nested VM we'd better #VMEXIT and let the guest + * handle the exception + */ + if (!reinject && + nested_svm_check_exception(svm, nr, has_error_code, error_code)) + return; + + if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) { + unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); + + /* + * For guest debugging where we have to reinject #BP if some + * INT3 is guest-owned: + * Emulate nRIP by moving RIP forward. Will fail if injection + * raises a fault that is not intercepted. Still better than + * failing in all cases. + */ + skip_emulated_instruction(&svm->vcpu); + rip = kvm_rip_read(&svm->vcpu); + svm->int3_rip = rip + svm->vmcb->save.cs.base; + svm->int3_injected = rip - old_rip; + } + + svm->vmcb->control.event_inj = nr + | SVM_EVTINJ_VALID + | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) + | SVM_EVTINJ_TYPE_EXEPT; + svm->vmcb->control.event_inj_err = error_code; +} + static int has_svm(void) { const char *msg; @@ -318,7 +396,7 @@ static int svm_hardware_enable(void *garbage) struct svm_cpu_data *sd; uint64_t efer; - struct descriptor_table gdt_descr; + struct desc_ptr gdt_descr; struct desc_struct *gdt; int me = raw_smp_processor_id(); @@ -343,8 +421,8 @@ static int svm_hardware_enable(void *garbage) sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; sd->next_asid = sd->max_asid + 1; - kvm_get_gdt(&gdt_descr); - gdt = (struct desc_struct *)gdt_descr.base; + native_store_gdt(&gdt_descr); + gdt = (struct desc_struct *)gdt_descr.address; sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); wrmsrl(MSR_EFER, efer | EFER_SVME); @@ -390,42 +468,98 @@ err_1: } +static bool valid_msr_intercept(u32 index) +{ + int i; + + for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) + if (direct_access_msrs[i].index == index) + return true; + + return false; +} + static void set_msr_interception(u32 *msrpm, unsigned msr, int read, int write) { + u8 bit_read, bit_write; + unsigned long tmp; + u32 offset; + + /* + * If this warning triggers extend the direct_access_msrs list at the + * beginning of the file + */ + WARN_ON(!valid_msr_intercept(msr)); + + offset = svm_msrpm_offset(msr); + bit_read = 2 * (msr & 0x0f); + bit_write = 2 * (msr & 0x0f) + 1; + tmp = msrpm[offset]; + + BUG_ON(offset == MSR_INVALID); + + read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp); + write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp); + + msrpm[offset] = tmp; +} + +static void svm_vcpu_init_msrpm(u32 *msrpm) +{ int i; - for (i = 0; i < NUM_MSR_MAPS; i++) { - if (msr >= msrpm_ranges[i] && - msr < msrpm_ranges[i] + MSRS_IN_RANGE) { - u32 msr_offset = (i * MSRS_IN_RANGE + msr - - msrpm_ranges[i]) * 2; - - u32 *base = msrpm + (msr_offset / 32); - u32 msr_shift = msr_offset % 32; - u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1); - *base = (*base & ~(0x3 << msr_shift)) | - (mask << msr_shift); + memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); + + for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { + if (!direct_access_msrs[i].always) + continue; + + set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1); + } +} + +static void add_msr_offset(u32 offset) +{ + int i; + + for (i = 0; i < MSRPM_OFFSETS; ++i) { + + /* Offset already in list? */ + if (msrpm_offsets[i] == offset) return; - } + + /* Slot used by another offset? */ + if (msrpm_offsets[i] != MSR_INVALID) + continue; + + /* Add offset to list */ + msrpm_offsets[i] = offset; + + return; } + + /* + * If this BUG triggers the msrpm_offsets table has an overflow. Just + * increase MSRPM_OFFSETS in this case. + */ BUG(); } -static void svm_vcpu_init_msrpm(u32 *msrpm) +static void init_msrpm_offsets(void) { - memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); + int i; -#ifdef CONFIG_X86_64 - set_msr_interception(msrpm, MSR_GS_BASE, 1, 1); - set_msr_interception(msrpm, MSR_FS_BASE, 1, 1); - set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1); - set_msr_interception(msrpm, MSR_LSTAR, 1, 1); - set_msr_interception(msrpm, MSR_CSTAR, 1, 1); - set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1); -#endif - set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); - set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); + memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets)); + + for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { + u32 offset; + + offset = svm_msrpm_offset(direct_access_msrs[i].index); + BUG_ON(offset == MSR_INVALID); + + add_msr_offset(offset); + } } static void svm_enable_lbrv(struct vcpu_svm *svm) @@ -466,6 +600,8 @@ static __init int svm_hardware_setup(void) memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; + init_msrpm_offsets(); + if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); @@ -522,7 +658,7 @@ static void init_seg(struct vmcb_seg *seg) { seg->selector = 0; seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | - SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ + SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ seg->limit = 0xffff; seg->base = 0; } @@ -540,25 +676,33 @@ static void init_vmcb(struct vcpu_svm *svm) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; - control->intercept_cr_read = INTERCEPT_CR0_MASK | + svm->vcpu.fpu_active = 1; + + control->intercept_cr_read = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | INTERCEPT_CR4_MASK; - control->intercept_cr_write = INTERCEPT_CR0_MASK | + control->intercept_cr_write = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | INTERCEPT_CR4_MASK | INTERCEPT_CR8_MASK; - control->intercept_dr_read = INTERCEPT_DR0_MASK | + control->intercept_dr_read = INTERCEPT_DR0_MASK | INTERCEPT_DR1_MASK | INTERCEPT_DR2_MASK | - INTERCEPT_DR3_MASK; + INTERCEPT_DR3_MASK | + INTERCEPT_DR4_MASK | + INTERCEPT_DR5_MASK | + INTERCEPT_DR6_MASK | + INTERCEPT_DR7_MASK; - control->intercept_dr_write = INTERCEPT_DR0_MASK | + control->intercept_dr_write = INTERCEPT_DR0_MASK | INTERCEPT_DR1_MASK | INTERCEPT_DR2_MASK | INTERCEPT_DR3_MASK | + INTERCEPT_DR4_MASK | INTERCEPT_DR5_MASK | + INTERCEPT_DR6_MASK | INTERCEPT_DR7_MASK; control->intercept_exceptions = (1 << PF_VECTOR) | @@ -566,9 +710,10 @@ static void init_vmcb(struct vcpu_svm *svm) (1 << MC_VECTOR); - control->intercept = (1ULL << INTERCEPT_INTR) | + control->intercept = (1ULL << INTERCEPT_INTR) | (1ULL << INTERCEPT_NMI) | (1ULL << INTERCEPT_SMI) | + (1ULL << INTERCEPT_SELECTIVE_CR0) | (1ULL << INTERCEPT_CPUID) | (1ULL << INTERCEPT_INVD) | (1ULL << INTERCEPT_HLT) | @@ -626,7 +771,8 @@ static void init_vmcb(struct vcpu_svm *svm) save->rip = 0x0000fff0; svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; - /* This is the guest-visible cr0 value. + /* + * This is the guest-visible cr0 value. * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. */ svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; @@ -641,10 +787,8 @@ static void init_vmcb(struct vcpu_svm *svm) control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | (1ULL << INTERCEPT_INVLPG)); control->intercept_exceptions &= ~(1 << PF_VECTOR); - control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| - INTERCEPT_CR3_MASK); - control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| - INTERCEPT_CR3_MASK); + control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; + control->intercept_cr_write &= ~INTERCEPT_CR3_MASK; save->g_pat = 0x0007040600070406ULL; save->cr3 = 0; save->cr4 = 0; @@ -698,30 +842,30 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (err) goto free_svm; + err = -ENOMEM; page = alloc_page(GFP_KERNEL); - if (!page) { - err = -ENOMEM; + if (!page) goto uninit; - } - err = -ENOMEM; msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); if (!msrpm_pages) - goto uninit; + goto free_page1; nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); if (!nested_msrpm_pages) - goto uninit; - - svm->msrpm = page_address(msrpm_pages); - svm_vcpu_init_msrpm(svm->msrpm); + goto free_page2; hsave_page = alloc_page(GFP_KERNEL); if (!hsave_page) - goto uninit; + goto free_page3; + svm->nested.hsave = page_address(hsave_page); + svm->msrpm = page_address(msrpm_pages); + svm_vcpu_init_msrpm(svm->msrpm); + svm->nested.msrpm = page_address(nested_msrpm_pages); + svm_vcpu_init_msrpm(svm->nested.msrpm); svm->vmcb = page_address(page); clear_page(svm->vmcb); @@ -730,13 +874,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) init_vmcb(svm); fx_init(&svm->vcpu); - svm->vcpu.fpu_active = 1; svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; return &svm->vcpu; +free_page3: + __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); +free_page2: + __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); +free_page1: + __free_page(page); uninit: kvm_vcpu_uninit(&svm->vcpu); free_svm: @@ -765,14 +914,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (unlikely(cpu != vcpu->cpu)) { u64 delta; - /* - * Make sure that the guest sees a monotonically - * increasing TSC. - */ - delta = vcpu->arch.host_tsc - native_read_tsc(); - svm->vmcb->control.tsc_offset += delta; - if (is_nested(svm)) - svm->nested.hsave->control.tsc_offset += delta; + if (check_tsc_unstable()) { + /* + * Make sure that the guest sees a monotonically + * increasing TSC. + */ + delta = vcpu->arch.host_tsc - native_read_tsc(); + svm->vmcb->control.tsc_offset += delta; + if (is_nested(svm)) + svm->nested.hsave->control.tsc_offset += delta; + } vcpu->cpu = cpu; kvm_migrate_timers(vcpu); svm->asid_generation = 0; @@ -868,7 +1019,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; - /* AMD's VMCB does not have an explicit unusable field, so emulate it + /* + * AMD's VMCB does not have an explicit unusable field, so emulate it * for cross vendor migration purposes by "not present" */ var->unusable = !var->present || (var->type == 0); @@ -904,7 +1056,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->type |= 0x1; break; case VCPU_SREG_SS: - /* On AMD CPUs sometimes the DB bit in the segment + /* + * On AMD CPUs sometimes the DB bit in the segment * descriptor is left as 1, although the whole segment has * been made unusable. Clear it here to pass an Intel VMX * entry check when cross vendor migrating. @@ -922,74 +1075,127 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu) return save->cpl; } -static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { struct vcpu_svm *svm = to_svm(vcpu); - dt->limit = svm->vmcb->save.idtr.limit; - dt->base = svm->vmcb->save.idtr.base; + dt->size = svm->vmcb->save.idtr.limit; + dt->address = svm->vmcb->save.idtr.base; } -static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->save.idtr.limit = dt->limit; - svm->vmcb->save.idtr.base = dt->base ; + svm->vmcb->save.idtr.limit = dt->size; + svm->vmcb->save.idtr.base = dt->address ; } -static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { struct vcpu_svm *svm = to_svm(vcpu); - dt->limit = svm->vmcb->save.gdtr.limit; - dt->base = svm->vmcb->save.gdtr.base; + dt->size = svm->vmcb->save.gdtr.limit; + dt->address = svm->vmcb->save.gdtr.base; } -static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->save.gdtr.limit = dt->limit; - svm->vmcb->save.gdtr.base = dt->base ; + svm->vmcb->save.gdtr.limit = dt->size; + svm->vmcb->save.gdtr.base = dt->address ; +} + +static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ } static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { } +static void update_cr0_intercept(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb; + ulong gcr0 = svm->vcpu.arch.cr0; + u64 *hcr0 = &svm->vmcb->save.cr0; + + if (!svm->vcpu.fpu_active) + *hcr0 |= SVM_CR0_SELECTIVE_MASK; + else + *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) + | (gcr0 & SVM_CR0_SELECTIVE_MASK); + + + if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { + vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; + vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; + if (is_nested(svm)) { + struct vmcb *hsave = svm->nested.hsave; + + hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; + hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; + vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read; + vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write; + } + } else { + svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; + svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; + if (is_nested(svm)) { + struct vmcb *hsave = svm->nested.hsave; + + hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK; + hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK; + } + } +} + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); + if (is_nested(svm)) { + /* + * We are here because we run in nested mode, the host kvm + * intercepts cr0 writes but the l1 hypervisor does not. + * But the L1 hypervisor may intercept selective cr0 writes. + * This needs to be checked here. + */ + unsigned long old, new; + + /* Remove bits that would trigger a real cr0 write intercept */ + old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK; + new = cr0 & SVM_CR0_SELECTIVE_MASK; + + if (old == new) { + /* cr0 write with ts and mp unchanged */ + svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; + if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) + return; + } + } + #ifdef CONFIG_X86_64 - if (vcpu->arch.shadow_efer & EFER_LME) { + if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { - vcpu->arch.shadow_efer |= EFER_LMA; + vcpu->arch.efer |= EFER_LMA; svm->vmcb->save.efer |= EFER_LMA | EFER_LME; } if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { - vcpu->arch.shadow_efer &= ~EFER_LMA; + vcpu->arch.efer &= ~EFER_LMA; svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); } } #endif - if (npt_enabled) - goto set; + vcpu->arch.cr0 = cr0; - if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { - svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - vcpu->fpu_active = 1; - } + if (!npt_enabled) + cr0 |= X86_CR0_PG | X86_CR0_WP; - vcpu->arch.cr0 = cr0; - cr0 |= X86_CR0_PG | X86_CR0_WP; - if (!vcpu->fpu_active) { - svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); + if (!vcpu->fpu_active) cr0 |= X86_CR0_TS; - } -set: /* * re-enable caching here because the QEMU bios * does not do it - this results in some delay at @@ -997,6 +1203,7 @@ set: */ cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; + update_cr0_intercept(svm); } static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1102,76 +1309,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->vmcb->control.asid = sd->next_asid++; } -static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) -{ - struct vcpu_svm *svm = to_svm(vcpu); - unsigned long val; - - switch (dr) { - case 0 ... 3: - val = vcpu->arch.db[dr]; - break; - case 6: - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - val = vcpu->arch.dr6; - else - val = svm->vmcb->save.dr6; - break; - case 7: - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - val = vcpu->arch.dr7; - else - val = svm->vmcb->save.dr7; - break; - default: - val = 0; - } - - return val; -} - -static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, - int *exception) +static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) { struct vcpu_svm *svm = to_svm(vcpu); - *exception = 0; - - switch (dr) { - case 0 ... 3: - vcpu->arch.db[dr] = value; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) - vcpu->arch.eff_db[dr] = value; - return; - case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) - *exception = UD_VECTOR; - return; - case 6: - if (value & 0xffffffff00000000ULL) { - *exception = GP_VECTOR; - return; - } - vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1; - return; - case 7: - if (value & 0xffffffff00000000ULL) { - *exception = GP_VECTOR; - return; - } - vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { - svm->vmcb->save.dr7 = vcpu->arch.dr7; - vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK); - } - return; - default: - /* FIXME: Possible case? */ - printk(KERN_DEBUG "%s: unexpected dr %u\n", - __func__, dr); - *exception = UD_VECTOR; - return; - } + svm->vmcb->save.dr7 = value; } static int pf_interception(struct vcpu_svm *svm) @@ -1208,7 +1350,7 @@ static int db_interception(struct vcpu_svm *svm) } if (svm->vcpu.guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){ + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; @@ -1239,13 +1381,32 @@ static int ud_interception(struct vcpu_svm *svm) return 1; } -static int nm_interception(struct vcpu_svm *svm) +static void svm_fpu_activate(struct kvm_vcpu *vcpu) { - svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) - svm->vmcb->save.cr0 &= ~X86_CR0_TS; + struct vcpu_svm *svm = to_svm(vcpu); + u32 excp; + + if (is_nested(svm)) { + u32 h_excp, n_excp; + + h_excp = svm->nested.hsave->control.intercept_exceptions; + n_excp = svm->nested.intercept_exceptions; + h_excp &= ~(1 << NM_VECTOR); + excp = h_excp | n_excp; + } else { + excp = svm->vmcb->control.intercept_exceptions; + excp &= ~(1 << NM_VECTOR); + } + + svm->vmcb->control.intercept_exceptions = excp; + svm->vcpu.fpu_active = 1; + update_cr0_intercept(svm); +} +static int nm_interception(struct vcpu_svm *svm) +{ + svm_fpu_activate(&svm->vcpu); return 1; } @@ -1279,29 +1440,23 @@ static int shutdown_interception(struct vcpu_svm *svm) static int io_interception(struct vcpu_svm *svm) { + struct kvm_vcpu *vcpu = &svm->vcpu; u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ int size, in, string; unsigned port; ++svm->vcpu.stat.io_exits; - - svm->next_rip = svm->vmcb->control.exit_info_2; - string = (io_info & SVM_IOIO_STR_MASK) != 0; - - if (string) { - if (emulate_instruction(&svm->vcpu, - 0, 0, 0) == EMULATE_DO_MMIO) - return 0; - return 1; - } - in = (io_info & SVM_IOIO_TYPE_MASK) != 0; + if (string || in) + return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); + port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; - + svm->next_rip = svm->vmcb->control.exit_info_2; skip_emulated_instruction(&svm->vcpu); - return kvm_emulate_pio(&svm->vcpu, in, size, port); + + return kvm_fast_pio_out(vcpu, size, port); } static int nmi_interception(struct vcpu_svm *svm) @@ -1337,7 +1492,7 @@ static int vmmcall_interception(struct vcpu_svm *svm) static int nested_svm_check_permissions(struct vcpu_svm *svm) { - if (!(svm->vcpu.arch.shadow_efer & EFER_SVME) + if (!(svm->vcpu.arch.efer & EFER_SVME) || !is_paging(&svm->vcpu)) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; @@ -1354,6 +1509,8 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm) static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code) { + int vmexit; + if (!is_nested(svm)) return 0; @@ -1362,21 +1519,28 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, svm->vmcb->control.exit_info_1 = error_code; svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; - return nested_svm_exit_handled(svm); + vmexit = nested_svm_intercept(svm); + if (vmexit == NESTED_EXIT_DONE) + svm->nested.exit_required = true; + + return vmexit; } -static inline int nested_svm_intr(struct vcpu_svm *svm) +/* This function returns true if it is save to enable the irq window */ +static inline bool nested_svm_intr(struct vcpu_svm *svm) { if (!is_nested(svm)) - return 0; + return true; if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) - return 0; + return true; if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) - return 0; + return false; - svm->vmcb->control.exit_code = SVM_EXIT_INTR; + svm->vmcb->control.exit_code = SVM_EXIT_INTR; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; if (svm->nested.intercept & 1ULL) { /* @@ -1387,21 +1551,40 @@ static inline int nested_svm_intr(struct vcpu_svm *svm) */ svm->nested.exit_required = true; trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); - return 1; + return false; } - return 0; + return true; } -static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx) +/* This function returns true if it is save to enable the nmi window */ +static inline bool nested_svm_nmi(struct vcpu_svm *svm) +{ + if (!is_nested(svm)) + return true; + + if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) + return true; + + svm->vmcb->control.exit_code = SVM_EXIT_NMI; + svm->nested.exit_required = true; + + return false; +} + +static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page) { struct page *page; + might_sleep(); + page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); if (is_error_page(page)) goto error; - return kmap_atomic(page, idx); + *_page = page; + + return kmap(page); error: kvm_release_page_clean(page); @@ -1410,61 +1593,55 @@ error: return NULL; } -static void nested_svm_unmap(void *addr, enum km_type idx) +static void nested_svm_unmap(struct page *page) { - struct page *page; + kunmap(page); + kvm_release_page_dirty(page); +} - if (!addr) - return; +static int nested_svm_intercept_ioio(struct vcpu_svm *svm) +{ + unsigned port; + u8 val, bit; + u64 gpa; - page = kmap_atomic_to_page(addr); + if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT))) + return NESTED_EXIT_HOST; - kunmap_atomic(addr, idx); - kvm_release_page_dirty(page); + port = svm->vmcb->control.exit_info_1 >> 16; + gpa = svm->nested.vmcb_iopm + (port / 8); + bit = port % 8; + val = 0; + + if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1)) + val &= (1 << bit); + + return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; } -static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm) +static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) { - u32 param = svm->vmcb->control.exit_info_1 & 1; - u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; - bool ret = false; - u32 t0, t1; - u8 *msrpm; + u32 offset, msr, value; + int write, mask; if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) - return false; + return NESTED_EXIT_HOST; - msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); + msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + offset = svm_msrpm_offset(msr); + write = svm->vmcb->control.exit_info_1 & 1; + mask = 1 << ((2 * (msr & 0xf)) + write); - if (!msrpm) - goto out; - - switch (msr) { - case 0 ... 0x1fff: - t0 = (msr * 2) % 8; - t1 = msr / 8; - break; - case 0xc0000000 ... 0xc0001fff: - t0 = (8192 + msr - 0xc0000000) * 2; - t1 = (t0 / 8); - t0 %= 8; - break; - case 0xc0010000 ... 0xc0011fff: - t0 = (16384 + msr - 0xc0010000) * 2; - t1 = (t0 / 8); - t0 %= 8; - break; - default: - ret = true; - goto out; - } + if (offset == MSR_INVALID) + return NESTED_EXIT_DONE; - ret = msrpm[t1] & ((1 << param) << t0); + /* Offset is in 32 bit units but need in 8 bit units */ + offset *= 4; -out: - nested_svm_unmap(msrpm, KM_USER0); + if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4)) + return NESTED_EXIT_DONE; - return ret; + return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; } static int nested_svm_exit_special(struct vcpu_svm *svm) @@ -1474,17 +1651,21 @@ static int nested_svm_exit_special(struct vcpu_svm *svm) switch (exit_code) { case SVM_EXIT_INTR: case SVM_EXIT_NMI: + case SVM_EXIT_EXCP_BASE + MC_VECTOR: return NESTED_EXIT_HOST; - /* For now we are always handling NPFs when using them */ case SVM_EXIT_NPF: + /* For now we are always handling NPFs when using them */ if (npt_enabled) return NESTED_EXIT_HOST; break; - /* When we're shadowing, trap PFs */ case SVM_EXIT_EXCP_BASE + PF_VECTOR: + /* When we're shadowing, trap PFs */ if (!npt_enabled) return NESTED_EXIT_HOST; break; + case SVM_EXIT_EXCP_BASE + NM_VECTOR: + nm_interception(svm); + break; default: break; } @@ -1495,7 +1676,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm) /* * If this function returns true, this #vmexit was already handled */ -static int nested_svm_exit_handled(struct vcpu_svm *svm) +static int nested_svm_intercept(struct vcpu_svm *svm) { u32 exit_code = svm->vmcb->control.exit_code; int vmexit = NESTED_EXIT_HOST; @@ -1504,6 +1685,9 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm) case SVM_EXIT_MSR: vmexit = nested_svm_exit_handled_msr(svm); break; + case SVM_EXIT_IOIO: + vmexit = nested_svm_intercept_ioio(svm); + break; case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); if (svm->nested.intercept_cr_read & cr_bits) @@ -1534,6 +1718,10 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm) vmexit = NESTED_EXIT_DONE; break; } + case SVM_EXIT_ERR: { + vmexit = NESTED_EXIT_DONE; + break; + } default: { u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); if (svm->nested.intercept & exit_bits) @@ -1541,9 +1729,17 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm) } } - if (vmexit == NESTED_EXIT_DONE) { + return vmexit; +} + +static int nested_svm_exit_handled(struct vcpu_svm *svm) +{ + int vmexit; + + vmexit = nested_svm_intercept(svm); + + if (vmexit == NESTED_EXIT_DONE) nested_svm_vmexit(svm); - } return vmexit; } @@ -1585,6 +1781,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; + struct page *page; trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, vmcb->control.exit_info_1, @@ -1592,10 +1789,13 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) vmcb->control.exit_int_info, vmcb->control.exit_int_info_err); - nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); + nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page); if (!nested_vmcb) return 1; + /* Exit nested SVM mode */ + svm->nested.vmcb = 0; + /* Give the current vmcb to the guest */ disable_gif(svm); @@ -1605,9 +1805,10 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->save.ds = vmcb->save.ds; nested_vmcb->save.gdtr = vmcb->save.gdtr; nested_vmcb->save.idtr = vmcb->save.idtr; - if (npt_enabled) - nested_vmcb->save.cr3 = vmcb->save.cr3; + nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); + nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; nested_vmcb->save.cr2 = vmcb->save.cr2; + nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; nested_vmcb->save.rflags = vmcb->save.rflags; nested_vmcb->save.rip = vmcb->save.rip; nested_vmcb->save.rsp = vmcb->save.rsp; @@ -1679,10 +1880,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) svm->vmcb->save.cpl = 0; svm->vmcb->control.exit_int_info = 0; - /* Exit nested SVM mode */ - svm->nested.vmcb = 0; - - nested_svm_unmap(nested_vmcb, KM_USER0); + nested_svm_unmap(page); kvm_mmu_reset_context(&svm->vcpu); kvm_mmu_load(&svm->vcpu); @@ -1692,19 +1890,33 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { - u32 *nested_msrpm; + /* + * This function merges the msr permission bitmaps of kvm and the + * nested vmcb. It is omptimized in that it only merges the parts where + * the kvm msr permission bitmap may contain zero bits + */ int i; - nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); - if (!nested_msrpm) - return false; + if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) + return true; - for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) - svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; + for (i = 0; i < MSRPM_OFFSETS; i++) { + u32 value, p; + u64 offset; - svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); + if (msrpm_offsets[i] == 0xffffffff) + break; + + p = msrpm_offsets[i]; + offset = svm->nested.vmcb_msrpm + (p * 4); - nested_svm_unmap(nested_msrpm, KM_USER0); + if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4)) + return false; + + svm->nested.msrpm[p] = svm->msrpm[p] | value; + } + + svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); return true; } @@ -1714,34 +1926,42 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; + struct page *page; + u64 vmcb_gpa; - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); + vmcb_gpa = svm->vmcb->save.rax; + + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); if (!nested_vmcb) return false; - /* nested_vmcb is our indicator if nested SVM is activated */ - svm->nested.vmcb = svm->vmcb->save.rax; - - trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb, + trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa, nested_vmcb->save.rip, nested_vmcb->control.int_ctl, nested_vmcb->control.event_inj, nested_vmcb->control.nested_ctl); + trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read, + nested_vmcb->control.intercept_cr_write, + nested_vmcb->control.intercept_exceptions, + nested_vmcb->control.intercept); + /* Clear internal status */ kvm_clear_exception_queue(&svm->vcpu); kvm_clear_interrupt_queue(&svm->vcpu); - /* Save the old vmcb, so we don't need to pick what we save, but - can restore everything when a VMEXIT occurs */ + /* + * Save the old vmcb, so we don't need to pick what we save, but can + * restore everything when a VMEXIT occurs + */ hsave->save.es = vmcb->save.es; hsave->save.cs = vmcb->save.cs; hsave->save.ss = vmcb->save.ss; hsave->save.ds = vmcb->save.ds; hsave->save.gdtr = vmcb->save.gdtr; hsave->save.idtr = vmcb->save.idtr; - hsave->save.efer = svm->vcpu.arch.shadow_efer; - hsave->save.cr0 = svm->vcpu.arch.cr0; + hsave->save.efer = svm->vcpu.arch.efer; + hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); hsave->save.cr4 = svm->vcpu.arch.cr4; hsave->save.rflags = vmcb->save.rflags; hsave->save.rip = svm->next_rip; @@ -1773,14 +1993,17 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) if (npt_enabled) { svm->vmcb->save.cr3 = nested_vmcb->save.cr3; svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; - } else { + } else kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); - kvm_mmu_reset_context(&svm->vcpu); - } + + /* Guest paging mode is active - reset mmu */ + kvm_mmu_reset_context(&svm->vcpu); + svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); + /* In case we don't even reach vcpu_run, the fields are not updated */ svm->vmcb->save.rax = nested_vmcb->save.rax; svm->vmcb->save.rsp = nested_vmcb->save.rsp; @@ -1789,22 +2012,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) svm->vmcb->save.dr6 = nested_vmcb->save.dr6; svm->vmcb->save.cpl = nested_vmcb->save.cpl; - /* We don't want a nested guest to be more powerful than the guest, - so all intercepts are ORed */ - svm->vmcb->control.intercept_cr_read |= - nested_vmcb->control.intercept_cr_read; - svm->vmcb->control.intercept_cr_write |= - nested_vmcb->control.intercept_cr_write; - svm->vmcb->control.intercept_dr_read |= - nested_vmcb->control.intercept_dr_read; - svm->vmcb->control.intercept_dr_write |= - nested_vmcb->control.intercept_dr_write; - svm->vmcb->control.intercept_exceptions |= - nested_vmcb->control.intercept_exceptions; - - svm->vmcb->control.intercept |= nested_vmcb->control.intercept; - - svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa; + svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL; + svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; /* cache intercepts */ svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; @@ -1821,13 +2030,43 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) else svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; + if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { + /* We only want the cr8 intercept bits of the guest */ + svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK; + svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; + } + + /* We don't want to see VMMCALLs from a nested guest */ + svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL); + + /* + * We don't want a nested guest to be more powerful than the guest, so + * all intercepts are ORed + */ + svm->vmcb->control.intercept_cr_read |= + nested_vmcb->control.intercept_cr_read; + svm->vmcb->control.intercept_cr_write |= + nested_vmcb->control.intercept_cr_write; + svm->vmcb->control.intercept_dr_read |= + nested_vmcb->control.intercept_dr_read; + svm->vmcb->control.intercept_dr_write |= + nested_vmcb->control.intercept_dr_write; + svm->vmcb->control.intercept_exceptions |= + nested_vmcb->control.intercept_exceptions; + + svm->vmcb->control.intercept |= nested_vmcb->control.intercept; + + svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; svm->vmcb->control.int_state = nested_vmcb->control.int_state; svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; - nested_svm_unmap(nested_vmcb, KM_USER0); + nested_svm_unmap(page); + + /* nested_vmcb is our indicator if nested SVM is activated */ + svm->nested.vmcb = vmcb_gpa; enable_gif(svm); @@ -1853,6 +2092,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) static int vmload_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; + struct page *page; if (nested_svm_check_permissions(svm)) return 1; @@ -1860,12 +2100,12 @@ static int vmload_interception(struct vcpu_svm *svm) svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); if (!nested_vmcb) return 1; nested_svm_vmloadsave(nested_vmcb, svm->vmcb); - nested_svm_unmap(nested_vmcb, KM_USER0); + nested_svm_unmap(page); return 1; } @@ -1873,6 +2113,7 @@ static int vmload_interception(struct vcpu_svm *svm) static int vmsave_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; + struct page *page; if (nested_svm_check_permissions(svm)) return 1; @@ -1880,12 +2121,12 @@ static int vmsave_interception(struct vcpu_svm *svm) svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); if (!nested_vmcb) return 1; nested_svm_vmloadsave(svm->vmcb, nested_vmcb); - nested_svm_unmap(nested_vmcb, KM_USER0); + nested_svm_unmap(page); return 1; } @@ -1988,6 +2229,8 @@ static int task_switch_interception(struct vcpu_svm *svm) svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; uint32_t idt_v = svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; + bool has_error_code = false; + u32 error_code = 0; tss_selector = (u16)svm->vmcb->control.exit_info_1; @@ -2008,6 +2251,12 @@ static int task_switch_interception(struct vcpu_svm *svm) svm->vcpu.arch.nmi_injected = false; break; case SVM_EXITINTINFO_TYPE_EXEPT: + if (svm->vmcb->control.exit_info_2 & + (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) { + has_error_code = true; + error_code = + (u32)svm->vmcb->control.exit_info_2; + } kvm_clear_exception_queue(&svm->vcpu); break; case SVM_EXITINTINFO_TYPE_INTR: @@ -2024,7 +2273,14 @@ static int task_switch_interception(struct vcpu_svm *svm) (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) skip_emulated_instruction(&svm->vcpu); - return kvm_task_switch(&svm->vcpu, tss_selector, reason); + if (kvm_task_switch(&svm->vcpu, tss_selector, reason, + has_error_code, error_code) == EMULATE_FAIL) { + svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + svm->vcpu.run->internal.ndata = 0; + return 0; + } + return 1; } static int cpuid_interception(struct vcpu_svm *svm) @@ -2037,7 +2293,7 @@ static int cpuid_interception(struct vcpu_svm *svm) static int iret_interception(struct vcpu_svm *svm) { ++svm->vcpu.stat.nmi_window_exits; - svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); + svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; return 1; } @@ -2115,9 +2371,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) case MSR_IA32_SYSENTER_ESP: *data = svm->sysenter_esp; break; - /* Nobody will change the following 5 values in the VMCB so - we can safely return them on rdmsr. They will always be 0 - until LBRV is implemented. */ + /* + * Nobody will change the following 5 values in the VMCB so we can + * safely return them on rdmsr. They will always be 0 until LBRV is + * implemented. + */ case MSR_IA32_DEBUGCTLMSR: *data = svm->vmcb->save.dbgctl; break; @@ -2137,7 +2395,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) *data = svm->nested.hsave_msr; break; case MSR_VM_CR: - *data = 0; + *data = svm->nested.vm_cr_msr; break; case MSR_IA32_UCODE_REV: *data = 0x01000065; @@ -2153,9 +2411,10 @@ static int rdmsr_interception(struct vcpu_svm *svm) u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data; - if (svm_get_msr(&svm->vcpu, ecx, &data)) + if (svm_get_msr(&svm->vcpu, ecx, &data)) { + trace_kvm_msr_read_ex(ecx); kvm_inject_gp(&svm->vcpu, 0); - else { + } else { trace_kvm_msr_read(ecx, data); svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; @@ -2166,6 +2425,31 @@ static int rdmsr_interception(struct vcpu_svm *svm) return 1; } +static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int svm_dis, chg_mask; + + if (data & ~SVM_VM_CR_VALID_MASK) + return 1; + + chg_mask = SVM_VM_CR_VALID_MASK; + + if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK) + chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK); + + svm->nested.vm_cr_msr &= ~chg_mask; + svm->nested.vm_cr_msr |= (data & chg_mask); + + svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK; + + /* check for svm_disable while efer.svme is set */ + if (svm_dis && (vcpu->arch.efer & EFER_SVME)) + return 1; + + return 0; +} + static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2232,6 +2516,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) svm->nested.hsave_msr = data; break; case MSR_VM_CR: + return svm_set_vm_cr(vcpu, data); case MSR_VM_IGNNE: pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; @@ -2247,13 +2532,15 @@ static int wrmsr_interception(struct vcpu_svm *svm) u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); - trace_kvm_msr_write(ecx, data); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (svm_set_msr(&svm->vcpu, ecx, data)) + if (svm_set_msr(&svm->vcpu, ecx, data)) { + trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); - else + } else { + trace_kvm_msr_write(ecx, data); skip_emulated_instruction(&svm->vcpu); + } return 1; } @@ -2293,37 +2580,42 @@ static int pause_interception(struct vcpu_svm *svm) } static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { - [SVM_EXIT_READ_CR0] = emulate_on_interception, - [SVM_EXIT_READ_CR3] = emulate_on_interception, - [SVM_EXIT_READ_CR4] = emulate_on_interception, - [SVM_EXIT_READ_CR8] = emulate_on_interception, - /* for now: */ - [SVM_EXIT_WRITE_CR0] = emulate_on_interception, - [SVM_EXIT_WRITE_CR3] = emulate_on_interception, - [SVM_EXIT_WRITE_CR4] = emulate_on_interception, - [SVM_EXIT_WRITE_CR8] = cr8_write_interception, - [SVM_EXIT_READ_DR0] = emulate_on_interception, + [SVM_EXIT_READ_CR0] = emulate_on_interception, + [SVM_EXIT_READ_CR3] = emulate_on_interception, + [SVM_EXIT_READ_CR4] = emulate_on_interception, + [SVM_EXIT_READ_CR8] = emulate_on_interception, + [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, + [SVM_EXIT_WRITE_CR0] = emulate_on_interception, + [SVM_EXIT_WRITE_CR3] = emulate_on_interception, + [SVM_EXIT_WRITE_CR4] = emulate_on_interception, + [SVM_EXIT_WRITE_CR8] = cr8_write_interception, + [SVM_EXIT_READ_DR0] = emulate_on_interception, [SVM_EXIT_READ_DR1] = emulate_on_interception, [SVM_EXIT_READ_DR2] = emulate_on_interception, [SVM_EXIT_READ_DR3] = emulate_on_interception, + [SVM_EXIT_READ_DR4] = emulate_on_interception, + [SVM_EXIT_READ_DR5] = emulate_on_interception, + [SVM_EXIT_READ_DR6] = emulate_on_interception, + [SVM_EXIT_READ_DR7] = emulate_on_interception, [SVM_EXIT_WRITE_DR0] = emulate_on_interception, [SVM_EXIT_WRITE_DR1] = emulate_on_interception, [SVM_EXIT_WRITE_DR2] = emulate_on_interception, [SVM_EXIT_WRITE_DR3] = emulate_on_interception, + [SVM_EXIT_WRITE_DR4] = emulate_on_interception, [SVM_EXIT_WRITE_DR5] = emulate_on_interception, + [SVM_EXIT_WRITE_DR6] = emulate_on_interception, [SVM_EXIT_WRITE_DR7] = emulate_on_interception, [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, - [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, - [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, - [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, - [SVM_EXIT_INTR] = intr_interception, + [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, + [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, + [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, + [SVM_EXIT_INTR] = intr_interception, [SVM_EXIT_NMI] = nmi_interception, [SVM_EXIT_SMI] = nop_on_interception, [SVM_EXIT_INIT] = nop_on_interception, [SVM_EXIT_VINTR] = interrupt_window_interception, - /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ [SVM_EXIT_CPUID] = cpuid_interception, [SVM_EXIT_IRET] = iret_interception, [SVM_EXIT_INVD] = emulate_on_interception, @@ -2331,7 +2623,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_HLT] = halt_interception, [SVM_EXIT_INVLPG] = invlpg_interception, [SVM_EXIT_INVLPGA] = invlpga_interception, - [SVM_EXIT_IOIO] = io_interception, + [SVM_EXIT_IOIO] = io_interception, [SVM_EXIT_MSR] = msr_interception, [SVM_EXIT_TASK_SWITCH] = task_switch_interception, [SVM_EXIT_SHUTDOWN] = shutdown_interception, @@ -2354,7 +2646,12 @@ static int handle_exit(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; u32 exit_code = svm->vmcb->control.exit_code; - trace_kvm_exit(exit_code, svm->vmcb->save.rip); + trace_kvm_exit(exit_code, vcpu); + + if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) + vcpu->arch.cr0 = svm->vmcb->save.cr0; + if (npt_enabled) + vcpu->arch.cr3 = svm->vmcb->save.cr3; if (unlikely(svm->nested.exit_required)) { nested_svm_vmexit(svm); @@ -2383,21 +2680,6 @@ static int handle_exit(struct kvm_vcpu *vcpu) svm_complete_interrupts(svm); - if (npt_enabled) { - int mmu_reload = 0; - if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { - svm_set_cr0(vcpu, svm->vmcb->save.cr0); - mmu_reload = 1; - } - vcpu->arch.cr0 = svm->vmcb->save.cr0; - vcpu->arch.cr3 = svm->vmcb->save.cr3; - if (mmu_reload) { - kvm_mmu_reset_context(vcpu); - kvm_mmu_load(vcpu); - } - } - - if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason @@ -2450,7 +2732,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; vcpu->arch.hflags |= HF_NMI_MASK; - svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); + svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); ++vcpu->stat.nmi_injections; } @@ -2482,6 +2764,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); + if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) + return; + if (irr == -1) return; @@ -2493,8 +2778,12 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && - !(svm->vcpu.arch.hflags & HF_NMI_MASK); + int ret; + ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && + !(svm->vcpu.arch.hflags & HF_NMI_MASK); + ret = ret && gif_set(svm) && nested_svm_nmi(svm); + + return ret; } static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) @@ -2510,10 +2799,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) if (masked) { svm->vcpu.arch.hflags |= HF_NMI_MASK; - svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); + svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); } else { svm->vcpu.arch.hflags &= ~HF_NMI_MASK; - svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); + svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); } } @@ -2539,13 +2828,13 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - nested_svm_intr(svm); - - /* In case GIF=0 we can't rely on the CPU to tell us when - * GIF becomes 1, because that's a separate STGI/VMRUN intercept. - * The next time we get that intercept, this function will be - * called again though and we'll get the vintr intercept. */ - if (gif_set(svm)) { + /* + * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes + * 1, because that's a separate STGI/VMRUN intercept. The next time we + * get that intercept, this function will be called again though and + * we'll get the vintr intercept. + */ + if (gif_set(svm) && nested_svm_intr(svm)) { svm_set_vintr(svm); svm_inject_irq(svm, 0x0); } @@ -2559,9 +2848,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) == HF_NMI_MASK) return; /* IRET will cause a vm exit */ - /* Something prevents NMI from been injected. Single step over - possible problem (IRET or exception injection or interrupt - shadow) */ + /* + * Something prevents NMI from been injected. Single step over possible + * problem (IRET or exception injection or interrupt shadow) + */ svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); update_db_intercept(vcpu); @@ -2585,6 +2875,9 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) + return; + if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; kvm_set_cr8(vcpu, cr8); @@ -2596,6 +2889,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u64 cr8; + if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) + return; + cr8 = kvm_get_cr8(vcpu); svm->vmcb->control.int_ctl &= ~V_TPR_MASK; svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; @@ -2606,6 +2902,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) u8 vector; int type; u32 exitintinfo = svm->vmcb->control.exit_int_info; + unsigned int3_injected = svm->int3_injected; + + svm->int3_injected = 0; if (svm->vcpu.arch.hflags & HF_IRET_MASK) svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); @@ -2625,18 +2924,25 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) svm->vcpu.arch.nmi_injected = true; break; case SVM_EXITINTINFO_TYPE_EXEPT: - /* In case of software exception do not reinject an exception - vector, but re-execute and instruction instead */ - if (is_nested(svm)) - break; - if (kvm_exception_is_soft(vector)) + /* + * In case of software exceptions, do not reinject the vector, + * but re-execute the instruction instead. Rewind RIP first + * if we emulated INT3 before. + */ + if (kvm_exception_is_soft(vector)) { + if (vector == BP_VECTOR && int3_injected && + kvm_is_linear_rip(&svm->vcpu, svm->int3_rip)) + kvm_rip_write(&svm->vcpu, + kvm_rip_read(&svm->vcpu) - + int3_injected); break; + } if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { u32 err = svm->vmcb->control.exit_int_info_err; - kvm_queue_exception_e(&svm->vcpu, vector, err); + kvm_requeue_exception_e(&svm->vcpu, vector, err); } else - kvm_queue_exception(&svm->vcpu, vector); + kvm_requeue_exception(&svm->vcpu, vector); break; case SVM_EXITINTINFO_TYPE_INTR: kvm_queue_interrupt(&svm->vcpu, vector, false); @@ -2659,6 +2965,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) u16 gs_selector; u16 ldt_selector; + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; + svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; + svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; + /* * A vmexit emulation is required before the vcpu can be executed * again. @@ -2666,10 +2976,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(svm->nested.exit_required)) return; - svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; - svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; - pre_svm_run(svm); sync_lapic_to_cr8(vcpu); @@ -2798,12 +3104,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) svm->vmcb->save.cr3 = root; force_new_asid(vcpu); - - if (vcpu->fpu_active) { - svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); - svm->vmcb->save.cr0 |= X86_CR0_TS; - vcpu->fpu_active = 0; - } } static int is_disabled(void) @@ -2852,25 +3152,43 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return 0; } +static void svm_cpuid_update(struct kvm_vcpu *vcpu) +{ +} + +static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) +{ + switch (func) { + case 0x8000000A: + entry->eax = 1; /* SVM revision 1 */ + entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper + ASID emulation to nested SVM */ + entry->ecx = 0; /* Reserved */ + entry->edx = 0; /* Do not support any additional features */ + + break; + } +} + static const struct trace_print_flags svm_exit_reasons_str[] = { - { SVM_EXIT_READ_CR0, "read_cr0" }, - { SVM_EXIT_READ_CR3, "read_cr3" }, - { SVM_EXIT_READ_CR4, "read_cr4" }, - { SVM_EXIT_READ_CR8, "read_cr8" }, - { SVM_EXIT_WRITE_CR0, "write_cr0" }, - { SVM_EXIT_WRITE_CR3, "write_cr3" }, - { SVM_EXIT_WRITE_CR4, "write_cr4" }, - { SVM_EXIT_WRITE_CR8, "write_cr8" }, - { SVM_EXIT_READ_DR0, "read_dr0" }, - { SVM_EXIT_READ_DR1, "read_dr1" }, - { SVM_EXIT_READ_DR2, "read_dr2" }, - { SVM_EXIT_READ_DR3, "read_dr3" }, - { SVM_EXIT_WRITE_DR0, "write_dr0" }, - { SVM_EXIT_WRITE_DR1, "write_dr1" }, - { SVM_EXIT_WRITE_DR2, "write_dr2" }, - { SVM_EXIT_WRITE_DR3, "write_dr3" }, - { SVM_EXIT_WRITE_DR5, "write_dr5" }, - { SVM_EXIT_WRITE_DR7, "write_dr7" }, + { SVM_EXIT_READ_CR0, "read_cr0" }, + { SVM_EXIT_READ_CR3, "read_cr3" }, + { SVM_EXIT_READ_CR4, "read_cr4" }, + { SVM_EXIT_READ_CR8, "read_cr8" }, + { SVM_EXIT_WRITE_CR0, "write_cr0" }, + { SVM_EXIT_WRITE_CR3, "write_cr3" }, + { SVM_EXIT_WRITE_CR4, "write_cr4" }, + { SVM_EXIT_WRITE_CR8, "write_cr8" }, + { SVM_EXIT_READ_DR0, "read_dr0" }, + { SVM_EXIT_READ_DR1, "read_dr1" }, + { SVM_EXIT_READ_DR2, "read_dr2" }, + { SVM_EXIT_READ_DR3, "read_dr3" }, + { SVM_EXIT_WRITE_DR0, "write_dr0" }, + { SVM_EXIT_WRITE_DR1, "write_dr1" }, + { SVM_EXIT_WRITE_DR2, "write_dr2" }, + { SVM_EXIT_WRITE_DR3, "write_dr3" }, + { SVM_EXIT_WRITE_DR5, "write_dr5" }, + { SVM_EXIT_WRITE_DR7, "write_dr7" }, { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, @@ -2905,9 +3223,24 @@ static const struct trace_print_flags svm_exit_reasons_str[] = { { -1, NULL } }; -static bool svm_gb_page_enable(void) +static int svm_get_lpage_level(void) { - return true; + return PT_PDPE_LEVEL; +} + +static bool svm_rdtscp_supported(void) +{ + return false; +} + +static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; + if (is_nested(svm)) + svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR; + update_cr0_intercept(svm); } static struct kvm_x86_ops svm_x86_ops = { @@ -2936,6 +3269,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, + .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, .set_cr3 = svm_set_cr3, @@ -2945,11 +3279,12 @@ static struct kvm_x86_ops svm_x86_ops = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, - .get_dr = svm_get_dr, - .set_dr = svm_set_dr, + .set_dr7 = svm_set_dr7, .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .fpu_activate = svm_fpu_activate, + .fpu_deactivate = svm_fpu_deactivate, .tlb_flush = svm_flush_tlb, @@ -2975,13 +3310,19 @@ static struct kvm_x86_ops svm_x86_ops = { .get_mt_mask = svm_get_mt_mask, .exit_reasons_str = svm_exit_reasons_str, - .gb_page_enable = svm_gb_page_enable, + .get_lpage_level = svm_get_lpage_level, + + .cpuid_update = svm_cpuid_update, + + .rdtscp_supported = svm_rdtscp_supported, + + .set_supported_cpuid = svm_set_supported_cpuid, }; static int __init svm_init(void) { return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), - THIS_MODULE); + __alignof__(struct vcpu_svm), THIS_MODULE); } static void __exit svm_exit(void) diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index eea40439066c..4ddadb1a5ffe 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -12,7 +12,8 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) /* * There is a race window between reading and incrementing, but we do * not care about potentially loosing timer events in the !reinject - * case anyway. + * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked + * in vcpu_enter_guest. */ if (ktimer->reinject || !atomic_read(&ktimer->pending)) { atomic_inc(&ktimer->pending); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 816e0449db0b..a6544b8e7c0f 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -5,8 +5,6 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm -#define TRACE_INCLUDE_PATH arch/x86/kvm -#define TRACE_INCLUDE_FILE trace /* * Tracepoint for guest mode entry. @@ -56,6 +54,38 @@ TRACE_EVENT(kvm_hypercall, ); /* + * Tracepoint for hypercall. + */ +TRACE_EVENT(kvm_hv_hypercall, + TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx, + __u64 ingpa, __u64 outgpa), + TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), + + TP_STRUCT__entry( + __field( __u16, code ) + __field( bool, fast ) + __field( __u16, rep_cnt ) + __field( __u16, rep_idx ) + __field( __u64, ingpa ) + __field( __u64, outgpa ) + ), + + TP_fast_assign( + __entry->code = code; + __entry->fast = fast; + __entry->rep_cnt = rep_cnt; + __entry->rep_idx = rep_idx; + __entry->ingpa = ingpa; + __entry->outgpa = outgpa; + ), + + TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", + __entry->code, __entry->fast ? "fast" : "slow", + __entry->rep_cnt, __entry->rep_idx, __entry->ingpa, + __entry->outgpa) +); + +/* * Tracepoint for PIO. */ TRACE_EVENT(kvm_pio, @@ -152,8 +182,8 @@ TRACE_EVENT(kvm_apic, * Tracepoint for kvm guest exit: */ TRACE_EVENT(kvm_exit, - TP_PROTO(unsigned int exit_reason, unsigned long guest_rip), - TP_ARGS(exit_reason, guest_rip), + TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu), + TP_ARGS(exit_reason, vcpu), TP_STRUCT__entry( __field( unsigned int, exit_reason ) @@ -162,7 +192,7 @@ TRACE_EVENT(kvm_exit, TP_fast_assign( __entry->exit_reason = exit_reason; - __entry->guest_rip = guest_rip; + __entry->guest_rip = kvm_rip_read(vcpu); ), TP_printk("reason %s rip 0x%lx", @@ -189,6 +219,38 @@ TRACE_EVENT(kvm_inj_virq, TP_printk("irq %u", __entry->irq) ); +#define EXS(x) { x##_VECTOR, "#" #x } + +#define kvm_trace_sym_exc \ + EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \ + EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \ + EXS(MF), EXS(MC) + +/* + * Tracepoint for kvm interrupt injection: + */ +TRACE_EVENT(kvm_inj_exception, + TP_PROTO(unsigned exception, bool has_error, unsigned error_code), + TP_ARGS(exception, has_error, error_code), + + TP_STRUCT__entry( + __field( u8, exception ) + __field( u8, has_error ) + __field( u32, error_code ) + ), + + TP_fast_assign( + __entry->exception = exception; + __entry->has_error = has_error; + __entry->error_code = error_code; + ), + + TP_printk("%s (0x%x)", + __print_symbolic(__entry->exception, kvm_trace_sym_exc), + /* FIXME: don't print error_code if not present */ + __entry->has_error ? __entry->error_code : 0) +); + /* * Tracepoint for page fault. */ @@ -214,28 +276,33 @@ TRACE_EVENT(kvm_page_fault, * Tracepoint for guest MSR access. */ TRACE_EVENT(kvm_msr, - TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data), - TP_ARGS(rw, ecx, data), + TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception), + TP_ARGS(write, ecx, data, exception), TP_STRUCT__entry( - __field( unsigned int, rw ) - __field( unsigned int, ecx ) - __field( unsigned long, data ) + __field( unsigned, write ) + __field( u32, ecx ) + __field( u64, data ) + __field( u8, exception ) ), TP_fast_assign( - __entry->rw = rw; + __entry->write = write; __entry->ecx = ecx; __entry->data = data; + __entry->exception = exception; ), - TP_printk("msr_%s %x = 0x%lx", - __entry->rw ? "write" : "read", - __entry->ecx, __entry->data) + TP_printk("msr_%s %x = 0x%llx%s", + __entry->write ? "write" : "read", + __entry->ecx, __entry->data, + __entry->exception ? " (#GP)" : "") ); -#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data) -#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data) +#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false) +#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false) +#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true) +#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true) /* * Tracepoint for guest CR access. @@ -376,12 +443,34 @@ TRACE_EVENT(kvm_nested_vmrun, ), TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x " - "event_inj: 0x%08x npt: %s\n", + "event_inj: 0x%08x npt: %s", __entry->rip, __entry->vmcb, __entry->nested_rip, __entry->int_ctl, __entry->event_inj, __entry->npt ? "on" : "off") ); +TRACE_EVENT(kvm_nested_intercepts, + TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept), + TP_ARGS(cr_read, cr_write, exceptions, intercept), + + TP_STRUCT__entry( + __field( __u16, cr_read ) + __field( __u16, cr_write ) + __field( __u32, exceptions ) + __field( __u64, intercept ) + ), + + TP_fast_assign( + __entry->cr_read = cr_read; + __entry->cr_write = cr_write; + __entry->exceptions = exceptions; + __entry->intercept = intercept; + ), + + TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx", + __entry->cr_read, __entry->cr_write, __entry->exceptions, + __entry->intercept) +); /* * Tracepoint for #VMEXIT while nested */ @@ -410,7 +499,7 @@ TRACE_EVENT(kvm_nested_vmexit, __entry->exit_int_info_err = exit_int_info_err; ), TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " - "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n", + "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", __entry->rip, ftrace_print_symbols_seq(p, __entry->exit_code, kvm_x86_ops->exit_reasons_str), @@ -445,7 +534,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject, ), TP_printk("reason: %s ext_inf1: 0x%016llx " - "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n", + "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", ftrace_print_symbols_seq(p, __entry->exit_code, kvm_x86_ops->exit_reasons_str), __entry->exit_info1, __entry->exit_info2, @@ -467,7 +556,7 @@ TRACE_EVENT(kvm_nested_intr_vmexit, __entry->rip = rip ), - TP_printk("rip: 0x%016llx\n", __entry->rip) + TP_printk("rip: 0x%016llx", __entry->rip) ); /* @@ -489,7 +578,7 @@ TRACE_EVENT(kvm_invlpga, __entry->address = address; ), - TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n", + TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx", __entry->rip, __entry->asid, __entry->address) ); @@ -510,11 +599,102 @@ TRACE_EVENT(kvm_skinit, __entry->slb = slb; ), - TP_printk("rip: 0x%016llx slb: 0x%08x\n", + TP_printk("rip: 0x%016llx slb: 0x%08x", __entry->rip, __entry->slb) ); +#define __print_insn(insn, ilen) ({ \ + int i; \ + const char *ret = p->buffer + p->len; \ + \ + for (i = 0; i < ilen; ++i) \ + trace_seq_printf(p, " %02x", insn[i]); \ + trace_seq_printf(p, "%c", 0); \ + ret; \ + }) + +#define KVM_EMUL_INSN_F_CR0_PE (1 << 0) +#define KVM_EMUL_INSN_F_EFL_VM (1 << 1) +#define KVM_EMUL_INSN_F_CS_D (1 << 2) +#define KVM_EMUL_INSN_F_CS_L (1 << 3) + +#define kvm_trace_symbol_emul_flags \ + { 0, "real" }, \ + { KVM_EMUL_INSN_F_CR0_PE \ + | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \ + { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \ + { KVM_EMUL_INSN_F_CR0_PE \ + | KVM_EMUL_INSN_F_CS_D, "prot32" }, \ + { KVM_EMUL_INSN_F_CR0_PE \ + | KVM_EMUL_INSN_F_CS_L, "prot64" } + +#define kei_decode_mode(mode) ({ \ + u8 flags = 0xff; \ + switch (mode) { \ + case X86EMUL_MODE_REAL: \ + flags = 0; \ + break; \ + case X86EMUL_MODE_VM86: \ + flags = KVM_EMUL_INSN_F_EFL_VM; \ + break; \ + case X86EMUL_MODE_PROT16: \ + flags = KVM_EMUL_INSN_F_CR0_PE; \ + break; \ + case X86EMUL_MODE_PROT32: \ + flags = KVM_EMUL_INSN_F_CR0_PE \ + | KVM_EMUL_INSN_F_CS_D; \ + break; \ + case X86EMUL_MODE_PROT64: \ + flags = KVM_EMUL_INSN_F_CR0_PE \ + | KVM_EMUL_INSN_F_CS_L; \ + break; \ + } \ + flags; \ + }) + +TRACE_EVENT(kvm_emulate_insn, + TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed), + TP_ARGS(vcpu, failed), + + TP_STRUCT__entry( + __field( __u64, rip ) + __field( __u32, csbase ) + __field( __u8, len ) + __array( __u8, insn, 15 ) + __field( __u8, flags ) + __field( __u8, failed ) + ), + + TP_fast_assign( + __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start; + __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); + __entry->len = vcpu->arch.emulate_ctxt.decode.eip + - vcpu->arch.emulate_ctxt.decode.fetch.start; + memcpy(__entry->insn, + vcpu->arch.emulate_ctxt.decode.fetch.data, + 15); + __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode); + __entry->failed = failed; + ), + + TP_printk("%x:%llx:%s (%s)%s", + __entry->csbase, __entry->rip, + __print_insn(__entry->insn, __entry->len), + __print_symbolic(__entry->flags, + kvm_trace_symbol_emul_flags), + __entry->failed ? " failed" : "" + ) + ); + +#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0) +#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1) + #endif /* _TRACE_KVM_H */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH arch/x86/kvm +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + /* This part must be outside protection */ #include <trace/define_trace.h> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d4918d6fc924..859a01a07dbf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -26,6 +26,8 @@ #include <linux/sched.h> #include <linux/moduleparam.h> #include <linux/ftrace_event.h> +#include <linux/slab.h> +#include <linux/tboot.h> #include "kvm_cache_regs.h" #include "x86.h" @@ -61,6 +63,23 @@ module_param_named(unrestricted_guest, static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); +#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) +#define KVM_GUEST_CR0_MASK \ + (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE) +#define KVM_VM_CR0_ALWAYS_ON \ + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_CR4_GUEST_OWNED_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT) + +#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) +#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) + +#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -80,6 +99,8 @@ module_param(ple_gap, int, S_IRUGO); static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; module_param(ple_window, int, S_IRUGO); +#define NR_AUTOLOAD_MSRS 1 + struct vmcs { u32 revision_id; u32 abort; @@ -107,6 +128,11 @@ struct vcpu_vmx { u64 msr_guest_kernel_gs_base; #endif struct vmcs *vmcs; + struct msr_autoload { + unsigned nr; + struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; + struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; + } msr_autoload; struct { int loaded; u16 fs_sel, gs_sel, ldt_sel; @@ -115,7 +141,7 @@ struct vcpu_vmx { } host_state; struct { int vm86_active; - u8 save_iopl; + ulong save_rflags; struct kvm_save_segment { u16 selector; unsigned long base; @@ -136,6 +162,8 @@ struct vcpu_vmx { ktime_t entry_time; s64 vnmi_blocked_time; u32 exit_reason; + + bool rdtscp_enabled; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -210,60 +238,60 @@ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif - MSR_EFER, MSR_K6_STAR, + MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR, }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) -static inline int is_page_fault(u32 intr_info) +static inline bool is_page_fault(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | INTR_INFO_VALID_MASK)) == (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); } -static inline int is_no_device(u32 intr_info) +static inline bool is_no_device(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | INTR_INFO_VALID_MASK)) == (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); } -static inline int is_invalid_opcode(u32 intr_info) +static inline bool is_invalid_opcode(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | INTR_INFO_VALID_MASK)) == (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); } -static inline int is_external_interrupt(u32 intr_info) +static inline bool is_external_interrupt(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); } -static inline int is_machine_check(u32 intr_info) +static inline bool is_machine_check(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | INTR_INFO_VALID_MASK)) == (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); } -static inline int cpu_has_vmx_msr_bitmap(void) +static inline bool cpu_has_vmx_msr_bitmap(void) { return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; } -static inline int cpu_has_vmx_tpr_shadow(void) +static inline bool cpu_has_vmx_tpr_shadow(void) { return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; } -static inline int vm_need_tpr_shadow(struct kvm *kvm) +static inline bool vm_need_tpr_shadow(struct kvm *kvm) { return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); } -static inline int cpu_has_secondary_exec_ctrls(void) +static inline bool cpu_has_secondary_exec_ctrls(void) { return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; @@ -283,71 +311,80 @@ static inline bool cpu_has_vmx_flexpriority(void) static inline bool cpu_has_vmx_ept_execute_only(void) { - return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT); + return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; } static inline bool cpu_has_vmx_eptp_uncacheable(void) { - return !!(vmx_capability.ept & VMX_EPTP_UC_BIT); + return vmx_capability.ept & VMX_EPTP_UC_BIT; } static inline bool cpu_has_vmx_eptp_writeback(void) { - return !!(vmx_capability.ept & VMX_EPTP_WB_BIT); + return vmx_capability.ept & VMX_EPTP_WB_BIT; } static inline bool cpu_has_vmx_ept_2m_page(void) { - return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); + return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; +} + +static inline bool cpu_has_vmx_ept_1g_page(void) +{ + return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; } -static inline int cpu_has_vmx_invept_individual_addr(void) +static inline bool cpu_has_vmx_invept_individual_addr(void) { - return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); + return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; } -static inline int cpu_has_vmx_invept_context(void) +static inline bool cpu_has_vmx_invept_context(void) { - return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT); + return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; } -static inline int cpu_has_vmx_invept_global(void) +static inline bool cpu_has_vmx_invept_global(void) { - return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT); + return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; } -static inline int cpu_has_vmx_ept(void) +static inline bool cpu_has_vmx_ept(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_EPT; } -static inline int cpu_has_vmx_unrestricted_guest(void) +static inline bool cpu_has_vmx_unrestricted_guest(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_UNRESTRICTED_GUEST; } -static inline int cpu_has_vmx_ple(void) +static inline bool cpu_has_vmx_ple(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PAUSE_LOOP_EXITING; } -static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) +static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm) { - return flexpriority_enabled && - (cpu_has_vmx_virtualize_apic_accesses()) && - (irqchip_in_kernel(kvm)); + return flexpriority_enabled && irqchip_in_kernel(kvm); } -static inline int cpu_has_vmx_vpid(void) +static inline bool cpu_has_vmx_vpid(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_VPID; } -static inline int cpu_has_virtual_nmis(void) +static inline bool cpu_has_vmx_rdtscp(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_RDTSCP; +} + +static inline bool cpu_has_virtual_nmis(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; } @@ -551,35 +588,71 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; - eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); - if (!vcpu->fpu_active) - eb |= 1u << NM_VECTOR; - /* - * Unconditionally intercept #DB so we can maintain dr6 without - * reading it every exit. - */ - eb |= 1u << DB_VECTOR; - if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { - if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) - eb |= 1u << BP_VECTOR; - } + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | + (1u << NM_VECTOR) | (1u << DB_VECTOR); + if ((vcpu->guest_debug & + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) + eb |= 1u << BP_VECTOR; if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; if (enable_ept) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ + if (vcpu->fpu_active) + eb &= ~(1u << NM_VECTOR); vmcs_write32(EXCEPTION_BITMAP, eb); } +static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) +{ + unsigned i; + struct msr_autoload *m = &vmx->msr_autoload; + + for (i = 0; i < m->nr; ++i) + if (m->guest[i].index == msr) + break; + + if (i == m->nr) + return; + --m->nr; + m->guest[i] = m->guest[m->nr]; + m->host[i] = m->host[m->nr]; + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); +} + +static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, + u64 guest_val, u64 host_val) +{ + unsigned i; + struct msr_autoload *m = &vmx->msr_autoload; + + for (i = 0; i < m->nr; ++i) + if (m->guest[i].index == msr) + break; + + if (i == m->nr) { + ++m->nr; + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); + } + + m->guest[i].index = msr; + m->guest[i].value = guest_val; + m->host[i].index = msr; + m->host[i].value = host_val; +} + static void reload_tss(void) { /* * VT restores TR but not its size. Useless. */ - struct descriptor_table gdt; + struct desc_ptr gdt; struct desc_struct *descs; - kvm_get_gdt(&gdt); - descs = (void *)gdt.base; + native_store_gdt(&gdt); + descs = (void *)gdt.address; descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ load_TR_desc(); } @@ -589,7 +662,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) u64 guest_efer; u64 ignore_bits; - guest_efer = vmx->vcpu.arch.shadow_efer; + guest_efer = vmx->vcpu.arch.efer; /* * NX is emulated; LMA and LME handled by hardware; SCE meaninless @@ -606,9 +679,57 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) guest_efer |= host_efer & ignore_bits; vmx->guest_msrs[efer_offset].data = guest_efer; vmx->guest_msrs[efer_offset].mask = ~ignore_bits; + + clear_atomic_switch_msr(vmx, MSR_EFER); + /* On ept, can't emulate nx, and must switch nx atomically */ + if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) { + guest_efer = vmx->vcpu.arch.efer; + if (!(guest_efer & EFER_LMA)) + guest_efer &= ~EFER_LME; + add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer); + return false; + } + return true; } +static unsigned long segment_base(u16 selector) +{ + struct desc_ptr gdt; + struct desc_struct *d; + unsigned long table_base; + unsigned long v; + + if (!(selector & ~3)) + return 0; + + native_store_gdt(&gdt); + table_base = gdt.address; + + if (selector & 4) { /* from ldt */ + u16 ldt_selector = kvm_read_ldt(); + + if (!(ldt_selector & ~3)) + return 0; + + table_base = segment_base(ldt_selector); + } + d = (struct desc_struct *)(table_base + (selector & ~7)); + v = get_desc_base(d); +#ifdef CONFIG_X86_64 + if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) + v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; +#endif + return v; +} + +static inline unsigned long kvm_read_tr_base(void) +{ + u16 tr; + asm("str %0" : "=g"(tr)); + return segment_base(tr); +} + static void vmx_save_host_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -733,7 +854,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (vcpu->cpu != cpu) { - struct descriptor_table dt; + struct desc_ptr dt; unsigned long sysenter_esp; vcpu->cpu = cpu; @@ -742,8 +863,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * processors. */ vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ - kvm_get_gdt(&dt); - vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ + native_store_gdt(&dt); + vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */ rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ @@ -767,38 +888,51 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) static void vmx_fpu_activate(struct kvm_vcpu *vcpu) { + ulong cr0; + if (vcpu->fpu_active) return; vcpu->fpu_active = 1; - vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); - if (vcpu->arch.cr0 & X86_CR0_TS) - vmcs_set_bits(GUEST_CR0, X86_CR0_TS); + cr0 = vmcs_readl(GUEST_CR0); + cr0 &= ~(X86_CR0_TS | X86_CR0_MP); + cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); + vmcs_writel(GUEST_CR0, cr0); update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); } +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); + static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) { - if (!vcpu->fpu_active) - return; - vcpu->fpu_active = 0; - vmcs_set_bits(GUEST_CR0, X86_CR0_TS); + vmx_decache_cr0_guest_bits(vcpu); + vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits = 0; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); } static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { - unsigned long rflags; + unsigned long rflags, save_rflags; rflags = vmcs_readl(GUEST_RFLAGS); - if (to_vmx(vcpu)->rmode.vm86_active) - rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM); + if (to_vmx(vcpu)->rmode.vm86_active) { + rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + save_rflags = to_vmx(vcpu)->rmode.save_rflags; + rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + } return rflags; } static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - if (to_vmx(vcpu)->rmode.vm86_active) + if (to_vmx(vcpu)->rmode.vm86_active) { + to_vmx(vcpu)->rmode.save_rflags = rflags; rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; + } vmcs_writel(GUEST_RFLAGS, rflags); } @@ -808,9 +942,9 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) int ret = 0; if (interruptibility & GUEST_INTR_STATE_STI) - ret |= X86_SHADOW_INT_STI; + ret |= KVM_X86_SHADOW_INT_STI; if (interruptibility & GUEST_INTR_STATE_MOV_SS) - ret |= X86_SHADOW_INT_MOV_SS; + ret |= KVM_X86_SHADOW_INT_MOV_SS; return ret & mask; } @@ -822,9 +956,9 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); - if (mask & X86_SHADOW_INT_MOV_SS) + if (mask & KVM_X86_SHADOW_INT_MOV_SS) interruptibility |= GUEST_INTR_STATE_MOV_SS; - if (mask & X86_SHADOW_INT_STI) + else if (mask & KVM_X86_SHADOW_INT_STI) interruptibility |= GUEST_INTR_STATE_STI; if ((interruptibility != interruptibility_old)) @@ -844,7 +978,8 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) } static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code) + bool has_error_code, u32 error_code, + bool reinject) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info = nr | INTR_INFO_VALID_MASK; @@ -878,6 +1013,11 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); } +static bool vmx_rdtscp_supported(void) +{ + return cpu_has_vmx_rdtscp(); +} + /* * Swap MSR entry in host/guest MSR entry array. */ @@ -913,12 +1053,15 @@ static void setup_msrs(struct vcpu_vmx *vmx) index = __find_msr_index(vmx, MSR_CSTAR); if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_TSC_AUX); + if (index >= 0 && vmx->rdtscp_enabled) + move_msr_up(vmx, index, save_nmsrs++); /* * MSR_K6_STAR is only needed on long mode guests, and only * if efer.sce is enabled. */ index = __find_msr_index(vmx, MSR_K6_STAR); - if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) + if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) move_msr_up(vmx, index, save_nmsrs++); } #endif @@ -1002,6 +1145,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_IA32_SYSENTER_ESP: data = vmcs_readl(GUEST_SYSENTER_ESP); break; + case MSR_TSC_AUX: + if (!to_vmx(vcpu)->rdtscp_enabled) + return 1; + /* Otherwise falls through */ default: vmx_load_host_state(to_vmx(vcpu)); msr = find_msr_entry(to_vmx(vcpu), msr_index); @@ -1065,7 +1212,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vcpu->arch.pat = data; break; } - /* Otherwise falls through to kvm_set_msr_common */ + ret = kvm_set_msr_common(vcpu, msr_index, data); + break; + case MSR_TSC_AUX: + if (!vmx->rdtscp_enabled) + return 1; + /* Check reserved bit, higher 32 bits should be zero */ + if ((data >> 32) != 0) + return 1; + /* Otherwise falls through */ default: msr = find_msr_entry(vmx, msr_index); if (msr) { @@ -1118,9 +1273,16 @@ static __init int vmx_disabled_by_bios(void) u64 msr; rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); - return (msr & (FEATURE_CONTROL_LOCKED | - FEATURE_CONTROL_VMXON_ENABLED)) - == FEATURE_CONTROL_LOCKED; + if (msr & FEATURE_CONTROL_LOCKED) { + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) + && tboot_enabled()) + return 1; + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + && !tboot_enabled()) + return 1; + } + + return 0; /* locked but not enabled */ } @@ -1128,21 +1290,23 @@ static int hardware_enable(void *garbage) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); - u64 old; + u64 old, test_bits; if (read_cr4() & X86_CR4_VMXE) return -EBUSY; INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); rdmsrl(MSR_IA32_FEATURE_CONTROL, old); - if ((old & (FEATURE_CONTROL_LOCKED | - FEATURE_CONTROL_VMXON_ENABLED)) - != (FEATURE_CONTROL_LOCKED | - FEATURE_CONTROL_VMXON_ENABLED)) + + test_bits = FEATURE_CONTROL_LOCKED; + test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + if (tboot_enabled()) + test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; + + if ((old & test_bits) != test_bits) { /* enable and lock */ - wrmsrl(MSR_IA32_FEATURE_CONTROL, old | - FEATURE_CONTROL_LOCKED | - FEATURE_CONTROL_VMXON_ENABLED); + wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); + } write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) @@ -1224,6 +1388,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING; opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | @@ -1243,7 +1409,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST | - SECONDARY_EXEC_PAUSE_LOOP_EXITING; + SECONDARY_EXEC_PAUSE_LOOP_EXITING | + SECONDARY_EXEC_RDTSCP; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -1429,8 +1596,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); flags = vmcs_readl(GUEST_RFLAGS); - flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); - flags |= (vmx->rmode.save_iopl << IOPL_SHIFT); + flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; vmcs_writel(GUEST_RFLAGS, flags); vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | @@ -1457,8 +1624,12 @@ static void enter_pmode(struct kvm_vcpu *vcpu) static gva_t rmode_tss_base(struct kvm *kvm) { if (!kvm->arch.tss_addr) { - gfn_t base_gfn = kvm->memslots[0].base_gfn + - kvm->memslots[0].npages - 3; + struct kvm_memslots *slots; + gfn_t base_gfn; + + slots = kvm_memslots(kvm); + base_gfn = kvm->memslots->memslots[0].base_gfn + + kvm->memslots->memslots[0].npages - 3; return base_gfn << PAGE_SHIFT; } return kvm->arch.tss_addr; @@ -1499,8 +1670,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); flags = vmcs_readl(GUEST_RFLAGS); - vmx->rmode.save_iopl - = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + vmx->rmode.save_rflags = flags; flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; @@ -1544,9 +1714,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) * of this msr depends on is_long_mode(). */ vmx_load_host_state(to_vmx(vcpu)); - vcpu->arch.shadow_efer = efer; - if (!msr) - return; + vcpu->arch.efer = efer; if (efer & EFER_LMA) { vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) | @@ -1576,17 +1744,18 @@ static void enter_lmode(struct kvm_vcpu *vcpu) (guest_tr_ar & ~AR_TYPE_MASK) | AR_TYPE_BUSY_64_TSS); } - vcpu->arch.shadow_efer |= EFER_LMA; - vmx_set_efer(vcpu, vcpu->arch.shadow_efer); + vcpu->arch.efer |= EFER_LMA; + vmx_set_efer(vcpu, vcpu->arch.efer); } static void exit_lmode(struct kvm_vcpu *vcpu) { - vcpu->arch.shadow_efer &= ~EFER_LMA; + vcpu->arch.efer &= ~EFER_LMA; vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) & ~VM_ENTRY_IA32E_MODE); + vmx_set_efer(vcpu, vcpu->arch.efer); } #endif @@ -1598,10 +1767,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu) ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); } +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ + ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; + + vcpu->arch.cr0 &= ~cr0_guest_owned_bits; + vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; +} + static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { - vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; + ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; + + vcpu->arch.cr4 &= ~cr4_guest_owned_bits; + vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; } static void ept_load_pdptrs(struct kvm_vcpu *vcpu) @@ -1646,7 +1825,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, vcpu->arch.cr4); + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, @@ -1654,23 +1833,13 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, vcpu->arch.cr4); + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } if (!(cr0 & X86_CR0_WP)) *hw_cr0 &= ~X86_CR0_WP; } -static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, - struct kvm_vcpu *vcpu) -{ - if (!is_paging(vcpu)) { - *hw_cr4 &= ~X86_CR4_PAE; - *hw_cr4 |= X86_CR4_PSE; - } else if (!(vcpu->arch.cr4 & X86_CR4_PAE)) - *hw_cr4 &= ~X86_CR4_PAE; -} - static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1682,8 +1851,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) else hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; - vmx_fpu_deactivate(vcpu); - if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); @@ -1691,7 +1858,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) enter_rmode(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.shadow_efer & EFER_LME) { + if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) enter_lmode(vcpu); if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) @@ -1702,12 +1869,12 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (enable_ept) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); + if (!vcpu->fpu_active) + hw_cr0 |= X86_CR0_TS | X86_CR0_MP; + vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; - - if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) - vmx_fpu_activate(vcpu); } static u64 construct_eptp(unsigned long root_hpa) @@ -1738,8 +1905,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vmx_flush_tlb(vcpu); vmcs_writel(GUEST_CR3, guest_cr3); - if (vcpu->arch.cr0 & X86_CR0_PE) - vmx_fpu_deactivate(vcpu); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1748,8 +1913,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); vcpu->arch.cr4 = cr4; - if (enable_ept) - ept_update_paging_mode_cr4(&hw_cr4, vcpu); + if (enable_ept) { + if (!is_paging(vcpu)) { + hw_cr4 &= ~X86_CR4_PAE; + hw_cr4 |= X86_CR4_PSE; + } else if (!(cr4 & X86_CR4_PAE)) { + hw_cr4 &= ~X86_CR4_PAE; + } + } vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); @@ -1787,7 +1958,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, static int vmx_get_cpl(struct kvm_vcpu *vcpu) { - if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ + if (!is_protmode(vcpu)) return 0; if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ @@ -1870,28 +2041,28 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) *l = (ar >> 13) & 1; } -static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { - dt->limit = vmcs_read32(GUEST_IDTR_LIMIT); - dt->base = vmcs_readl(GUEST_IDTR_BASE); + dt->size = vmcs_read32(GUEST_IDTR_LIMIT); + dt->address = vmcs_readl(GUEST_IDTR_BASE); } -static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { - vmcs_write32(GUEST_IDTR_LIMIT, dt->limit); - vmcs_writel(GUEST_IDTR_BASE, dt->base); + vmcs_write32(GUEST_IDTR_LIMIT, dt->size); + vmcs_writel(GUEST_IDTR_BASE, dt->address); } -static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { - dt->limit = vmcs_read32(GUEST_GDTR_LIMIT); - dt->base = vmcs_readl(GUEST_GDTR_BASE); + dt->size = vmcs_read32(GUEST_GDTR_LIMIT); + dt->address = vmcs_readl(GUEST_GDTR_BASE); } -static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { - vmcs_write32(GUEST_GDTR_LIMIT, dt->limit); - vmcs_writel(GUEST_GDTR_BASE, dt->base); + vmcs_write32(GUEST_GDTR_LIMIT, dt->size); + vmcs_writel(GUEST_GDTR_BASE, dt->address); } static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) @@ -2042,7 +2213,7 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) static bool guest_state_valid(struct kvm_vcpu *vcpu) { /* real mode guest state checks */ - if (!(vcpu->arch.cr0 & X86_CR0_PE)) { + if (!is_protmode(vcpu)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) @@ -2175,7 +2346,7 @@ static int alloc_apic_access_page(struct kvm *kvm) struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_page) goto out; kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; @@ -2188,7 +2359,7 @@ static int alloc_apic_access_page(struct kvm *kvm) kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2197,7 +2368,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); if (kvm->arch.ept_identity_pagetable) goto out; kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; @@ -2212,7 +2383,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2232,6 +2403,16 @@ static void allocate_vpid(struct vcpu_vmx *vmx) spin_unlock(&vmx_vpid_lock); } +static void free_vpid(struct vcpu_vmx *vmx) +{ + if (!enable_vpid) + return; + spin_lock(&vmx_vpid_lock); + if (vmx->vpid != 0) + __clear_bit(vmx->vpid, vmx_vpid_bitmap); + spin_unlock(&vmx_vpid_lock); +} + static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) { int f = sizeof(unsigned long); @@ -2270,7 +2451,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) u32 junk; u64 host_pat, tsc_this, tsc_base; unsigned long a; - struct descriptor_table dt; + struct desc_ptr dt; int i; unsigned long kvm_vmx_return; u32 exec_control; @@ -2351,14 +2532,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ - kvm_get_idt(&dt); - vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ + native_store_idt(&dt); + vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); @@ -2384,14 +2567,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) for (i = 0; i < NR_VMX_MSR; ++i) { u32 index = vmx_msr_index[i]; u32 data_low, data_high; - u64 data; int j = vmx->nmsrs; if (rdmsr_safe(index, &data_low, &data_high) < 0) continue; if (wrmsr_safe(index, data_low, data_high) < 0) continue; - data = data_low | ((u64)data_high << 32); vmx->guest_msrs[j].index = i; vmx->guest_msrs[j].data = 0; vmx->guest_msrs[j].mask = -1ull; @@ -2404,7 +2585,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); - vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; + if (enable_ept) + vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; + vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; rdtscll(tsc_this); @@ -2429,10 +2613,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 msr; - int ret; + int ret, idx; vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); if (!init_rmode(vmx->vcpu.kvm)) { ret = -ENOMEM; goto out; @@ -2526,7 +2710,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ + vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ vmx_set_cr4(&vmx->vcpu, 0); vmx_set_efer(&vmx->vcpu, 0); vmx_fpu_activate(&vmx->vcpu); @@ -2540,7 +2724,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->emulation_required = 0; out: - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); return ret; } @@ -2638,8 +2822,7 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) return 0; return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS | - GUEST_INTR_STATE_NMI)); + (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI)); } static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) @@ -2717,6 +2900,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, kvm_queue_exception(vcpu, vec); return 1; case BP_VECTOR: + /* + * Update instruction length as we may reinject the exception + * from user space while in guest debugging mode. + */ + to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) return 0; /* fall through */ @@ -2839,6 +3028,13 @@ static int handle_exception(struct kvm_vcpu *vcpu) kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); /* fall through */ case BP_VECTOR: + /* + * Update instruction length as we may reinject #BP from + * user space while in guest debugging mode. Reading it for + * #DB as well causes no harm, it is not used in that case. + */ + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; kvm_run->debug.arch.exception = ex_no; @@ -2870,22 +3066,20 @@ static int handle_io(struct kvm_vcpu *vcpu) int size, in, string; unsigned port; - ++vcpu->stat.io_exits; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); string = (exit_qualification & 16) != 0; + in = (exit_qualification & 8) != 0; - if (string) { - if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO) - return 0; - return 1; - } + ++vcpu->stat.io_exits; - size = (exit_qualification & 7) + 1; - in = (exit_qualification & 8) != 0; - port = exit_qualification >> 16; + if (string || in) + return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); + port = exit_qualification >> 16; + size = (exit_qualification & 7) + 1; skip_emulated_instruction(vcpu); - return kvm_emulate_pio(vcpu, in, size, port); + + return kvm_fast_pio_out(vcpu, size, port); } static void @@ -2940,11 +3134,10 @@ static int handle_cr(struct kvm_vcpu *vcpu) }; break; case 2: /* clts */ - vmx_fpu_deactivate(vcpu); - vcpu->arch.cr0 &= ~X86_CR0_TS; - vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); - vmx_fpu_activate(vcpu); + vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); skip_emulated_instruction(vcpu); + vmx_fpu_activate(vcpu); return 1; case 1: /*mov from cr*/ switch (cr) { @@ -2962,7 +3155,9 @@ static int handle_cr(struct kvm_vcpu *vcpu) } break; case 3: /* lmsw */ - kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); + val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; + trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); + kvm_lmsw(vcpu, val); skip_emulated_instruction(vcpu); return 1; @@ -2978,9 +3173,9 @@ static int handle_cr(struct kvm_vcpu *vcpu) static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; - unsigned long val; int dr, reg; + /* Do not handle if the CPL > 0, will trigger GP on re-entry */ if (!kvm_require_cpl(vcpu, 0)) return 1; dr = vmcs_readl(GUEST_DR7); @@ -3012,57 +3207,20 @@ static int handle_dr(struct kvm_vcpu *vcpu) dr = exit_qualification & DEBUG_REG_ACCESS_NUM; reg = DEBUG_REG_ACCESS_REG(exit_qualification); if (exit_qualification & TYPE_MOV_FROM_DR) { - switch (dr) { - case 0 ... 3: - val = vcpu->arch.db[dr]; - break; - case 6: - val = vcpu->arch.dr6; - break; - case 7: - val = vcpu->arch.dr7; - break; - default: - val = 0; - } - kvm_register_write(vcpu, reg, val); - } else { - val = vcpu->arch.regs[reg]; - switch (dr) { - case 0 ... 3: - vcpu->arch.db[dr] = val; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) - vcpu->arch.eff_db[dr] = val; - break; - case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) - kvm_queue_exception(vcpu, UD_VECTOR); - break; - case 6: - if (val & 0xffffffff00000000ULL) { - kvm_queue_exception(vcpu, GP_VECTOR); - break; - } - vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; - break; - case 7: - if (val & 0xffffffff00000000ULL) { - kvm_queue_exception(vcpu, GP_VECTOR); - break; - } - vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { - vmcs_writel(GUEST_DR7, vcpu->arch.dr7); - vcpu->arch.switch_db_regs = - (val & DR7_BP_EN_MASK); - } - break; - } - } + unsigned long val; + if (!kvm_get_dr(vcpu, dr, &val)) + kvm_register_write(vcpu, reg, val); + } else + kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]); skip_emulated_instruction(vcpu); return 1; } +static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) +{ + vmcs_writel(GUEST_DR7, val); +} + static int handle_cpuid(struct kvm_vcpu *vcpu) { kvm_emulate_cpuid(vcpu); @@ -3075,6 +3233,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) u64 data; if (vmx_get_msr(vcpu, ecx, &data)) { + trace_kvm_msr_read_ex(ecx); kvm_inject_gp(vcpu, 0); return 1; } @@ -3094,13 +3253,13 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - trace_kvm_msr_write(ecx, data); - if (vmx_set_msr(vcpu, ecx, data) != 0) { + trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; } + trace_kvm_msr_write(ecx, data); skip_emulated_instruction(vcpu); return 1; } @@ -3193,6 +3352,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification; + bool has_error_code = false; + u32 error_code = 0; u16 tss_selector; int reason, type, idt_v; @@ -3215,6 +3376,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) kvm_clear_interrupt_queue(vcpu); break; case INTR_TYPE_HARD_EXCEPTION: + if (vmx->idt_vectoring_info & + VECTORING_INFO_DELIVER_CODE_MASK) { + has_error_code = true; + error_code = + vmcs_read32(IDT_VECTORING_ERROR_CODE); + } + /* fall through */ case INTR_TYPE_SOFT_EXCEPTION: kvm_clear_exception_queue(vcpu); break; @@ -3229,8 +3397,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) type != INTR_TYPE_NMI_INTR)) skip_emulated_instruction(vcpu); - if (!kvm_task_switch(vcpu, tss_selector, reason)) + if (kvm_task_switch(vcpu, tss_selector, reason, + has_error_code, error_code) == EMULATE_FAIL) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; return 0; + } /* clear all local breakpoint enable flags */ vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); @@ -3385,7 +3558,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) } if (err != EMULATE_DONE) { - kvm_report_emulation_failure(vcpu, "emulation failure"); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; @@ -3416,6 +3588,12 @@ static int handle_pause(struct kvm_vcpu *vcpu) return 1; } +static int handle_invalid_op(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -3453,6 +3631,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, + [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, }; static const int kvm_vmx_max_exit_handlers = @@ -3468,7 +3648,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; - trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); + trace_kvm_exit(exit_reason, vcpu); /* If guest state is invalid, start emulating */ if (vmx->emulation_required && emulate_invalid_guest_state) @@ -3553,8 +3733,11 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) /* We need to handle NMIs before interrupts are enabled */ if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && - (exit_intr_info & INTR_INFO_VALID_MASK)) + (exit_intr_info & INTR_INFO_VALID_MASK)) { + kvm_before_handle_nmi(&vmx->vcpu); asm("int $2"); + kvm_after_handle_nmi(&vmx->vcpu); + } idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; @@ -3686,9 +3869,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) */ vmcs_writel(HOST_CR0, read_cr0()); - if (vcpu->arch.switch_db_regs) - set_debugreg(vcpu->arch.dr6, 6); - asm( /* Store host registers */ "push %%"R"dx; push %%"R"bp;" @@ -3789,9 +3969,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_PDPTR)); vcpu->arch.regs_dirty = 0; - if (vcpu->arch.switch_db_regs) - get_debugreg(vcpu->arch.dr6, 6); - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); @@ -3820,10 +3997,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - spin_lock(&vmx_vpid_lock); - if (vmx->vpid != 0) - __clear_bit(vmx->vpid, vmx_vpid_bitmap); - spin_unlock(&vmx_vpid_lock); + free_vpid(vmx); vmx_free_vmcs(vcpu); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); @@ -3885,6 +4059,7 @@ free_msrs: uninit_vcpu: kvm_vcpu_uninit(&vmx->vcpu); free_vcpu: + free_vpid(vmx); kmem_cache_free(kvm_vcpu_cache, vmx); return ERR_PTR(err); } @@ -3920,7 +4095,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) * b. VT-d with snooping control feature: snooping control feature of * VT-d engine can guarantee the cache correctness. Just set it * to WB to keep consistent with host. So the same as item 3. - * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep + * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep * consistent with host MTRR */ if (is_mmio) @@ -3931,37 +4106,92 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) VMX_EPT_MT_EPTE_SHIFT; else ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) - | VMX_EPT_IGMT_BIT; + | VMX_EPT_IPAT_BIT; return ret; } +#define _ER(x) { EXIT_REASON_##x, #x } + static const struct trace_print_flags vmx_exit_reasons_str[] = { - { EXIT_REASON_EXCEPTION_NMI, "exception" }, - { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" }, - { EXIT_REASON_TRIPLE_FAULT, "triple_fault" }, - { EXIT_REASON_NMI_WINDOW, "nmi_window" }, - { EXIT_REASON_IO_INSTRUCTION, "io_instruction" }, - { EXIT_REASON_CR_ACCESS, "cr_access" }, - { EXIT_REASON_DR_ACCESS, "dr_access" }, - { EXIT_REASON_CPUID, "cpuid" }, - { EXIT_REASON_MSR_READ, "rdmsr" }, - { EXIT_REASON_MSR_WRITE, "wrmsr" }, - { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" }, - { EXIT_REASON_HLT, "halt" }, - { EXIT_REASON_INVLPG, "invlpg" }, - { EXIT_REASON_VMCALL, "hypercall" }, - { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" }, - { EXIT_REASON_APIC_ACCESS, "apic_access" }, - { EXIT_REASON_WBINVD, "wbinvd" }, - { EXIT_REASON_TASK_SWITCH, "task_switch" }, - { EXIT_REASON_EPT_VIOLATION, "ept_violation" }, + _ER(EXCEPTION_NMI), + _ER(EXTERNAL_INTERRUPT), + _ER(TRIPLE_FAULT), + _ER(PENDING_INTERRUPT), + _ER(NMI_WINDOW), + _ER(TASK_SWITCH), + _ER(CPUID), + _ER(HLT), + _ER(INVLPG), + _ER(RDPMC), + _ER(RDTSC), + _ER(VMCALL), + _ER(VMCLEAR), + _ER(VMLAUNCH), + _ER(VMPTRLD), + _ER(VMPTRST), + _ER(VMREAD), + _ER(VMRESUME), + _ER(VMWRITE), + _ER(VMOFF), + _ER(VMON), + _ER(CR_ACCESS), + _ER(DR_ACCESS), + _ER(IO_INSTRUCTION), + _ER(MSR_READ), + _ER(MSR_WRITE), + _ER(MWAIT_INSTRUCTION), + _ER(MONITOR_INSTRUCTION), + _ER(PAUSE_INSTRUCTION), + _ER(MCE_DURING_VMENTRY), + _ER(TPR_BELOW_THRESHOLD), + _ER(APIC_ACCESS), + _ER(EPT_VIOLATION), + _ER(EPT_MISCONFIG), + _ER(WBINVD), { -1, NULL } }; -static bool vmx_gb_page_enable(void) +#undef _ER + +static int vmx_get_lpage_level(void) +{ + if (enable_ept && !cpu_has_vmx_ept_1g_page()) + return PT_DIRECTORY_LEVEL; + else + /* For shadow and EPT supported 1GB page */ + return PT_PDPE_LEVEL; +} + +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + +static void vmx_cpuid_update(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exec_control; + + vmx->rdtscp_enabled = false; + if (vmx_rdtscp_supported()) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + if (exec_control & SECONDARY_EXEC_RDTSCP) { + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) + vmx->rdtscp_enabled = true; + else { + exec_control &= ~SECONDARY_EXEC_RDTSCP; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + } + } + } +} + +static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { - return false; } static struct kvm_x86_ops vmx_x86_ops = { @@ -3990,6 +4220,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_segment = vmx_set_segment, .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr3 = vmx_set_cr3, @@ -3999,9 +4230,12 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, + .set_dr7 = vmx_set_dr7, .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + .fpu_activate = vmx_fpu_activate, + .fpu_deactivate = vmx_fpu_deactivate, .tlb_flush = vmx_flush_tlb, @@ -4027,7 +4261,13 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_mt_mask = vmx_get_mt_mask, .exit_reasons_str = vmx_exit_reasons_str, - .gb_page_enable = vmx_gb_page_enable, + .get_lpage_level = vmx_get_lpage_level, + + .cpuid_update = vmx_cpuid_update, + + .rdtscp_supported = vmx_rdtscp_supported, + + .set_supported_cpuid = vmx_set_supported_cpuid, }; static int __init vmx_init(void) @@ -4075,7 +4315,8 @@ static int __init vmx_init(void) set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ - r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), + __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) goto out3; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1ddcad452add..05d571f6f196 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -38,8 +38,11 @@ #include <linux/intel-iommu.h> #include <linux/cpufreq.h> #include <linux/user-return-notifier.h> +#include <linux/srcu.h> +#include <linux/slab.h> +#include <linux/perf_event.h> #include <trace/events/kvm.h> -#undef TRACE_INCLUDE_FILE + #define CREATE_TRACE_POINTS #include "trace.h" @@ -93,16 +96,16 @@ module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); struct kvm_shared_msrs_global { int nr; - struct kvm_shared_msr { - u32 msr; - u64 value; - } msrs[KVM_NR_SHARED_MSRS]; + u32 msrs[KVM_NR_SHARED_MSRS]; }; struct kvm_shared_msrs { struct user_return_notifier urn; bool registered; - u64 current_value[KVM_NR_SHARED_MSRS]; + struct kvm_shared_msr_values { + u64 host; + u64 curr; + } values[KVM_NR_SHARED_MSRS]; }; static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; @@ -147,53 +150,64 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; - struct kvm_shared_msr *global; struct kvm_shared_msrs *locals = container_of(urn, struct kvm_shared_msrs, urn); + struct kvm_shared_msr_values *values; for (slot = 0; slot < shared_msrs_global.nr; ++slot) { - global = &shared_msrs_global.msrs[slot]; - if (global->value != locals->current_value[slot]) { - wrmsrl(global->msr, global->value); - locals->current_value[slot] = global->value; + values = &locals->values[slot]; + if (values->host != values->curr) { + wrmsrl(shared_msrs_global.msrs[slot], values->host); + values->curr = values->host; } } locals->registered = false; user_return_notifier_unregister(urn); } -void kvm_define_shared_msr(unsigned slot, u32 msr) +static void shared_msr_update(unsigned slot, u32 msr) { - int cpu; + struct kvm_shared_msrs *smsr; u64 value; + smsr = &__get_cpu_var(shared_msrs); + /* only read, and nobody should modify it at this time, + * so don't need lock */ + if (slot >= shared_msrs_global.nr) { + printk(KERN_ERR "kvm: invalid MSR slot!"); + return; + } + rdmsrl_safe(msr, &value); + smsr->values[slot].host = value; + smsr->values[slot].curr = value; +} + +void kvm_define_shared_msr(unsigned slot, u32 msr) +{ if (slot >= shared_msrs_global.nr) shared_msrs_global.nr = slot + 1; - shared_msrs_global.msrs[slot].msr = msr; - rdmsrl_safe(msr, &value); - shared_msrs_global.msrs[slot].value = value; - for_each_online_cpu(cpu) - per_cpu(shared_msrs, cpu).current_value[slot] = value; + shared_msrs_global.msrs[slot] = msr; + /* we need ensured the shared_msr_global have been updated */ + smp_wmb(); } EXPORT_SYMBOL_GPL(kvm_define_shared_msr); static void kvm_shared_msr_cpu_online(void) { unsigned i; - struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs); for (i = 0; i < shared_msrs_global.nr; ++i) - locals->current_value[i] = shared_msrs_global.msrs[i].value; + shared_msr_update(i, shared_msrs_global.msrs[i]); } void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) { struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); - if (((value ^ smsr->current_value[slot]) & mask) == 0) + if (((value ^ smsr->values[slot].curr) & mask) == 0) return; - smsr->current_value[slot] = value; - wrmsrl(shared_msrs_global.msrs[slot].msr, value); + smsr->values[slot].curr = value; + wrmsrl(shared_msrs_global.msrs[slot], value); if (!smsr->registered) { smsr->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&smsr->urn); @@ -210,34 +224,6 @@ static void drop_user_return_notifiers(void *ignore) kvm_on_user_return(&smsr->urn); } -unsigned long segment_base(u16 selector) -{ - struct descriptor_table gdt; - struct desc_struct *d; - unsigned long table_base; - unsigned long v; - - if (selector == 0) - return 0; - - kvm_get_gdt(&gdt); - table_base = gdt.base; - - if (selector & 4) { /* from ldt */ - u16 ldt_selector = kvm_read_ldt(); - - table_base = segment_base(ldt_selector); - } - d = (struct desc_struct *)(table_base + (selector & ~7)); - v = get_desc_base(d); -#ifdef CONFIG_X86_64 - if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) - v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; -#endif - return v; -} -EXPORT_SYMBOL_GPL(segment_base); - u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) { if (irqchip_in_kernel(vcpu->kvm)) @@ -257,38 +243,83 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); +#define EXCPT_BENIGN 0 +#define EXCPT_CONTRIBUTORY 1 +#define EXCPT_PF 2 + +static int exception_class(int vector) +{ + switch (vector) { + case PF_VECTOR: + return EXCPT_PF; + case DE_VECTOR: + case TS_VECTOR: + case NP_VECTOR: + case SS_VECTOR: + case GP_VECTOR: + return EXCPT_CONTRIBUTORY; + default: + break; + } + return EXCPT_BENIGN; +} + +static void kvm_multiple_exception(struct kvm_vcpu *vcpu, + unsigned nr, bool has_error, u32 error_code, + bool reinject) +{ + u32 prev_nr; + int class1, class2; + + if (!vcpu->arch.exception.pending) { + queue: + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = has_error; + vcpu->arch.exception.nr = nr; + vcpu->arch.exception.error_code = error_code; + vcpu->arch.exception.reinject = reinject; + return; + } + + /* to check exception */ + prev_nr = vcpu->arch.exception.nr; + if (prev_nr == DF_VECTOR) { + /* triple fault -> shutdown */ + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + return; + } + class1 = exception_class(prev_nr); + class2 = exception_class(nr); + if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) + || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { + /* generate double fault per SDM Table 5-5 */ + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = true; + vcpu->arch.exception.nr = DF_VECTOR; + vcpu->arch.exception.error_code = 0; + } else + /* replace previous exception with a new one in a hope + that instruction re-execution will regenerate lost + exception */ + goto queue; +} + void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - WARN_ON(vcpu->arch.exception.pending); - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = false; - vcpu->arch.exception.nr = nr; + kvm_multiple_exception(vcpu, nr, false, 0, false); } EXPORT_SYMBOL_GPL(kvm_queue_exception); +void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) +{ + kvm_multiple_exception(vcpu, nr, false, 0, true); +} +EXPORT_SYMBOL_GPL(kvm_requeue_exception); + void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, u32 error_code) { ++vcpu->stat.pf_guest; - - if (vcpu->arch.exception.pending) { - switch(vcpu->arch.exception.nr) { - case DF_VECTOR: - /* triple fault -> shutdown */ - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); - return; - case PF_VECTOR: - vcpu->arch.exception.nr = DF_VECTOR; - vcpu->arch.exception.error_code = 0; - return; - default: - /* replace previous exception with a new one in a hope - that instruction re-execution will regenerate lost - exception */ - vcpu->arch.exception.pending = false; - break; - } - } vcpu->arch.cr2 = addr; kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); } @@ -301,14 +332,16 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - WARN_ON(vcpu->arch.exception.pending); - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = true; - vcpu->arch.exception.nr = nr; - vcpu->arch.exception.error_code = error_code; + kvm_multiple_exception(vcpu, nr, true, error_code, false); } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); +void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) +{ + kvm_multiple_exception(vcpu, nr, true, error_code, true); +} +EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); + /* * Checks if cpl <= required_cpl; if true, return true. Otherwise queue * a #GP and return false. @@ -383,41 +416,38 @@ out: void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { - if (cr0 & CR0_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", - cr0, vcpu->arch.cr0); + cr0 |= X86_CR0_ET; + +#ifdef CONFIG_X86_64 + if (cr0 & 0xffffffff00000000UL) { kvm_inject_gp(vcpu, 0); return; } +#endif + + cr0 &= ~CR0_RESERVED_BITS; if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { - printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); kvm_inject_gp(vcpu, 0); return; } if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { - printk(KERN_DEBUG "set_cr0: #GP, set PG flag " - "and a clear PE flag\n"); kvm_inject_gp(vcpu, 0); return; } if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 - if ((vcpu->arch.shadow_efer & EFER_LME)) { + if ((vcpu->arch.efer & EFER_LME)) { int cs_db, cs_l; if (!is_pae(vcpu)) { - printk(KERN_DEBUG "set_cr0: #GP, start paging " - "in long mode while PAE is disabled\n"); kvm_inject_gp(vcpu, 0); return; } kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); if (cs_l) { - printk(KERN_DEBUG "set_cr0: #GP, start paging " - "in long mode while CS.L == 1\n"); kvm_inject_gp(vcpu, 0); return; @@ -425,8 +455,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } else #endif if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - printk(KERN_DEBUG "set_cr0: #GP, pdptrs " - "reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } @@ -434,7 +462,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } kvm_x86_ops->set_cr0(vcpu, cr0); - vcpu->arch.cr0 = cr0; kvm_mmu_reset_context(vcpu); return; @@ -443,44 +470,38 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { - kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); + kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(kvm_lmsw); void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long old_cr4 = vcpu->arch.cr4; + unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; if (cr4 & CR4_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) { - printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " - "in long mode\n"); kvm_inject_gp(vcpu, 0); return; } } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } if (cr4 & X86_CR4_VMXE) { - printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); kvm_inject_gp(vcpu, 0); return; } kvm_x86_ops->set_cr4(vcpu, cr4); vcpu->arch.cr4 = cr4; - vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; kvm_mmu_reset_context(vcpu); } EXPORT_SYMBOL_GPL(kvm_set_cr4); @@ -495,21 +516,16 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (is_long_mode(vcpu)) { if (cr3 & CR3_L_MODE_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } } else { if (is_pae(vcpu)) { if (cr3 & CR3_PAE_RESERVED_BITS) { - printk(KERN_DEBUG - "set_cr3: #GP, reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { - printk(KERN_DEBUG "set_cr3: #GP, pdptrs " - "reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } @@ -541,7 +557,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr3); void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { if (cr8 & CR8_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); kvm_inject_gp(vcpu, 0); return; } @@ -561,6 +576,80 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_cr8); +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +{ + switch (dr) { + case 0 ... 3: + vcpu->arch.db[dr] = val; + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + vcpu->arch.eff_db[dr] = val; + break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + /* fall through */ + case 6: + if (val & 0xffffffff00000000ULL) { + kvm_inject_gp(vcpu, 0); + return 1; + } + vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; + break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + /* fall through */ + default: /* 7 */ + if (val & 0xffffffff00000000ULL) { + kvm_inject_gp(vcpu, 0); + return 1; + } + vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { + kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); + vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK); + } + break; + } + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_set_dr); + +int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +{ + switch (dr) { + case 0 ... 3: + *val = vcpu->arch.db[dr]; + break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + /* fall through */ + case 6: + *val = vcpu->arch.dr6; + break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + /* fall through */ + default: /* 7 */ + *val = vcpu->arch.dr7; + break; + } + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_get_dr); + static inline u32 bit(int bitno) { return 1 << (bitno & 31); @@ -575,9 +664,12 @@ static inline u32 bit(int bitno) * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 2 +#define KVM_SAVE_MSRS_BEGIN 7 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_APIC_ASSIST_PAGE, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_K6_STAR, #ifdef CONFIG_X86_64 @@ -592,53 +684,42 @@ static u32 emulated_msrs[] = { MSR_IA32_MISC_ENABLE, }; -static void set_efer(struct kvm_vcpu *vcpu, u64 efer) +static int set_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (efer & efer_reserved_bits) { - printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", - efer); - kvm_inject_gp(vcpu, 0); - return; - } + if (efer & efer_reserved_bits) + return 1; if (is_paging(vcpu) - && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { - printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); - kvm_inject_gp(vcpu, 0); - return; - } + && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) + return 1; if (efer & EFER_FFXSR) { struct kvm_cpuid_entry2 *feat; feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { - printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n"); - kvm_inject_gp(vcpu, 0); - return; - } + if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) + return 1; } if (efer & EFER_SVME) { struct kvm_cpuid_entry2 *feat; feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { - printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n"); - kvm_inject_gp(vcpu, 0); - return; - } + if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) + return 1; } - kvm_x86_ops->set_efer(vcpu, efer); - efer &= ~EFER_LMA; - efer |= vcpu->arch.shadow_efer & EFER_LMA; + efer |= vcpu->arch.efer & EFER_LMA; + + kvm_x86_ops->set_efer(vcpu, efer); - vcpu->arch.shadow_efer = efer; + vcpu->arch.efer = efer; vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; kvm_mmu_reset_context(vcpu); + + return 0; } void kvm_enable_efer_bits(u64 mask) @@ -668,14 +749,22 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { - static int version; + int version; + int r; struct pvclock_wall_clock wc; - struct timespec now, sys, boot; + struct timespec boot; if (!wall_clock) return; - version++; + r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); + if (r) + return; + + if (version & 1) + ++version; /* first time write, random junk */ + + ++version; kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); @@ -685,9 +774,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) * wall clock specified here. guest system time equals host * system time for us, thus we must fill in host boot time here. */ - now = current_kernel_time(); - ktime_get_ts(&sys); - boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); + getboottime(&boot); wc.sec = boot.tv_sec; wc.nsec = boot.tv_nsec; @@ -762,6 +849,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) local_irq_save(flags); kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); ktime_get_ts(&ts); + monotonic_to_bootbased(&ts); local_irq_restore(flags); /* With all the info we got, fill in the values */ @@ -769,6 +857,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) vcpu->hv_clock.system_time = ts.tv_nsec + (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; + vcpu->hv_clock.flags = 0; + /* * The interface expects us to write an even number signaling that the * update is finished. Since the guest won't see the intermediate @@ -914,9 +1004,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MC0_CTL + 4 * bank_num) { u32 offset = msr - MSR_IA32_MC0_CTL; - /* only 0 or all 1s can be written to IA32_MCi_CTL */ + /* only 0 or all 1s can be written to IA32_MCi_CTL + * some Linux kernels though clear bit 10 in bank 4 to + * workaround a BIOS/GART TBL issue on AMD K8s, ignore + * this to avoid an uncatched #GP in the guest + */ if ((offset & 0x3) == 0 && - data != 0 && data != ~(u64)0) + data != 0 && (data | (1 << 10)) != ~(u64)0) return -1; vcpu->arch.mce_banks[offset] = data; break; @@ -958,14 +1052,108 @@ out: return r; } +static bool kvm_hv_hypercall_enabled(struct kvm *kvm) +{ + return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; +} + +static bool kvm_hv_msr_partition_wide(u32 msr) +{ + bool r = false; + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + case HV_X64_MSR_HYPERCALL: + r = true; + break; + } + + return r; +} + +static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + kvm->arch.hv_guest_os_id = data; + /* setting guest os id to zero disables hypercall page */ + if (!kvm->arch.hv_guest_os_id) + kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; + break; + case HV_X64_MSR_HYPERCALL: { + u64 gfn; + unsigned long addr; + u8 instructions[4]; + + /* if guest os id is not set hypercall should remain disabled */ + if (!kvm->arch.hv_guest_os_id) + break; + if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { + kvm->arch.hv_hypercall = data; + break; + } + gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return 1; + kvm_x86_ops->patch_hypercall(vcpu, instructions); + ((unsigned char *)instructions)[3] = 0xc3; /* ret */ + if (copy_to_user((void __user *)addr, instructions, 4)) + return 1; + kvm->arch.hv_hypercall = data; + break; + } + default: + pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); + return 1; + } + return 0; +} + +static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + switch (msr) { + case HV_X64_MSR_APIC_ASSIST_PAGE: { + unsigned long addr; + + if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { + vcpu->arch.hv_vapic = data; + break; + } + addr = gfn_to_hva(vcpu->kvm, data >> + HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); + if (kvm_is_error_hva(addr)) + return 1; + if (clear_user((void __user *)addr, PAGE_SIZE)) + return 1; + vcpu->arch.hv_vapic = data; + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); + default: + pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); + return 1; + } + + return 0; +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { case MSR_EFER: - set_efer(vcpu, data); - break; + return set_efer(vcpu, data); case MSR_K7_HWCR: data &= ~(u64)0x40; /* ignore flush filter disable */ + data &= ~(u64)0x100; /* ignore ignne emulation enable */ if (data != 0) { pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", data); @@ -1008,10 +1196,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; + case MSR_KVM_WALL_CLOCK_NEW: case MSR_KVM_WALL_CLOCK: vcpu->kvm->arch.wall_clock = data; kvm_write_wall_clock(vcpu->kvm, data); break; + case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { if (vcpu->arch.time_page) { kvm_release_page_dirty(vcpu->arch.time_page); @@ -1072,6 +1262,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " "0x%x data 0x%llx\n", msr, data); break; + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_lock(&vcpu->kvm->lock); + r = set_msr_hyperv_pw(vcpu, msr, data); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return set_msr_hyperv(vcpu, msr, data); + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -1171,6 +1371,54 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } +static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + data = kvm->arch.hv_guest_os_id; + break; + case HV_X64_MSR_HYPERCALL: + data = kvm->arch.hv_hypercall; + break; + default: + pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + + *pdata = data; + return 0; +} + +static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + + switch (msr) { + case HV_X64_MSR_VP_INDEX: { + int r; + struct kvm_vcpu *v; + kvm_for_each_vcpu(r, v, vcpu->kvm) + if (v == vcpu) + data = r; + break; + } + case HV_X64_MSR_EOI: + return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); + case HV_X64_MSR_ICR: + return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); + case HV_X64_MSR_TPR: + return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); + default: + pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + *pdata = data; + return 0; +} + int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 data; @@ -1222,12 +1470,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data |= (((uint64_t)4ULL) << 40); break; case MSR_EFER: - data = vcpu->arch.shadow_efer; + data = vcpu->arch.efer; break; case MSR_KVM_WALL_CLOCK: + case MSR_KVM_WALL_CLOCK_NEW: data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_SYSTEM_TIME: + case MSR_KVM_SYSTEM_TIME_NEW: data = vcpu->arch.time; break; case MSR_IA32_P5_MC_ADDR: @@ -1237,6 +1487,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: return get_msr_mce(vcpu, msr, pdata); + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_lock(&vcpu->kvm->lock); + r = get_msr_hyperv_pw(vcpu, msr, pdata); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return get_msr_hyperv(vcpu, msr, pdata); + break; default: if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); @@ -1262,15 +1522,15 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data)) { - int i; + int i, idx; vcpu_load(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); vcpu_put(vcpu); @@ -1352,6 +1612,12 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_XEN_HVM: case KVM_CAP_ADJUST_CLOCK: case KVM_CAP_VCPU_EVENTS: + case KVM_CAP_HYPERV: + case KVM_CAP_HYPERV_VAPIC: + case KVM_CAP_HYPERV_SPIN: + case KVM_CAP_PCI_SEGMENT: + case KVM_CAP_DEBUGREGS: + case KVM_CAP_X86_ROBUST_SINGLESTEP: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1465,8 +1731,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); + kvm_x86_ops->vcpu_put(vcpu); } static int is_efer_nx(void) @@ -1515,6 +1781,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, if (copy_from_user(cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry))) goto out_free; + vcpu_load(vcpu); for (i = 0; i < cpuid->nent; i++) { vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; @@ -1531,6 +1798,8 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, cpuid_fix_nx_cap(vcpu); r = 0; kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); + vcpu_put(vcpu); out_free: vfree(cpuid_entries); @@ -1551,8 +1820,11 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, if (copy_from_user(&vcpu->arch.cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry2))) goto out; + vcpu_load(vcpu); vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); + vcpu_put(vcpu); return 0; out: @@ -1565,6 +1837,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, { int r; + vcpu_load(vcpu); r = -E2BIG; if (cpuid->nent < vcpu->arch.cpuid_nent) goto out; @@ -1576,6 +1849,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, out: cpuid->nent = vcpu->arch.cpuid_nent; + vcpu_put(vcpu); return r; } @@ -1595,12 +1869,15 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index, int *nent, int maxnent) { unsigned f_nx = is_efer_nx() ? F(NX) : 0; - unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0; #ifdef CONFIG_X86_64 + unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) + ? F(GBPAGES) : 0; unsigned f_lm = F(LM); #else + unsigned f_gbpages = 0; unsigned f_lm = 0; #endif + unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = @@ -1620,7 +1897,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | F(PAT) | F(PSE36) | 0 /* Reserved */ | f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | - F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | + F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = @@ -1703,6 +1980,24 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + case KVM_CPUID_SIGNATURE: { + char signature[12] = "KVMKVMKVM\0\0"; + u32 *sigptr = (u32 *)signature; + entry->eax = 0; + entry->ebx = sigptr[0]; + entry->ecx = sigptr[1]; + entry->edx = sigptr[2]; + break; + } + case KVM_CPUID_FEATURES: + entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | + (1 << KVM_FEATURE_NOP_IO_DELAY) | + (1 << KVM_FEATURE_CLOCKSOURCE2) | + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); + entry->ebx = 0; + entry->ecx = 0; + entry->edx = 0; + break; case 0x80000000: entry->eax = min(entry->eax, 0x8000001a); break; @@ -1711,6 +2006,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= kvm_supported_word6_x86_features; break; } + + kvm_x86_ops->set_supported_cpuid(function, entry); + put_cpu(); } @@ -1746,6 +2044,23 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) do_cpuid_ent(&cpuid_entries[nent], func, 0, &nent, cpuid->nent); + + + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, + cpuid->nent); + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent, + cpuid->nent); + r = -E2BIG; if (nent >= cpuid->nent) goto out_free; @@ -1825,6 +2140,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, int r; unsigned bank_num = mcg_cap & 0xff, bank; + vcpu_load(vcpu); r = -EINVAL; if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) goto out; @@ -1839,6 +2155,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; out: + vcpu_put(vcpu); return r; } @@ -1867,7 +2184,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, return 0; if (mce->status & MCI_STATUS_UC) { if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || - !(vcpu->arch.cr4 & X86_CR4_MCE)) { + !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { printk(KERN_DEBUG "kvm: set_mce: " "injects mce exception while " "previous one is in progress!\n"); @@ -1898,14 +2215,20 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, { vcpu_load(vcpu); - events->exception.injected = vcpu->arch.exception.pending; + events->exception.injected = + vcpu->arch.exception.pending && + !kvm_exception_is_soft(vcpu->arch.exception.nr); events->exception.nr = vcpu->arch.exception.nr; events->exception.has_error_code = vcpu->arch.exception.has_error_code; events->exception.error_code = vcpu->arch.exception.error_code; - events->interrupt.injected = vcpu->arch.interrupt.pending; + events->interrupt.injected = + vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; - events->interrupt.soft = vcpu->arch.interrupt.soft; + events->interrupt.soft = 0; + events->interrupt.shadow = + kvm_x86_ops->get_interrupt_shadow(vcpu, + KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI); events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending; @@ -1914,7 +2237,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->sipi_vector = vcpu->arch.sipi_vector; events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING - | KVM_VCPUEVENT_VALID_SIPI_VECTOR); + | KVM_VCPUEVENT_VALID_SIPI_VECTOR + | KVM_VCPUEVENT_VALID_SHADOW); vcpu_put(vcpu); } @@ -1923,7 +2247,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING - | KVM_VCPUEVENT_VALID_SIPI_VECTOR)) + | KVM_VCPUEVENT_VALID_SIPI_VECTOR + | KVM_VCPUEVENT_VALID_SHADOW)) return -EINVAL; vcpu_load(vcpu); @@ -1938,6 +2263,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.soft = events->interrupt.soft; if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) kvm_pic_clear_isr_ack(vcpu->kvm); + if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) + kvm_x86_ops->set_interrupt_shadow(vcpu, + events->interrupt.shadow); vcpu->arch.nmi_injected = events->nmi.injected; if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) @@ -1952,6 +2280,36 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, return 0; } +static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, + struct kvm_debugregs *dbgregs) +{ + vcpu_load(vcpu); + + memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); + dbgregs->dr6 = vcpu->arch.dr6; + dbgregs->dr7 = vcpu->arch.dr7; + dbgregs->flags = 0; + + vcpu_put(vcpu); +} + +static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, + struct kvm_debugregs *dbgregs) +{ + if (dbgregs->flags) + return -EINVAL; + + vcpu_load(vcpu); + + memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); + vcpu->arch.dr6 = dbgregs->dr6; + vcpu->arch.dr7 = dbgregs->dr7; + + vcpu_put(vcpu); + + return 0; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2106,7 +2464,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&mce, argp, sizeof mce)) goto out; + vcpu_load(vcpu); r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); + vcpu_put(vcpu); break; } case KVM_GET_VCPU_EVENTS: { @@ -2130,6 +2490,29 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); break; } + case KVM_GET_DEBUGREGS: { + struct kvm_debugregs dbgregs; + + kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs); + + r = -EFAULT; + if (copy_to_user(argp, &dbgregs, + sizeof(struct kvm_debugregs))) + break; + r = 0; + break; + } + case KVM_SET_DEBUGREGS: { + struct kvm_debugregs dbgregs; + + r = -EFAULT; + if (copy_from_user(&dbgregs, argp, + sizeof(struct kvm_debugregs))) + break; + + r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); + break; + } default: r = -EINVAL; } @@ -2161,14 +2544,14 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) return -EINVAL; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); spin_lock(&kvm->mmu_lock); kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; spin_unlock(&kvm->mmu_lock); - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return 0; } @@ -2177,13 +2560,35 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) return kvm->arch.n_alloc_mmu_pages; } +gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; + + aliases = kvm_aliases(kvm); + + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; + if (alias->flags & KVM_ALIAS_INVALID) + continue; + if (gfn >= alias->base_gfn + && gfn < alias->base_gfn + alias->npages) + return alias->target_gfn + gfn - alias->base_gfn; + } + return gfn; +} + gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) { int i; struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; + + aliases = kvm_aliases(kvm); - for (i = 0; i < kvm->arch.naliases; ++i) { - alias = &kvm->arch.aliases[i]; + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; if (gfn >= alias->base_gfn && gfn < alias->base_gfn + alias->npages) return alias->target_gfn + gfn - alias->base_gfn; @@ -2201,6 +2606,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, { int r, n; struct kvm_mem_alias *p; + struct kvm_mem_aliases *aliases, *old_aliases; r = -EINVAL; /* General sanity checks */ @@ -2217,26 +2623,48 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, < alias->target_phys_addr) goto out; - down_write(&kvm->slots_lock); - spin_lock(&kvm->mmu_lock); + r = -ENOMEM; + aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!aliases) + goto out; + + mutex_lock(&kvm->slots_lock); + + /* invalidate any gfn reference in case of deletion/shrinking */ + memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); + aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID; + old_aliases = kvm->arch.aliases; + rcu_assign_pointer(kvm->arch.aliases, aliases); + synchronize_srcu_expedited(&kvm->srcu); + kvm_mmu_zap_all(kvm); + kfree(old_aliases); + + r = -ENOMEM; + aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!aliases) + goto out_unlock; - p = &kvm->arch.aliases[alias->slot]; + memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); + + p = &aliases->aliases[alias->slot]; p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; p->npages = alias->memory_size >> PAGE_SHIFT; p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; + p->flags &= ~(KVM_ALIAS_INVALID); for (n = KVM_ALIAS_SLOTS; n > 0; --n) - if (kvm->arch.aliases[n - 1].npages) + if (aliases->aliases[n - 1].npages) break; - kvm->arch.naliases = n; - - spin_unlock(&kvm->mmu_lock); - kvm_mmu_zap_all(kvm); + aliases->naliases = n; - up_write(&kvm->slots_lock); - - return 0; + old_aliases = kvm->arch.aliases; + rcu_assign_pointer(kvm->arch.aliases, aliases); + synchronize_srcu_expedited(&kvm->srcu); + kfree(old_aliases); + r = 0; +out_unlock: + mutex_unlock(&kvm->slots_lock); out: return r; } @@ -2274,18 +2702,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - spin_lock(&pic_irqchip(kvm)->lock); + raw_spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + raw_spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_PIC_SLAVE: - spin_lock(&pic_irqchip(kvm)->lock); + raw_spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + raw_spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_IOAPIC: r = kvm_set_ioapic(kvm, &chip->chip.ioapic); @@ -2365,29 +2793,63 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - int r; - int n; + int r, i; struct kvm_memory_slot *memslot; - int is_dirty = 0; + unsigned long n; + unsigned long is_dirty = 0; + unsigned long *dirty_bitmap = NULL; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); - r = kvm_get_dirty_log(kvm, log, &is_dirty); - if (r) + r = -EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) goto out; + memslot = &kvm->memslots->memslots[log->slot]; + r = -ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = kvm_dirty_bitmap_bytes(memslot); + + r = -ENOMEM; + dirty_bitmap = vmalloc(n); + if (!dirty_bitmap) + goto out; + memset(dirty_bitmap, 0, n); + + for (i = 0; !is_dirty && i < n/sizeof(long); i++) + is_dirty = memslot->dirty_bitmap[i]; + /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { + struct kvm_memslots *slots, *old_slots; + spin_lock(&kvm->mmu_lock); kvm_mmu_slot_remove_write_access(kvm, log->slot); spin_unlock(&kvm->mmu_lock); - memslot = &kvm->memslots[log->slot]; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; - memset(memslot->dirty_bitmap, 0, n); + + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) + goto out_free; + + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; + + old_slots = kvm->memslots; + rcu_assign_pointer(kvm->memslots, slots); + synchronize_srcu_expedited(&kvm->srcu); + dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; + kfree(old_slots); } + r = 0; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) + r = -EFAULT; +out_free: + vfree(dirty_bitmap); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2470,6 +2932,8 @@ long kvm_arch_vm_ioctl(struct file *filp, if (vpic) { r = kvm_ioapic_init(kvm); if (r) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &vpic->dev); kfree(vpic); goto create_irqchip_unlock; } @@ -2481,10 +2945,8 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_setup_default_irq_routing(kvm); if (r) { mutex_lock(&kvm->irq_lock); - kfree(kvm->arch.vpic); - kfree(kvm->arch.vioapic); - kvm->arch.vpic = NULL; - kvm->arch.vioapic = NULL; + kvm_ioapic_destroy(kvm); + kvm_destroy_pic(kvm); mutex_unlock(&kvm->irq_lock); } create_irqchip_unlock: @@ -2500,7 +2962,7 @@ long kvm_arch_vm_ioctl(struct file *filp, sizeof(struct kvm_pit_config))) goto out; create_pit: - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); r = -EEXIST; if (kvm->arch.vpit) goto create_pit_unlock; @@ -2509,7 +2971,7 @@ long kvm_arch_vm_ioctl(struct file *filp, if (kvm->arch.vpit) r = 0; create_pit_unlock: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); break; case KVM_IRQ_LINE_STATUS: case KVM_IRQ_LINE: { @@ -2518,11 +2980,13 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&irq_event, argp, sizeof irq_event)) goto out; + r = -ENXIO; if (irqchip_in_kernel(kvm)) { __s32 status; status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irq_event.irq, irq_event.level); if (ioctl == KVM_IRQ_LINE_STATUS) { + r = -EFAULT; irq_event.status = status; if (copy_to_user(argp, &irq_event, sizeof irq_event)) @@ -2726,7 +3190,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) return 0; - return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); + return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) @@ -2735,17 +3199,56 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) return 0; - return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); + return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } -static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) +static void kvm_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + kvm_x86_ops->set_segment(vcpu, var, seg); +} + +void kvm_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + kvm_x86_ops->get_segment(vcpu, var, seg); +} + +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + + gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_FETCH_MASK; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_WRITE_MASK; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + +/* uses this to access any guest's mapped memory without checking CPL */ +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); +} + +static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 access, + u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -2768,14 +3271,40 @@ out: return r; } -static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) +/* used for instruction fetching */ +static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, + access | PFERR_FETCH_MASK, error); +} + +static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, + error); +} + +static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); +} + +static int kvm_write_guest_virt_system(gva_t addr, void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu, + u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, + PFERR_WRITE_MASK, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -2798,13 +3327,13 @@ out: return r; } - static int emulator_read_emulated(unsigned long addr, void *val, unsigned int bytes, struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -2814,17 +3343,20 @@ static int emulator_read_emulated(unsigned long addr, return X86EMUL_CONTINUE; } - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); + + if (gpa == UNMAPPED_GVA) { + kvm_inject_page_fault(vcpu, addr, error_code); + return X86EMUL_PROPAGATE_FAULT; + } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto mmio; - if (kvm_read_guest_virt(addr, val, bytes, vcpu) + if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; mmio: /* @@ -2863,11 +3395,12 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, 2); + kvm_inject_page_fault(vcpu, addr, error_code); return X86EMUL_PROPAGATE_FAULT; } @@ -2896,9 +3429,9 @@ mmio: } int emulator_write_emulated(unsigned long addr, - const void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu) + const void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) { /* Crossing a page boundary? */ if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { @@ -2916,45 +3449,150 @@ int emulator_write_emulated(unsigned long addr, } EXPORT_SYMBOL_GPL(emulator_write_emulated); +#define CMPXCHG_TYPE(t, ptr, old, new) \ + (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) + +#ifdef CONFIG_X86_64 +# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new) +#else +# define CMPXCHG64(ptr, old, new) \ + (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) +#endif + static int emulator_cmpxchg_emulated(unsigned long addr, const void *old, const void *new, unsigned int bytes, struct kvm_vcpu *vcpu) { - printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); -#ifndef CONFIG_X86_64 - /* guests cmpxchg8b have to be emulated atomically */ - if (bytes == 8) { - gpa_t gpa; - struct page *page; - char *kaddr; - u64 val; + gpa_t gpa; + struct page *page; + char *kaddr; + bool exchanged; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + /* guests cmpxchg8b have to be emulated atomically */ + if (bytes > 8 || (bytes & (bytes - 1))) + goto emul_write; - if (gpa == UNMAPPED_GVA || - (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) - goto emul_write; + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); - if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) - goto emul_write; + if (gpa == UNMAPPED_GVA || + (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + goto emul_write; - val = *(u64 *)new; + if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) + goto emul_write; - page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - kaddr = kmap_atomic(page, KM_USER0); - set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); - kunmap_atomic(kaddr, KM_USER0); - kvm_release_page_dirty(page); + kaddr = kmap_atomic(page, KM_USER0); + kaddr += offset_in_page(gpa); + switch (bytes) { + case 1: + exchanged = CMPXCHG_TYPE(u8, kaddr, old, new); + break; + case 2: + exchanged = CMPXCHG_TYPE(u16, kaddr, old, new); + break; + case 4: + exchanged = CMPXCHG_TYPE(u32, kaddr, old, new); + break; + case 8: + exchanged = CMPXCHG64(kaddr, old, new); + break; + default: + BUG(); } + kunmap_atomic(kaddr, KM_USER0); + kvm_release_page_dirty(page); + + if (!exchanged) + return X86EMUL_CMPXCHG_FAILED; + + kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1); + + return X86EMUL_CONTINUE; + emul_write: -#endif + printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); return emulator_write_emulated(addr, new, bytes, vcpu); } +static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) +{ + /* TODO: String I/O for in kernel device */ + int r; + + if (vcpu->arch.pio.in) + r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, + vcpu->arch.pio.size, pd); + else + r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, + vcpu->arch.pio.port, vcpu->arch.pio.size, + pd); + return r; +} + + +static int emulator_pio_in_emulated(int size, unsigned short port, void *val, + unsigned int count, struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.pio.count) + goto data_avail; + + trace_kvm_pio(1, port, size, 1); + + vcpu->arch.pio.port = port; + vcpu->arch.pio.in = 1; + vcpu->arch.pio.count = count; + vcpu->arch.pio.size = size; + + if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { + data_avail: + memcpy(val, vcpu->arch.pio_data, size * count); + vcpu->arch.pio.count = 0; + return 1; + } + + vcpu->run->exit_reason = KVM_EXIT_IO; + vcpu->run->io.direction = KVM_EXIT_IO_IN; + vcpu->run->io.size = size; + vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; + vcpu->run->io.count = count; + vcpu->run->io.port = port; + + return 0; +} + +static int emulator_pio_out_emulated(int size, unsigned short port, + const void *val, unsigned int count, + struct kvm_vcpu *vcpu) +{ + trace_kvm_pio(0, port, size, 1); + + vcpu->arch.pio.port = port; + vcpu->arch.pio.in = 0; + vcpu->arch.pio.count = count; + vcpu->arch.pio.size = size; + + memcpy(vcpu->arch.pio_data, val, size * count); + + if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { + vcpu->arch.pio.count = 0; + return 1; + } + + vcpu->run->exit_reason = KVM_EXIT_IO; + vcpu->run->io.direction = KVM_EXIT_IO_OUT; + vcpu->run->io.size = size; + vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; + vcpu->run->io.count = count; + vcpu->run->io.port = port; + + return 0; +} + static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) { return kvm_x86_ops->get_segment_base(vcpu, seg); @@ -2968,35 +3606,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) int emulate_clts(struct kvm_vcpu *vcpu) { - kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + kvm_x86_ops->fpu_activate(vcpu); return X86EMUL_CONTINUE; } int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) { - struct kvm_vcpu *vcpu = ctxt->vcpu; - - switch (dr) { - case 0 ... 3: - *dest = kvm_x86_ops->get_dr(vcpu, dr); - return X86EMUL_CONTINUE; - default: - pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); - return X86EMUL_UNHANDLEABLE; - } + return kvm_get_dr(ctxt->vcpu, dr, dest); } int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) { unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; - int exception; - kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); - if (exception) { - /* FIXME: better handling */ - return X86EMUL_UNHANDLEABLE; - } - return X86EMUL_CONTINUE; + return kvm_set_dr(ctxt->vcpu, dr, value & mask); } void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) @@ -3010,18 +3634,174 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); + kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); } EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); +static u64 mk_cr_64(u64 curr_cr, u32 new_val) +{ + return (curr_cr & ~((1ULL << 32) - 1)) | new_val; +} + +static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) +{ + unsigned long value; + + switch (cr) { + case 0: + value = kvm_read_cr0(vcpu); + break; + case 2: + value = vcpu->arch.cr2; + break; + case 3: + value = vcpu->arch.cr3; + break; + case 4: + value = kvm_read_cr4(vcpu); + break; + case 8: + value = kvm_get_cr8(vcpu); + break; + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + return 0; + } + + return value; +} + +static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) +{ + switch (cr) { + case 0: + kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); + break; + case 2: + vcpu->arch.cr2 = val; + break; + case 3: + kvm_set_cr3(vcpu, val); + break; + case 4: + kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); + break; + case 8: + kvm_set_cr8(vcpu, val & 0xfUL); + break; + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + } +} + +static int emulator_get_cpl(struct kvm_vcpu *vcpu) +{ + return kvm_x86_ops->get_cpl(vcpu); +} + +static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->get_gdt(vcpu, dt); +} + +static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, + struct kvm_vcpu *vcpu) +{ + struct kvm_segment var; + + kvm_get_segment(vcpu, &var, seg); + + if (var.unusable) + return false; + + if (var.g) + var.limit >>= 12; + set_desc_limit(desc, var.limit); + set_desc_base(desc, (unsigned long)var.base); + desc->type = var.type; + desc->s = var.s; + desc->dpl = var.dpl; + desc->p = var.present; + desc->avl = var.avl; + desc->l = var.l; + desc->d = var.db; + desc->g = var.g; + + return true; +} + +static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, + struct kvm_vcpu *vcpu) +{ + struct kvm_segment var; + + /* needed to preserve selector */ + kvm_get_segment(vcpu, &var, seg); + + var.base = get_desc_base(desc); + var.limit = get_desc_limit(desc); + if (desc->g) + var.limit = (var.limit << 12) | 0xfff; + var.type = desc->type; + var.present = desc->p; + var.dpl = desc->dpl; + var.db = desc->d; + var.s = desc->s; + var.l = desc->l; + var.g = desc->g; + var.avl = desc->avl; + var.present = desc->p; + var.unusable = !var.present; + var.padding = 0; + + kvm_set_segment(vcpu, &var, seg); + return; +} + +static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu) +{ + struct kvm_segment kvm_seg; + + kvm_get_segment(vcpu, &kvm_seg, seg); + return kvm_seg.selector; +} + +static void emulator_set_segment_selector(u16 sel, int seg, + struct kvm_vcpu *vcpu) +{ + struct kvm_segment kvm_seg; + + kvm_get_segment(vcpu, &kvm_seg, seg); + kvm_seg.selector = sel; + kvm_set_segment(vcpu, &kvm_seg, seg); +} + +static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + kvm_x86_ops->set_rflags(vcpu, rflags); +} + static struct x86_emulate_ops emulate_ops = { - .read_std = kvm_read_guest_virt, + .read_std = kvm_read_guest_virt_system, + .write_std = kvm_write_guest_virt_system, + .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, + .pio_in_emulated = emulator_pio_in_emulated, + .pio_out_emulated = emulator_pio_out_emulated, + .get_cached_descriptor = emulator_get_cached_descriptor, + .set_cached_descriptor = emulator_set_cached_descriptor, + .get_segment_selector = emulator_get_segment_selector, + .set_segment_selector = emulator_set_segment_selector, + .get_gdt = emulator_get_gdt, + .get_cr = emulator_get_cr, + .set_cr = emulator_set_cr, + .cpl = emulator_get_cpl, + .set_rflags = emulator_set_rflags, }; static void cache_all_regs(struct kvm_vcpu *vcpu) @@ -3052,21 +3832,23 @@ int emulate_instruction(struct kvm_vcpu *vcpu, cache_all_regs(vcpu); vcpu->mmio_is_write = 0; - vcpu->arch.pio.string = 0; if (!(emulation_type & EMULTYPE_NO_DECODE)) { int cs_db, cs_l; kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); vcpu->arch.emulate_ctxt.vcpu = vcpu; - vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); + vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); + vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); vcpu->arch.emulate_ctxt.mode = + (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_REAL : cs_l + ? X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + trace_kvm_emulate_insn_start(vcpu); /* Only allow emulation of specific instructions on #UD * (namely VMMCALL, sysenter, sysexit, syscall)*/ @@ -3099,6 +3881,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, ++vcpu->stat.insn_emulation; if (r) { ++vcpu->stat.insn_emulation_fail; + trace_kvm_emulate_insn_failed(vcpu); if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) return EMULATE_DONE; return EMULATE_FAIL; @@ -3110,16 +3893,20 @@ int emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DONE; } +restart: r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; if (r == 0) kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); - if (vcpu->arch.pio.string) + if (vcpu->arch.pio.count) { + if (!vcpu->arch.pio.in) + vcpu->arch.pio.count = 0; return EMULATE_DO_MMIO; + } - if ((r || vcpu->mmio_is_write) && run) { + if (r || vcpu->mmio_is_write) { run->exit_reason = KVM_EXIT_MMIO; run->mmio.phys_addr = vcpu->mmio_phys_addr; memcpy(run->mmio.data, vcpu->mmio_data, 8); @@ -3129,218 +3916,41 @@ int emulate_instruction(struct kvm_vcpu *vcpu, if (r) { if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) - return EMULATE_DONE; + goto done; if (!vcpu->mmio_needed) { + ++vcpu->stat.insn_emulation_fail; + trace_kvm_emulate_insn_failed(vcpu); kvm_report_emulation_failure(vcpu, "mmio"); return EMULATE_FAIL; } return EMULATE_DO_MMIO; } - kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); - if (vcpu->mmio_is_write) { vcpu->mmio_needed = 0; return EMULATE_DO_MMIO; } - return EMULATE_DONE; -} -EXPORT_SYMBOL_GPL(emulate_instruction); - -static int pio_copy_data(struct kvm_vcpu *vcpu) -{ - void *p = vcpu->arch.pio_data; - gva_t q = vcpu->arch.pio.guest_gva; - unsigned bytes; - int ret; - - bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; - if (vcpu->arch.pio.in) - ret = kvm_write_guest_virt(q, p, bytes, vcpu); - else - ret = kvm_read_guest_virt(q, p, bytes, vcpu); - return ret; -} - -int complete_pio(struct kvm_vcpu *vcpu) -{ - struct kvm_pio_request *io = &vcpu->arch.pio; - long delta; - int r; - unsigned long val; - - if (!io->string) { - if (io->in) { - val = kvm_register_read(vcpu, VCPU_REGS_RAX); - memcpy(&val, vcpu->arch.pio_data, io->size); - kvm_register_write(vcpu, VCPU_REGS_RAX, val); - } - } else { - if (io->in) { - r = pio_copy_data(vcpu); - if (r) - return r; - } - - delta = 1; - if (io->rep) { - delta *= io->cur_count; - /* - * The size of the register should really depend on - * current address size. - */ - val = kvm_register_read(vcpu, VCPU_REGS_RCX); - val -= delta; - kvm_register_write(vcpu, VCPU_REGS_RCX, val); - } - if (io->down) - delta = -delta; - delta *= io->size; - if (io->in) { - val = kvm_register_read(vcpu, VCPU_REGS_RDI); - val += delta; - kvm_register_write(vcpu, VCPU_REGS_RDI, val); - } else { - val = kvm_register_read(vcpu, VCPU_REGS_RSI); - val += delta; - kvm_register_write(vcpu, VCPU_REGS_RSI, val); - } - } - - io->count -= io->cur_count; - io->cur_count = 0; - - return 0; -} - -static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) -{ - /* TODO: String I/O for in kernel device */ - int r; - - if (vcpu->arch.pio.in) - r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); - else - r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); - return r; -} - -static int pio_string_write(struct kvm_vcpu *vcpu) -{ - struct kvm_pio_request *io = &vcpu->arch.pio; - void *pd = vcpu->arch.pio_data; - int i, r = 0; - - for (i = 0; i < io->cur_count; i++) { - if (kvm_io_bus_write(&vcpu->kvm->pio_bus, - io->port, io->size, pd)) { - r = -EOPNOTSUPP; - break; - } - pd += io->size; - } - return r; -} - -int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) -{ - unsigned long val; - - vcpu->run->exit_reason = KVM_EXIT_IO; - vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; - vcpu->run->io.size = vcpu->arch.pio.size = size; - vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; - vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; - vcpu->run->io.port = vcpu->arch.pio.port = port; - vcpu->arch.pio.in = in; - vcpu->arch.pio.string = 0; - vcpu->arch.pio.down = 0; - vcpu->arch.pio.rep = 0; - - trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, - size, 1); +done: + if (vcpu->arch.exception.pending) + vcpu->arch.emulate_ctxt.restart = false; - val = kvm_register_read(vcpu, VCPU_REGS_RAX); - memcpy(vcpu->arch.pio_data, &val, 4); + if (vcpu->arch.emulate_ctxt.restart) + goto restart; - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { - complete_pio(vcpu); - return 1; - } - return 0; + return EMULATE_DONE; } -EXPORT_SYMBOL_GPL(kvm_emulate_pio); +EXPORT_SYMBOL_GPL(emulate_instruction); -int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, - int size, unsigned long count, int down, - gva_t address, int rep, unsigned port) +int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) { - unsigned now, in_page; - int ret = 0; - - vcpu->run->exit_reason = KVM_EXIT_IO; - vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; - vcpu->run->io.size = vcpu->arch.pio.size = size; - vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; - vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; - vcpu->run->io.port = vcpu->arch.pio.port = port; - vcpu->arch.pio.in = in; - vcpu->arch.pio.string = 1; - vcpu->arch.pio.down = down; - vcpu->arch.pio.rep = rep; - - trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, - size, count); - - if (!count) { - kvm_x86_ops->skip_emulated_instruction(vcpu); - return 1; - } - - if (!down) - in_page = PAGE_SIZE - offset_in_page(address); - else - in_page = offset_in_page(address) + size; - now = min(count, (unsigned long)in_page / size); - if (!now) - now = 1; - if (down) { - /* - * String I/O in reverse. Yuck. Kill the guest, fix later. - */ - pr_unimpl(vcpu, "guest string pio down\n"); - kvm_inject_gp(vcpu, 0); - return 1; - } - vcpu->run->io.count = now; - vcpu->arch.pio.cur_count = now; - - if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) - kvm_x86_ops->skip_emulated_instruction(vcpu); - - vcpu->arch.pio.guest_gva = address; - - if (!vcpu->arch.pio.in) { - /* string PIO write */ - ret = pio_copy_data(vcpu); - if (ret == X86EMUL_PROPAGATE_FAULT) { - kvm_inject_gp(vcpu, 0); - return 1; - } - if (ret == 0 && !pio_string_write(vcpu)) { - complete_pio(vcpu); - if (vcpu->arch.pio.count == 0) - ret = 1; - } - } - /* no string PIO read support yet */ - + unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); + int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu); + /* do not return to emulator after return from userspace */ + vcpu->arch.pio.count = 0; return ret; } -EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); +EXPORT_SYMBOL_GPL(kvm_fast_pio_out); static void bounce_off(void *info) { @@ -3415,6 +4025,51 @@ static void kvm_timer_init(void) } } +static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); + +static int kvm_is_in_guest(void) +{ + return percpu_read(current_vcpu) != NULL; +} + +static int kvm_is_user_mode(void) +{ + int user_mode = 3; + + if (percpu_read(current_vcpu)) + user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu)); + + return user_mode != 0; +} + +static unsigned long kvm_get_guest_ip(void) +{ + unsigned long ip = 0; + + if (percpu_read(current_vcpu)) + ip = kvm_rip_read(percpu_read(current_vcpu)); + + return ip; +} + +static struct perf_guest_info_callbacks kvm_guest_cbs = { + .is_in_guest = kvm_is_in_guest, + .is_user_mode = kvm_is_user_mode, + .get_guest_ip = kvm_get_guest_ip, +}; + +void kvm_before_handle_nmi(struct kvm_vcpu *vcpu) +{ + percpu_write(current_vcpu, vcpu); +} +EXPORT_SYMBOL_GPL(kvm_before_handle_nmi); + +void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) +{ + percpu_write(current_vcpu, NULL); +} +EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); + int kvm_arch_init(void *opaque) { int r; @@ -3451,6 +4106,8 @@ int kvm_arch_init(void *opaque) kvm_timer_init(); + perf_register_guest_info_callbacks(&kvm_guest_cbs); + return 0; out: @@ -3459,6 +4116,8 @@ out: void kvm_arch_exit(void) { + perf_unregister_guest_info_callbacks(&kvm_guest_cbs); + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); @@ -3488,11 +4147,76 @@ static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, return a0 | ((gpa_t)a1 << 32); } +int kvm_hv_hypercall(struct kvm_vcpu *vcpu) +{ + u64 param, ingpa, outgpa, ret; + uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; + bool fast, longmode; + int cs_db, cs_l; + + /* + * hypercall generates UD from non zero cpl and real mode + * per HYPER-V spec + */ + if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 0; + } + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + longmode = is_long_mode(vcpu) && cs_l == 1; + + if (!longmode) { + param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); + ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); + outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | + (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); + } +#ifdef CONFIG_X86_64 + else { + param = kvm_register_read(vcpu, VCPU_REGS_RCX); + ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); + outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); + } +#endif + + code = param & 0xffff; + fast = (param >> 16) & 0x1; + rep_cnt = (param >> 32) & 0xfff; + rep_idx = (param >> 48) & 0xfff; + + trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); + + switch (code) { + case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: + kvm_vcpu_on_spin(vcpu); + break; + default: + res = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } + + ret = res | (((u64)rep_done & 0xfff) << 32); + if (longmode) { + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); + } else { + kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); + kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); + } + + return 1; +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; int r = 1; + if (kvm_hv_hypercall_enabled(vcpu->kvm)) + return kvm_hv_hypercall(vcpu); + nr = kvm_register_read(vcpu, VCPU_REGS_RAX); a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); @@ -3535,10 +4259,8 @@ EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); int kvm_fix_hypercall(struct kvm_vcpu *vcpu) { char instruction[3]; - int ret = 0; unsigned long rip = kvm_rip_read(vcpu); - /* * Blow out the MMU to ensure that no other VCPU has an active mapping * to ensure that the updated hypercall appears atomically across all @@ -3547,93 +4269,24 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) kvm_mmu_zap_all(vcpu->kvm); kvm_x86_ops->patch_hypercall(vcpu, instruction); - if (emulator_write_emulated(rip, instruction, 3, vcpu) - != X86EMUL_CONTINUE) - ret = -EFAULT; - - return ret; -} -static u64 mk_cr_64(u64 curr_cr, u32 new_val) -{ - return (curr_cr & ~((1ULL << 32) - 1)) | new_val; + return emulator_write_emulated(rip, instruction, 3, vcpu); } void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) { - struct descriptor_table dt = { limit, base }; + struct desc_ptr dt = { limit, base }; kvm_x86_ops->set_gdt(vcpu, &dt); } void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) { - struct descriptor_table dt = { limit, base }; + struct desc_ptr dt = { limit, base }; kvm_x86_ops->set_idt(vcpu, &dt); } -void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, - unsigned long *rflags) -{ - kvm_lmsw(vcpu, msw); - *rflags = kvm_get_rflags(vcpu); -} - -unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) -{ - unsigned long value; - - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - switch (cr) { - case 0: - value = vcpu->arch.cr0; - break; - case 2: - value = vcpu->arch.cr2; - break; - case 3: - value = vcpu->arch.cr3; - break; - case 4: - value = vcpu->arch.cr4; - break; - case 8: - value = kvm_get_cr8(vcpu); - break; - default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); - return 0; - } - - return value; -} - -void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, - unsigned long *rflags) -{ - switch (cr) { - case 0: - kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); - *rflags = kvm_get_rflags(vcpu); - break; - case 2: - vcpu->arch.cr2 = val; - break; - case 3: - kvm_set_cr3(vcpu, val); - break; - case 4: - kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); - break; - case 8: - kvm_set_cr8(vcpu, val & 0xfUL); - break; - default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); - } -} - static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) { struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; @@ -3691,14 +4344,19 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, } return best; } +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; + best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); + if (!best || best->eax < 0x80000008) + goto not_found; best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); if (best) return best->eax & 0xff; +not_found: return 36; } @@ -3774,14 +4432,15 @@ static void vapic_enter(struct kvm_vcpu *vcpu) static void vapic_exit(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; + int idx; if (!apic || !apic->vapic_addr) return; - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_release_page_dirty(apic->vapic_page); mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); } static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -3811,9 +4470,13 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) { /* try to reinject previous events if any */ if (vcpu->arch.exception.pending) { + trace_kvm_inj_exception(vcpu->arch.exception.nr, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code); kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); + vcpu->arch.exception.error_code, + vcpu->arch.exception.reinject); return; } @@ -3877,12 +4540,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 0; goto out; } + if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { + vcpu->fpu_active = 0; + kvm_x86_ops->fpu_deactivate(vcpu); + } } preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); - kvm_load_guest_fpu(vcpu); + if (vcpu->fpu_active) + kvm_load_guest_fpu(vcpu); local_irq_disable(); @@ -3910,7 +4578,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_lapic_sync_to_vapic(vcpu); } - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); kvm_guest_enter(); @@ -3952,7 +4620,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_enable(); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); /* * Profile KVM exit RIPs: @@ -3974,6 +4642,7 @@ out: static int __vcpu_run(struct kvm_vcpu *vcpu) { int r; + struct kvm *kvm = vcpu->kvm; if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { pr_debug("vcpu %d received sipi with vector # %x\n", @@ -3985,7 +4654,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); vapic_enter(vcpu); r = 1; @@ -3993,9 +4662,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) r = vcpu_enter_guest(vcpu); else { - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) { switch(vcpu->arch.mp_state) { @@ -4030,14 +4699,13 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) ++vcpu->stat.signal_exits; } if (need_resched()) { - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); } } - up_read(&vcpu->kvm->slots_lock); - post_kvm_run_save(vcpu); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); vapic_exit(vcpu); @@ -4065,24 +4733,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (!irqchip_in_kernel(vcpu->kvm)) kvm_set_cr8(vcpu, kvm_run->cr8); - if (vcpu->arch.pio.cur_count) { - r = complete_pio(vcpu); - if (r) - goto out; - } - if (vcpu->mmio_needed) { - memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); - vcpu->mmio_read_completed = 1; - vcpu->mmio_needed = 0; - - down_read(&vcpu->kvm->slots_lock); - r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, - EMULTYPE_NO_DECODE); - up_read(&vcpu->kvm->slots_lock); + if (vcpu->arch.pio.count || vcpu->mmio_needed || + vcpu->arch.emulate_ctxt.restart) { + if (vcpu->mmio_needed) { + memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); + vcpu->mmio_read_completed = 1; + vcpu->mmio_needed = 0; + } + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (r == EMULATE_DO_MMIO) { - /* - * Read-modify-write. Back to userspace. - */ r = 0; goto out; } @@ -4094,6 +4755,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) r = __vcpu_run(vcpu); out: + post_kvm_run_save(vcpu); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); @@ -4165,12 +4827,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return 0; } -void kvm_get_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - kvm_x86_ops->get_segment(vcpu, var, seg); -} - void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) { struct kvm_segment cs; @@ -4184,7 +4840,7 @@ EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - struct descriptor_table dt; + struct desc_ptr dt; vcpu_load(vcpu); @@ -4199,19 +4855,18 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); kvm_x86_ops->get_idt(vcpu, &dt); - sregs->idt.limit = dt.limit; - sregs->idt.base = dt.base; + sregs->idt.limit = dt.size; + sregs->idt.base = dt.address; kvm_x86_ops->get_gdt(vcpu, &dt); - sregs->gdt.limit = dt.limit; - sregs->gdt.base = dt.base; + sregs->gdt.limit = dt.size; + sregs->gdt.base = dt.address; - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - sregs->cr0 = vcpu->arch.cr0; + sregs->cr0 = kvm_read_cr0(vcpu); sregs->cr2 = vcpu->arch.cr2; sregs->cr3 = vcpu->arch.cr3; - sregs->cr4 = vcpu->arch.cr4; + sregs->cr4 = kvm_read_cr4(vcpu); sregs->cr8 = kvm_get_cr8(vcpu); - sregs->efer = vcpu->arch.shadow_efer; + sregs->efer = vcpu->arch.efer; sregs->apic_base = kvm_get_apic_base(vcpu); memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); @@ -4243,428 +4898,33 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return 0; } -static void kvm_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - kvm_x86_ops->set_segment(vcpu, var, seg); -} - -static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, - struct kvm_segment *kvm_desct) -{ - kvm_desct->base = get_desc_base(seg_desc); - kvm_desct->limit = get_desc_limit(seg_desc); - if (seg_desc->g) { - kvm_desct->limit <<= 12; - kvm_desct->limit |= 0xfff; - } - kvm_desct->selector = selector; - kvm_desct->type = seg_desc->type; - kvm_desct->present = seg_desc->p; - kvm_desct->dpl = seg_desc->dpl; - kvm_desct->db = seg_desc->d; - kvm_desct->s = seg_desc->s; - kvm_desct->l = seg_desc->l; - kvm_desct->g = seg_desc->g; - kvm_desct->avl = seg_desc->avl; - if (!selector) - kvm_desct->unusable = 1; - else - kvm_desct->unusable = 0; - kvm_desct->padding = 0; -} - -static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, - u16 selector, - struct descriptor_table *dtable) -{ - if (selector & 1 << 2) { - struct kvm_segment kvm_seg; - - kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); - - if (kvm_seg.unusable) - dtable->limit = 0; - else - dtable->limit = kvm_seg.limit; - dtable->base = kvm_seg.base; - } - else - kvm_x86_ops->get_gdt(vcpu, dtable); -} - -/* allowed just for 8 bytes segments */ -static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - struct desc_struct *seg_desc) -{ - struct descriptor_table dtable; - u16 index = selector >> 3; - - get_segment_descriptor_dtable(vcpu, selector, &dtable); - - if (dtable.limit < index * 8 + 7) { - kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); - return 1; - } - return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); -} - -/* allowed just for 8 bytes segments */ -static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - struct desc_struct *seg_desc) -{ - struct descriptor_table dtable; - u16 index = selector >> 3; - - get_segment_descriptor_dtable(vcpu, selector, &dtable); - - if (dtable.limit < index * 8 + 7) - return 1; - return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); -} - -static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, - struct desc_struct *seg_desc) -{ - u32 base_addr = get_desc_base(seg_desc); - - return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); -} - -static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) -{ - struct kvm_segment kvm_seg; - - kvm_get_segment(vcpu, &kvm_seg, seg); - return kvm_seg.selector; -} - -static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, - u16 selector, - struct kvm_segment *kvm_seg) -{ - struct desc_struct seg_desc; - - if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) - return 1; - seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); - return 0; -} - -static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) -{ - struct kvm_segment segvar = { - .base = selector << 4, - .limit = 0xffff, - .selector = selector, - .type = 3, - .present = 1, - .dpl = 3, - .db = 0, - .s = 1, - .l = 0, - .g = 0, - .avl = 0, - .unusable = 0, - }; - kvm_x86_ops->set_segment(vcpu, &segvar, seg); - return 0; -} - -static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, + bool has_error_code, u32 error_code) { - return (seg != VCPU_SREG_LDTR) && - (seg != VCPU_SREG_TR) && - (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); -} - -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - int type_bits, int seg) -{ - struct kvm_segment kvm_seg; - - if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) - return kvm_load_realmode_segment(vcpu, selector, seg); - if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) - return 1; - kvm_seg.type |= type_bits; - - if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && - seg != VCPU_SREG_LDTR) - if (!kvm_seg.s) - kvm_seg.unusable = 1; - - kvm_set_segment(vcpu, &kvm_seg, seg); - return 0; -} - -static void save_state_to_tss32(struct kvm_vcpu *vcpu, - struct tss_segment_32 *tss) -{ - tss->cr3 = vcpu->arch.cr3; - tss->eip = kvm_rip_read(vcpu); - tss->eflags = kvm_get_rflags(vcpu); - tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); - tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); - tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); - tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); - tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); - tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); - tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); - tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); - tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); - tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); - tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); - tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); - tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); - tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); - tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); -} - -static int load_state_from_tss32(struct kvm_vcpu *vcpu, - struct tss_segment_32 *tss) -{ - kvm_set_cr3(vcpu, tss->cr3); - - kvm_rip_write(vcpu, tss->eip); - kvm_set_rflags(vcpu, tss->eflags | 2); - - kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); - kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); - kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); - kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); - kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); - kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); - kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); - kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); - - if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) - return 1; - return 0; -} - -static void save_state_to_tss16(struct kvm_vcpu *vcpu, - struct tss_segment_16 *tss) -{ - tss->ip = kvm_rip_read(vcpu); - tss->flag = kvm_get_rflags(vcpu); - tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); - tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); - tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); - tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); - tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); - tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); - tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); - tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); - - tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); - tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); - tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); - tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); - tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); -} - -static int load_state_from_tss16(struct kvm_vcpu *vcpu, - struct tss_segment_16 *tss) -{ - kvm_rip_write(vcpu, tss->ip); - kvm_set_rflags(vcpu, tss->flag | 2); - kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); - kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); - kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); - kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); - kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); - kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); - kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); - kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); - - if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) - return 1; - return 0; -} - -static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, - u16 old_tss_sel, u32 old_tss_base, - struct desc_struct *nseg_desc) -{ - struct tss_segment_16 tss_segment_16; - int ret = 0; - - if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16, - sizeof tss_segment_16)) - goto out; - - save_state_to_tss16(vcpu, &tss_segment_16); - - if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16, - sizeof tss_segment_16)) - goto out; - - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), - &tss_segment_16, sizeof tss_segment_16)) - goto out; - - if (old_tss_sel != 0xffff) { - tss_segment_16.prev_task_link = old_tss_sel; - - if (kvm_write_guest(vcpu->kvm, - get_tss_base_addr(vcpu, nseg_desc), - &tss_segment_16.prev_task_link, - sizeof tss_segment_16.prev_task_link)) - goto out; - } - - if (load_state_from_tss16(vcpu, &tss_segment_16)) - goto out; - - ret = 1; -out: - return ret; -} - -static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, - u16 old_tss_sel, u32 old_tss_base, - struct desc_struct *nseg_desc) -{ - struct tss_segment_32 tss_segment_32; - int ret = 0; - - if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32, - sizeof tss_segment_32)) - goto out; - - save_state_to_tss32(vcpu, &tss_segment_32); - - if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32, - sizeof tss_segment_32)) - goto out; - - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), - &tss_segment_32, sizeof tss_segment_32)) - goto out; - - if (old_tss_sel != 0xffff) { - tss_segment_32.prev_task_link = old_tss_sel; - - if (kvm_write_guest(vcpu->kvm, - get_tss_base_addr(vcpu, nseg_desc), - &tss_segment_32.prev_task_link, - sizeof tss_segment_32.prev_task_link)) - goto out; - } - - if (load_state_from_tss32(vcpu, &tss_segment_32)) - goto out; - - ret = 1; -out: - return ret; -} - -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) -{ - struct kvm_segment tr_seg; - struct desc_struct cseg_desc; - struct desc_struct nseg_desc; - int ret = 0; - u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); - u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); - - old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); - - /* FIXME: Handle errors. Failure to read either TSS or their - * descriptors should generate a pagefault. - */ - if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) - goto out; - - if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) - goto out; - - if (reason != TASK_SWITCH_IRET) { - int cpl; - - cpl = kvm_x86_ops->get_cpl(vcpu); - if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) { - kvm_queue_exception_e(vcpu, GP_VECTOR, 0); - return 1; - } - } - - if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { - kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); - return 1; - } - - if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { - cseg_desc.type &= ~(1 << 1); //clear the B flag - save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); - } + int cs_db, cs_l, ret; + cache_all_regs(vcpu); - if (reason == TASK_SWITCH_IRET) { - u32 eflags = kvm_get_rflags(vcpu); - kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); - } + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - /* set back link to prev task only if NT bit is set in eflags - note that old_tss_sel is not used afetr this point */ - if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) - old_tss_sel = 0xffff; + vcpu->arch.emulate_ctxt.vcpu = vcpu; + vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); + vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); + vcpu->arch.emulate_ctxt.mode = + (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : + (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) + ? X86EMUL_MODE_VM86 : cs_l + ? X86EMUL_MODE_PROT64 : cs_db + ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - if (nseg_desc.type & 8) - ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, - old_tss_base, &nseg_desc); - else - ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel, - old_tss_base, &nseg_desc); + ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, + tss_selector, reason, has_error_code, + error_code); - if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { - u32 eflags = kvm_get_rflags(vcpu); - kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT); - } + if (ret) + return EMULATE_FAIL; - if (reason != TASK_SWITCH_IRET) { - nseg_desc.type |= (1 << 1); - save_guest_segment_descriptor(vcpu, tss_selector, - &nseg_desc); - } - - kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); - seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); - tr_seg.type = 11; - kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); -out: - return ret; + kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + return EMULATE_DONE; } EXPORT_SYMBOL_GPL(kvm_task_switch); @@ -4673,15 +4933,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, { int mmu_reset_needed = 0; int pending_vec, max_bits; - struct descriptor_table dt; + struct desc_ptr dt; vcpu_load(vcpu); - dt.limit = sregs->idt.limit; - dt.base = sregs->idt.base; + dt.size = sregs->idt.limit; + dt.address = sregs->idt.base; kvm_x86_ops->set_idt(vcpu, &dt); - dt.limit = sregs->gdt.limit; - dt.base = sregs->gdt.base; + dt.size = sregs->gdt.limit; + dt.address = sregs->gdt.base; kvm_x86_ops->set_gdt(vcpu, &dt); vcpu->arch.cr2 = sregs->cr2; @@ -4690,17 +4950,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_set_cr8(vcpu, sregs->cr8); - mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; + mmu_reset_needed |= vcpu->arch.efer != sregs->efer; kvm_x86_ops->set_efer(vcpu, sregs->efer); kvm_set_apic_base(vcpu, sregs->apic_base); - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - - mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; + mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; kvm_x86_ops->set_cr0(vcpu, sregs->cr0); vcpu->arch.cr0 = sregs->cr0; - mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; + mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); if (!is_long_mode(vcpu) && is_pae(vcpu)) { load_pdptrs(vcpu, vcpu->arch.cr3); @@ -4735,7 +4993,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, /* Older userspace won't unhalt the vcpu on reset. */ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && - !(vcpu->arch.cr0 & X86_CR0_PE)) + !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; vcpu_put(vcpu); @@ -4782,11 +5040,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); } - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - vcpu->arch.singlestep_cs = - get_segment_selector(vcpu, VCPU_SREG_CS); - vcpu->arch.singlestep_rip = kvm_rip_read(vcpu); - } + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + + get_segment_base(vcpu, VCPU_SREG_CS); /* * Trigger an rflags update that will inject or remove the trace @@ -4833,11 +5089,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, { unsigned long vaddr = tr->linear_address; gpa_t gpa; + int idx; vcpu_load(vcpu); - down_read(&vcpu->kvm->slots_lock); - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); - up_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); + gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); + srcu_read_unlock(&vcpu->kvm->srcu, idx); tr->physical_address = gpa; tr->valid = gpa != UNMAPPED_GVA; tr->writeable = 1; @@ -4918,14 +5175,14 @@ EXPORT_SYMBOL_GPL(fx_init); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) + if (vcpu->guest_fpu_loaded) return; vcpu->guest_fpu_loaded = 1; kvm_fx_save(&vcpu->arch.host_fx_image); kvm_fx_restore(&vcpu->arch.guest_fx_image); + trace_kvm_fpu(1); } -EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { @@ -4936,8 +5193,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) kvm_fx_save(&vcpu->arch.guest_fx_image); kvm_fx_restore(&vcpu->arch.host_fx_image); ++vcpu->stat.fpu_reload; + set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); + trace_kvm_fpu(0); } -EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { @@ -5089,11 +5347,13 @@ fail: void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { + int idx; + kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_mmu_destroy(vcpu); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); } @@ -5104,6 +5364,12 @@ struct kvm *kvm_arch_create_vm(void) if (!kvm) return ERR_PTR(-ENOMEM); + kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!kvm->arch.aliases) { + kfree(kvm); + return ERR_PTR(-ENOMEM); + } + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); @@ -5160,16 +5426,18 @@ void kvm_arch_destroy_vm(struct kvm *kvm) put_page(kvm->arch.apic_access_page); if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); + cleanup_srcu_struct(&kvm->srcu); + kfree(kvm->arch.aliases); kfree(kvm); } -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, int user_alloc) { - int npages = mem->memory_size >> PAGE_SHIFT; - struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; + int npages = memslot->npages; /*To keep backward compatibility with older userspace, *x86 needs to hanlde !user_alloc case. @@ -5189,26 +5457,35 @@ int kvm_arch_set_memory_region(struct kvm *kvm, if (IS_ERR((void *)userspace_addr)) return PTR_ERR((void *)userspace_addr); - /* set userspace_addr atomically for kvm_hva_to_rmapp */ - spin_lock(&kvm->mmu_lock); memslot->userspace_addr = userspace_addr; - spin_unlock(&kvm->mmu_lock); - } else { - if (!old.user_alloc && old.rmap) { - int ret; - - down_write(¤t->mm->mmap_sem); - ret = do_munmap(current->mm, old.userspace_addr, - old.npages * PAGE_SIZE); - up_write(¤t->mm->mmap_sem); - if (ret < 0) - printk(KERN_WARNING - "kvm_vm_ioctl_set_memory_region: " - "failed to munmap memory\n"); - } } } + + return 0; +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + + int npages = mem->memory_size >> PAGE_SHIFT; + + if (!user_alloc && !old.user_alloc && old.rmap && !npages) { + int ret; + + down_write(¤t->mm->mmap_sem); + ret = do_munmap(current->mm, old.userspace_addr, + old.npages * PAGE_SIZE); + up_write(¤t->mm->mmap_sem); + if (ret < 0) + printk(KERN_WARNING + "kvm_vm_ioctl_set_memory_region: " + "failed to munmap memory\n"); + } + spin_lock(&kvm->mmu_lock); if (!kvm->arch.n_requested_mmu_pages) { unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); @@ -5217,8 +5494,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm, kvm_mmu_slot_remove_write_access(kvm, mem->slot); spin_unlock(&kvm->mmu_lock); - - return 0; } void kvm_arch_flush_shadow(struct kvm *kvm) @@ -5258,13 +5533,22 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) return kvm_x86_ops->interrupt_allowed(vcpu); } +bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) +{ + unsigned long current_rip = kvm_rip_read(vcpu) + + get_segment_base(vcpu, VCPU_SREG_CS); + + return current_rip == linear_rip; +} +EXPORT_SYMBOL_GPL(kvm_is_linear_rip); + unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags; rflags = kvm_x86_ops->get_rflags(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF); + rflags &= ~X86_EFLAGS_TF; return rflags; } EXPORT_SYMBOL_GPL(kvm_get_rflags); @@ -5272,10 +5556,8 @@ EXPORT_SYMBOL_GPL(kvm_get_rflags); void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && - vcpu->arch.singlestep_cs == - get_segment_selector(vcpu, VCPU_SREG_CS) && - vcpu->arch.singlestep_rip == kvm_rip_read(vcpu)) - rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; + kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) + rflags |= X86_EFLAGS_TF; kvm_x86_ops->set_rflags(vcpu, rflags); } EXPORT_SYMBOL_GPL(kvm_set_rflags); @@ -5291,3 +5573,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 5eadea585d2a..f4b54458285b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -2,6 +2,7 @@ #define ARCH_X86_KVM_X86_H #include <linux/kvm_host.h> +#include "kvm_cache_regs.h" static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) { @@ -35,4 +36,43 @@ static inline bool kvm_exception_is_soft(unsigned int nr) struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); +static inline bool is_protmode(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, X86_CR0_PE); +} + +static inline int is_long_mode(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + return vcpu->arch.efer & EFER_LMA; +#else + return 0; +#endif +} + +static inline int is_pae(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); +} + +static inline int is_pse(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); +} + +static inline int is_paging(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, X86_CR0_PG); +} + +static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm) +{ + return rcu_dereference_check(kvm->arch.aliases, + srcu_read_lock_held(&kvm->srcu) + || lockdep_is_held(&kvm->slots_lock)); +} + +void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); +void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); + #endif |