diff options
34 files changed, 242 insertions, 163 deletions
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index f238370c907d..8d0097f10208 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -93,8 +93,8 @@ common_shutdown_1(void *generic_ptr) if (cpuid != boot_cpuid) { flags |= 0x00040000UL; /* "remain halted" */ *pflags = flags; - cpu_clear(cpuid, cpu_present_map); - cpu_clear(cpuid, cpu_possible_map); + set_cpu_present(cpuid, false); + set_cpu_possible(cpuid, false); halt(); } #endif @@ -120,8 +120,8 @@ common_shutdown_1(void *generic_ptr) #ifdef CONFIG_SMP /* Wait for the secondaries to halt. */ - cpu_clear(boot_cpuid, cpu_present_map); - cpu_clear(boot_cpuid, cpu_possible_map); + set_cpu_present(boot_cpuid, false); + set_cpu_possible(boot_cpuid, false); while (cpus_weight(cpu_present_map)) barrier(); #endif diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 00f1dc3dfd5f..b1fe5674c3a1 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -120,12 +120,12 @@ void __cpuinit smp_callin(void) { int cpuid = hard_smp_processor_id(); - cpumask_t mask = cpu_online_map; - if (cpu_test_and_set(cpuid, mask)) { + if (cpu_online(cpuid)) { printk("??, cpu 0x%x already present??\n", cpuid); BUG(); } + set_cpu_online(cpuid, true); /* Turn on machine checks. */ wrmces(7); @@ -436,8 +436,8 @@ setup_smp(void) ((char *)cpubase + i*hwrpb->processor_size); if ((cpu->flags & 0x1cc) == 0x1cc) { smp_num_probed++; - cpu_set(i, cpu_possible_map); - cpu_set(i, cpu_present_map); + set_cpu_possible(i, true); + set_cpu_present(i, true); cpu->pal_revision = boot_cpu_palrev; } @@ -470,8 +470,8 @@ smp_prepare_cpus(unsigned int max_cpus) /* Nothing to do on a UP box, or when told not to. */ if (smp_num_probed == 1 || max_cpus == 0) { - cpu_possible_map = cpumask_of_cpu(boot_cpuid); - cpu_present_map = cpumask_of_cpu(boot_cpuid); + init_cpu_possible(cpumask_of(boot_cpuid)); + init_cpu_present(cpumask_of(boot_cpuid)); printk(KERN_INFO "SMP mode deactivated.\n"); return; } diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h index 68aa6da807c1..bfa86b6af7cd 100644 --- a/arch/ia64/include/asm/kvm.h +++ b/arch/ia64/include/asm/kvm.h @@ -25,6 +25,10 @@ #include <linux/ioctl.h> +/* Select x86 specific features in <linux/kvm.h> */ +#define __KVM_HAVE_IOAPIC +#define __KVM_HAVE_DEVICE_ASSIGNMENT + /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 4e586f6110aa..28f982045f29 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1337,6 +1337,10 @@ static void kvm_release_vm_pages(struct kvm *kvm) } } +void kvm_arch_sync_events(struct kvm *kvm) +{ +} + void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_iommu_unmap_guest(kvm); diff --git a/arch/ia64/kvm/process.c b/arch/ia64/kvm/process.c index 552d07724207..230eae482f32 100644 --- a/arch/ia64/kvm/process.c +++ b/arch/ia64/kvm/process.c @@ -455,13 +455,18 @@ fpswa_ret_t vmm_fp_emulate(int fp_fault, void *bundle, unsigned long *ipsr, if (!vmm_fpswa_interface) return (fpswa_ret_t) {-1, 0, 0, 0}; - /* - * Just let fpswa driver to use hardware fp registers. - * No fp register is valid in memory. - */ memset(&fp_state, 0, sizeof(fp_state_t)); /* + * compute fp_state. only FP registers f6 - f11 are used by the + * vmm, so set those bits in the mask and set the low volatile + * pointer to point to these registers. + */ + fp_state.bitmask_low64 = 0xfc0; /* bit6..bit11 */ + + fp_state.fp_state_low_volatile = (fp_state_low_volatile_t *) ®s->f6; + + /* * unsigned long (*EFI_FPSWA) ( * unsigned long trap_type, * void *Bundle, @@ -545,10 +550,6 @@ void reflect_interruption(u64 ifa, u64 isr, u64 iim, status = vmm_handle_fpu_swa(0, regs, isr); if (!status) return ; - else if (-EAGAIN == status) { - vcpu_decrement_iip(vcpu); - return ; - } break; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2822c8ccfaaf..5f81256287f5 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -125,6 +125,10 @@ static void kvmppc_free_vcpus(struct kvm *kvm) } } +void kvm_arch_sync_events(struct kvm *kvm) +{ +} + void kvm_arch_destroy_vm(struct kvm *kvm) { kvmppc_free_vcpus(kvm); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index be8497186b96..0d33893e1e89 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -212,6 +212,10 @@ static void kvm_free_vcpus(struct kvm *kvm) } } +void kvm_arch_sync_events(struct kvm *kvm) +{ +} + void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_free_vcpus(kvm); diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index d2e3bf3608af..886c9402ec45 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -9,6 +9,13 @@ #include <linux/types.h> #include <linux/ioctl.h> +/* Select x86 specific features in <linux/kvm.h> */ +#define __KVM_HAVE_PIT +#define __KVM_HAVE_IOAPIC +#define __KVM_HAVE_DEVICE_ASSIGNMENT +#define __KVM_HAVE_MSI +#define __KVM_HAVE_USER_NMI + /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index fb039cd345d8..6428aa17b40e 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1157,8 +1157,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) data->cpu = pol->cpu; data->currpstate = HW_PSTATE_INVALID; - rc = powernow_k8_cpu_init_acpi(data); - if (rc) { + if (powernow_k8_cpu_init_acpi(data)) { /* * Use the PSB BIOS structure. This is only availabe on * an UP version, and is deprecated by AMD. @@ -1176,17 +1175,20 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) "ACPI maintainers and complain to your BIOS " "vendor.\n"); #endif - goto err_out; + kfree(data); + return -ENODEV; } if (pol->cpu != 0) { printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for " "CPU other than CPU0. Complain to your BIOS " "vendor.\n"); - goto err_out; + kfree(data); + return -ENODEV; } rc = find_psb_table(data); if (rc) { - goto err_out; + kfree(data); + return -ENODEV; } /* Take a crude guess here. * That guess was in microseconds, so multiply with 1000 */ diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index e665d1c623ca..72bd275a9b5c 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -207,7 +207,7 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps) hrtimer_add_expires_ns(&pt->timer, pt->period); pt->scheduled = hrtimer_get_expires_ns(&pt->timer); if (pt->period) - ps->channels[0].count_load_time = hrtimer_get_expires(&pt->timer); + ps->channels[0].count_load_time = ktime_get(); return (pt->period == 0 ? 0 : 1); } diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index c019b8edcdb7..cf17ed52f6fb 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -87,13 +87,6 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); -void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) -{ - kvm_apic_timer_intr_post(vcpu, vec); - /* TODO: PIT, RTC etc. */ -} -EXPORT_SYMBOL_GPL(kvm_timer_intr_post); - void __kvm_migrate_timers(struct kvm_vcpu *vcpu) { __kvm_migrate_apic_timer(vcpu); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 2bf32a03ceec..82579ee538d0 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -89,7 +89,6 @@ static inline int irqchip_in_kernel(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s); -void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index afac68c0815c..f0b67f2cdd69 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -35,6 +35,12 @@ #include "kvm_cache_regs.h" #include "irq.h" +#ifndef CONFIG_X86_64 +#define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) +#else +#define mod_64(x, y) ((x) % (y)) +#endif + #define PRId64 "d" #define PRIx64 "llx" #define PRIu64 "u" @@ -511,52 +517,22 @@ static void apic_send_ipi(struct kvm_lapic *apic) static u32 apic_get_tmcct(struct kvm_lapic *apic) { - u64 counter_passed; - ktime_t passed, now; + ktime_t remaining; + s64 ns; u32 tmcct; ASSERT(apic != NULL); - now = apic->timer.dev.base->get_time(); - tmcct = apic_get_reg(apic, APIC_TMICT); - /* if initial count is 0, current count should also be 0 */ - if (tmcct == 0) + if (apic_get_reg(apic, APIC_TMICT) == 0) return 0; - if (unlikely(ktime_to_ns(now) <= - ktime_to_ns(apic->timer.last_update))) { - /* Wrap around */ - passed = ktime_add(( { - (ktime_t) { - .tv64 = KTIME_MAX - - (apic->timer.last_update).tv64}; } - ), now); - apic_debug("time elapsed\n"); - } else - passed = ktime_sub(now, apic->timer.last_update); - - counter_passed = div64_u64(ktime_to_ns(passed), - (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); - - if (counter_passed > tmcct) { - if (unlikely(!apic_lvtt_period(apic))) { - /* one-shot timers stick at 0 until reset */ - tmcct = 0; - } else { - /* - * periodic timers reset to APIC_TMICT when they - * hit 0. The while loop simulates this happening N - * times. (counter_passed %= tmcct) would also work, - * but might be slower or not work on 32-bit?? - */ - while (counter_passed > tmcct) - counter_passed -= tmcct; - tmcct -= counter_passed; - } - } else { - tmcct -= counter_passed; - } + remaining = hrtimer_expires_remaining(&apic->timer.dev); + if (ktime_to_ns(remaining) < 0) + remaining = ktime_set(0, 0); + + ns = mod_64(ktime_to_ns(remaining), apic->timer.period); + tmcct = div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); return tmcct; } @@ -653,8 +629,6 @@ static void start_apic_timer(struct kvm_lapic *apic) { ktime_t now = apic->timer.dev.base->get_time(); - apic->timer.last_update = now; - apic->timer.period = apic_get_reg(apic, APIC_TMICT) * APIC_BUS_CYCLE_NS * apic->timer.divide_count; atomic_set(&apic->timer.pending, 0); @@ -1110,16 +1084,6 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) } } -void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec) - apic->timer.last_update = ktime_add_ns( - apic->timer.last_update, - apic->timer.period); -} - int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) { int vector = kvm_apic_has_interrupt(vcpu); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 81858881287e..45ab6ee71209 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -12,7 +12,6 @@ struct kvm_lapic { atomic_t pending; s64 period; /* unit: ns */ u32 divide_count; - ktime_t last_update; struct hrtimer dev; } timer; struct kvm_vcpu *vcpu; @@ -42,7 +41,6 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); int kvm_lapic_enabled(struct kvm_vcpu *vcpu); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); -void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 83f11c7474a1..2d4477c71473 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1698,8 +1698,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, if (largepage) spte |= PT_PAGE_SIZE_MASK; if (mt_mask) { - mt_mask = get_memory_type(vcpu, gfn) << - kvm_x86_ops->get_mt_mask_shift(); + if (!kvm_is_mmio_pfn(pfn)) { + mt_mask = get_memory_type(vcpu, gfn) << + kvm_x86_ops->get_mt_mask_shift(); + mt_mask |= VMX_EPT_IGMT_BIT; + } else + mt_mask = MTRR_TYPE_UNCACHABLE << + kvm_x86_ops->get_mt_mask_shift(); spte |= mt_mask; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1452851ae258..a9e769e4e251 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1600,7 +1600,6 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu) /* Okay, we can deliver the interrupt: grab it and update PIC state. */ intr_vector = kvm_cpu_get_interrupt(vcpu); svm_inject_irq(svm, intr_vector); - kvm_timer_intr_post(vcpu, intr_vector); out: update_cr8_intercept(vcpu); } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6259d7467648..7611af576829 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -903,6 +903,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) data = vmcs_readl(GUEST_SYSENTER_ESP); break; default: + vmx_load_host_state(to_vmx(vcpu)); msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { data = msr->data; @@ -3285,7 +3286,6 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) } if (vcpu->arch.interrupt.pending) { vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); - kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr); if (kvm_cpu_has_interrupt(vcpu)) enable_irq_window(vcpu); } @@ -3687,8 +3687,7 @@ static int __init vmx_init(void) if (vm_need_ept()) { bypass_guest_pf = 0; kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | - VMX_EPT_WRITABLE_MASK | - VMX_EPT_IGMT_BIT); + VMX_EPT_WRITABLE_MASK); kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, VMX_EPT_EXECUTABLE_MASK, VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cc17546a2406..758b7a155ae9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -967,7 +967,6 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: case KVM_CAP_SET_TSS_ADDR: case KVM_CAP_EXT_CPUID: - case KVM_CAP_CLOCKSOURCE: case KVM_CAP_PIT: case KVM_CAP_NOP_IO_DELAY: case KVM_CAP_MP_STATE: @@ -992,6 +991,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_IOMMU: r = iommu_found(); break; + case KVM_CAP_CLOCKSOURCE: + r = boot_cpu_has(X86_FEATURE_CONSTANT_TSC); + break; default: r = 0; break; @@ -4127,9 +4129,13 @@ static void kvm_free_vcpus(struct kvm *kvm) } -void kvm_arch_destroy_vm(struct kvm *kvm) +void kvm_arch_sync_events(struct kvm *kvm) { kvm_free_all_assigned_devices(kvm); +} + +void kvm_arch_destroy_vm(struct kvm *kvm) +{ kvm_iommu_unmap_guest(kvm); kvm_free_pit(kvm); kfree(kvm->arch.vpic); diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index f4b7c79023ff..fa9e41626bfc 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -61,6 +61,8 @@ /* global iommu list, set NULL for ignored DMAR units */ static struct intel_iommu **g_iommus; +static int rwbf_quirk = 0; + /* * 0: Present * 1-11: Reserved @@ -785,7 +787,7 @@ static void iommu_flush_write_buffer(struct intel_iommu *iommu) u32 val; unsigned long flag; - if (!cap_rwbf(iommu->cap)) + if (!rwbf_quirk && !cap_rwbf(iommu->cap)) return; val = iommu->gcmd | DMA_GCMD_WBF; @@ -3137,3 +3139,13 @@ static struct iommu_ops intel_iommu_ops = { .unmap = intel_iommu_unmap_range, .iova_to_phys = intel_iommu_iova_to_phys, }; + +static void __devinit quirk_iommu_rwbf(struct pci_dev *dev) +{ + /* Mobile 4 Series Chipset neglects to set RWBF capability, + but needs it */ + printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n"); + rwbf_quirk = 1; +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index aafc9eba1c25..b0c87dce66a3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -868,7 +868,7 @@ static inline unsigned ext4_rec_len_from_disk(__le16 dlen) { unsigned len = le16_to_cpu(dlen); - if (len == EXT4_MAX_REC_LEN) + if (len == EXT4_MAX_REC_LEN || len == 0) return 1 << 16; return len; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 03ba20be1329..cbd2ca99d113 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -47,8 +47,10 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { - return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode, - new_size); + return jbd2_journal_begin_ordered_truncate( + EXT4_SB(inode->i_sb)->s_journal, + &EXT4_I(inode)->jinode, + new_size); } static void ext4_invalidatepage(struct page *page, unsigned long offset); @@ -2437,6 +2439,7 @@ static int ext4_da_writepages(struct address_space *mapping, int no_nrwrite_index_update; int pages_written = 0; long pages_skipped; + int range_cyclic, cycled = 1, io_done = 0; int needed_blocks, ret = 0, nr_to_writebump = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); @@ -2488,9 +2491,15 @@ static int ext4_da_writepages(struct address_space *mapping, if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - if (wbc->range_cyclic) + range_cyclic = wbc->range_cyclic; + if (wbc->range_cyclic) { index = mapping->writeback_index; - else + if (index) + cycled = 0; + wbc->range_start = index << PAGE_CACHE_SHIFT; + wbc->range_end = LLONG_MAX; + wbc->range_cyclic = 0; + } else index = wbc->range_start >> PAGE_CACHE_SHIFT; mpd.wbc = wbc; @@ -2504,6 +2513,7 @@ static int ext4_da_writepages(struct address_space *mapping, wbc->no_nrwrite_index_update = 1; pages_skipped = wbc->pages_skipped; +retry: while (!ret && wbc->nr_to_write > 0) { /* @@ -2546,6 +2556,7 @@ static int ext4_da_writepages(struct address_space *mapping, pages_written += mpd.pages_written; wbc->pages_skipped = pages_skipped; ret = 0; + io_done = 1; } else if (wbc->nr_to_write) /* * There is no more writeout needed @@ -2554,6 +2565,13 @@ static int ext4_da_writepages(struct address_space *mapping, */ break; } + if (!io_done && !cycled) { + cycled = 1; + index = 0; + wbc->range_start = index << PAGE_CACHE_SHIFT; + wbc->range_end = mapping->writeback_index - 1; + goto retry; + } if (pages_skipped != wbc->pages_skipped) printk(KERN_EMERG "This should not happen leaving %s " "with nr_to_write = %ld ret = %d\n", @@ -2561,6 +2579,7 @@ static int ext4_da_writepages(struct address_space *mapping, /* Update index */ index += pages_written; + wbc->range_cyclic = range_cyclic; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) /* * set the writeback_index so that range_cyclic diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index deba54f6cbed..4415beeb0b62 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3693,6 +3693,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_free = pa->pa_len; atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); + INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_linear = 0; @@ -3755,6 +3757,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_linear = 1; @@ -4476,23 +4479,26 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) pa->pa_free -= ac->ac_b_ex.fe_len; pa->pa_len -= ac->ac_b_ex.fe_len; spin_unlock(&pa->pa_lock); - /* - * We want to add the pa to the right bucket. - * Remove it from the list and while adding - * make sure the list to which we are adding - * doesn't grow big. - */ - if (likely(pa->pa_free)) { - spin_lock(pa->pa_obj_lock); - list_del_rcu(&pa->pa_inode_list); - spin_unlock(pa->pa_obj_lock); - ext4_mb_add_n_trim(ac); - } } - ext4_mb_put_pa(ac, ac->ac_sb, pa); } if (ac->alloc_semp) up_read(ac->alloc_semp); + if (pa) { + /* + * We want to add the pa to the right bucket. + * Remove it from the list and while adding + * make sure the list to which we are adding + * doesn't grow big. We need to release + * alloc_semp before calling ext4_mb_add_n_trim() + */ + if (pa->pa_linear && likely(pa->pa_free)) { + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + ext4_mb_add_n_trim(ac); + } + ext4_mb_put_pa(ac, ac->ac_sb, pa); + } if (ac->ac_bitmap_page) page_cache_release(ac->ac_bitmap_page); if (ac->ac_buddy_page) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 734abca25e35..fe64d9f79852 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -481,7 +481,7 @@ int ext4_ext_migrate(struct inode *inode) + 1); if (IS_ERR(handle)) { retval = PTR_ERR(handle); - goto err_out; + return retval; } tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, @@ -489,8 +489,7 @@ int ext4_ext_migrate(struct inode *inode) if (IS_ERR(tmp_inode)) { retval = -ENOMEM; ext4_journal_stop(handle); - tmp_inode = NULL; - goto err_out; + return retval; } i_size_write(tmp_inode, i_size_read(inode)); /* @@ -618,8 +617,7 @@ err_out: ext4_journal_stop(handle); - if (tmp_inode) - iput(tmp_inode); + iput(tmp_inode); return retval; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e5f06a5f045e..a5732c58f676 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3046,14 +3046,17 @@ static void ext4_write_super(struct super_block *sb) static int ext4_sync_fs(struct super_block *sb, int wait) { int ret = 0; + tid_t target; trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); sb->s_dirt = 0; if (EXT4_SB(sb)->s_journal) { - if (wait) - ret = ext4_force_commit(sb); - else - jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL); + if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, + &target)) { + if (wait) + jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, + target); + } } else { ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index eb343008eded..58144102bf25 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -450,7 +450,7 @@ int __jbd2_log_space_left(journal_t *journal) } /* - * Called under j_state_lock. Returns true if a transaction was started. + * Called under j_state_lock. Returns true if a transaction commit was started. */ int __jbd2_log_start_commit(journal_t *journal, tid_t target) { @@ -518,7 +518,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal) /* * Start a commit of the current running transaction (if any). Returns true - * if a transaction was started, and fills its tid in at *ptid + * if a transaction is going to be committed (or is currently already + * committing), and fills its tid in at *ptid */ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) { @@ -528,15 +529,19 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) if (journal->j_running_transaction) { tid_t tid = journal->j_running_transaction->t_tid; - ret = __jbd2_log_start_commit(journal, tid); - if (ret && ptid) + __jbd2_log_start_commit(journal, tid); + /* There's a running transaction and we've just made sure + * it's commit has been scheduled. */ + if (ptid) *ptid = tid; - } else if (journal->j_committing_transaction && ptid) { + ret = 1; + } else if (journal->j_committing_transaction) { /* * If ext3_write_super() recently started a commit, then we * have to wait for completion of that transaction */ - *ptid = journal->j_committing_transaction->t_tid; + if (ptid) + *ptid = journal->j_committing_transaction->t_tid; ret = 1; } spin_unlock(&journal->j_state_lock); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 46b4e347ed7d..28ce21d8598e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2129,26 +2129,46 @@ done: } /* - * This function must be called when inode is journaled in ordered mode - * before truncation happens. It starts writeout of truncated part in - * case it is in the committing transaction so that we stand to ordered - * mode consistency guarantees. + * File truncate and transaction commit interact with each other in a + * non-trivial way. If a transaction writing data block A is + * committing, we cannot discard the data by truncate until we have + * written them. Otherwise if we crashed after the transaction with + * write has committed but before the transaction with truncate has + * committed, we could see stale data in block A. This function is a + * helper to solve this problem. It starts writeout of the truncated + * part in case it is in the committing transaction. + * + * Filesystem code must call this function when inode is journaled in + * ordered mode before truncation happens and after the inode has been + * placed on orphan list with the new inode size. The second condition + * avoids the race that someone writes new data and we start + * committing the transaction after this function has been called but + * before a transaction for truncate is started (and furthermore it + * allows us to optimize the case where the addition to orphan list + * happens in the same transaction as write --- we don't have to write + * any data in such case). */ -int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, +int jbd2_journal_begin_ordered_truncate(journal_t *journal, + struct jbd2_inode *jinode, loff_t new_size) { - journal_t *journal; - transaction_t *commit_trans; + transaction_t *inode_trans, *commit_trans; int ret = 0; - if (!inode->i_transaction && !inode->i_next_transaction) + /* This is a quick check to avoid locking if not necessary */ + if (!jinode->i_transaction) goto out; - journal = inode->i_transaction->t_journal; + /* Locks are here just to force reading of recent values, it is + * enough that the transaction was not committing before we started + * a transaction adding the inode to orphan list */ spin_lock(&journal->j_state_lock); commit_trans = journal->j_committing_transaction; spin_unlock(&journal->j_state_lock); - if (inode->i_transaction == commit_trans) { - ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping, + spin_lock(&journal->j_list_lock); + inode_trans = jinode->i_transaction; + spin_unlock(&journal->j_list_lock); + if (inode_trans == commit_trans) { + ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, new_size, LLONG_MAX); if (ret) jbd2_journal_abort(journal, ret); diff --git a/fs/namespace.c b/fs/namespace.c index 228d8c4bfd18..06f8e63f6cb1 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -614,9 +614,11 @@ static inline void __mntput(struct vfsmount *mnt) */ for_each_possible_cpu(cpu) { struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu); - if (cpu_writer->mnt != mnt) - continue; spin_lock(&cpu_writer->lock); + if (cpu_writer->mnt != mnt) { + spin_unlock(&cpu_writer->lock); + continue; + } atomic_add(cpu_writer->count, &mnt->__mnt_writers); cpu_writer->count = 0; /* diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 3c3532e1307c..172850a9a12a 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -513,8 +513,10 @@ static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) static inline int ocfs2_begin_ordered_truncate(struct inode *inode, loff_t new_size) { - return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode, - new_size); + return jbd2_journal_begin_ordered_truncate( + OCFS2_SB(inode->i_sb)->journal->j_journal, + &OCFS2_I(inode)->ip_jinode, + new_size); } #endif /* OCFS2_JOURNAL_H */ diff --git a/include/asm-frv/pgtable.h b/include/asm-frv/pgtable.h index 83c51aba534b..e16fdb1f4f4f 100644 --- a/include/asm-frv/pgtable.h +++ b/include/asm-frv/pgtable.h @@ -478,7 +478,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define __swp_type(x) (((x).val >> 2) & 0x1f) #define __swp_offset(x) ((x).val >> 8) #define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 2) | ((offset) << 8) }) -#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte }) +#define __pte_to_swp_entry(_pte) ((swp_entry_t) { (_pte).pte }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) static inline int pte_file(pte_t pte) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index b28b37eb11c6..4d248b3f1323 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1150,7 +1150,8 @@ extern int jbd2_journal_clear_err (journal_t *); extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); extern int jbd2_journal_force_commit(journal_t *); extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode); -extern int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size); +extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, + struct jbd2_inode *inode, loff_t new_size); extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode); extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode); diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 5715f1907601..0424326f1679 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -58,10 +58,10 @@ struct kvm_irqchip { __u32 pad; union { char dummy[512]; /* reserving space */ -#ifdef CONFIG_X86 +#ifdef __KVM_HAVE_PIT struct kvm_pic_state pic; #endif -#if defined(CONFIG_X86) || defined(CONFIG_IA64) +#ifdef __KVM_HAVE_IOAPIC struct kvm_ioapic_state ioapic; #endif } chip; @@ -384,16 +384,16 @@ struct kvm_trace_rec { #define KVM_CAP_MP_STATE 14 #define KVM_CAP_COALESCED_MMIO 15 #define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */ -#if defined(CONFIG_X86)||defined(CONFIG_IA64) +#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT #define KVM_CAP_DEVICE_ASSIGNMENT 17 #endif #define KVM_CAP_IOMMU 18 -#if defined(CONFIG_X86) +#ifdef __KVM_HAVE_MSI #define KVM_CAP_DEVICE_MSI 20 #endif /* Bug in KVM_SET_USER_MEMORY_REGION fixed: */ #define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21 -#if defined(CONFIG_X86) +#ifdef __KVM_HAVE_USER_NMI #define KVM_CAP_USER_NMI 22 #endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ec49d0be7f52..bf6f703642fc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -285,6 +285,7 @@ void kvm_free_physmem(struct kvm *kvm); struct kvm *kvm_arch_create_vm(void); void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_free_all_assigned_devices(struct kvm *kvm); +void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *v); diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index e9693a29d00e..4c4037503600 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c @@ -73,14 +73,13 @@ static int kvm_iommu_map_memslots(struct kvm *kvm) { int i, r = 0; - down_read(&kvm->slots_lock); for (i = 0; i < kvm->nmemslots; i++) { r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn, kvm->memslots[i].npages); if (r) break; } - up_read(&kvm->slots_lock); + return r; } @@ -190,12 +189,11 @@ static void kvm_iommu_put_pages(struct kvm *kvm, static int kvm_iommu_unmap_memslots(struct kvm *kvm) { int i; - down_read(&kvm->slots_lock); + for (i = 0; i < kvm->nmemslots; i++) { kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn, kvm->memslots[i].npages); } - up_read(&kvm->slots_lock); return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3a5a08298aab..29a667ce35b0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -173,7 +173,6 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) assigned_dev->host_irq_disabled = false; } mutex_unlock(&assigned_dev->kvm->lock); - kvm_put_kvm(assigned_dev->kvm); } static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) @@ -181,8 +180,6 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) struct kvm_assigned_dev_kernel *assigned_dev = (struct kvm_assigned_dev_kernel *) dev_id; - kvm_get_kvm(assigned_dev->kvm); - schedule_work(&assigned_dev->interrupt_work); disable_irq_nosync(irq); @@ -213,6 +210,7 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) } } +/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ static void kvm_free_assigned_irq(struct kvm *kvm, struct kvm_assigned_dev_kernel *assigned_dev) { @@ -228,11 +226,24 @@ static void kvm_free_assigned_irq(struct kvm *kvm, if (!assigned_dev->irq_requested_type) return; - if (cancel_work_sync(&assigned_dev->interrupt_work)) - /* We had pending work. That means we will have to take - * care of kvm_put_kvm. - */ - kvm_put_kvm(kvm); + /* + * In kvm_free_device_irq, cancel_work_sync return true if: + * 1. work is scheduled, and then cancelled. + * 2. work callback is executed. + * + * The first one ensured that the irq is disabled and no more events + * would happen. But for the second one, the irq may be enabled (e.g. + * for MSI). So we disable irq here to prevent further events. + * + * Notice this maybe result in nested disable if the interrupt type is + * INTx, but it's OK for we are going to free it. + * + * If this function is a part of VM destroy, please ensure that till + * now, the kvm state is still legal for probably we also have to wait + * interrupt_work done. + */ + disable_irq_nosync(assigned_dev->host_irq); + cancel_work_sync(&assigned_dev->interrupt_work); free_irq(assigned_dev->host_irq, (void *)assigned_dev); @@ -285,8 +296,8 @@ static int assigned_device_update_intx(struct kvm *kvm, if (irqchip_in_kernel(kvm)) { if (!msi2intx && - adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) { - free_irq(adev->host_irq, (void *)kvm); + (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)) { + free_irq(adev->host_irq, (void *)adev); pci_disable_msi(adev->dev); } @@ -455,6 +466,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, struct kvm_assigned_dev_kernel *match; struct pci_dev *dev; + down_read(&kvm->slots_lock); mutex_lock(&kvm->lock); match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, @@ -516,6 +528,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, out: mutex_unlock(&kvm->lock); + up_read(&kvm->slots_lock); return r; out_list_del: list_del(&match->list); @@ -527,6 +540,7 @@ out_put: out_free: kfree(match); mutex_unlock(&kvm->lock); + up_read(&kvm->slots_lock); return r; } #endif @@ -789,11 +803,19 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, return young; } +static void kvm_mmu_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + kvm_arch_flush_shadow(kvm); +} + static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .invalidate_page = kvm_mmu_notifier_invalidate_page, .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, + .release = kvm_mmu_notifier_release, }; #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ @@ -883,6 +905,7 @@ static void kvm_destroy_vm(struct kvm *kvm) { struct mm_struct *mm = kvm->mm; + kvm_arch_sync_events(kvm); spin_lock(&kvm_lock); list_del(&kvm->vm_list); spin_unlock(&kvm_lock); |