From 4ff542163397073f86eda484318d61980ff1031d Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 28 Sep 2023 00:15:26 -0700 Subject: iommufd: Support allocating nested parent domain Extend IOMMU_HWPT_ALLOC to allocate domains to be used as parent (stage-2) in nested translation. Add IOMMU_HWPT_ALLOC_NEST_PARENT to the uAPI. Link: https://lore.kernel.org/r/20230928071528.26258-5-yi.l.liu@intel.com Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe --- include/uapi/linux/iommufd.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index b4ba0c0cbab6..4a7c5c8fdbb4 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -347,10 +347,20 @@ struct iommu_vfio_ioas { }; #define IOMMU_VFIO_IOAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VFIO_IOAS) +/** + * enum iommufd_hwpt_alloc_flags - Flags for HWPT allocation + * @IOMMU_HWPT_ALLOC_NEST_PARENT: If set, allocate a domain which can serve + * as the parent domain in the nesting + * configuration. + */ +enum iommufd_hwpt_alloc_flags { + IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0, +}; + /** * struct iommu_hwpt_alloc - ioctl(IOMMU_HWPT_ALLOC) * @size: sizeof(struct iommu_hwpt_alloc) - * @flags: Must be 0 + * @flags: Combination of enum iommufd_hwpt_alloc_flags * @dev_id: The device to allocate this HWPT for * @pt_id: The IOAS to connect this HWPT to * @out_hwpt_id: The ID of the new HWPT -- cgit v1.2.3 From b5f9e63278d6f32789478acf1ed41d21d92b36cf Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 17 Oct 2023 11:15:52 -0700 Subject: iommufd: Correct IOMMU_HWPT_ALLOC_NEST_PARENT description The IOMMU_HWPT_ALLOC_NEST_PARENT flag is used to allocate a HWPT. Though a HWPT holds a domain in the core structure, it is still quite confusing to describe it using "domain" in the uAPI kdoc. Correct it to "HWPT". Fixes: 4ff542163397 ("iommufd: Support allocating nested parent domain") Link: https://lore.kernel.org/r/20231017181552.12667-1-nicolinc@nvidia.com Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/uapi/linux/iommufd.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 4a7c5c8fdbb4..be7a95042677 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -349,9 +349,8 @@ struct iommu_vfio_ioas { /** * enum iommufd_hwpt_alloc_flags - Flags for HWPT allocation - * @IOMMU_HWPT_ALLOC_NEST_PARENT: If set, allocate a domain which can serve - * as the parent domain in the nesting - * configuration. + * @IOMMU_HWPT_ALLOC_NEST_PARENT: If set, allocate a HWPT that can serve as + * the parent HWPT in a nesting configuration. */ enum iommufd_hwpt_alloc_flags { IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0, -- cgit v1.2.3 From 5f9bdbf4c65860cc8b9c544d92bfd76fbea8d9c5 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:50:56 +0100 Subject: iommufd: Add a flag to enforce dirty tracking on attach Throughout IOMMU domain lifetime that wants to use dirty tracking, some guarantees are needed such that any device attached to the iommu_domain supports dirty tracking. The idea is to handle a case where IOMMU in the system are assymetric feature-wise and thus the capability may not be supported for all devices. The enforcement is done by adding a flag into HWPT_ALLOC namely: IOMMU_HWPT_ALLOC_DIRTY_TRACKING .. Passed in HWPT_ALLOC ioctl() flags. The enforcement is done by creating a iommu_domain via domain_alloc_user() and validating the requested flags with what the device IOMMU supports (and failing accordingly) advertised). Advertising the new IOMMU domain feature flag requires that the individual iommu driver capability is supported when a future device attachment happens. Link: https://lore.kernel.org/r/20231024135109.73787-6-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/hw_pagetable.c | 4 +++- include/uapi/linux/iommufd.h | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 8b3d2875d642..dd50ca9e2c09 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -157,7 +157,9 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) struct iommufd_ioas *ioas; int rc; - if ((cmd->flags & (~IOMMU_HWPT_ALLOC_NEST_PARENT)) || cmd->__reserved) + if ((cmd->flags & ~(IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_ALLOC_DIRTY_TRACKING)) || + cmd->__reserved) return -EOPNOTSUPP; idev = iommufd_get_device(ucmd, cmd->dev_id); diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index be7a95042677..c76248410120 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -351,9 +351,12 @@ struct iommu_vfio_ioas { * enum iommufd_hwpt_alloc_flags - Flags for HWPT allocation * @IOMMU_HWPT_ALLOC_NEST_PARENT: If set, allocate a HWPT that can serve as * the parent HWPT in a nesting configuration. + * @IOMMU_HWPT_ALLOC_DIRTY_TRACKING: Dirty tracking support for device IOMMU is + * enforced on device attachment */ enum iommufd_hwpt_alloc_flags { IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1, }; /** -- cgit v1.2.3 From e2a4b294784957fc28ecb1fed8a7e69da18eb18d Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:50:57 +0100 Subject: iommufd: Add IOMMU_HWPT_SET_DIRTY_TRACKING Every IOMMU driver should be able to implement the needed iommu domain ops to control dirty tracking. Connect a hw_pagetable to the IOMMU core dirty tracking ops, specifically the ability to enable/disable dirty tracking on an IOMMU domain (hw_pagetable id). To that end add an io_pagetable kernel API to toggle dirty tracking: * iopt_set_dirty_tracking(iopt, [domain], state) The intended caller of this is via the hw_pagetable object that is created. Internally it will ensure the leftover dirty state is cleared /right before/ dirty tracking starts. This is also useful for iommu drivers which may decide that dirty tracking is always-enabled at boot without wanting to toggle dynamically via corresponding iommu domain op. Link: https://lore.kernel.org/r/20231024135109.73787-7-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/hw_pagetable.c | 24 +++++++++++++++ drivers/iommu/iommufd/io_pagetable.c | 54 +++++++++++++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 12 ++++++++ drivers/iommu/iommufd/main.c | 3 ++ include/uapi/linux/iommufd.h | 28 +++++++++++++++++ 5 files changed, 121 insertions(+) (limited to 'include/uapi') diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index dd50ca9e2c09..c3b7bd9bfcbb 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -196,3 +196,27 @@ out_put_idev: iommufd_put_object(&idev->obj); return rc; } + +int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd) +{ + struct iommu_hwpt_set_dirty_tracking *cmd = ucmd->cmd; + struct iommufd_hw_pagetable *hwpt; + struct iommufd_ioas *ioas; + int rc = -EOPNOTSUPP; + bool enable; + + if (cmd->flags & ~IOMMU_HWPT_DIRTY_TRACKING_ENABLE) + return rc; + + hwpt = iommufd_get_hwpt(ucmd, cmd->hwpt_id); + if (IS_ERR(hwpt)) + return PTR_ERR(hwpt); + + ioas = hwpt->ioas; + enable = cmd->flags & IOMMU_HWPT_DIRTY_TRACKING_ENABLE; + + rc = iopt_set_dirty_tracking(&ioas->iopt, hwpt->domain, enable); + + iommufd_put_object(&hwpt->obj); + return rc; +} diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 3a598182b761..41c2efb6ff15 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -412,6 +412,60 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, return 0; } +static int iopt_clear_dirty_data(struct io_pagetable *iopt, + struct iommu_domain *domain) +{ + const struct iommu_dirty_ops *ops = domain->dirty_ops; + struct iommu_iotlb_gather gather; + struct iommu_dirty_bitmap dirty; + struct iopt_area *area; + int ret = 0; + + lockdep_assert_held_read(&iopt->iova_rwsem); + + iommu_dirty_bitmap_init(&dirty, NULL, &gather); + + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + if (!area->pages) + continue; + + ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area), + iopt_area_length(area), 0, + &dirty); + if (ret) + break; + } + + iommu_iotlb_sync(domain, &gather); + return ret; +} + +int iopt_set_dirty_tracking(struct io_pagetable *iopt, + struct iommu_domain *domain, bool enable) +{ + const struct iommu_dirty_ops *ops = domain->dirty_ops; + int ret = 0; + + if (!ops) + return -EOPNOTSUPP; + + down_read(&iopt->iova_rwsem); + + /* Clear dirty bits from PTEs to ensure a clean snapshot */ + if (enable) { + ret = iopt_clear_dirty_data(iopt, domain); + if (ret) + goto out_unlock; + } + + ret = ops->set_dirty_tracking(domain, enable); + +out_unlock: + up_read(&iopt->iova_rwsem); + return ret; +} + int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, unsigned long length, struct list_head *pages_list) { diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 3064997a0181..b09750848da6 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -8,6 +8,7 @@ #include #include #include +#include struct iommu_domain; struct iommu_group; @@ -70,6 +71,9 @@ int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, unsigned long length, unsigned long *unmapped); int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped); +int iopt_set_dirty_tracking(struct io_pagetable *iopt, + struct iommu_domain *domain, bool enable); + void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, unsigned long length); int iopt_table_add_domain(struct io_pagetable *iopt, @@ -240,6 +244,14 @@ struct iommufd_hw_pagetable { struct list_head hwpt_item; }; +static inline struct iommufd_hw_pagetable * +iommufd_get_hwpt(struct iommufd_ucmd *ucmd, u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_HW_PAGETABLE), + struct iommufd_hw_pagetable, obj); +} +int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd); struct iommufd_hw_pagetable * iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, struct iommufd_device *idev, u32 flags, diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index e71523cbd0de..46fedd779714 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -307,6 +307,7 @@ union ucmd_buffer { struct iommu_destroy destroy; struct iommu_hw_info info; struct iommu_hwpt_alloc hwpt; + struct iommu_hwpt_set_dirty_tracking set_dirty_tracking; struct iommu_ioas_alloc alloc; struct iommu_ioas_allow_iovas allow_iovas; struct iommu_ioas_copy ioas_copy; @@ -342,6 +343,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { __reserved), IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, __reserved), + IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking, + struct iommu_hwpt_set_dirty_tracking, __reserved), IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, struct iommu_ioas_alloc, out_ioas_id), IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index c76248410120..5c82b68c88f3 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -47,6 +47,7 @@ enum { IOMMUFD_CMD_VFIO_IOAS, IOMMUFD_CMD_HWPT_ALLOC, IOMMUFD_CMD_GET_HW_INFO, + IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING, }; /** @@ -453,4 +454,31 @@ struct iommu_hw_info { __u32 __reserved; }; #define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO) + +/* + * enum iommufd_hwpt_set_dirty_tracking_flags - Flags for steering dirty + * tracking + * @IOMMU_HWPT_DIRTY_TRACKING_ENABLE: Enable dirty tracking + */ +enum iommufd_hwpt_set_dirty_tracking_flags { + IOMMU_HWPT_DIRTY_TRACKING_ENABLE = 1, +}; + +/** + * struct iommu_hwpt_set_dirty_tracking - ioctl(IOMMU_HWPT_SET_DIRTY_TRACKING) + * @size: sizeof(struct iommu_hwpt_set_dirty_tracking) + * @flags: Combination of enum iommufd_hwpt_set_dirty_tracking_flags + * @hwpt_id: HW pagetable ID that represents the IOMMU domain + * @__reserved: Must be 0 + * + * Toggle dirty tracking on an HW pagetable. + */ +struct iommu_hwpt_set_dirty_tracking { + __u32 size; + __u32 flags; + __u32 hwpt_id; + __u32 __reserved; +}; +#define IOMMU_HWPT_SET_DIRTY_TRACKING _IO(IOMMUFD_TYPE, \ + IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING) #endif -- cgit v1.2.3 From b9a60d6f850e4470017b60f731220a58cda199aa Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:50:58 +0100 Subject: iommufd: Add IOMMU_HWPT_GET_DIRTY_BITMAP Connect a hw_pagetable to the IOMMU core dirty tracking read_and_clear_dirty iommu domain op. It exposes all of the functionality for the UAPI that read the dirtied IOVAs while clearing the Dirty bits from the PTEs. In doing so, add an IO pagetable API iopt_read_and_clear_dirty_data() that performs the reading of dirty IOPTEs for a given IOVA range and then copying back to userspace bitmap. Underneath it uses the IOMMU domain kernel API which will read the dirty bits, as well as atomically clearing the IOPTE dirty bit and flushing the IOTLB at the end. The IOVA bitmaps usage takes care of the iteration of the bitmaps user pages efficiently and without copies. Within the iterator function we iterate over io-pagetable contigous areas that have been mapped. Contrary to past incantation of a similar interface in VFIO the IOVA range to be scanned is tied in to the bitmap size, thus the application needs to pass a appropriately sized bitmap address taking into account the iova range being passed *and* page size ... as opposed to allowing bitmap-iova != iova. Link: https://lore.kernel.org/r/20231024135109.73787-8-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/hw_pagetable.c | 22 +++++++ drivers/iommu/iommufd/io_pagetable.c | 113 ++++++++++++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 10 +++ drivers/iommu/iommufd/main.c | 4 ++ include/uapi/linux/iommufd.h | 35 ++++++++++ 5 files changed, 184 insertions(+) (limited to 'include/uapi') diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index c3b7bd9bfcbb..7316f69110ef 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -220,3 +220,25 @@ int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd) iommufd_put_object(&hwpt->obj); return rc; } + +int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd) +{ + struct iommu_hwpt_get_dirty_bitmap *cmd = ucmd->cmd; + struct iommufd_hw_pagetable *hwpt; + struct iommufd_ioas *ioas; + int rc = -EOPNOTSUPP; + + if ((cmd->flags || cmd->__reserved)) + return -EOPNOTSUPP; + + hwpt = iommufd_get_hwpt(ucmd, cmd->hwpt_id); + if (IS_ERR(hwpt)) + return PTR_ERR(hwpt); + + ioas = hwpt->ioas; + rc = iopt_read_and_clear_dirty_data(&ioas->iopt, hwpt->domain, + cmd->flags, cmd); + + iommufd_put_object(&hwpt->obj); + return rc; +} diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 41c2efb6ff15..255264e796fb 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "io_pagetable.h" #include "double_span.h" @@ -412,6 +413,118 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, return 0; } +struct iova_bitmap_fn_arg { + struct io_pagetable *iopt; + struct iommu_domain *domain; + struct iommu_dirty_bitmap *dirty; +}; + +static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap, + unsigned long iova, size_t length, + void *opaque) +{ + struct iopt_area *area; + struct iopt_area_contig_iter iter; + struct iova_bitmap_fn_arg *arg = opaque; + struct iommu_domain *domain = arg->domain; + struct iommu_dirty_bitmap *dirty = arg->dirty; + const struct iommu_dirty_ops *ops = domain->dirty_ops; + unsigned long last_iova = iova + length - 1; + int ret; + + iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) { + unsigned long last = min(last_iova, iopt_area_last_iova(area)); + + ret = ops->read_and_clear_dirty(domain, iter.cur_iova, + last - iter.cur_iova + 1, 0, + dirty); + if (ret) + return ret; + } + + if (!iopt_area_contig_done(&iter)) + return -EINVAL; + return 0; +} + +static int +iommu_read_and_clear_dirty(struct iommu_domain *domain, + struct io_pagetable *iopt, unsigned long flags, + struct iommu_hwpt_get_dirty_bitmap *bitmap) +{ + const struct iommu_dirty_ops *ops = domain->dirty_ops; + struct iommu_iotlb_gather gather; + struct iommu_dirty_bitmap dirty; + struct iova_bitmap_fn_arg arg; + struct iova_bitmap *iter; + int ret = 0; + + if (!ops || !ops->read_and_clear_dirty) + return -EOPNOTSUPP; + + iter = iova_bitmap_alloc(bitmap->iova, bitmap->length, + bitmap->page_size, + u64_to_user_ptr(bitmap->data)); + if (IS_ERR(iter)) + return -ENOMEM; + + iommu_dirty_bitmap_init(&dirty, iter, &gather); + + arg.iopt = iopt; + arg.domain = domain; + arg.dirty = &dirty; + iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty); + + iommu_iotlb_sync(domain, &gather); + iova_bitmap_free(iter); + + return ret; +} + +int iommufd_check_iova_range(struct io_pagetable *iopt, + struct iommu_hwpt_get_dirty_bitmap *bitmap) +{ + size_t iommu_pgsize = iopt->iova_alignment; + u64 last_iova; + + if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova)) + return -EOVERFLOW; + + if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX) + return -EOVERFLOW; + + if ((bitmap->iova & (iommu_pgsize - 1)) || + ((last_iova + 1) & (iommu_pgsize - 1))) + return -EINVAL; + + if (!bitmap->page_size) + return -EINVAL; + + if ((bitmap->iova & (bitmap->page_size - 1)) || + ((last_iova + 1) & (bitmap->page_size - 1))) + return -EINVAL; + + return 0; +} + +int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, + struct iommu_domain *domain, + unsigned long flags, + struct iommu_hwpt_get_dirty_bitmap *bitmap) +{ + int ret; + + ret = iommufd_check_iova_range(iopt, bitmap); + if (ret) + return ret; + + down_read(&iopt->iova_rwsem); + ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap); + up_read(&iopt->iova_rwsem); + + return ret; +} + static int iopt_clear_dirty_data(struct io_pagetable *iopt, struct iommu_domain *domain) { diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index b09750848da6..034129130db3 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include struct iommu_domain; @@ -71,6 +73,10 @@ int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, unsigned long length, unsigned long *unmapped); int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped); +int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, + struct iommu_domain *domain, + unsigned long flags, + struct iommu_hwpt_get_dirty_bitmap *bitmap); int iopt_set_dirty_tracking(struct io_pagetable *iopt, struct iommu_domain *domain, bool enable); @@ -226,6 +232,8 @@ int iommufd_option_rlimit_mode(struct iommu_option *cmd, struct iommufd_ctx *ictx); int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd); +int iommufd_check_iova_range(struct io_pagetable *iopt, + struct iommu_hwpt_get_dirty_bitmap *bitmap); /* * A HW pagetable is called an iommu_domain inside the kernel. This user object @@ -252,6 +260,8 @@ iommufd_get_hwpt(struct iommufd_ucmd *ucmd, u32 id) struct iommufd_hw_pagetable, obj); } int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd); +int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd); + struct iommufd_hw_pagetable * iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, struct iommufd_device *idev, u32 flags, diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 46fedd779714..d50f42a730aa 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -307,6 +307,7 @@ union ucmd_buffer { struct iommu_destroy destroy; struct iommu_hw_info info; struct iommu_hwpt_alloc hwpt; + struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap; struct iommu_hwpt_set_dirty_tracking set_dirty_tracking; struct iommu_ioas_alloc alloc; struct iommu_ioas_allow_iovas allow_iovas; @@ -343,6 +344,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { __reserved), IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, __reserved), + IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap, + struct iommu_hwpt_get_dirty_bitmap, data), IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking, struct iommu_hwpt_set_dirty_tracking, __reserved), IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, @@ -555,5 +558,6 @@ MODULE_ALIAS_MISCDEV(VFIO_MINOR); MODULE_ALIAS("devname:vfio/vfio"); #endif MODULE_IMPORT_NS(IOMMUFD_INTERNAL); +MODULE_IMPORT_NS(IOMMUFD); MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); MODULE_LICENSE("GPL"); diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 5c82b68c88f3..dce38e32ca84 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -48,6 +48,7 @@ enum { IOMMUFD_CMD_HWPT_ALLOC, IOMMUFD_CMD_GET_HW_INFO, IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING, + IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP, }; /** @@ -481,4 +482,38 @@ struct iommu_hwpt_set_dirty_tracking { }; #define IOMMU_HWPT_SET_DIRTY_TRACKING _IO(IOMMUFD_TYPE, \ IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING) + +/** + * struct iommu_hwpt_get_dirty_bitmap - ioctl(IOMMU_HWPT_GET_DIRTY_BITMAP) + * @size: sizeof(struct iommu_hwpt_get_dirty_bitmap) + * @hwpt_id: HW pagetable ID that represents the IOMMU domain + * @flags: Must be zero + * @__reserved: Must be 0 + * @iova: base IOVA of the bitmap first bit + * @length: IOVA range size + * @page_size: page size granularity of each bit in the bitmap + * @data: bitmap where to set the dirty bits. The bitmap bits each + * represent a page_size which you deviate from an arbitrary iova. + * + * Checking a given IOVA is dirty: + * + * data[(iova / page_size) / 64] & (1ULL << ((iova / page_size) % 64)) + * + * Walk the IOMMU pagetables for a given IOVA range to return a bitmap + * with the dirty IOVAs. In doing so it will also by default clear any + * dirty bit metadata set in the IOPTE. + */ +struct iommu_hwpt_get_dirty_bitmap { + __u32 size; + __u32 hwpt_id; + __u32 flags; + __u32 __reserved; + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 page_size; + __aligned_u64 data; +}; +#define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \ + IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP) + #endif -- cgit v1.2.3 From 7623683857e52b75184d37862c70f1230aef2edd Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:50:59 +0100 Subject: iommufd: Add capabilities to IOMMU_GET_HW_INFO Extend IOMMUFD_CMD_GET_HW_INFO op to query generic iommu capabilities for a given device. Capabilities are IOMMU agnostic and use device_iommu_capable() API passing one of the IOMMU_CAP_*. Enumerate IOMMU_CAP_DIRTY_TRACKING for now in the out_capabilities field returned back to userspace. Link: https://lore.kernel.org/r/20231024135109.73787-9-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 4 ++++ include/uapi/linux/iommufd.h | 17 +++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'include/uapi') diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index e88fa73a45e6..2a41fd2b6ef8 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -1185,6 +1185,10 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) */ cmd->data_len = data_len; + cmd->out_capabilities = 0; + if (device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING)) + cmd->out_capabilities |= IOMMU_HW_CAP_DIRTY_TRACKING; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_free: kfree(data); diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index dce38e32ca84..036ebc6c19cf 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -418,6 +418,20 @@ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_INTEL_VTD, }; +/** + * enum iommufd_hw_capabilities + * @IOMMU_HW_CAP_DIRTY_TRACKING: IOMMU hardware support for dirty tracking + * If available, it means the following APIs + * are supported: + * + * IOMMU_HWPT_GET_DIRTY_BITMAP + * IOMMU_HWPT_SET_DIRTY_TRACKING + * + */ +enum iommufd_hw_capabilities { + IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0, +}; + /** * struct iommu_hw_info - ioctl(IOMMU_GET_HW_INFO) * @size: sizeof(struct iommu_hw_info) @@ -429,6 +443,8 @@ enum iommu_hw_info_type { * the iommu type specific hardware information data * @out_data_type: Output the iommu hardware info type as defined in the enum * iommu_hw_info_type. + * @out_capabilities: Output the generic iommu capability info type as defined + * in the enum iommu_hw_capabilities. * @__reserved: Must be 0 * * Query an iommu type specific hardware information data from an iommu behind @@ -453,6 +469,7 @@ struct iommu_hw_info { __aligned_u64 data_uptr; __u32 out_data_type; __u32 __reserved; + __aligned_u64 out_capabilities; }; #define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO) -- cgit v1.2.3 From 609848132c71316df3260d1ec066539c21bba585 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 24 Oct 2023 14:51:00 +0100 Subject: iommufd: Add a flag to skip clearing of IOPTE dirty VFIO has an operation where it unmaps an IOVA while returning a bitmap with the dirty data. In reality the operation doesn't quite query the IO pagetables that the PTE was dirty or not. Instead it marks as dirty on anything that was mapped, and doing so in one syscall. In IOMMUFD the equivalent is done in two operations by querying with GET_DIRTY_IOVA followed by UNMAP_IOVA. However, this would incur two TLB flushes given that after clearing dirty bits IOMMU implementations require invalidating their IOTLB, plus another invalidation needed for the UNMAP. To allow dirty bits to be queried faster, add a flag (IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR) that requests to not clear the dirty bits from the PTE (but just reading them), under the expectation that the next operation is the unmap. An alternative is to unmap and just perpectually mark as dirty as that's the same behaviour as today. So here equivalent functionally can be provided with unmap alone, and if real dirty info is required it will amortize the cost while querying. There's still a race against DMA where in theory the unmap of the IOVA (when the guest invalidates the IOTLB via emulated iommu) would race against the VF performing DMA on the same IOVA. As discussed in [0], we are accepting to resolve this race as throwing away the DMA and it doesn't matter if it hit physical DRAM or not, the VM can't tell if we threw it away because the DMA was blocked or because we failed to copy the DRAM. [0] https://lore.kernel.org/linux-iommu/20220502185239.GR8364@nvidia.com/ Link: https://lore.kernel.org/r/20231024135109.73787-10-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/hw_pagetable.c | 3 ++- drivers/iommu/iommufd/io_pagetable.c | 9 +++++++-- include/uapi/linux/iommufd.h | 15 ++++++++++++++- 3 files changed, 23 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 7316f69110ef..72a5269984b0 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -228,7 +228,8 @@ int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd) struct iommufd_ioas *ioas; int rc = -EOPNOTSUPP; - if ((cmd->flags || cmd->__reserved)) + if ((cmd->flags & ~(IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR)) || + cmd->__reserved) return -EOPNOTSUPP; hwpt = iommufd_get_hwpt(ucmd, cmd->hwpt_id); diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 255264e796fb..9f060abe53b6 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -414,6 +414,7 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, } struct iova_bitmap_fn_arg { + unsigned long flags; struct io_pagetable *iopt; struct iommu_domain *domain; struct iommu_dirty_bitmap *dirty; @@ -430,13 +431,14 @@ static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap, struct iommu_dirty_bitmap *dirty = arg->dirty; const struct iommu_dirty_ops *ops = domain->dirty_ops; unsigned long last_iova = iova + length - 1; + unsigned long flags = arg->flags; int ret; iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) { unsigned long last = min(last_iova, iopt_area_last_iova(area)); ret = ops->read_and_clear_dirty(domain, iter.cur_iova, - last - iter.cur_iova + 1, 0, + last - iter.cur_iova + 1, flags, dirty); if (ret) return ret; @@ -470,12 +472,15 @@ iommu_read_and_clear_dirty(struct iommu_domain *domain, iommu_dirty_bitmap_init(&dirty, iter, &gather); + arg.flags = flags; arg.iopt = iopt; arg.domain = domain; arg.dirty = &dirty; iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty); - iommu_iotlb_sync(domain, &gather); + if (!(flags & IOMMU_DIRTY_NO_CLEAR)) + iommu_iotlb_sync(domain, &gather); + iova_bitmap_free(iter); return ret; diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 036ebc6c19cf..c44eecf5d318 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -500,11 +500,24 @@ struct iommu_hwpt_set_dirty_tracking { #define IOMMU_HWPT_SET_DIRTY_TRACKING _IO(IOMMUFD_TYPE, \ IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING) +/** + * enum iommufd_hwpt_get_dirty_bitmap_flags - Flags for getting dirty bits + * @IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR: Just read the PTEs without clearing + * any dirty bits metadata. This flag + * can be passed in the expectation + * where the next operation is an unmap + * of the same IOVA range. + * + */ +enum iommufd_hwpt_get_dirty_bitmap_flags { + IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR = 1, +}; + /** * struct iommu_hwpt_get_dirty_bitmap - ioctl(IOMMU_HWPT_GET_DIRTY_BITMAP) * @size: sizeof(struct iommu_hwpt_get_dirty_bitmap) * @hwpt_id: HW pagetable ID that represents the IOMMU domain - * @flags: Must be zero + * @flags: Combination of enum iommufd_hwpt_get_dirty_bitmap_flags * @__reserved: Must be 0 * @iova: base IOVA of the bitmap first bit * @length: IOVA range size -- cgit v1.2.3 From bd529dbb661d62bd9f03e44c9fc837d98a190499 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 25 Oct 2023 21:39:35 -0700 Subject: iommufd: Add a nested HW pagetable object IOMMU_HWPT_ALLOC already supports iommu_domain allocation for usersapce. But it can only allocate a hw_pagetable that associates to a given IOAS, i.e. only a kernel-managed hw_pagetable of IOMMUFD_OBJ_HWPT_PAGING type. IOMMU drivers can now support user-managed hw_pagetables, for two-stage translation use cases that require user data input from the user space. Add a new IOMMUFD_OBJ_HWPT_NESTED type with its abort/destroy(). Pair it with a new iommufd_hwpt_nested structure and its to_hwpt_nested() helper. Update the to_hwpt_paging() helper, so a NESTED-type hw_pagetable can be handled in the callers, for example iommufd_hw_pagetable_enforce_rr(). Screen the inputs including the parent PAGING-type hw_pagetable that has a need of a new nest_parent flag in the iommufd_hwpt_paging structure. Extend the IOMMU_HWPT_ALLOC ioctl to accept an IOMMU driver specific data input which is tagged by the enum iommu_hwpt_data_type. Also, update the @pt_id to accept hwpt_id too besides an ioas_id. Then, use them to allocate a hw_pagetable of IOMMUFD_OBJ_HWPT_NESTED type using the iommufd_hw_pagetable_alloc_nested() allocator. Link: https://lore.kernel.org/r/20231026043938.63898-8-yi.l.liu@intel.com Signed-off-by: Nicolin Chen Co-developed-by: Yi Liu Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 3 +- drivers/iommu/iommufd/hw_pagetable.c | 109 ++++++++++++++++++++++++++++++-- drivers/iommu/iommufd/iommufd_private.h | 28 ++++++-- drivers/iommu/iommufd/main.c | 4 ++ include/uapi/linux/iommufd.h | 31 ++++++++- 5 files changed, 159 insertions(+), 16 deletions(-) (limited to 'include/uapi') diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index a99ce4547353..59d3a07300d9 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -588,7 +588,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, } hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, 0, - immediate_attach); + immediate_attach, NULL); if (IS_ERR(hwpt_paging)) { destroy_hwpt = ERR_CAST(hwpt_paging); goto out_unlock; @@ -628,6 +628,7 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, return PTR_ERR(pt_obj); switch (pt_obj->type) { + case IOMMUFD_OBJ_HWPT_NESTED: case IOMMUFD_OBJ_HWPT_PAGING: { struct iommufd_hw_pagetable *hwpt = container_of(pt_obj, struct iommufd_hw_pagetable, obj); diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 198ecbd536f7..2abbeafdbd22 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -44,6 +44,22 @@ void iommufd_hwpt_paging_abort(struct iommufd_object *obj) iommufd_hwpt_paging_destroy(obj); } +void iommufd_hwpt_nested_destroy(struct iommufd_object *obj) +{ + struct iommufd_hwpt_nested *hwpt_nested = + container_of(obj, struct iommufd_hwpt_nested, common.obj); + + if (hwpt_nested->common.domain) + iommu_domain_free(hwpt_nested->common.domain); + + refcount_dec(&hwpt_nested->parent->common.obj.users); +} + +void iommufd_hwpt_nested_abort(struct iommufd_object *obj) +{ + iommufd_hwpt_nested_destroy(obj); +} + static int iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging) { @@ -68,6 +84,8 @@ iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging) * @idev: Device to get an iommu_domain for * @flags: Flags from userspace * @immediate_attach: True if idev should be attached to the hwpt + * @user_data: The user provided driver specific data describing the domain to + * create * * Allocate a new iommu_domain and return it as a hw_pagetable. The HWPT * will be linked to the given ioas and upon return the underlying iommu_domain @@ -80,7 +98,8 @@ iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging) struct iommufd_hwpt_paging * iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, struct iommufd_device *idev, u32 flags, - bool immediate_attach) + bool immediate_attach, + const struct iommu_user_data *user_data) { const u32 valid_flags = IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING; @@ -91,7 +110,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, lockdep_assert_held(&ioas->mutex); - if (flags && !ops->domain_alloc_user) + if ((flags || user_data) && !ops->domain_alloc_user) return ERR_PTR(-EOPNOTSUPP); if (flags & ~valid_flags) return ERR_PTR(-EOPNOTSUPP); @@ -106,10 +125,11 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, /* Pairs with iommufd_hw_pagetable_destroy() */ refcount_inc(&ioas->obj.users); hwpt_paging->ioas = ioas; + hwpt_paging->nest_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; if (ops->domain_alloc_user) { - hwpt->domain = - ops->domain_alloc_user(idev->dev, flags, NULL, NULL); + hwpt->domain = ops->domain_alloc_user(idev->dev, flags, NULL, + user_data); if (IS_ERR(hwpt->domain)) { rc = PTR_ERR(hwpt->domain); hwpt->domain = NULL; @@ -169,9 +189,70 @@ out_abort: return ERR_PTR(rc); } +/** + * iommufd_hwpt_nested_alloc() - Get a NESTED iommu_domain for a device + * @ictx: iommufd context + * @parent: Parent PAGING-type hwpt to associate the domain with + * @idev: Device to get an iommu_domain for + * @flags: Flags from userspace + * @user_data: user_data pointer. Must be valid + * + * Allocate a new iommu_domain (must be IOMMU_DOMAIN_NESTED) and return it as + * a NESTED hw_pagetable. The given parent PAGING-type hwpt must be capable of + * being a parent. + */ +static struct iommufd_hwpt_nested * +iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, + struct iommufd_hwpt_paging *parent, + struct iommufd_device *idev, u32 flags, + const struct iommu_user_data *user_data) +{ + const struct iommu_ops *ops = dev_iommu_ops(idev->dev); + struct iommufd_hwpt_nested *hwpt_nested; + struct iommufd_hw_pagetable *hwpt; + int rc; + + if (flags || !user_data->len || !ops->domain_alloc_user) + return ERR_PTR(-EOPNOTSUPP); + if (parent->auto_domain || !parent->nest_parent) + return ERR_PTR(-EINVAL); + + hwpt_nested = __iommufd_object_alloc( + ictx, hwpt_nested, IOMMUFD_OBJ_HWPT_NESTED, common.obj); + if (IS_ERR(hwpt_nested)) + return ERR_CAST(hwpt_nested); + hwpt = &hwpt_nested->common; + + refcount_inc(&parent->common.obj.users); + hwpt_nested->parent = parent; + + hwpt->domain = ops->domain_alloc_user(idev->dev, flags, + parent->common.domain, user_data); + if (IS_ERR(hwpt->domain)) { + rc = PTR_ERR(hwpt->domain); + hwpt->domain = NULL; + goto out_abort; + } + + if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { + rc = -EINVAL; + goto out_abort; + } + return hwpt_nested; + +out_abort: + iommufd_object_abort_and_destroy(ictx, &hwpt->obj); + return ERR_PTR(rc); +} + int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_alloc *cmd = ucmd->cmd; + const struct iommu_user_data user_data = { + .type = cmd->data_type, + .uptr = u64_to_user_ptr(cmd->data_uptr), + .len = cmd->data_len, + }; struct iommufd_hw_pagetable *hwpt; struct iommufd_ioas *ioas = NULL; struct iommufd_object *pt_obj; @@ -180,6 +261,8 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) if (cmd->__reserved) return -EOPNOTSUPP; + if (cmd->data_type == IOMMU_HWPT_DATA_NONE && cmd->data_len) + return -EINVAL; idev = iommufd_get_device(ucmd, cmd->dev_id); if (IS_ERR(idev)) @@ -196,13 +279,27 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) ioas = container_of(pt_obj, struct iommufd_ioas, obj); mutex_lock(&ioas->mutex); - hwpt_paging = iommufd_hwpt_paging_alloc(ucmd->ictx, ioas, idev, - cmd->flags, false); + hwpt_paging = iommufd_hwpt_paging_alloc( + ucmd->ictx, ioas, idev, cmd->flags, false, + user_data.len ? &user_data : NULL); if (IS_ERR(hwpt_paging)) { rc = PTR_ERR(hwpt_paging); goto out_unlock; } hwpt = &hwpt_paging->common; + } else if (pt_obj->type == IOMMUFD_OBJ_HWPT_PAGING) { + struct iommufd_hwpt_nested *hwpt_nested; + + hwpt_nested = iommufd_hwpt_nested_alloc( + ucmd->ictx, + container_of(pt_obj, struct iommufd_hwpt_paging, + common.obj), + idev, cmd->flags, &user_data); + if (IS_ERR(hwpt_nested)) { + rc = PTR_ERR(hwpt_nested); + goto out_unlock; + } + hwpt = &hwpt_nested->common; } else { rc = -EINVAL; goto out_put_pt; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index cd8da289ed0b..a74cfefffbc6 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -124,6 +124,7 @@ enum iommufd_object_type { IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, IOMMUFD_OBJ_DEVICE, IOMMUFD_OBJ_HWPT_PAGING, + IOMMUFD_OBJ_HWPT_NESTED, IOMMUFD_OBJ_IOAS, IOMMUFD_OBJ_ACCESS, #ifdef CONFIG_IOMMUFD_TEST @@ -255,10 +256,16 @@ struct iommufd_hwpt_paging { bool auto_domain : 1; bool enforce_cache_coherency : 1; bool msi_cookie : 1; + bool nest_parent : 1; /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item; }; +struct iommufd_hwpt_nested { + struct iommufd_hw_pagetable common; + struct iommufd_hwpt_paging *parent; +}; + static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt) { return hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING; @@ -283,25 +290,32 @@ int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd); struct iommufd_hwpt_paging * iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, struct iommufd_device *idev, u32 flags, - bool immediate_attach); + bool immediate_attach, + const struct iommu_user_data *user_data); int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev); struct iommufd_hw_pagetable * iommufd_hw_pagetable_detach(struct iommufd_device *idev); void iommufd_hwpt_paging_destroy(struct iommufd_object *obj); void iommufd_hwpt_paging_abort(struct iommufd_object *obj); +void iommufd_hwpt_nested_destroy(struct iommufd_object *obj); +void iommufd_hwpt_nested_abort(struct iommufd_object *obj); int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd); static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, struct iommufd_hw_pagetable *hwpt) { - struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt); + if (hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING) { + struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt); - lockdep_assert_not_held(&hwpt_paging->ioas->mutex); - if (hwpt_paging->auto_domain) - iommufd_object_deref_user(ictx, &hwpt->obj); - else - refcount_dec(&hwpt->obj.users); + lockdep_assert_not_held(&hwpt_paging->ioas->mutex); + + if (hwpt_paging->auto_domain) { + iommufd_object_deref_user(ictx, &hwpt->obj); + return; + } + } + refcount_dec(&hwpt->obj.users); } struct iommufd_group { diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index ab6675a7f6b4..45b9d40773b1 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -492,6 +492,10 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { .destroy = iommufd_hwpt_paging_destroy, .abort = iommufd_hwpt_paging_abort, }, + [IOMMUFD_OBJ_HWPT_NESTED] = { + .destroy = iommufd_hwpt_nested_destroy, + .abort = iommufd_hwpt_nested_abort, + }, #ifdef CONFIG_IOMMUFD_TEST [IOMMUFD_OBJ_SELFTEST] = { .destroy = iommufd_selftest_destroy, diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index c44eecf5d318..d816deac906f 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -361,20 +361,44 @@ enum iommufd_hwpt_alloc_flags { IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1, }; +/** + * enum iommu_hwpt_data_type - IOMMU HWPT Data Type + * @IOMMU_HWPT_DATA_NONE: no data + */ +enum iommu_hwpt_data_type { + IOMMU_HWPT_DATA_NONE, +}; + /** * struct iommu_hwpt_alloc - ioctl(IOMMU_HWPT_ALLOC) * @size: sizeof(struct iommu_hwpt_alloc) * @flags: Combination of enum iommufd_hwpt_alloc_flags * @dev_id: The device to allocate this HWPT for - * @pt_id: The IOAS to connect this HWPT to + * @pt_id: The IOAS or HWPT to connect this HWPT to * @out_hwpt_id: The ID of the new HWPT * @__reserved: Must be 0 + * @data_type: One of enum iommu_hwpt_data_type + * @data_len: Length of the type specific data + * @data_uptr: User pointer to the type specific data * * Explicitly allocate a hardware page table object. This is the same object * type that is returned by iommufd_device_attach() and represents the * underlying iommu driver's iommu_domain kernel object. * - * A HWPT will be created with the IOVA mappings from the given IOAS. + * A kernel-managed HWPT will be created with the mappings from the given + * IOAS via the @pt_id. The @data_type for this allocation must be set to + * IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a + * nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags. + * + * A user-managed nested HWPT will be created from a given parent HWPT via + * @pt_id, in which the parent HWPT must be allocated previously via the + * same ioctl from a given IOAS (@pt_id). In this case, the @data_type + * must be set to a pre-defined type corresponding to an I/O page table + * type supported by the underlying IOMMU hardware. + * + * If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and + * @data_uptr should be zero. Otherwise, both @data_len and @data_uptr + * must be given. */ struct iommu_hwpt_alloc { __u32 size; @@ -383,6 +407,9 @@ struct iommu_hwpt_alloc { __u32 pt_id; __u32 out_hwpt_id; __u32 __reserved; + __u32 data_type; + __u32 data_len; + __aligned_u64 data_uptr; }; #define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC) -- cgit v1.2.3 From 82b6661c9c35e60946dee536545b4848f25eafab Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 25 Oct 2023 21:42:09 -0700 Subject: iommufd: Add data structure for Intel VT-d stage-1 domain allocation This adds IOMMU_HWPT_DATA_VTD_S1 for stage-1 hw_pagetable of Intel VT-d and the corressponding data structure for userspace specified parameter for the domain allocation. Link: https://lore.kernel.org/r/20231026044216.64964-2-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- include/uapi/linux/iommufd.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index d816deac906f..3ce5ee5f09b6 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -361,12 +361,42 @@ enum iommufd_hwpt_alloc_flags { IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1, }; +/** + * enum iommu_hwpt_vtd_s1_flags - Intel VT-d stage-1 page table + * entry attributes + * @IOMMU_VTD_S1_SRE: Supervisor request + * @IOMMU_VTD_S1_EAFE: Extended access enable + * @IOMMU_VTD_S1_WPE: Write protect enable + */ +enum iommu_hwpt_vtd_s1_flags { + IOMMU_VTD_S1_SRE = 1 << 0, + IOMMU_VTD_S1_EAFE = 1 << 1, + IOMMU_VTD_S1_WPE = 1 << 2, +}; + +/** + * struct iommu_hwpt_vtd_s1 - Intel VT-d stage-1 page table + * info (IOMMU_HWPT_DATA_VTD_S1) + * @flags: Combination of enum iommu_hwpt_vtd_s1_flags + * @pgtbl_addr: The base address of the stage-1 page table. + * @addr_width: The address width of the stage-1 page table + * @__reserved: Must be 0 + */ +struct iommu_hwpt_vtd_s1 { + __aligned_u64 flags; + __aligned_u64 pgtbl_addr; + __u32 addr_width; + __u32 __reserved; +}; + /** * enum iommu_hwpt_data_type - IOMMU HWPT Data Type * @IOMMU_HWPT_DATA_NONE: no data + * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table */ enum iommu_hwpt_data_type { IOMMU_HWPT_DATA_NONE, + IOMMU_HWPT_DATA_VTD_S1, }; /** -- cgit v1.2.3 From 03476e687eb07b94f7cdb07cd3c7c4304b6c58b3 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 25 Oct 2023 21:42:16 -0700 Subject: iommu/vt-d: Disallow read-only mappings to nest parent domain When remapping hardware is configured by system software in scalable mode as Nested (PGTT=011b) and with PWSNP field Set in the PASID-table-entry, it may Set Accessed bit and Dirty bit (and Extended Access bit if enabled) in first-stage page-table entries even when second-stage mappings indicate that corresponding first-stage page-table is Read-Only. As the result, contents of pages designated by VMM as Read-Only can be modified by IOMMU via PML5E (PML4E for 4-level tables) access as part of address translation process due to DMAs issued by Guest. This disallows read-only mappings in the domain that is supposed to be used as nested parent. Reference from Sapphire Rapids Specification Update [1], errata details, SPR17. Userspace should know this limitation by checking the IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17 flag reported in the IOMMU_GET_HW_INFO ioctl. [1] https://www.intel.com/content/www/us/en/content-details/772415/content-details.html Link: https://lore.kernel.org/r/20231026044216.64964-9-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Lu Baolu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- drivers/iommu/intel/iommu.c | 6 ++++++ include/uapi/linux/iommufd.h | 12 +++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 4ce372d5d4f3..a2c429855cc0 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -2194,6 +2194,11 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) return -EINVAL; + if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) { + pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); + return -EINVAL; + } + attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); attr |= DMA_FL_PTE_PRESENT; if (domain->use_first_level) { @@ -4850,6 +4855,7 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) if (!vtd) return ERR_PTR(-ENOMEM); + vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17; vtd->cap_reg = iommu->cap; vtd->ecap_reg = iommu->ecap; *length = sizeof(*vtd); diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 3ce5ee5f09b6..0b2bc6252e2c 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -443,10 +443,20 @@ struct iommu_hwpt_alloc { }; #define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC) +/** + * enum iommu_hw_info_vtd_flags - Flags for VT-d hw_info + * @IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17: If set, disallow read-only mappings + * on a nested_parent domain. + * https://www.intel.com/content/www/us/en/content-details/772415/content-details.html + */ +enum iommu_hw_info_vtd_flags { + IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17 = 1 << 0, +}; + /** * struct iommu_hw_info_vtd - Intel VT-d hardware information * - * @flags: Must be 0 + * @flags: Combination of enum iommu_hw_info_vtd_flags * @__reserved: Must be 0 * * @cap_reg: Value of Intel VT-d capability register defined in VT-d spec -- cgit v1.2.3