aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.clang-format1
-rw-r--r--.mailmap7
-rw-r--r--Documentation/core-api/index.rst1
-rw-r--r--Documentation/core-api/xarray.rst435
-rw-r--r--MAINTAINERS17
-rw-r--r--arch/parisc/kernel/syscall.S2
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h4
-rw-r--r--arch/powerpc/include/asm/nohash/64/pgtable.h4
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c17
-rw-r--r--drivers/input/keyboard/hilkbd.c2
-rw-r--r--drivers/pci/hotplug/acpiphp.h2
-rw-r--r--drivers/pci/hotplug/acpiphp_core.c4
-rw-r--r--drivers/pci/hotplug/acpiphp_glue.c2
-rw-r--r--drivers/staging/erofs/utils.c18
-rw-r--r--fs/btrfs/compression.c6
-rw-r--r--fs/btrfs/extent_io.c12
-rw-r--r--fs/buffer.c14
-rw-r--r--fs/dax.c917
-rw-r--r--fs/ext4/inode.c2
-rw-r--r--fs/f2fs/data.c6
-rw-r--r--fs/f2fs/dir.c2
-rw-r--r--fs/f2fs/f2fs.h2
-rw-r--r--fs/f2fs/inline.c2
-rw-r--r--fs/f2fs/node.c6
-rw-r--r--fs/fs-writeback.c25
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/inode.c2
-rw-r--r--fs/isofs/dir.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nilfs2/btnode.c26
-rw-r--r--fs/nilfs2/page.c29
-rw-r--r--fs/proc/task_mmu.c2
-rw-r--r--include/linux/fs.h63
-rw-r--r--include/linux/idr.h18
-rw-r--r--include/linux/pagemap.h10
-rw-r--r--include/linux/pagevec.h8
-rw-r--r--include/linux/radix-tree.h178
-rw-r--r--include/linux/swap.h22
-rw-r--r--include/linux/swapops.h19
-rw-r--r--include/linux/xarray.h1293
-rw-r--r--kernel/memremap.c75
-rw-r--r--lib/Kconfig5
-rw-r--r--lib/Kconfig.debug3
-rw-r--r--lib/Makefile3
-rw-r--r--lib/idr.c401
-rw-r--r--lib/radix-tree.c834
-rw-r--r--lib/test_xarray.c1238
-rw-r--r--lib/xarray.c2036
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/filemap.c724
-rw-r--r--mm/huge_memory.c17
-rw-r--r--mm/khugepaged.c178
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/memfd.c105
-rw-r--r--mm/migrate.c48
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/page-writeback.c72
-rw-r--r--mm/readahead.c10
-rw-r--r--mm/shmem.c193
-rw-r--r--mm/swap.c6
-rw-r--r--mm/swap_state.c119
-rw-r--r--mm/truncate.c27
-rw-r--r--mm/vmscan.c10
-rw-r--r--mm/workingset.c68
-rw-r--r--tools/include/asm-generic/bitops.h1
-rw-r--r--tools/include/asm-generic/bitops/atomic.h9
-rw-r--r--tools/include/asm-generic/bitops/non-atomic.h109
-rw-r--r--tools/include/linux/bitmap.h1
-rw-r--r--tools/include/linux/kernel.h1
-rw-r--r--tools/include/linux/spinlock.h12
-rw-r--r--tools/testing/radix-tree/.gitignore1
-rw-r--r--tools/testing/radix-tree/Makefile11
-rw-r--r--tools/testing/radix-tree/benchmark.c141
-rw-r--r--tools/testing/radix-tree/bitmap.c23
-rw-r--r--tools/testing/radix-tree/generated/autoconf.h2
-rw-r--r--tools/testing/radix-tree/idr-test.c71
-rw-r--r--tools/testing/radix-tree/iteration_check.c109
-rw-r--r--tools/testing/radix-tree/linux/bug.h1
-rw-r--r--tools/testing/radix-tree/linux/kconfig.h1
-rw-r--r--tools/testing/radix-tree/linux/kernel.h5
-rw-r--r--tools/testing/radix-tree/linux/lockdep.h11
-rw-r--r--tools/testing/radix-tree/linux/radix-tree.h1
-rw-r--r--tools/testing/radix-tree/linux/rcupdate.h2
-rw-r--r--tools/testing/radix-tree/main.c66
-rw-r--r--tools/testing/radix-tree/multiorder.c609
-rw-r--r--tools/testing/radix-tree/regression1.c75
-rw-r--r--tools/testing/radix-tree/regression2.c8
-rw-r--r--tools/testing/radix-tree/regression3.c23
-rw-r--r--tools/testing/radix-tree/tag_check.c33
-rw-r--r--tools/testing/radix-tree/test.c131
-rw-r--r--tools/testing/radix-tree/test.h13
-rw-r--r--tools/testing/radix-tree/xarray.c35
93 files changed, 7052 insertions, 3821 deletions
diff --git a/.clang-format b/.clang-format
index 1d5da22e0ba5..e6080f5834a3 100644
--- a/.clang-format
+++ b/.clang-format
@@ -323,7 +323,6 @@ ForEachMacros:
- 'protocol_for_each_card'
- 'protocol_for_each_dev'
- 'queue_for_each_hw_ctx'
- - 'radix_tree_for_each_contig'
- 'radix_tree_for_each_slot'
- 'radix_tree_for_each_tagged'
- 'rbtree_postorder_for_each_entry_safe'
diff --git a/.mailmap b/.mailmap
index 285e09645b31..89f532caf639 100644
--- a/.mailmap
+++ b/.mailmap
@@ -119,6 +119,13 @@ Mark Brown <broonie@sirena.org.uk>
Mark Yao <markyao0591@gmail.com> <mark.yao@rock-chips.com>
Martin Kepplinger <martink@posteo.de> <martin.kepplinger@theobroma-systems.com>
Martin Kepplinger <martink@posteo.de> <martin.kepplinger@ginzinger.com>
+Matthew Wilcox <willy@infradead.org> <matthew.r.wilcox@intel.com>
+Matthew Wilcox <willy@infradead.org> <matthew@wil.cx>
+Matthew Wilcox <willy@infradead.org> <mawilcox@linuxonhyperv.com>
+Matthew Wilcox <willy@infradead.org> <mawilcox@microsoft.com>
+Matthew Wilcox <willy@infradead.org> <willy@debian.org>
+Matthew Wilcox <willy@infradead.org> <willy@linux.intel.com>
+Matthew Wilcox <willy@infradead.org> <willy@parisc-linux.org>
Matthieu CASTET <castet.matthieu@free.fr>
Mauro Carvalho Chehab <mchehab@kernel.org> <mchehab@brturbo.com.br>
Mauro Carvalho Chehab <mchehab@kernel.org> <maurochehab@gmail.com>
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index 29c790f571a5..3adee82be311 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -21,6 +21,7 @@ Core utilities
local_ops
workqueue
genericirq
+ xarray
flexible-arrays
librs
genalloc
diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst
new file mode 100644
index 000000000000..a4e705108f42
--- /dev/null
+++ b/Documentation/core-api/xarray.rst
@@ -0,0 +1,435 @@
+.. SPDX-License-Identifier: GPL-2.0+
+
+======
+XArray
+======
+
+:Author: Matthew Wilcox
+
+Overview
+========
+
+The XArray is an abstract data type which behaves like a very large array
+of pointers. It meets many of the same needs as a hash or a conventional
+resizable array. Unlike a hash, it allows you to sensibly go to the
+next or previous entry in a cache-efficient manner. In contrast to a
+resizable array, there is no need to copy data or change MMU mappings in
+order to grow the array. It is more memory-efficient, parallelisable
+and cache friendly than a doubly-linked list. It takes advantage of
+RCU to perform lookups without locking.
+
+The XArray implementation is efficient when the indices used are densely
+clustered; hashing the object and using the hash as the index will not
+perform well. The XArray is optimised for small indices, but still has
+good performance with large indices. If your index can be larger than
+``ULONG_MAX`` then the XArray is not the data type for you. The most
+important user of the XArray is the page cache.
+
+Each non-``NULL`` entry in the array has three bits associated with
+it called marks. Each mark may be set or cleared independently of
+the others. You can iterate over entries which are marked.
+
+Normal pointers may be stored in the XArray directly. They must be 4-byte
+aligned, which is true for any pointer returned from :c:func:`kmalloc` and
+:c:func:`alloc_page`. It isn't true for arbitrary user-space pointers,
+nor for function pointers. You can store pointers to statically allocated
+objects, as long as those objects have an alignment of at least 4.
+
+You can also store integers between 0 and ``LONG_MAX`` in the XArray.
+You must first convert it into an entry using :c:func:`xa_mk_value`.
+When you retrieve an entry from the XArray, you can check whether it is
+a value entry by calling :c:func:`xa_is_value`, and convert it back to
+an integer by calling :c:func:`xa_to_value`.
+
+Some users want to store tagged pointers instead of using the marks
+described above. They can call :c:func:`xa_tag_pointer` to create an
+entry with a tag, :c:func:`xa_untag_pointer` to turn a tagged entry
+back into an untagged pointer and :c:func:`xa_pointer_tag` to retrieve
+the tag of an entry. Tagged pointers use the same bits that are used
+to distinguish value entries from normal pointers, so each user must
+decide whether they want to store value entries or tagged pointers in
+any particular XArray.
+
+The XArray does not support storing :c:func:`IS_ERR` pointers as some
+conflict with value entries or internal entries.
+
+An unusual feature of the XArray is the ability to create entries which
+occupy a range of indices. Once stored to, looking up any index in
+the range will return the same entry as looking up any other index in
+the range. Setting a mark on one index will set it on all of them.
+Storing to any index will store to all of them. Multi-index entries can
+be explicitly split into smaller entries, or storing ``NULL`` into any
+entry will cause the XArray to forget about the range.
+
+Normal API
+==========
+
+Start by initialising an XArray, either with :c:func:`DEFINE_XARRAY`
+for statically allocated XArrays or :c:func:`xa_init` for dynamically
+allocated ones. A freshly-initialised XArray contains a ``NULL``
+pointer at every index.
+
+You can then set entries using :c:func:`xa_store` and get entries
+using :c:func:`xa_load`. xa_store will overwrite any entry with the
+new entry and return the previous entry stored at that index. You can
+use :c:func:`xa_erase` instead of calling :c:func:`xa_store` with a
+``NULL`` entry. There is no difference between an entry that has never
+been stored to and one that has most recently had ``NULL`` stored to it.
+
+You can conditionally replace an entry at an index by using
+:c:func:`xa_cmpxchg`. Like :c:func:`cmpxchg`, it will only succeed if
+the entry at that index has the 'old' value. It also returns the entry
+which was at that index; if it returns the same entry which was passed as
+'old', then :c:func:`xa_cmpxchg` succeeded.
+
+If you want to only store a new entry to an index if the current entry
+at that index is ``NULL``, you can use :c:func:`xa_insert` which
+returns ``-EEXIST`` if the entry is not empty.
+
+You can enquire whether a mark is set on an entry by using
+:c:func:`xa_get_mark`. If the entry is not ``NULL``, you can set a mark
+on it by using :c:func:`xa_set_mark` and remove the mark from an entry by
+calling :c:func:`xa_clear_mark`. You can ask whether any entry in the
+XArray has a particular mark set by calling :c:func:`xa_marked`.
+
+You can copy entries out of the XArray into a plain array by calling
+:c:func:`xa_extract`. Or you can iterate over the present entries in
+the XArray by calling :c:func:`xa_for_each`. You may prefer to use
+:c:func:`xa_find` or :c:func:`xa_find_after` to move to the next present
+entry in the XArray.
+
+Calling :c:func:`xa_store_range` stores the same entry in a range
+of indices. If you do this, some of the other operations will behave
+in a slightly odd way. For example, marking the entry at one index
+may result in the entry being marked at some, but not all of the other
+indices. Storing into one index may result in the entry retrieved by
+some, but not all of the other indices changing.
+
+Finally, you can remove all entries from an XArray by calling
+:c:func:`xa_destroy`. If the XArray entries are pointers, you may wish
+to free the entries first. You can do this by iterating over all present
+entries in the XArray using the :c:func:`xa_for_each` iterator.
+
+ID assignment
+-------------
+
+You can call :c:func:`xa_alloc` to store the entry at any unused index
+in the XArray. If you need to modify the array from interrupt context,
+you can use :c:func:`xa_alloc_bh` or :c:func:`xa_alloc_irq` to disable
+interrupts while allocating the ID. Unlike :c:func:`xa_store`, allocating
+a ``NULL`` pointer does not delete an entry. Instead it reserves an
+entry like :c:func:`xa_reserve` and you can release it using either
+:c:func:`xa_erase` or :c:func:`xa_release`. To use ID assignment, the
+XArray must be defined with :c:func:`DEFINE_XARRAY_ALLOC`, or initialised
+by passing ``XA_FLAGS_ALLOC`` to :c:func:`xa_init_flags`,
+
+Memory allocation
+-----------------
+
+The :c:func:`xa_store`, :c:func:`xa_cmpxchg`, :c:func:`xa_alloc`,
+:c:func:`xa_reserve` and :c:func:`xa_insert` functions take a gfp_t
+parameter in case the XArray needs to allocate memory to store this entry.
+If the entry is being deleted, no memory allocation needs to be performed,
+and the GFP flags specified will be ignored.
+
+It is possible for no memory to be allocatable, particularly if you pass
+a restrictive set of GFP flags. In that case, the functions return a
+special value which can be turned into an errno using :c:func:`xa_err`.
+If you don't need to know exactly which error occurred, using
+:c:func:`xa_is_err` is slightly more efficient.
+
+Locking
+-------
+
+When using the Normal API, you do not have to worry about locking.
+The XArray uses RCU and an internal spinlock to synchronise access:
+
+No lock needed:
+ * :c:func:`xa_empty`
+ * :c:func:`xa_marked`
+
+Takes RCU read lock:
+ * :c:func:`xa_load`
+ * :c:func:`xa_for_each`
+ * :c:func:`xa_find`
+ * :c:func:`xa_find_after`
+ * :c:func:`xa_extract`
+ * :c:func:`xa_get_mark`
+
+Takes xa_lock internally:
+ * :c:func:`xa_store`
+ * :c:func:`xa_insert`
+ * :c:func:`xa_erase`
+ * :c:func:`xa_erase_bh`
+ * :c:func:`xa_erase_irq`
+ * :c:func:`xa_cmpxchg`
+ * :c:func:`xa_store_range`
+ * :c:func:`xa_alloc`
+ * :c:func:`xa_alloc_bh`
+ * :c:func:`xa_alloc_irq`
+ * :c:func:`xa_destroy`
+ * :c:func:`xa_set_mark`
+ * :c:func:`xa_clear_mark`
+
+Assumes xa_lock held on entry:
+ * :c:func:`__xa_store`
+ * :c:func:`__xa_insert`
+ * :c:func:`__xa_erase`
+ * :c:func:`__xa_cmpxchg`
+ * :c:func:`__xa_alloc`
+ * :c:func:`__xa_set_mark`
+ * :c:func:`__xa_clear_mark`
+
+If you want to take advantage of the lock to protect the data structures
+that you are storing in the XArray, you can call :c:func:`xa_lock`
+before calling :c:func:`xa_load`, then take a reference count on the
+object you have found before calling :c:func:`xa_unlock`. This will
+prevent stores from removing the object from the array between looking
+up the object and incrementing the refcount. You can also use RCU to
+avoid dereferencing freed memory, but an explanation of that is beyond
+the scope of this document.
+
+The XArray does not disable interrupts or softirqs while modifying
+the array. It is safe to read the XArray from interrupt or softirq
+context as the RCU lock provides enough protection.
+
+If, for example, you want to store entries in the XArray in process
+context and then erase them in softirq context, you can do that this way::
+
+ void foo_init(struct foo *foo)
+ {
+ xa_init_flags(&foo->array, XA_FLAGS_LOCK_BH);
+ }
+
+ int foo_store(struct foo *foo, unsigned long index, void *entry)
+ {
+ int err;
+
+ xa_lock_bh(&foo->array);
+ err = xa_err(__xa_store(&foo->array, index, entry, GFP_KERNEL));
+ if (!err)
+ foo->count++;
+ xa_unlock_bh(&foo->array);
+ return err;
+ }
+
+ /* foo_erase() is only called from softirq context */
+ void foo_erase(struct foo *foo, unsigned long index)
+ {
+ xa_lock(&foo->array);
+ __xa_erase(&foo->array, index);
+ foo->count--;
+ xa_unlock(&foo->array);
+ }
+
+If you are going to modify the XArray from interrupt or softirq context,
+you need to initialise the array using :c:func:`xa_init_flags`, passing
+``XA_FLAGS_LOCK_IRQ`` or ``XA_FLAGS_LOCK_BH``.
+
+The above example also shows a common pattern of wanting to extend the
+coverage of the xa_lock on the store side to protect some statistics
+associated with the array.
+
+Sharing the XArray with interrupt context is also possible, either
+using :c:func:`xa_lock_irqsave` in both the interrupt handler and process
+context, or :c:func:`xa_lock_irq` in process context and :c:func:`xa_lock`
+in the interrupt handler. Some of the more common patterns have helper
+functions such as :c:func:`xa_erase_bh` and :c:func:`xa_erase_irq`.
+
+Sometimes you need to protect access to the XArray with a mutex because
+that lock sits above another mutex in the locking hierarchy. That does
+not entitle you to use functions like :c:func:`__xa_erase` without taking
+the xa_lock; the xa_lock is used for lockdep validation and will be used
+for other purposes in the future.
+
+The :c:func:`__xa_set_mark` and :c:func:`__xa_clear_mark` functions are also
+available for situations where you look up an entry and want to atomically
+set or clear a mark. It may be more efficient to use the advanced API
+in this case, as it will save you from walking the tree twice.
+
+Advanced API
+============
+
+The advanced API offers more flexibility and better performance at the
+cost of an interface which can be harder to use and has fewer safeguards.
+No locking is done for you by the advanced API, and you are required
+to use the xa_lock while modifying the array. You can choose whether
+to use the xa_lock or the RCU lock while doing read-only operations on
+the array. You can mix advanced and normal operations on the same array;
+indeed the normal API is implemented in terms of the advanced API. The
+advanced API is only available to modules with a GPL-compatible license.
+
+The advanced API is based around the xa_state. This is an opaque data
+structure which you declare on the stack using the :c:func:`XA_STATE`
+macro. This macro initialises the xa_state ready to start walking
+around the XArray. It is used as a cursor to maintain the position
+in the XArray and let you compose various operations together without
+having to restart from the top every time.
+
+The xa_state is also used to store errors. You can call
+:c:func:`xas_error` to retrieve the error. All operations check whether
+the xa_state is in an error state before proceeding, so there's no need
+for you to check for an error after each call; you can make multiple
+calls in succession and only check at a convenient point. The only
+errors currently generated by the XArray code itself are ``ENOMEM`` and
+``EINVAL``, but it supports arbitrary errors in case you want to call
+:c:func:`xas_set_err` yourself.
+
+If the xa_state is holding an ``ENOMEM`` error, calling :c:func:`xas_nomem`
+will attempt to allocate more memory using the specified gfp flags and
+cache it in the xa_state for the next attempt. The idea is that you take
+the xa_lock, attempt the operation and drop the lock. The operation
+attempts to allocate memory while holding the lock, but it is more
+likely to fail. Once you have dropped the lock, :c:func:`xas_nomem`
+can try harder to allocate more memory. It will return ``true`` if it
+is worth retrying the operation (i.e. that there was a memory error *and*
+more memory was allocated). If it has previously allocated memory, and
+that memory wasn't used, and there is no error (or some error that isn't
+``ENOMEM``), then it will free the memory previously allocated.
+
+Internal Entries
+----------------
+
+The XArray reserves some entries for its own purposes. These are never
+exposed through the normal API, but when using the advanced API, it's
+possible to see them. Usually the best way to handle them is to pass them
+to :c:func:`xas_retry`, and retry the operation if it returns ``true``.
+
+.. flat-table::
+ :widths: 1 1 6
+
+ * - Name
+ - Test
+ - Usage
+
+ * - Node
+ - :c:func:`xa_is_node`
+ - An XArray node. May be visible when using a multi-index xa_state.
+
+ * - Sibling
+ - :c:func:`xa_is_sibling`
+ - A non-canonical entry for a multi-index entry. The value indicates
+ which slot in this node has the canonical entry.
+
+ * - Retry
+ - :c:func:`xa_is_retry`
+ - This entry is currently being modified by a thread which has the
+ xa_lock. The node containing this entry may be freed at the end
+ of this RCU period. You should restart the lookup from the head
+ of the array.
+
+ * - Zero
+ - :c:func:`xa_is_zero`
+ - Zero entries appear as ``NULL`` through the Normal API, but occupy
+ an entry in the XArray which can be used to reserve the index for
+ future use.
+
+Other internal entries may be added in the future. As far as possible, they
+will be handled by :c:func:`xas_retry`.
+
+Additional functionality
+------------------------
+
+The :c:func:`xas_create_range` function allocates all the necessary memory
+to store every entry in a range. It will set ENOMEM in the xa_state if
+it cannot allocate memory.
+
+You can use :c:func:`xas_init_marks` to reset the marks on an entry
+to their default state. This is usually all marks clear, unless the
+XArray is marked with ``XA_FLAGS_TRACK_FREE``, in which case mark 0 is set
+and all other marks are clear. Replacing one entry with another using
+:c:func:`xas_store` will not reset the marks on that entry; if you want
+the marks reset, you should do that explicitly.
+
+The :c:func:`xas_load` will walk the xa_state as close to the entry
+as it can. If you know the xa_state has already been walked to the
+entry and need to check that the entry hasn't changed, you can use
+:c:func:`xas_reload` to save a function call.
+
+If you need to move to a different index in the XArray, call
+:c:func:`xas_set`. This resets the cursor to the top of the tree, which
+will generally make the next operation walk the cursor to the desired
+spot in the tree. If you want to move to the next or previous index,
+call :c:func:`xas_next` or :c:func:`xas_prev`. Setting the index does
+not walk the cursor around the array so does not require a lock to be
+held, while moving to the next or previous index does.
+
+You can search for the next present entry using :c:func:`xas_find`. This
+is the equivalent of both :c:func:`xa_find` and :c:func:`xa_find_after`;
+if the cursor has been walked to an entry, then it will find the next
+entry after the one currently referenced. If not, it will return the
+entry at the index of the xa_state. Using :c:func:`xas_next_entry` to
+move to the next present entry instead of :c:func:`xas_find` will save
+a function call in the majority of cases at the expense of emitting more
+inline code.
+
+The :c:func:`xas_find_marked` function is similar. If the xa_state has
+not been walked, it will return the entry at the index of the xa_state,
+if it is marked. Otherwise, it will return the first marked entry after
+the entry referenced by the xa_state. The :c:func:`xas_next_marked`
+function is the equivalent of :c:func:`xas_next_entry`.
+
+When iterating over a range of the XArray using :c:func:`xas_for_each`
+or :c:func:`xas_for_each_marked`, it may be necessary to temporarily stop
+the iteration. The :c:func:`xas_pause` function exists for this purpose.
+After you have done the necessary work and wish to resume, the xa_state
+is in an appropriate state to continue the iteration after the entry
+you last processed. If you have interrupts disabled while iterating,
+then it is good manners to pause the iteration and reenable interrupts
+every ``XA_CHECK_SCHED`` entries.
+
+The :c:func:`xas_get_mark`, :c:func:`xas_set_mark` and
+:c:func:`xas_clear_mark` functions require the xa_state cursor to have
+been moved to the appropriate location in the xarray; they will do
+nothing if you have called :c:func:`xas_pause` or :c:func:`xas_set`
+immediately before.
+
+You can call :c:func:`xas_set_update` to have a callback function
+called each time the XArray updates a node. This is used by the page
+cache workingset code to maintain its list of nodes which contain only
+shadow entries.
+
+Multi-Index Entries
+-------------------
+
+The XArray has the ability to tie multiple indices together so that
+operations on one index affect all indices. For example, storing into
+any index will change the value of the entry retrieved from any index.
+Setting or clearing a mark on any index will set or clear the mark
+on every index that is tied together. The current implementation
+only allows tying ranges which are aligned powers of two together;
+eg indices 64-127 may be tied together, but 2-6 may not be. This may
+save substantial quantities of memory; for example tying 512 entries
+together will save over 4kB.
+
+You can create a multi-index entry by using :c:func:`XA_STATE_ORDER`
+or :c:func:`xas_set_order` followed by a call to :c:func:`xas_store`.
+Calling :c:func:`xas_load` with a multi-index xa_state will walk the
+xa_state to the right location in the tree, but the return value is not
+meaningful, potentially being an internal entry or ``NULL`` even when there
+is an entry stored within the range. Calling :c:func:`xas_find_conflict`
+will return the first entry within the range or ``NULL`` if there are no
+entries in the range. The :c:func:`xas_for_each_conflict` iterator will
+iterate over every entry which overlaps the specified range.
+
+If :c:func:`xas_load` encounters a multi-index entry, the xa_index
+in the xa_state will not be changed. When iterating over an XArray
+or calling :c:func:`xas_find`, if the initial index is in the middle
+of a multi-index entry, it will not be altered. Subsequent calls
+or iterations will move the index to the first index in the range.
+Each entry will only be returned once, no matter how many indices it
+occupies.
+
+Using :c:func:`xas_next` or :c:func:`xas_prev` with a multi-index xa_state
+is not supported. Using either of these functions on a multi-index entry
+will reveal sibling entries; these should be skipped over by the caller.
+
+Storing ``NULL`` into any index of a multi-index entry will set the entry
+at every index to ``NULL`` and dissolve the tie. Splitting a multi-index
+entry into entries occupying smaller ranges is not yet supported.
+
+Functions and structures
+========================
+
+.. kernel-doc:: include/linux/xarray.h
+.. kernel-doc:: lib/xarray.c
diff --git a/MAINTAINERS b/MAINTAINERS
index bfc9722a4932..a78d45755881 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -535,7 +535,7 @@ F: Documentation/hwmon/adt7475
F: drivers/hwmon/adt7475.c
ADVANSYS SCSI DRIVER
-M: Matthew Wilcox <matthew@wil.cx>
+M: Matthew Wilcox <willy@infradead.org>
M: Hannes Reinecke <hare@suse.com>
L: linux-scsi@vger.kernel.org
S: Maintained
@@ -4393,7 +4393,7 @@ S: Maintained
F: drivers/i2c/busses/i2c-diolan-u2c.c
FILESYSTEM DIRECT ACCESS (DAX)
-M: Matthew Wilcox <mawilcox@microsoft.com>
+M: Matthew Wilcox <willy@infradead.org>
M: Ross Zwisler <zwisler@kernel.org>
M: Jan Kara <jack@suse.cz>
L: linux-fsdevel@vger.kernel.org
@@ -8697,7 +8697,7 @@ F: drivers/message/fusion/
F: drivers/scsi/mpt3sas/
LSILOGIC/SYMBIOS/NCR 53C8XX and 53C1010 PCI-SCSI drivers
-M: Matthew Wilcox <matthew@wil.cx>
+M: Matthew Wilcox <willy@infradead.org>
L: linux-scsi@vger.kernel.org
S: Maintained
F: drivers/scsi/sym53c8xx_2/
@@ -16137,6 +16137,17 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/vdso
S: Maintained
F: arch/x86/entry/vdso/
+XARRAY
+M: Matthew Wilcox <willy@infradead.org>
+L: linux-fsdevel@vger.kernel.org
+S: Supported
+F: Documentation/core-api/xarray.rst
+F: lib/idr.c
+F: lib/xarray.c
+F: include/linux/idr.h
+F: include/linux/xarray.h
+F: tools/testing/radix-tree
+
XC2028/3028 TUNER DRIVER
M: Mauro Carvalho Chehab <mchehab@kernel.org>
L: linux-media@vger.kernel.org
diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
index f5f22ea9b97e..9505c317818d 100644
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -2,7 +2,7 @@
* Linux/PA-RISC Project (http://www.parisc-linux.org/)
*
* System call entry code / Linux gateway page
- * Copyright (c) Matthew Wilcox 1999 <willy@bofh.ai>
+ * Copyright (c) Matthew Wilcox 1999 <willy@infradead.org>
* Licensed under the GNU GPL.
* thanks to Philipp Rumpf, Mike Shaver and various others
* sorry about the wall, puffin..
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index c4a726c10af5..6c99e846a8c9 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -716,9 +716,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \
BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \
} while (0)
-/*
- * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT;
- */
+
#define SWP_TYPE_BITS 5
#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \
& ((1UL << SWP_TYPE_BITS) - 1))
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index 67421f74efcf..e77ed9761632 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -350,9 +350,7 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma,
#define MAX_SWAPFILES_CHECK() do { \
BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \
} while (0)
-/*
- * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT;
- */
+
#define SWP_TYPE_BITS 5
#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \
& ((1UL << SWP_TYPE_BITS) - 1))
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index fcc73a6ab503..316730b45f84 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -5996,7 +5996,8 @@ i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
count = __sg_page_count(sg);
while (idx + count <= n) {
- unsigned long exception, i;
+ void *entry;
+ unsigned long i;
int ret;
/* If we cannot allocate and insert this entry, or the
@@ -6011,12 +6012,9 @@ i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
if (ret && ret != -EEXIST)
goto scan;
- exception =
- RADIX_TREE_EXCEPTIONAL_ENTRY |
- idx << RADIX_TREE_EXCEPTIONAL_SHIFT;
+ entry = xa_mk_value(idx);
for (i = 1; i < count; i++) {
- ret = radix_tree_insert(&iter->radix, idx + i,
- (void *)exception);
+ ret = radix_tree_insert(&iter->radix, idx + i, entry);
if (ret && ret != -EEXIST)
goto scan;
}
@@ -6054,15 +6052,14 @@ lookup:
GEM_BUG_ON(!sg);
/* If this index is in the middle of multi-page sg entry,
- * the radixtree will contain an exceptional entry that points
+ * the radix tree will contain a value entry that points
* to the start of that range. We will return the pointer to
* the base page and the offset of this page within the
* sg entry's range.
*/
*offset = 0;
- if (unlikely(radix_tree_exception(sg))) {
- unsigned long base =
- (unsigned long)sg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
+ if (unlikely(xa_is_value(sg))) {
+ unsigned long base = xa_to_value(sg);
sg = radix_tree_lookup(&iter->radix, base);
GEM_BUG_ON(!sg);
diff --git a/drivers/input/keyboard/hilkbd.c b/drivers/input/keyboard/hilkbd.c
index 5c7afdec192c..f5c5ae8b6c06 100644
--- a/drivers/input/keyboard/hilkbd.c
+++ b/drivers/input/keyboard/hilkbd.c
@@ -2,7 +2,7 @@
* linux/drivers/hil/hilkbd.c
*
* Copyright (C) 1998 Philip Blundell <philb@gnu.org>
- * Copyright (C) 1999 Matthew Wilcox <willy@bofh.ai>
+ * Copyright (C) 1999 Matthew Wilcox <willy@infradead.org>
* Copyright (C) 1999-2007 Helge Deller <deller@gmx.de>
*
* Very basic HP Human Interface Loop (HIL) driver.
diff --git a/drivers/pci/hotplug/acpiphp.h b/drivers/pci/hotplug/acpiphp.h
index cf3058404f41..a2094c07af6a 100644
--- a/drivers/pci/hotplug/acpiphp.h
+++ b/drivers/pci/hotplug/acpiphp.h
@@ -8,7 +8,7 @@
* Copyright (C) 2002 Hiroshi Aono (h-aono@ap.jp.nec.com)
* Copyright (C) 2002,2003 Takayoshi Kochi (t-kochi@bq.jp.nec.com)
* Copyright (C) 2002,2003 NEC Corporation
- * Copyright (C) 2003-2005 Matthew Wilcox (matthew.wilcox@hp.com)
+ * Copyright (C) 2003-2005 Matthew Wilcox (willy@infradead.org)
* Copyright (C) 2003-2005 Hewlett Packard
*
* All rights reserved.
diff --git a/drivers/pci/hotplug/acpiphp_core.c b/drivers/pci/hotplug/acpiphp_core.c
index c9e2bd40c038..853e04ad272c 100644
--- a/drivers/pci/hotplug/acpiphp_core.c
+++ b/drivers/pci/hotplug/acpiphp_core.c
@@ -8,7 +8,7 @@
* Copyright (C) 2002 Hiroshi Aono (h-aono@ap.jp.nec.com)
* Copyright (C) 2002,2003 Takayoshi Kochi (t-kochi@bq.jp.nec.com)
* Copyright (C) 2002,2003 NEC Corporation
- * Copyright (C) 2003-2005 Matthew Wilcox (matthew.wilcox@hp.com)
+ * Copyright (C) 2003-2005 Matthew Wilcox (willy@infradead.org)
* Copyright (C) 2003-2005 Hewlett Packard
*
* All rights reserved.
@@ -40,7 +40,7 @@ bool acpiphp_disabled;
static struct acpiphp_attention_info *attention_info;
#define DRIVER_VERSION "0.5"
-#define DRIVER_AUTHOR "Greg Kroah-Hartman <gregkh@us.ibm.com>, Takayoshi Kochi <t-kochi@bq.jp.nec.com>, Matthew Wilcox <willy@hp.com>"
+#define DRIVER_AUTHOR "Greg Kroah-Hartman <gregkh@us.ibm.com>, Takayoshi Kochi <t-kochi@bq.jp.nec.com>, Matthew Wilcox <willy@infradead.org>"
#define DRIVER_DESC "ACPI Hot Plug PCI Controller Driver"
MODULE_AUTHOR(DRIVER_AUTHOR);
diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index 12afa7fdf77e..e4c46637f32f 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -5,7 +5,7 @@
* Copyright (C) 2002,2003 Takayoshi Kochi (t-kochi@bq.jp.nec.com)
* Copyright (C) 2002 Hiroshi Aono (h-aono@ap.jp.nec.com)
* Copyright (C) 2002,2003 NEC Corporation
- * Copyright (C) 2003-2005 Matthew Wilcox (matthew.wilcox@hp.com)
+ * Copyright (C) 2003-2005 Matthew Wilcox (willy@infradead.org)
* Copyright (C) 2003-2005 Hewlett Packard
* Copyright (C) 2005 Rajesh Shah (rajesh.shah@intel.com)
* Copyright (C) 2005 Intel Corporation
diff --git a/drivers/staging/erofs/utils.c b/drivers/staging/erofs/utils.c
index 595cf90af9bb..bdee9bd09f11 100644
--- a/drivers/staging/erofs/utils.c
+++ b/drivers/staging/erofs/utils.c
@@ -35,7 +35,6 @@ static atomic_long_t erofs_global_shrink_cnt;
#ifdef CONFIG_EROFS_FS_ZIP
-/* radix_tree and the future XArray both don't use tagptr_t yet */
struct erofs_workgroup *erofs_find_workgroup(
struct super_block *sb, pgoff_t index, bool *tag)
{
@@ -47,9 +46,8 @@ repeat:
rcu_read_lock();
grp = radix_tree_lookup(&sbi->workstn_tree, index);
if (grp != NULL) {
- *tag = radix_tree_exceptional_entry(grp);
- grp = (void *)((unsigned long)grp &
- ~RADIX_TREE_EXCEPTIONAL_ENTRY);
+ *tag = xa_pointer_tag(grp);
+ grp = xa_untag_pointer(grp);
if (erofs_workgroup_get(grp, &oldcount)) {
/* prefer to relax rcu read side */
@@ -83,9 +81,7 @@ int erofs_register_workgroup(struct super_block *sb,
sbi = EROFS_SB(sb);
erofs_workstn_lock(sbi);
- if (tag)
- grp = (void *)((unsigned long)grp |
- 1UL << RADIX_TREE_EXCEPTIONAL_SHIFT);
+ grp = xa_tag_pointer(grp, tag);
err = radix_tree_insert(&sbi->workstn_tree,
grp->index, grp);
@@ -131,9 +127,7 @@ repeat:
for (i = 0; i < found; ++i) {
int cnt;
- struct erofs_workgroup *grp = (void *)
- ((unsigned long)batch[i] &
- ~RADIX_TREE_EXCEPTIONAL_ENTRY);
+ struct erofs_workgroup *grp = xa_untag_pointer(batch[i]);
first_index = grp->index + 1;
@@ -150,8 +144,8 @@ repeat:
#endif
continue;
- if (radix_tree_delete(&sbi->workstn_tree,
- grp->index) != grp) {
+ if (xa_untag_pointer(radix_tree_delete(&sbi->workstn_tree,
+ grp->index)) != grp) {
#ifdef EROFS_FS_HAS_MANAGED_CACHE
skip:
erofs_workgroup_unfreeze(grp, 1);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8703ce68fe9d..2955a4ea2fa8 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -437,10 +437,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (pg_index > end_index)
break;
- rcu_read_lock();
- page = radix_tree_lookup(&mapping->i_pages, pg_index);
- rcu_read_unlock();
- if (page && !radix_tree_exceptional_entry(page)) {
+ page = xa_load(&mapping->i_pages, pg_index);
+ if (page && !xa_is_value(page)) {
misses++;
if (misses > 4)
break;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6877a74c7469..d228f706ff3e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3784,7 +3784,7 @@ int btree_write_cache_pages(struct address_space *mapping,
pgoff_t index;
pgoff_t end; /* Inclusive */
int scanned = 0;
- int tag;
+ xa_mark_t tag;
pagevec_init(&pvec);
if (wbc->range_cyclic) {
@@ -3909,7 +3909,7 @@ static int extent_write_cache_pages(struct address_space *mapping,
pgoff_t done_index;
int range_whole = 0;
int scanned = 0;
- int tag;
+ xa_mark_t tag;
/*
* We have to hold onto the inode so that ordered extents can do their
@@ -5159,11 +5159,9 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
clear_page_dirty_for_io(page);
xa_lock_irq(&page->mapping->i_pages);
- if (!PageDirty(page)) {
- radix_tree_tag_clear(&page->mapping->i_pages,
- page_index(page),
- PAGECACHE_TAG_DIRTY);
- }
+ if (!PageDirty(page))
+ __xa_clear_mark(&page->mapping->i_pages,
+ page_index(page), PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&page->mapping->i_pages);
ClearPageError(page);
unlock_page(page);
diff --git a/fs/buffer.c b/fs/buffer.c
index 109f55196866..d60d61e8ed7d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -562,7 +562,7 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
EXPORT_SYMBOL(mark_buffer_dirty_inode);
/*
- * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
+ * Mark the page dirty, and set it dirty in the page cache, and mark the inode
* dirty.
*
* If warn is true, then emit a warning if the page is not uptodate and has
@@ -579,8 +579,8 @@ void __set_page_dirty(struct page *page, struct address_space *mapping,
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->i_pages,
- page_index(page), PAGECACHE_TAG_DIRTY);
+ __xa_set_mark(&mapping->i_pages, page_index(page),
+ PAGECACHE_TAG_DIRTY);
}
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
@@ -1050,7 +1050,7 @@ __getblk_slow(struct block_device *bdev, sector_t block,
* The relationship between dirty buffers and dirty pages:
*
* Whenever a page has any dirty buffers, the page's dirty bit is set, and
- * the page is tagged dirty in its radix tree.
+ * the page is tagged dirty in the page cache.
*
* At all times, the dirtiness of the buffers represents the dirtiness of
* subsections of the page. If the page has buffers, the page dirty bit is
@@ -1073,9 +1073,9 @@ __getblk_slow(struct block_device *bdev, sector_t block,
* mark_buffer_dirty - mark a buffer_head as needing writeout
* @bh: the buffer_head to mark dirty
*
- * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
- * backing page dirty, then tag the page as dirty in its address_space's radix
- * tree and then attach the address_space's inode to its superblock's dirty
+ * mark_buffer_dirty() will set the dirty bit against the buffer, then set
+ * its backing page dirty, then tag the page as dirty in the page cache
+ * and then attach the address_space's inode to its superblock's dirty
* inode list.
*
* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
diff --git a/fs/dax.c b/fs/dax.c
index 0fb270f0a0ef..616e36ea6aaa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -38,6 +38,17 @@
#define CREATE_TRACE_POINTS
#include <trace/events/fs_dax.h>
+static inline unsigned int pe_order(enum page_entry_size pe_size)
+{
+ if (pe_size == PE_SIZE_PTE)
+ return PAGE_SHIFT - PAGE_SHIFT;
+ if (pe_size == PE_SIZE_PMD)
+ return PMD_SHIFT - PAGE_SHIFT;
+ if (pe_size == PE_SIZE_PUD)
+ return PUD_SHIFT - PAGE_SHIFT;
+ return ~0;
+}
+
/* We choose 4096 entries - same as per-zone page wait tables */
#define DAX_WAIT_TABLE_BITS 12
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
@@ -46,6 +57,9 @@
#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
+/* The order of a PMD entry */
+#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
+
static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
static int __init init_dax_wait_table(void)
@@ -59,63 +73,74 @@ static int __init init_dax_wait_table(void)
fs_initcall(init_dax_wait_table);
/*
- * We use lowest available bit in exceptional entry for locking, one bit for
- * the entry size (PMD) and two more to tell us if the entry is a zero page or
- * an empty entry that is just used for locking. In total four special bits.
+ * DAX pagecache entries use XArray value entries so they can't be mistaken
+ * for pages. We use one bit for locking, one bit for the entry size (PMD)
+ * and two more to tell us if the entry is a zero page or an empty entry that
+ * is just used for locking. In total four special bits.
*
* If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
* and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
* block allocation.
*/
-#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
-#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
-#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
-#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
-#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
+#define DAX_SHIFT (4)
+#define DAX_LOCKED (1UL << 0)
+#define DAX_PMD (1UL << 1)
+#define DAX_ZERO_PAGE (1UL << 2)
+#define DAX_EMPTY (1UL << 3)
-static unsigned long dax_radix_pfn(void *entry)
+static unsigned long dax_to_pfn(void *entry)
{
- return (unsigned long)entry >> RADIX_DAX_SHIFT;
+ return xa_to_value(entry) >> DAX_SHIFT;
}
-static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags)
+static void *dax_make_entry(pfn_t pfn, unsigned long flags)
{
- return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
- (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK);
+ return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
}
-static unsigned int dax_radix_order(void *entry)
+static void *dax_make_page_entry(struct page *page)
{
- if ((unsigned long)entry & RADIX_DAX_PMD)
- return PMD_SHIFT - PAGE_SHIFT;
+ pfn_t pfn = page_to_pfn_t(page);
+ return dax_make_entry(pfn, PageHead(page) ? DAX_PMD : 0);
+}
+
+static bool dax_is_locked(void *entry)
+{
+ return xa_to_value(entry) & DAX_LOCKED;
+}
+
+static unsigned int dax_entry_order(void *entry)
+{
+ if (xa_to_value(entry) & DAX_PMD)
+ return PMD_ORDER;
return 0;
}
static int dax_is_pmd_entry(void *entry)
{
- return (unsigned long)entry & RADIX_DAX_PMD;
+ return xa_to_value(entry) & DAX_PMD;
}
static int dax_is_pte_entry(void *entry)
{
- return !((unsigned long)entry & RADIX_DAX_PMD);
+ return !(xa_to_value(entry) & DAX_PMD);
}
static int dax_is_zero_entry(void *entry)
{
- return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
+ return xa_to_value(entry) & DAX_ZERO_PAGE;
}
static int dax_is_empty_entry(void *entry)
{
- return (unsigned long)entry & RADIX_DAX_EMPTY;
+ return xa_to_value(entry) & DAX_EMPTY;
}
/*
- * DAX radix tree locking
+ * DAX page cache entry locking
*/
struct exceptional_entry_key {
- struct address_space *mapping;
+ struct xarray *xa;
pgoff_t entry_start;
};
@@ -124,10 +149,11 @@ struct wait_exceptional_entry_queue {
struct exceptional_entry_key key;
};
-static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
- pgoff_t index, void *entry, struct exceptional_entry_key *key)
+static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
+ void *entry, struct exceptional_entry_key *key)
{
unsigned long hash;
+ unsigned long index = xas->xa_index;
/*
* If 'entry' is a PMD, align the 'index' that we use for the wait
@@ -136,22 +162,21 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
*/
if (dax_is_pmd_entry(entry))
index &= ~PG_PMD_COLOUR;
-
- key->mapping = mapping;
+ key->xa = xas->xa;
key->entry_start = index;
- hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
+ hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
return wait_table + hash;
}
-static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode,
- int sync, void *keyp)
+static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
+ unsigned int mode, int sync, void *keyp)
{
struct exceptional_entry_key *key = keyp;
struct wait_exceptional_entry_queue *ewait =
container_of(wait, struct wait_exceptional_entry_queue, wait);
- if (key->mapping != ewait->key.mapping ||
+ if (key->xa != ewait->key.xa ||
key->entry_start != ewait->key.entry_start)
return 0;
return autoremove_wake_function(wait, mode, sync, NULL);
@@ -162,13 +187,12 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo
* The important information it's conveying is whether the entry at
* this index used to be a PMD entry.
*/
-static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
- pgoff_t index, void *entry, bool wake_all)
+static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
{
struct exceptional_entry_key key;
wait_queue_head_t *wq;
- wq = dax_entry_waitqueue(mapping, index, entry, &key);
+ wq = dax_entry_waitqueue(xas, entry, &key);
/*
* Checking for locked entry and prepare_to_wait_exclusive() happens
@@ -181,55 +205,16 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
}
/*
- * Check whether the given slot is locked. Must be called with the i_pages
- * lock held.
- */
-static inline int slot_locked(struct address_space *mapping, void **slot)
-{
- unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
- return entry & RADIX_DAX_ENTRY_LOCK;
-}
-
-/*
- * Mark the given slot as locked. Must be called with the i_pages lock held.
- */
-static inline void *lock_slot(struct address_space *mapping, void **slot)
-{
- unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
-
- entry |= RADIX_DAX_ENTRY_LOCK;
- radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
- return (void *)entry;
-}
-
-/*
- * Mark the given slot as unlocked. Must be called with the i_pages lock held.
- */
-static inline void *unlock_slot(struct address_space *mapping, void **slot)
-{
- unsigned long entry = (unsigned long)
- radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock);
-
- entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
- radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry);
- return (void *)entry;
-}
-
-/*
- * Lookup entry in radix tree, wait for it to become unlocked if it is
- * exceptional entry and return it. The caller must call
- * put_unlocked_mapping_entry() when he decided not to lock the entry or
- * put_locked_mapping_entry() when he locked the entry and now wants to
- * unlock it.
+ * Look up entry in page cache, wait for it to become unlocked if it
+ * is a DAX entry and return it. The caller must subsequently call
+ * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
+ * if it did.
*
* Must be called with the i_pages lock held.
*/
-static void *__get_unlocked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void ***slotp, bool (*wait_fn)(void))
+static void *get_unlocked_entry(struct xa_state *xas)
{
- void *entry, **slot;
+ void *entry;
struct wait_exceptional_entry_queue ewait;
wait_queue_head_t *wq;
@@ -237,80 +222,54 @@ static void *__get_unlocked_mapping_entry(struct address_space *mapping,
ewait.wait.func = wake_exceptional_entry_func;
for (;;) {
- bool revalidate;
-
- entry = __radix_tree_lookup(&mapping->i_pages, index, NULL,
- &slot);
- if (!entry ||
- WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
- !slot_locked(mapping, slot)) {
- if (slotp)
- *slotp = slot;
+ entry = xas_load(xas);
+ if (!entry || xa_is_internal(entry) ||
+ WARN_ON_ONCE(!xa_is_value(entry)) ||
+ !dax_is_locked(entry))
return entry;
- }
- wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
+ wq = dax_entry_waitqueue(xas, entry, &ewait.key);
prepare_to_wait_exclusive(wq, &ewait.wait,
TASK_UNINTERRUPTIBLE);
- xa_unlock_irq(&mapping->i_pages);
- revalidate = wait_fn();
+ xas_unlock_irq(xas);
+ xas_reset(xas);
+ schedule();
finish_wait(wq, &ewait.wait);
- xa_lock_irq(&mapping->i_pages);
- if (revalidate)
- return ERR_PTR(-EAGAIN);
+ xas_lock_irq(xas);
}
}
-static bool entry_wait(void)
-{
- schedule();
- /*
- * Never return an ERR_PTR() from
- * __get_unlocked_mapping_entry(), just keep looping.
- */
- return false;
-}
-
-static void *get_unlocked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void ***slotp)
+static void put_unlocked_entry(struct xa_state *xas, void *entry)
{
- return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait);
-}
-
-static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
-{
- void *entry, **slot;
-
- xa_lock_irq(&mapping->i_pages);
- entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot);
- if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
- !slot_locked(mapping, slot))) {
- xa_unlock_irq(&mapping->i_pages);
- return;
- }
- unlock_slot(mapping, slot);
- xa_unlock_irq(&mapping->i_pages);
- dax_wake_mapping_entry_waiter(mapping, index, entry, false);
+ /* If we were the only waiter woken, wake the next one */
+ if (entry)
+ dax_wake_entry(xas, entry, false);
}
-static void put_locked_mapping_entry(struct address_space *mapping,
- pgoff_t index)
+/*
+ * We used the xa_state to get the entry, but then we locked the entry and
+ * dropped the xa_lock, so we know the xa_state is stale and must be reset
+ * before use.
+ */
+static void dax_unlock_entry(struct xa_state *xas, void *entry)
{
- unlock_mapping_entry(mapping, index);
+ void *old;
+
+ xas_reset(xas);
+ xas_lock_irq(xas);
+ old = xas_store(xas, entry);
+ xas_unlock_irq(xas);
+ BUG_ON(!dax_is_locked(old));
+ dax_wake_entry(xas, entry, false);
}
/*
- * Called when we are done with radix tree entry we looked up via
- * get_unlocked_mapping_entry() and which we didn't lock in the end.
+ * Return: The entry stored at this location before it was locked.
*/
-static void put_unlocked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void *entry)
+static void *dax_lock_entry(struct xa_state *xas, void *entry)
{
- if (!entry)
- return;
-
- /* We have to wake up next waiter for the radix tree entry lock */
- dax_wake_mapping_entry_waiter(mapping, index, entry, false);
+ unsigned long v = xa_to_value(entry);
+ return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
}
static unsigned long dax_entry_size(void *entry)
@@ -325,9 +284,9 @@ static unsigned long dax_entry_size(void *entry)
return PAGE_SIZE;
}
-static unsigned long dax_radix_end_pfn(void *entry)
+static unsigned long dax_end_pfn(void *entry)
{
- return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
+ return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
}
/*
@@ -335,8 +294,8 @@ static unsigned long dax_radix_end_pfn(void *entry)
* 'empty' and 'zero' entries.
*/
#define for_each_mapped_pfn(entry, pfn) \
- for (pfn = dax_radix_pfn(entry); \
- pfn < dax_radix_end_pfn(entry); pfn++)
+ for (pfn = dax_to_pfn(entry); \
+ pfn < dax_end_pfn(entry); pfn++)
/*
* TODO: for reflink+dax we need a way to associate a single page with
@@ -393,33 +352,16 @@ static struct page *dax_busy_page(void *entry)
return NULL;
}
-static bool entry_wait_revalidate(void)
-{
- rcu_read_unlock();
- schedule();
- rcu_read_lock();
-
- /*
- * Tell __get_unlocked_mapping_entry() to take a break, we need
- * to revalidate page->mapping after dropping locks
- */
- return true;
-}
-
bool dax_lock_mapping_entry(struct page *page)
{
- pgoff_t index;
- struct inode *inode;
- bool did_lock = false;
- void *entry = NULL, **slot;
- struct address_space *mapping;
+ XA_STATE(xas, NULL, 0);
+ void *entry;
- rcu_read_lock();
for (;;) {
- mapping = READ_ONCE(page->mapping);
+ struct address_space *mapping = READ_ONCE(page->mapping);
if (!dax_mapping(mapping))
- break;
+ return false;
/*
* In the device-dax case there's no need to lock, a
@@ -428,98 +370,94 @@ bool dax_lock_mapping_entry(struct page *page)
* otherwise we would not have a valid pfn_to_page()
* translation.
*/
- inode = mapping->host;
- if (S_ISCHR(inode->i_mode)) {
- did_lock = true;
- break;
- }
+ if (S_ISCHR(mapping->host->i_mode))
+ return true;
- xa_lock_irq(&mapping->i_pages);
+ xas.xa = &mapping->i_pages;
+ xas_lock_irq(&xas);
if (mapping != page->mapping) {
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
continue;
}
- index = page->index;
-
- entry = __get_unlocked_mapping_entry(mapping, index, &slot,
- entry_wait_revalidate);
- if (!entry) {
- xa_unlock_irq(&mapping->i_pages);
- break;
- } else if (IS_ERR(entry)) {
- xa_unlock_irq(&mapping->i_pages);
- WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN);
- continue;
+ xas_set(&xas, page->index);
+ entry = xas_load(&xas);
+ if (dax_is_locked(entry)) {
+ entry = get_unlocked_entry(&xas);
+ /* Did the page move while we slept? */
+ if (dax_to_pfn(entry) != page_to_pfn(page)) {
+ xas_unlock_irq(&xas);
+ continue;
+ }
}
- lock_slot(mapping, slot);
- did_lock = true;
- xa_unlock_irq(&mapping->i_pages);
- break;
+ dax_lock_entry(&xas, entry);
+ xas_unlock_irq(&xas);
+ return true;
}
- rcu_read_unlock();
-
- return did_lock;
}
void dax_unlock_mapping_entry(struct page *page)
{
struct address_space *mapping = page->mapping;
- struct inode *inode = mapping->host;
+ XA_STATE(xas, &mapping->i_pages, page->index);
- if (S_ISCHR(inode->i_mode))
+ if (S_ISCHR(mapping->host->i_mode))
return;
- unlock_mapping_entry(mapping, page->index);
+ dax_unlock_entry(&xas, dax_make_page_entry(page));
}
/*
- * Find radix tree entry at given index. If it points to an exceptional entry,
- * return it with the radix tree entry locked. If the radix tree doesn't
- * contain given index, create an empty exceptional entry for the index and
- * return with it locked.
+ * Find page cache entry at given index. If it is a DAX entry, return it
+ * with the entry locked. If the page cache doesn't contain an entry at
+ * that index, add a locked empty entry.
*
- * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
- * either return that locked entry or will return an error. This error will
- * happen if there are any 4k entries within the 2MiB range that we are
- * requesting.
+ * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
+ * either return that locked entry or will return VM_FAULT_FALLBACK.
+ * This will happen if there are any PTE entries within the PMD range
+ * that we are requesting.
*
- * We always favor 4k entries over 2MiB entries. There isn't a flow where we
- * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
- * insertion will fail if it finds any 4k entries already in the tree, and a
- * 4k insertion will cause an existing 2MiB entry to be unmapped and
- * downgraded to 4k entries. This happens for both 2MiB huge zero pages as
- * well as 2MiB empty entries.
+ * We always favor PTE entries over PMD entries. There isn't a flow where we
+ * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD
+ * insertion will fail if it finds any PTE entries already in the tree, and a
+ * PTE insertion will cause an existing PMD entry to be unmapped and
+ * downgraded to PTE entries. This happens for both PMD zero pages as
+ * well as PMD empty entries.
*
- * The exception to this downgrade path is for 2MiB DAX PMD entries that have
- * real storage backing them. We will leave these real 2MiB DAX entries in
- * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
+ * The exception to this downgrade path is for PMD entries that have
+ * real storage backing them. We will leave these real PMD entries in
+ * the tree, and PTE writes will simply dirty the entire PMD entry.
*
* Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
* persistent memory the benefit is doubtful. We can add that later if we can
* show it helps.
+ *
+ * On error, this function does not return an ERR_PTR. Instead it returns
+ * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values
+ * overlap with xarray value entries.
*/
-static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
- unsigned long size_flag)
+static void *grab_mapping_entry(struct xa_state *xas,
+ struct address_space *mapping, unsigned long size_flag)
{
- bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
- void *entry, **slot;
-
-restart:
- xa_lock_irq(&mapping->i_pages);
- entry = get_unlocked_mapping_entry(mapping, index, &slot);
+ unsigned long index = xas->xa_index;
+ bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */
+ void *entry;
- if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
- entry = ERR_PTR(-EIO);
- goto out_unlock;
- }
+retry:
+ xas_lock_irq(xas);
+ entry = get_unlocked_entry(xas);
+ if (xa_is_internal(entry))
+ goto fallback;
if (entry) {
- if (size_flag & RADIX_DAX_PMD) {
+ if (WARN_ON_ONCE(!xa_is_value(entry))) {
+ xas_set_err(xas, EIO);
+ goto out_unlock;
+ }
+
+ if (size_flag & DAX_PMD) {
if (dax_is_pte_entry(entry)) {
- put_unlocked_mapping_entry(mapping, index,
- entry);
- entry = ERR_PTR(-EEXIST);
- goto out_unlock;
+ put_unlocked_entry(xas, entry);
+ goto fallback;
}
} else { /* trying to grab a PTE entry */
if (dax_is_pmd_entry(entry) &&
@@ -530,87 +468,57 @@ restart:
}
}
- /* No entry for given index? Make sure radix tree is big enough. */
- if (!entry || pmd_downgrade) {
- int err;
-
- if (pmd_downgrade) {
- /*
- * Make sure 'entry' remains valid while we drop
- * the i_pages lock.
- */
- entry = lock_slot(mapping, slot);
- }
+ if (pmd_downgrade) {
+ /*
+ * Make sure 'entry' remains valid while we drop
+ * the i_pages lock.
+ */
+ dax_lock_entry(xas, entry);
- xa_unlock_irq(&mapping->i_pages);
/*
* Besides huge zero pages the only other thing that gets
* downgraded are empty entries which don't need to be
* unmapped.
*/
- if (pmd_downgrade && dax_is_zero_entry(entry))
- unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
- PG_PMD_NR, false);
-
- err = radix_tree_preload(
- mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
- if (err) {
- if (pmd_downgrade)
- put_locked_mapping_entry(mapping, index);
- return ERR_PTR(err);
- }
- xa_lock_irq(&mapping->i_pages);
-
- if (!entry) {
- /*
- * We needed to drop the i_pages lock while calling
- * radix_tree_preload() and we didn't have an entry to
- * lock. See if another thread inserted an entry at
- * our index during this time.
- */
- entry = __radix_tree_lookup(&mapping->i_pages, index,
- NULL, &slot);
- if (entry) {
- radix_tree_preload_end();
- xa_unlock_irq(&mapping->i_pages);
- goto restart;
- }
+ if (dax_is_zero_entry(entry)) {
+ xas_unlock_irq(xas);
+ unmap_mapping_pages(mapping,
+ xas->xa_index & ~PG_PMD_COLOUR,
+ PG_PMD_NR, false);
+ xas_reset(xas);
+ xas_lock_irq(xas);
}
- if (pmd_downgrade) {
- dax_disassociate_entry(entry, mapping, false);
- radix_tree_delete(&mapping->i_pages, index);
- mapping->nrexceptional--;
- dax_wake_mapping_entry_waiter(mapping, index, entry,
- true);
- }
+ dax_disassociate_entry(entry, mapping, false);
+ xas_store(xas, NULL); /* undo the PMD join */
+ dax_wake_entry(xas, entry, true);
+ mapping->nrexceptional--;
+ entry = NULL;
+ xas_set(xas, index);
+ }
- entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
-
- err = __radix_tree_insert(&mapping->i_pages, index,
- dax_radix_order(entry), entry);
- radix_tree_preload_end();
- if (err) {
- xa_unlock_irq(&mapping->i_pages);
- /*
- * Our insertion of a DAX entry failed, most likely
- * because we were inserting a PMD entry and it
- * collided with a PTE sized entry at a different
- * index in the PMD range. We haven't inserted
- * anything into the radix tree and have no waiters to
- * wake.
- */
- return ERR_PTR(err);
- }
- /* Good, we have inserted empty locked entry into the tree. */
+ if (entry) {
+ dax_lock_entry(xas, entry);
+ } else {
+ entry = dax_make_entry(pfn_to_pfn_t(0), size_flag | DAX_EMPTY);
+ dax_lock_entry(xas, entry);
+ if (xas_error(xas))
+ goto out_unlock;
mapping->nrexceptional++;
- xa_unlock_irq(&mapping->i_pages);
- return entry;
}
- entry = lock_slot(mapping, slot);
- out_unlock:
- xa_unlock_irq(&mapping->i_pages);
+
+out_unlock:
+ xas_unlock_irq(xas);
+ if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
+ goto retry;
+ if (xas->xa_node == XA_ERROR(-ENOMEM))
+ return xa_mk_internal(VM_FAULT_OOM);
+ if (xas_error(xas))
+ return xa_mk_internal(VM_FAULT_SIGBUS);
return entry;
+fallback:
+ xas_unlock_irq(xas);
+ return xa_mk_internal(VM_FAULT_FALLBACK);
}
/**
@@ -630,11 +538,10 @@ restart:
*/
struct page *dax_layout_busy_page(struct address_space *mapping)
{
- pgoff_t indices[PAGEVEC_SIZE];
+ XA_STATE(xas, &mapping->i_pages, 0);
+ void *entry;
+ unsigned int scanned = 0;
struct page *page = NULL;
- struct pagevec pvec;
- pgoff_t index, end;
- unsigned i;
/*
* In the 'limited' case get_user_pages() for dax is disabled.
@@ -645,13 +552,9 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
if (!dax_mapping(mapping) || !mapping_mapped(mapping))
return NULL;
- pagevec_init(&pvec);
- index = 0;
- end = -1;
-
/*
* If we race get_user_pages_fast() here either we'll see the
- * elevated page count in the pagevec_lookup and wait, or
+ * elevated page count in the iteration and wait, or
* get_user_pages_fast() will see that the page it took a reference
* against is no longer mapped in the page tables and bail to the
* get_user_pages() slow path. The slow path is protected by
@@ -663,94 +566,68 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
*/
unmap_mapping_range(mapping, 0, 0, 1);
- while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE),
- indices)) {
- pgoff_t nr_pages = 1;
-
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *pvec_ent = pvec.pages[i];
- void *entry;
-
- index = indices[i];
- if (index >= end)
- break;
-
- if (WARN_ON_ONCE(
- !radix_tree_exceptional_entry(pvec_ent)))
- continue;
-
- xa_lock_irq(&mapping->i_pages);
- entry = get_unlocked_mapping_entry(mapping, index, NULL);
- if (entry) {
- page = dax_busy_page(entry);
- /*
- * Account for multi-order entries at
- * the end of the pagevec.
- */
- if (i + 1 >= pagevec_count(&pvec))
- nr_pages = 1UL << dax_radix_order(entry);
- }
- put_unlocked_mapping_entry(mapping, index, entry);
- xa_unlock_irq(&mapping->i_pages);
- if (page)
- break;
- }
-
- /*
- * We don't expect normal struct page entries to exist in our
- * tree, but we keep these pagevec calls so that this code is
- * consistent with the common pattern for handling pagevecs
- * throughout the kernel.
- */
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
- index += nr_pages;
-
+ xas_lock_irq(&xas);
+ xas_for_each(&xas, entry, ULONG_MAX) {
+ if (WARN_ON_ONCE(!xa_is_value(entry)))
+ continue;
+ if (unlikely(dax_is_locked(entry)))
+ entry = get_unlocked_entry(&xas);
+ if (entry)
+ page = dax_busy_page(entry);
+ put_unlocked_entry(&xas, entry);
if (page)
break;
+ if (++scanned % XA_CHECK_SCHED)
+ continue;
+
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
+ cond_resched();
+ xas_lock_irq(&xas);
}
+ xas_unlock_irq(&xas);
return page;
}
EXPORT_SYMBOL_GPL(dax_layout_busy_page);
-static int __dax_invalidate_mapping_entry(struct address_space *mapping,
+static int __dax_invalidate_entry(struct address_space *mapping,
pgoff_t index, bool trunc)
{
+ XA_STATE(xas, &mapping->i_pages, index);
int ret = 0;
void *entry;
- struct radix_tree_root *pages = &mapping->i_pages;
- xa_lock_irq(pages);
- entry = get_unlocked_mapping_entry(mapping, index, NULL);
- if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
+ xas_lock_irq(&xas);
+ entry = get_unlocked_entry(&xas);
+ if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
goto out;
if (!trunc &&
- (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) ||
- radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)))
+ (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
+ xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
goto out;
dax_disassociate_entry(entry, mapping, trunc);
- radix_tree_delete(pages, index);
+ xas_store(&xas, NULL);
mapping->nrexceptional--;
ret = 1;
out:
- put_unlocked_mapping_entry(mapping, index, entry);
- xa_unlock_irq(pages);
+ put_unlocked_entry(&xas, entry);
+ xas_unlock_irq(&xas);
return ret;
}
+
/*
- * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
- * entry to get unlocked before deleting it.
+ * Delete DAX entry at @index from @mapping. Wait for it
+ * to be unlocked before deleting it.
*/
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
{
- int ret = __dax_invalidate_mapping_entry(mapping, index, true);
+ int ret = __dax_invalidate_entry(mapping, index, true);
/*
* This gets called from truncate / punch_hole path. As such, the caller
* must hold locks protecting against concurrent modifications of the
- * radix tree (usually fs-private i_mmap_sem for writing). Since the
- * caller has seen exceptional entry for this index, we better find it
+ * page cache (usually fs-private i_mmap_sem for writing). Since the
+ * caller has seen a DAX entry for this index, we better find it
* at that index as well...
*/
WARN_ON_ONCE(!ret);
@@ -758,12 +635,12 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
}
/*
- * Invalidate exceptional DAX entry if it is clean.
+ * Invalidate DAX entry if it is clean.
*/
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index)
{
- return __dax_invalidate_mapping_entry(mapping, index, false);
+ return __dax_invalidate_entry(mapping, index, false);
}
static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
@@ -799,30 +676,27 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
* already in the tree, we will skip the insertion and just dirty the PMD as
* appropriate.
*/
-static void *dax_insert_mapping_entry(struct address_space *mapping,
- struct vm_fault *vmf,
- void *entry, pfn_t pfn_t,
- unsigned long flags, bool dirty)
+static void *dax_insert_entry(struct xa_state *xas,
+ struct address_space *mapping, struct vm_fault *vmf,
+ void *entry, pfn_t pfn, unsigned long flags, bool dirty)
{
- struct radix_tree_root *pages = &mapping->i_pages;
- unsigned long pfn = pfn_t_to_pfn(pfn_t);
- pgoff_t index = vmf->pgoff;
- void *new_entry;
+ void *new_entry = dax_make_entry(pfn, flags);
if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
+ if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
+ unsigned long index = xas->xa_index;
/* we are replacing a zero page with block mapping */
if (dax_is_pmd_entry(entry))
unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
- PG_PMD_NR, false);
+ PG_PMD_NR, false);
else /* pte entry */
- unmap_mapping_pages(mapping, vmf->pgoff, 1, false);
+ unmap_mapping_pages(mapping, index, 1, false);
}
- xa_lock_irq(pages);
- new_entry = dax_radix_locked_entry(pfn, flags);
+ xas_reset(xas);
+ xas_lock_irq(xas);
if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
dax_disassociate_entry(entry, mapping, false);
dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
@@ -830,33 +704,30 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
/*
- * Only swap our new entry into the radix tree if the current
+ * Only swap our new entry into the page cache if the current
* entry is a zero page or an empty entry. If a normal PTE or
- * PMD entry is already in the tree, we leave it alone. This
+ * PMD entry is already in the cache, we leave it alone. This
* means that if we are trying to insert a PTE and the
* existing entry is a PMD, we will just leave the PMD in the
* tree and dirty it if necessary.
*/
- struct radix_tree_node *node;
- void **slot;
- void *ret;
-
- ret = __radix_tree_lookup(pages, index, &node, &slot);
- WARN_ON_ONCE(ret != entry);
- __radix_tree_replace(pages, node, slot,
- new_entry, NULL);
+ void *old = dax_lock_entry(xas, new_entry);
+ WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
+ DAX_LOCKED));
entry = new_entry;
+ } else {
+ xas_load(xas); /* Walk the xa_state */
}
if (dirty)
- radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY);
+ xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
- xa_unlock_irq(pages);
+ xas_unlock_irq(xas);
return entry;
}
-static inline unsigned long
-pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
+static inline
+unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
{
unsigned long address;
@@ -866,8 +737,8 @@ pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
}
/* Walk all mappings of a given index of a file and writeprotect them */
-static void dax_mapping_entry_mkclean(struct address_space *mapping,
- pgoff_t index, unsigned long pfn)
+static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
+ unsigned long pfn)
{
struct vm_area_struct *vma;
pte_t pte, *ptep = NULL;
@@ -937,11 +808,9 @@ unlock_pte:
i_mmap_unlock_read(mapping);
}
-static int dax_writeback_one(struct dax_device *dax_dev,
- struct address_space *mapping, pgoff_t index, void *entry)
+static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
+ struct address_space *mapping, void *entry)
{
- struct radix_tree_root *pages = &mapping->i_pages;
- void *entry2, **slot;
unsigned long pfn;
long ret = 0;
size_t size;
@@ -950,32 +819,38 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* A page got tagged dirty in DAX mapping? Something is seriously
* wrong.
*/
- if (WARN_ON(!radix_tree_exceptional_entry(entry)))
+ if (WARN_ON(!xa_is_value(entry)))
return -EIO;
- xa_lock_irq(pages);
- entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
- /* Entry got punched out / reallocated? */
- if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
- goto put_unlocked;
- /*
- * Entry got reallocated elsewhere? No need to writeback. We have to
- * compare pfns as we must not bail out due to difference in lockbit
- * or entry type.
- */
- if (dax_radix_pfn(entry2) != dax_radix_pfn(entry))
- goto put_unlocked;
- if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
- dax_is_zero_entry(entry))) {
- ret = -EIO;
- goto put_unlocked;
+ if (unlikely(dax_is_locked(entry))) {
+ void *old_entry = entry;
+
+ entry = get_unlocked_entry(xas);
+
+ /* Entry got punched out / reallocated? */
+ if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
+ goto put_unlocked;
+ /*
+ * Entry got reallocated elsewhere? No need to writeback.
+ * We have to compare pfns as we must not bail out due to
+ * difference in lockbit or entry type.
+ */
+ if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
+ goto put_unlocked;
+ if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
+ dax_is_zero_entry(entry))) {
+ ret = -EIO;
+ goto put_unlocked;
+ }
+
+ /* Another fsync thread may have already done this entry */
+ if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
+ goto put_unlocked;
}
- /* Another fsync thread may have already written back this entry */
- if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))
- goto put_unlocked;
/* Lock the entry to serialize with page faults */
- entry = lock_slot(mapping, slot);
+ dax_lock_entry(xas, entry);
+
/*
* We can clear the tag now but we have to be careful so that concurrent
* dax_writeback_one() calls for the same index cannot finish before we
@@ -983,8 +858,8 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* at the entry only under the i_pages lock and once they do that
* they will see the entry locked and wait for it to unlock.
*/
- radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE);
- xa_unlock_irq(pages);
+ xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
+ xas_unlock_irq(xas);
/*
* Even if dax_writeback_mapping_range() was given a wbc->range_start
@@ -993,10 +868,10 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* This allows us to flush for PMD_SIZE and not have to worry about
* partial PMD writebacks.
*/
- pfn = dax_radix_pfn(entry);
- size = PAGE_SIZE << dax_radix_order(entry);
+ pfn = dax_to_pfn(entry);
+ size = PAGE_SIZE << dax_entry_order(entry);
- dax_mapping_entry_mkclean(mapping, index, pfn);
+ dax_entry_mkclean(mapping, xas->xa_index, pfn);
dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size);
/*
* After we have flushed the cache, we can clear the dirty tag. There
@@ -1004,16 +879,18 @@ static int dax_writeback_one(struct dax_device *dax_dev,
* the pfn mappings are writeprotected and fault waits for mapping
* entry lock.
*/
- xa_lock_irq(pages);
- radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY);
- xa_unlock_irq(pages);
- trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
- put_locked_mapping_entry(mapping, index);
+ xas_reset(xas);
+ xas_lock_irq(xas);
+ xas_store(xas, entry);
+ xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
+ dax_wake_entry(xas, entry, false);
+
+ trace_dax_writeback_one(mapping->host, xas->xa_index,
+ size >> PAGE_SHIFT);
return ret;
put_unlocked:
- put_unlocked_mapping_entry(mapping, index, entry2);
- xa_unlock_irq(pages);
+ put_unlocked_entry(xas, entry);
return ret;
}
@@ -1025,13 +902,13 @@ static int dax_writeback_one(struct dax_device *dax_dev,
int dax_writeback_mapping_range(struct address_space *mapping,
struct block_device *bdev, struct writeback_control *wbc)
{
+ XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
struct inode *inode = mapping->host;
- pgoff_t start_index, end_index;
- pgoff_t indices[PAGEVEC_SIZE];
+ pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
struct dax_device *dax_dev;
- struct pagevec pvec;
- bool done = false;
- int i, ret = 0;
+ void *entry;
+ int ret = 0;
+ unsigned int scanned = 0;
if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
return -EIO;
@@ -1043,41 +920,29 @@ int dax_writeback_mapping_range(struct address_space *mapping,
if (!dax_dev)
return -EIO;
- start_index = wbc->range_start >> PAGE_SHIFT;
- end_index = wbc->range_end >> PAGE_SHIFT;
-
- trace_dax_writeback_range(inode, start_index, end_index);
+ trace_dax_writeback_range(inode, xas.xa_index, end_index);
- tag_pages_for_writeback(mapping, start_index, end_index);
+ tag_pages_for_writeback(mapping, xas.xa_index, end_index);
- pagevec_init(&pvec);
- while (!done) {
- pvec.nr = find_get_entries_tag(mapping, start_index,
- PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
- pvec.pages, indices);
-
- if (pvec.nr == 0)
+ xas_lock_irq(&xas);
+ xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
+ ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
+ if (ret < 0) {
+ mapping_set_error(mapping, ret);
break;
-
- for (i = 0; i < pvec.nr; i++) {
- if (indices[i] > end_index) {
- done = true;
- break;
- }
-
- ret = dax_writeback_one(dax_dev, mapping, indices[i],
- pvec.pages[i]);
- if (ret < 0) {
- mapping_set_error(mapping, ret);
- goto out;
- }
}
- start_index = indices[pvec.nr - 1] + 1;
+ if (++scanned % XA_CHECK_SCHED)
+ continue;
+
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
+ cond_resched();
+ xas_lock_irq(&xas);
}
-out:
+ xas_unlock_irq(&xas);
put_dax(dax_dev);
- trace_dax_writeback_range_done(inode, start_index, end_index);
- return (ret < 0 ? ret : 0);
+ trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
+ return ret;
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
@@ -1125,16 +990,18 @@ out:
* If this page is ever written to we will re-fault and change the mapping to
* point to real DAX storage instead.
*/
-static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
- struct vm_fault *vmf)
+static vm_fault_t dax_load_hole(struct xa_state *xas,
+ struct address_space *mapping, void **entry,
+ struct vm_fault *vmf)
{
struct inode *inode = mapping->host;
unsigned long vaddr = vmf->address;
pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
vm_fault_t ret;
- dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE,
- false);
+ *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
+ DAX_ZERO_PAGE, false);
+
ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
trace_dax_load_hole(inode, vmf, ret);
return ret;
@@ -1342,6 +1209,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
{
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping;
+ XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
struct inode *inode = mapping->host;
unsigned long vaddr = vmf->address;
loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
@@ -1368,9 +1236,9 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (write && !vmf->cow_page)
flags |= IOMAP_WRITE;
- entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
- if (IS_ERR(entry)) {
- ret = dax_fault_return(PTR_ERR(entry));
+ entry = grab_mapping_entry(&xas, mapping, 0);
+ if (xa_is_internal(entry)) {
+ ret = xa_to_internal(entry);
goto out;
}
@@ -1443,7 +1311,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (error < 0)
goto error_finish_iomap;
- entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
+ entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
0, write && !sync);
/*
@@ -1471,7 +1339,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (!write) {
- ret = dax_load_hole(mapping, entry, vmf);
+ ret = dax_load_hole(&xas, mapping, &entry, vmf);
goto finish_iomap;
}
/*FALLTHRU*/
@@ -1498,21 +1366,20 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
}
unlock_entry:
- put_locked_mapping_entry(mapping, vmf->pgoff);
+ dax_unlock_entry(&xas, entry);
out:
trace_dax_pte_fault_done(inode, vmf, ret);
return ret | major;
}
#ifdef CONFIG_FS_DAX_PMD
-static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
- void *entry)
+static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
+ struct iomap *iomap, void **entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK;
struct inode *inode = mapping->host;
struct page *zero_page;
- void *ret = NULL;
spinlock_t *ptl;
pmd_t pmd_entry;
pfn_t pfn;
@@ -1523,8 +1390,8 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
goto fallback;
pfn = page_to_pfn_t(zero_page);
- ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
- RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
+ *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
+ DAX_PMD | DAX_ZERO_PAGE, false);
ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (!pmd_none(*(vmf->pmd))) {
@@ -1536,11 +1403,11 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
pmd_entry = pmd_mkhuge(pmd_entry);
set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
spin_unlock(ptl);
- trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
+ trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
return VM_FAULT_NOPAGE;
fallback:
- trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
+ trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
return VM_FAULT_FALLBACK;
}
@@ -1549,6 +1416,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
{
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping;
+ XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
unsigned long pmd_addr = vmf->address & PMD_MASK;
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool sync;
@@ -1556,7 +1424,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
struct inode *inode = mapping->host;
vm_fault_t result = VM_FAULT_FALLBACK;
struct iomap iomap = { 0 };
- pgoff_t max_pgoff, pgoff;
+ pgoff_t max_pgoff;
void *entry;
loff_t pos;
int error;
@@ -1567,7 +1435,6 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* supposed to hold locks serializing us with truncate / punch hole so
* this is a reliable test.
*/
- pgoff = linear_page_index(vma, pmd_addr);
max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
@@ -1576,7 +1443,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* Make sure that the faulting address's PMD offset (color) matches
* the PMD offset from the start of the file. This is necessary so
* that a PMD range in the page table overlaps exactly with a PMD
- * range in the radix tree.
+ * range in the page cache.
*/
if ((vmf->pgoff & PG_PMD_COLOUR) !=
((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
@@ -1592,24 +1459,26 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
if ((pmd_addr + PMD_SIZE) > vma->vm_end)
goto fallback;
- if (pgoff >= max_pgoff) {
+ if (xas.xa_index >= max_pgoff) {
result = VM_FAULT_SIGBUS;
goto out;
}
/* If the PMD would extend beyond the file size */
- if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
+ if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff)
goto fallback;
/*
- * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
- * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page
- * is already in the tree, for instance), it will return -EEXIST and
- * we just fall back to 4k entries.
+ * grab_mapping_entry() will make sure we get an empty PMD entry,
+ * a zero PMD entry or a DAX PMD. If it can't (because a PTE
+ * entry is already in the array, for instance), it will return
+ * VM_FAULT_FALLBACK.
*/
- entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
- if (IS_ERR(entry))
+ entry = grab_mapping_entry(&xas, mapping, DAX_PMD);
+ if (xa_is_internal(entry)) {
+ result = xa_to_internal(entry);
goto fallback;
+ }
/*
* It is possible, particularly with mixed reads & writes to private
@@ -1628,7 +1497,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* setting up a mapping, so really we're using iomap_begin() as a way
* to look up our filesystem block.
*/
- pos = (loff_t)pgoff << PAGE_SHIFT;
+ pos = (loff_t)xas.xa_index << PAGE_SHIFT;
error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
if (error)
goto unlock_entry;
@@ -1644,8 +1513,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (error < 0)
goto finish_iomap;
- entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
- RADIX_DAX_PMD, write && !sync);
+ entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
+ DAX_PMD, write && !sync);
/*
* If we are doing synchronous page fault and inode needs fsync,
@@ -1669,7 +1538,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
case IOMAP_HOLE:
if (WARN_ON_ONCE(write))
break;
- result = dax_pmd_load_hole(vmf, &iomap, entry);
+ result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry);
break;
default:
WARN_ON_ONCE(1);
@@ -1692,7 +1561,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
&iomap);
}
unlock_entry:
- put_locked_mapping_entry(mapping, pgoff);
+ dax_unlock_entry(&xas, entry);
fallback:
if (result == VM_FAULT_FALLBACK) {
split_huge_pmd(vma, vmf->pmd, vmf->address);
@@ -1737,54 +1606,49 @@ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
}
EXPORT_SYMBOL_GPL(dax_iomap_fault);
-/**
+/*
* dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
* @vmf: The description of the fault
- * @pe_size: Size of entry to be inserted
* @pfn: PFN to insert
+ * @order: Order of entry to insert.
*
- * This function inserts writeable PTE or PMD entry into page tables for mmaped
- * DAX file. It takes care of marking corresponding radix tree entry as dirty
- * as well.
+ * This function inserts a writeable PTE or PMD entry into the page tables
+ * for an mmaped DAX file. It also marks the page cache entry as dirty.
*/
-static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
- enum page_entry_size pe_size,
- pfn_t pfn)
+static vm_fault_t
+dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
- void *entry, **slot;
- pgoff_t index = vmf->pgoff;
+ XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
+ void *entry;
vm_fault_t ret;
- xa_lock_irq(&mapping->i_pages);
- entry = get_unlocked_mapping_entry(mapping, index, &slot);
+ xas_lock_irq(&xas);
+ entry = get_unlocked_entry(&xas);
/* Did we race with someone splitting entry or so? */
if (!entry ||
- (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
- (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
- put_unlocked_mapping_entry(mapping, index, entry);
- xa_unlock_irq(&mapping->i_pages);
+ (order == 0 && !dax_is_pte_entry(entry)) ||
+ (order == PMD_ORDER && (xa_is_internal(entry) ||
+ !dax_is_pmd_entry(entry)))) {
+ put_unlocked_entry(&xas, entry);
+ xas_unlock_irq(&xas);
trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
- radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY);
- entry = lock_slot(mapping, slot);
- xa_unlock_irq(&mapping->i_pages);
- switch (pe_size) {
- case PE_SIZE_PTE:
+ xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
+ dax_lock_entry(&xas, entry);
+ xas_unlock_irq(&xas);
+ if (order == 0)
ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
- break;
#ifdef CONFIG_FS_DAX_PMD
- case PE_SIZE_PMD:
+ else if (order == PMD_ORDER)
ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
pfn, true);
- break;
#endif
- default:
+ else
ret = VM_FAULT_FALLBACK;
- }
- put_locked_mapping_entry(mapping, index);
+ dax_unlock_entry(&xas, entry);
trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
return ret;
}
@@ -1804,17 +1668,12 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
{
int err;
loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
- size_t len = 0;
+ unsigned int order = pe_order(pe_size);
+ size_t len = PAGE_SIZE << order;
- if (pe_size == PE_SIZE_PTE)
- len = PAGE_SIZE;
- else if (pe_size == PE_SIZE_PMD)
- len = PMD_SIZE;
- else
- WARN_ON_ONCE(1);
err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
if (err)
return VM_FAULT_SIGBUS;
- return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
+ return dax_insert_pfn_mkwrite(vmf, pfn, order);
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c3d9a42c561e..05f01fbd9c7f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2643,7 +2643,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
long left = mpd->wbc->nr_to_write;
pgoff_t index = mpd->first_page;
pgoff_t end = mpd->last_page;
- int tag;
+ xa_mark_t tag;
int i, err = 0;
int blkbits = mpd->inode->i_blkbits;
ext4_lblk_t lblk;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 106f116466bf..b293cb3e27a2 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2071,7 +2071,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
pgoff_t done_index;
int cycled;
int range_whole = 0;
- int tag;
+ xa_mark_t tag;
int nwritten = 0;
pagevec_init(&pvec);
@@ -2787,13 +2787,13 @@ const struct address_space_operations f2fs_dblock_aops = {
#endif
};
-void f2fs_clear_radix_tree_dirty_tag(struct page *page)
+void f2fs_clear_page_cache_dirty_tag(struct page *page)
{
struct address_space *mapping = page_mapping(page);
unsigned long flags;
xa_lock_irqsave(&mapping->i_pages, flags);
- radix_tree_tag_clear(&mapping->i_pages, page_index(page),
+ __xa_clear_mark(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 2ef84b4590ea..bacc667950b6 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -726,7 +726,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
if (bit_pos == NR_DENTRY_IN_BLOCK &&
!f2fs_truncate_hole(dir, page->index, page->index + 1)) {
- f2fs_clear_radix_tree_dirty_tag(page);
+ f2fs_clear_page_cache_dirty_tag(page);
clear_page_dirty_for_io(page);
ClearPagePrivate(page);
ClearPageUptodate(page);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 56204a8f8a12..1e031971a466 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3108,7 +3108,7 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
struct page *page, enum migrate_mode mode);
#endif
bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len);
-void f2fs_clear_radix_tree_dirty_tag(struct page *page);
+void f2fs_clear_page_cache_dirty_tag(struct page *page);
/*
* gc.c
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index cb31a719b048..7b0cff7e6051 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -243,7 +243,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
kunmap_atomic(src_addr);
set_page_dirty(dn.inode_page);
- f2fs_clear_radix_tree_dirty_tag(page);
+ f2fs_clear_page_cache_dirty_tag(page);
set_inode_flag(inode, FI_APPEND_WRITE);
set_inode_flag(inode, FI_DATA_EXIST);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 2b34206486d8..d338740d0fda 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -101,7 +101,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
static void clear_node_page_dirty(struct page *page)
{
if (PageDirty(page)) {
- f2fs_clear_radix_tree_dirty_tag(page);
+ f2fs_clear_page_cache_dirty_tag(page);
clear_page_dirty_for_io(page);
dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
}
@@ -1306,9 +1306,7 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
if (f2fs_check_nid_range(sbi, nid))
return;
- rcu_read_lock();
- apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid);
- rcu_read_unlock();
+ apage = xa_load(&NODE_MAPPING(sbi)->i_pages, nid);
if (apage)
return;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 471d863958bc..b40168fcc94a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -339,9 +339,9 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
struct address_space *mapping = inode->i_mapping;
struct bdi_writeback *old_wb = inode->i_wb;
struct bdi_writeback *new_wb = isw->new_wb;
- struct radix_tree_iter iter;
+ XA_STATE(xas, &mapping->i_pages, 0);
+ struct page *page;
bool switched = false;
- void **slot;
/*
* By the time control reaches here, RCU grace period has passed
@@ -375,25 +375,18 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
* to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
* pages actually under writeback.
*/
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0,
- PAGECACHE_TAG_DIRTY) {
- struct page *page = radix_tree_deref_slot_protected(slot,
- &mapping->i_pages.xa_lock);
- if (likely(page) && PageDirty(page)) {
+ xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
+ if (PageDirty(page)) {
dec_wb_stat(old_wb, WB_RECLAIMABLE);
inc_wb_stat(new_wb, WB_RECLAIMABLE);
}
}
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0,
- PAGECACHE_TAG_WRITEBACK) {
- struct page *page = radix_tree_deref_slot_protected(slot,
- &mapping->i_pages.xa_lock);
- if (likely(page)) {
- WARN_ON_ONCE(!PageWriteback(page));
- dec_wb_stat(old_wb, WB_WRITEBACK);
- inc_wb_stat(new_wb, WB_WRITEBACK);
- }
+ xas_set(&xas, 0);
+ xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
+ WARN_ON_ONCE(!PageWriteback(page));
+ dec_wb_stat(old_wb, WB_WRITEBACK);
+ inc_wb_stat(new_wb, WB_WRITEBACK);
}
wb_get(new_wb);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 31e8270d0b26..8afbb35559b9 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -366,7 +366,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
pgoff_t done_index;
int cycled;
int range_whole = 0;
- int tag;
+ xa_mark_t tag;
pagevec_init(&pvec);
if (wbc->range_cyclic) {
diff --git a/fs/inode.c b/fs/inode.c
index 42f6d25f32a5..9b808986d440 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -349,7 +349,7 @@ EXPORT_SYMBOL(inc_nlink);
static void __address_space_init_once(struct address_space *mapping)
{
- INIT_RADIX_TREE(&mapping->i_pages, GFP_ATOMIC | __GFP_ACCOUNT);
+ xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ);
init_rwsem(&mapping->i_mmap_rwsem);
INIT_LIST_HEAD(&mapping->private_list);
spin_lock_init(&mapping->private_lock);
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 947ce22f5b3c..f0fe641893a5 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -46,7 +46,7 @@ int isofs_name_translate(struct iso_directory_record *de, char *new, struct inod
return i;
}
-/* Acorn extensions written by Matthew Wilcox <willy@bofh.ai> 1998 */
+/* Acorn extensions written by Matthew Wilcox <willy@infradead.org> 1998 */
int get_acorn_filename(struct iso_directory_record *de,
char *retname, struct inode *inode)
{
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 06cb0c1d9aee..d3781cd983f6 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -896,7 +896,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (end != inode->i_mapping->nrpages) {
rcu_read_lock();
- end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
+ end = page_cache_next_miss(mapping, idx + 1, ULONG_MAX);
rcu_read_unlock();
}
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index ebb24a314f43..de99db518571 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -168,24 +168,18 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
ctxt->newbh = NULL;
if (inode->i_blkbits == PAGE_SHIFT) {
- lock_page(obh->b_page);
- /*
- * We cannot call radix_tree_preload for the kernels older
- * than 2.6.23, because it is not exported for modules.
- */
+ struct page *opage = obh->b_page;
+ lock_page(opage);
retry:
- err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
- if (err)
- goto failed_unlock;
/* BUG_ON(oldkey != obh->b_page->index); */
- if (unlikely(oldkey != obh->b_page->index))
- NILFS_PAGE_BUG(obh->b_page,
+ if (unlikely(oldkey != opage->index))
+ NILFS_PAGE_BUG(opage,
"invalid oldkey %lld (newkey=%lld)",
(unsigned long long)oldkey,
(unsigned long long)newkey);
xa_lock_irq(&btnc->i_pages);
- err = radix_tree_insert(&btnc->i_pages, newkey, obh->b_page);
+ err = __xa_insert(&btnc->i_pages, newkey, opage, GFP_NOFS);
xa_unlock_irq(&btnc->i_pages);
/*
* Note: page->index will not change to newkey until
@@ -193,7 +187,6 @@ retry:
* To protect the page in intermediate state, the page lock
* is held.
*/
- radix_tree_preload_end();
if (!err)
return 0;
else if (err != -EEXIST)
@@ -203,7 +196,7 @@ retry:
if (!err)
goto retry;
/* fallback to copy mode */
- unlock_page(obh->b_page);
+ unlock_page(opage);
}
nbh = nilfs_btnode_create_block(btnc, newkey);
@@ -243,9 +236,8 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
mark_buffer_dirty(obh);
xa_lock_irq(&btnc->i_pages);
- radix_tree_delete(&btnc->i_pages, oldkey);
- radix_tree_tag_set(&btnc->i_pages, newkey,
- PAGECACHE_TAG_DIRTY);
+ __xa_erase(&btnc->i_pages, oldkey);
+ __xa_set_mark(&btnc->i_pages, newkey, PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&btnc->i_pages);
opage->index = obh->b_blocknr = newkey;
@@ -275,7 +267,7 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc,
if (nbh == NULL) { /* blocksize == pagesize */
xa_lock_irq(&btnc->i_pages);
- radix_tree_delete(&btnc->i_pages, newkey);
+ __xa_erase(&btnc->i_pages, newkey);
xa_unlock_irq(&btnc->i_pages);
unlock_page(ctxt->bh->b_page);
} else
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 329a056b73b1..d7fc8d369d89 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -289,7 +289,7 @@ repeat:
* @dmap: destination page cache
* @smap: source page cache
*
- * No pages must no be added to the cache during this process.
+ * No pages must be added to the cache during this process.
* This must be ensured by the caller.
*/
void nilfs_copy_back_pages(struct address_space *dmap,
@@ -298,7 +298,6 @@ void nilfs_copy_back_pages(struct address_space *dmap,
struct pagevec pvec;
unsigned int i, n;
pgoff_t index = 0;
- int err;
pagevec_init(&pvec);
repeat:
@@ -313,35 +312,34 @@ repeat:
lock_page(page);
dpage = find_lock_page(dmap, offset);
if (dpage) {
- /* override existing page on the destination cache */
+ /* overwrite existing page in the destination cache */
WARN_ON(PageDirty(dpage));
nilfs_copy_page(dpage, page, 0);
unlock_page(dpage);
put_page(dpage);
+ /* Do we not need to remove page from smap here? */
} else {
- struct page *page2;
+ struct page *p;
/* move the page to the destination cache */
xa_lock_irq(&smap->i_pages);
- page2 = radix_tree_delete(&smap->i_pages, offset);
- WARN_ON(page2 != page);
-
+ p = __xa_erase(&smap->i_pages, offset);
+ WARN_ON(page != p);
smap->nrpages--;
xa_unlock_irq(&smap->i_pages);
xa_lock_irq(&dmap->i_pages);
- err = radix_tree_insert(&dmap->i_pages, offset, page);
- if (unlikely(err < 0)) {
- WARN_ON(err == -EEXIST);
+ p = __xa_store(&dmap->i_pages, offset, page, GFP_NOFS);
+ if (unlikely(p)) {
+ /* Probably -ENOMEM */
page->mapping = NULL;
- put_page(page); /* for cache */
+ put_page(page);
} else {
page->mapping = dmap;
dmap->nrpages++;
if (PageDirty(page))
- radix_tree_tag_set(&dmap->i_pages,
- offset,
- PAGECACHE_TAG_DIRTY);
+ __xa_set_mark(&dmap->i_pages, offset,
+ PAGECACHE_TAG_DIRTY);
}
xa_unlock_irq(&dmap->i_pages);
}
@@ -467,8 +465,7 @@ int __nilfs_clear_page_dirty(struct page *page)
if (mapping) {
xa_lock_irq(&mapping->i_pages);
if (test_bit(PG_dirty, &page->flags)) {
- radix_tree_tag_clear(&mapping->i_pages,
- page_index(page),
+ __xa_clear_mark(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&mapping->i_pages);
return clear_page_dirty_for_io(page);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a027473561c6..47c3764c469b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -521,7 +521,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
if (!page)
return;
- if (radix_tree_exceptional_entry(page))
+ if (xa_is_value(page))
mss->swap += PAGE_SIZE;
else
put_page(page);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 897eae8faee1..771341470bce 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -403,24 +403,40 @@ int pagecache_write_end(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
+/**
+ * struct address_space - Contents of a cacheable, mappable object.
+ * @host: Owner, either the inode or the block_device.
+ * @i_pages: Cached pages.
+ * @gfp_mask: Memory allocation flags to use for allocating pages.
+ * @i_mmap_writable: Number of VM_SHARED mappings.
+ * @i_mmap: Tree of private and shared mappings.
+ * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
+ * @nrpages: Number of page entries, protected by the i_pages lock.
+ * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock.
+ * @writeback_index: Writeback starts here.
+ * @a_ops: Methods.
+ * @flags: Error bits and flags (AS_*).
+ * @wb_err: The most recent error which has occurred.
+ * @private_lock: For use by the owner of the address_space.
+ * @private_list: For use by the owner of the address_space.
+ * @private_data: For use by the owner of the address_space.
+ */
struct address_space {
- struct inode *host; /* owner: inode, block_device */
- struct radix_tree_root i_pages; /* cached pages */
- atomic_t i_mmap_writable;/* count VM_SHARED mappings */
- struct rb_root_cached i_mmap; /* tree of private and shared mappings */
- struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */
- /* Protected by the i_pages lock */
- unsigned long nrpages; /* number of total pages */
- /* number of shadow or DAX exceptional entries */
+ struct inode *host;
+ struct xarray i_pages;
+ gfp_t gfp_mask;
+ atomic_t i_mmap_writable;
+ struct rb_root_cached i_mmap;
+ struct rw_semaphore i_mmap_rwsem;
+ unsigned long nrpages;
unsigned long nrexceptional;
- pgoff_t writeback_index;/* writeback starts here */
- const struct address_space_operations *a_ops; /* methods */
- unsigned long flags; /* error bits */
- spinlock_t private_lock; /* for use by the address_space */
- gfp_t gfp_mask; /* implicit gfp mask for allocations */
- struct list_head private_list; /* for use by the address_space */
- void *private_data; /* ditto */
+ pgoff_t writeback_index;
+ const struct address_space_operations *a_ops;
+ unsigned long flags;
errseq_t wb_err;
+ spinlock_t private_lock;
+ struct list_head private_list;
+ void *private_data;
} __attribute__((aligned(sizeof(long)))) __randomize_layout;
/*
* On most architectures that alignment is already the case; but
@@ -467,15 +483,18 @@ struct block_device {
struct mutex bd_fsfreeze_mutex;
} __randomize_layout;
+/* XArray tags, for tagging dirty and writeback pages in the pagecache. */
+#define PAGECACHE_TAG_DIRTY XA_MARK_0
+#define PAGECACHE_TAG_WRITEBACK XA_MARK_1
+#define PAGECACHE_TAG_TOWRITE XA_MARK_2
+
/*
- * Radix-tree tags, for tagging dirty and writeback pages within the pagecache
- * radix trees
+ * Returns true if any of the pages in the mapping are marked with the tag.
*/
-#define PAGECACHE_TAG_DIRTY 0
-#define PAGECACHE_TAG_WRITEBACK 1
-#define PAGECACHE_TAG_TOWRITE 2
-
-int mapping_tagged(struct address_space *mapping, int tag);
+static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag)
+{
+ return xa_marked(&mapping->i_pages, tag);
+}
static inline void i_mmap_lock_write(struct address_space *mapping)
{
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 3ec8628ce17f..60daf34b625d 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -214,8 +214,7 @@ static inline void idr_preload_end(void)
++id, (entry) = idr_get_next((idr), &(id)))
/*
- * IDA - IDR based id allocator, use when translation from id to
- * pointer isn't necessary.
+ * IDA - ID Allocator, use when translation from id to pointer isn't necessary.
*/
#define IDA_CHUNK_SIZE 128 /* 128 bytes per chunk */
#define IDA_BITMAP_LONGS (IDA_CHUNK_SIZE / sizeof(long))
@@ -225,14 +224,14 @@ struct ida_bitmap {
unsigned long bitmap[IDA_BITMAP_LONGS];
};
-DECLARE_PER_CPU(struct ida_bitmap *, ida_bitmap);
-
struct ida {
- struct radix_tree_root ida_rt;
+ struct xarray xa;
};
+#define IDA_INIT_FLAGS (XA_FLAGS_LOCK_IRQ | XA_FLAGS_ALLOC)
+
#define IDA_INIT(name) { \
- .ida_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER | GFP_NOWAIT), \
+ .xa = XARRAY_INIT(name, IDA_INIT_FLAGS) \
}
#define DEFINE_IDA(name) struct ida name = IDA_INIT(name)
@@ -292,7 +291,7 @@ static inline int ida_alloc_max(struct ida *ida, unsigned int max, gfp_t gfp)
static inline void ida_init(struct ida *ida)
{
- INIT_RADIX_TREE(&ida->ida_rt, IDR_RT_MARKER | GFP_NOWAIT);
+ xa_init_flags(&ida->xa, IDA_INIT_FLAGS);
}
#define ida_simple_get(ida, start, end, gfp) \
@@ -301,9 +300,6 @@ static inline void ida_init(struct ida *ida)
static inline bool ida_is_empty(const struct ida *ida)
{
- return radix_tree_empty(&ida->ida_rt);
+ return xa_empty(&ida->xa);
}
-
-/* in lib/radix-tree.c */
-int ida_pre_get(struct ida *ida, gfp_t gfp_mask);
#endif /* __IDR_H__ */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index b1bd2186e6d2..226f96f0dee0 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -241,9 +241,9 @@ static inline gfp_t readahead_gfp_mask(struct address_space *x)
typedef int filler_t(void *, struct page *);
-pgoff_t page_cache_next_hole(struct address_space *mapping,
+pgoff_t page_cache_next_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan);
-pgoff_t page_cache_prev_hole(struct address_space *mapping,
+pgoff_t page_cache_prev_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan);
#define FGP_ACCESSED 0x00000001
@@ -363,17 +363,17 @@ static inline unsigned find_get_pages(struct address_space *mapping,
unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
unsigned int nr_pages, struct page **pages);
unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
- pgoff_t end, int tag, unsigned int nr_pages,
+ pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
struct page **pages);
static inline unsigned find_get_pages_tag(struct address_space *mapping,
- pgoff_t *index, int tag, unsigned int nr_pages,
+ pgoff_t *index, xa_mark_t tag, unsigned int nr_pages,
struct page **pages)
{
return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag,
nr_pages, pages);
}
unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
- int tag, unsigned int nr_entries,
+ xa_mark_t tag, unsigned int nr_entries,
struct page **entries, pgoff_t *indices);
struct page *grab_cache_page_write_begin(struct address_space *mapping,
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 6dc456ac6136..081d934eda64 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -9,6 +9,8 @@
#ifndef _LINUX_PAGEVEC_H
#define _LINUX_PAGEVEC_H
+#include <linux/xarray.h>
+
/* 15 pointers + header align the pagevec structure to a power of two */
#define PAGEVEC_SIZE 15
@@ -40,12 +42,12 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec,
unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
struct address_space *mapping, pgoff_t *index, pgoff_t end,
- int tag);
+ xa_mark_t tag);
unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
struct address_space *mapping, pgoff_t *index, pgoff_t end,
- int tag, unsigned max_pages);
+ xa_mark_t tag, unsigned max_pages);
static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
- struct address_space *mapping, pgoff_t *index, int tag)
+ struct address_space *mapping, pgoff_t *index, xa_mark_t tag)
{
return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag);
}
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 34149e8b5f73..06c4c7a6c09c 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -28,34 +28,30 @@
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/types.h>
+#include <linux/xarray.h>
+
+/* Keep unconverted code working */
+#define radix_tree_root xarray
+#define radix_tree_node xa_node
/*
* The bottom two bits of the slot determine how the remaining bits in the
* slot are interpreted:
*
* 00 - data pointer
- * 01 - internal entry
- * 10 - exceptional entry
- * 11 - this bit combination is currently unused/reserved
+ * 10 - internal entry
+ * x1 - value entry
*
* The internal entry may be a pointer to the next level in the tree, a
* sibling entry, or an indicator that the entry in this slot has been moved
* to another location in the tree and the lookup should be restarted. While
* NULL fits the 'data pointer' pattern, it means that there is no entry in
* the tree for this index (no matter what level of the tree it is found at).
- * This means that you cannot store NULL in the tree as a value for the index.
+ * This means that storing a NULL entry in the tree is the same as deleting
+ * the entry from the tree.
*/
#define RADIX_TREE_ENTRY_MASK 3UL
-#define RADIX_TREE_INTERNAL_NODE 1UL
-
-/*
- * Most users of the radix tree store pointers but shmem/tmpfs stores swap
- * entries in the same tree. They are marked as exceptional entries to
- * distinguish them from pointers to struct page.
- * EXCEPTIONAL_ENTRY tests the bit, EXCEPTIONAL_SHIFT shifts content past it.
- */
-#define RADIX_TREE_EXCEPTIONAL_ENTRY 2
-#define RADIX_TREE_EXCEPTIONAL_SHIFT 2
+#define RADIX_TREE_INTERNAL_NODE 2UL
static inline bool radix_tree_is_internal_node(void *ptr)
{
@@ -65,75 +61,32 @@ static inline bool radix_tree_is_internal_node(void *ptr)
/*** radix-tree API starts here ***/
-#define RADIX_TREE_MAX_TAGS 3
-
-#ifndef RADIX_TREE_MAP_SHIFT
-#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6)
-#endif
-
+#define RADIX_TREE_MAP_SHIFT XA_CHUNK_SHIFT
#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT)
#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1)
-#define RADIX_TREE_TAG_LONGS \
- ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
+#define RADIX_TREE_MAX_TAGS XA_MAX_MARKS
+#define RADIX_TREE_TAG_LONGS XA_MARK_LONGS
#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
RADIX_TREE_MAP_SHIFT))
-/*
- * @count is the count of every non-NULL element in the ->slots array
- * whether that is an exceptional entry, a retry entry, a user pointer,
- * a sibling entry or a pointer to the next level of the tree.
- * @exceptional is the count of every element in ->slots which is
- * either radix_tree_exceptional_entry() or is a sibling entry for an
- * exceptional entry.
- */
-struct radix_tree_node {
- unsigned char shift; /* Bits remaining in each slot */
- unsigned char offset; /* Slot offset in parent */
- unsigned char count; /* Total entry count */
- unsigned char exceptional; /* Exceptional entry count */
- struct radix_tree_node *parent; /* Used when ascending tree */
- struct radix_tree_root *root; /* The tree we belong to */
- union {
- struct list_head private_list; /* For tree user */
- struct rcu_head rcu_head; /* Used when freeing node */
- };
- void __rcu *slots[RADIX_TREE_MAP_SIZE];
- unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
-};
-
-/* The IDR tag is stored in the low bits of the GFP flags */
+/* The IDR tag is stored in the low bits of xa_flags */
#define ROOT_IS_IDR ((__force gfp_t)4)
-/* The top bits of gfp_mask are used to store the root tags */
+/* The top bits of xa_flags are used to store the root tags */
#define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT)
-struct radix_tree_root {
- spinlock_t xa_lock;
- gfp_t gfp_mask;
- struct radix_tree_node __rcu *rnode;
-};
-
-#define RADIX_TREE_INIT(name, mask) { \
- .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \
- .gfp_mask = (mask), \
- .rnode = NULL, \
-}
+#define RADIX_TREE_INIT(name, mask) XARRAY_INIT(name, mask)
#define RADIX_TREE(name, mask) \
struct radix_tree_root name = RADIX_TREE_INIT(name, mask)
-#define INIT_RADIX_TREE(root, mask) \
-do { \
- spin_lock_init(&(root)->xa_lock); \
- (root)->gfp_mask = (mask); \
- (root)->rnode = NULL; \
-} while (0)
+#define INIT_RADIX_TREE(root, mask) xa_init_flags(root, mask)
static inline bool radix_tree_empty(const struct radix_tree_root *root)
{
- return root->rnode == NULL;
+ return root->xa_head == NULL;
}
/**
@@ -143,7 +96,6 @@ static inline bool radix_tree_empty(const struct radix_tree_root *root)
* @next_index: one beyond the last index for this chunk
* @tags: bit-mask for tag-iterating
* @node: node that contains current slot
- * @shift: shift for the node that holds our slots
*
* This radix tree iterator works in terms of "chunks" of slots. A chunk is a
* subinterval of slots contained within one radix tree leaf node. It is
@@ -157,20 +109,8 @@ struct radix_tree_iter {
unsigned long next_index;
unsigned long tags;
struct radix_tree_node *node;
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
- unsigned int shift;
-#endif
};
-static inline unsigned int iter_shift(const struct radix_tree_iter *iter)
-{
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
- return iter->shift;
-#else
- return 0;
-#endif
-}
-
/**
* Radix-tree synchronization
*
@@ -194,12 +134,11 @@ static inline unsigned int iter_shift(const struct radix_tree_iter *iter)
* radix_tree_lookup_slot
* radix_tree_tag_get
* radix_tree_gang_lookup
- * radix_tree_gang_lookup_slot
* radix_tree_gang_lookup_tag
* radix_tree_gang_lookup_tag_slot
* radix_tree_tagged
*
- * The first 8 functions are able to be called locklessly, using RCU. The
+ * The first 7 functions are able to be called locklessly, using RCU. The
* caller must ensure calls to these functions are made within rcu_read_lock()
* regions. Other readers (lock-free or otherwise) and modifications may be
* running concurrently.
@@ -269,17 +208,6 @@ static inline int radix_tree_deref_retry(void *arg)
}
/**
- * radix_tree_exceptional_entry - radix_tree_deref_slot gave exceptional entry?
- * @arg: value returned by radix_tree_deref_slot
- * Returns: 0 if well-aligned pointer, non-0 if exceptional entry.
- */
-static inline int radix_tree_exceptional_entry(void *arg)
-{
- /* Not unlikely because radix_tree_exception often tested first */
- return (unsigned long)arg & RADIX_TREE_EXCEPTIONAL_ENTRY;
-}
-
-/**
* radix_tree_exception - radix_tree_deref_slot returned either exception?
* @arg: value returned by radix_tree_deref_slot
* Returns: 0 if well-aligned pointer, non-0 if either kind of exception.
@@ -289,47 +217,28 @@ static inline int radix_tree_exception(void *arg)
return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
}
-int __radix_tree_create(struct radix_tree_root *, unsigned long index,
- unsigned order, struct radix_tree_node **nodep,
- void __rcu ***slotp);
-int __radix_tree_insert(struct radix_tree_root *, unsigned long index,
- unsigned order, void *);
-static inline int radix_tree_insert(struct radix_tree_root *root,
- unsigned long index, void *entry)
-{
- return __radix_tree_insert(root, index, 0, entry);
-}
+int radix_tree_insert(struct radix_tree_root *, unsigned long index,
+ void *);
void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index,
struct radix_tree_node **nodep, void __rcu ***slotp);
void *radix_tree_lookup(const struct radix_tree_root *, unsigned long);
void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *,
unsigned long index);
-typedef void (*radix_tree_update_node_t)(struct radix_tree_node *);
void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *,
- void __rcu **slot, void *entry,
- radix_tree_update_node_t update_node);
+ void __rcu **slot, void *entry);
void radix_tree_iter_replace(struct radix_tree_root *,
const struct radix_tree_iter *, void __rcu **slot, void *entry);
void radix_tree_replace_slot(struct radix_tree_root *,
void __rcu **slot, void *entry);
-void __radix_tree_delete_node(struct radix_tree_root *,
- struct radix_tree_node *,
- radix_tree_update_node_t update_node);
void radix_tree_iter_delete(struct radix_tree_root *,
struct radix_tree_iter *iter, void __rcu **slot);
void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
void *radix_tree_delete(struct radix_tree_root *, unsigned long);
-void radix_tree_clear_tags(struct radix_tree_root *, struct radix_tree_node *,
- void __rcu **slot);
unsigned int radix_tree_gang_lookup(const struct radix_tree_root *,
void **results, unsigned long first_index,
unsigned int max_items);
-unsigned int radix_tree_gang_lookup_slot(const struct radix_tree_root *,
- void __rcu ***results, unsigned long *indices,
- unsigned long first_index, unsigned int max_items);
int radix_tree_preload(gfp_t gfp_mask);
int radix_tree_maybe_preload(gfp_t gfp_mask);
-int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
void radix_tree_init(void);
void *radix_tree_tag_set(struct radix_tree_root *,
unsigned long index, unsigned int tag);
@@ -337,8 +246,6 @@ void *radix_tree_tag_clear(struct radix_tree_root *,
unsigned long index, unsigned int tag);
int radix_tree_tag_get(const struct radix_tree_root *,
unsigned long index, unsigned int tag);
-void radix_tree_iter_tag_set(struct radix_tree_root *,
- const struct radix_tree_iter *iter, unsigned int tag);
void radix_tree_iter_tag_clear(struct radix_tree_root *,
const struct radix_tree_iter *iter, unsigned int tag);
unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *,
@@ -354,12 +261,6 @@ static inline void radix_tree_preload_end(void)
preempt_enable();
}
-int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t);
-int radix_tree_split(struct radix_tree_root *, unsigned long index,
- unsigned new_order);
-int radix_tree_join(struct radix_tree_root *, unsigned long index,
- unsigned new_order, void *);
-
void __rcu **idr_get_free(struct radix_tree_root *root,
struct radix_tree_iter *iter, gfp_t gfp,
unsigned long max);
@@ -465,7 +366,7 @@ void __rcu **radix_tree_iter_retry(struct radix_tree_iter *iter)
static inline unsigned long
__radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
{
- return iter->index + (slots << iter_shift(iter));
+ return iter->index + slots;
}
/**
@@ -490,21 +391,9 @@ void __rcu **__must_check radix_tree_iter_resume(void __rcu **slot,
static __always_inline long
radix_tree_chunk_size(struct radix_tree_iter *iter)
{
- return (iter->next_index - iter->index) >> iter_shift(iter);
+ return iter->next_index - iter->index;
}
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
-void __rcu **__radix_tree_next_slot(void __rcu **slot,
- struct radix_tree_iter *iter, unsigned flags);
-#else
-/* Can't happen without sibling entries, but the compiler can't tell that */
-static inline void __rcu **__radix_tree_next_slot(void __rcu **slot,
- struct radix_tree_iter *iter, unsigned flags)
-{
- return slot;
-}
-#endif
-
/**
* radix_tree_next_slot - find next slot in chunk
*
@@ -563,8 +452,6 @@ static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot,
return NULL;
found:
- if (unlikely(radix_tree_is_internal_node(rcu_dereference_raw(*slot))))
- return __radix_tree_next_slot(slot, iter, flags);
return slot;
}
@@ -584,23 +471,6 @@ static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot,
slot = radix_tree_next_slot(slot, iter, 0))
/**
- * radix_tree_for_each_contig - iterate over contiguous slots
- *
- * @slot: the void** variable for pointer to slot
- * @root: the struct radix_tree_root pointer
- * @iter: the struct radix_tree_iter pointer
- * @start: iteration starting index
- *
- * @slot points to radix tree slot, @iter->index contains its index.
- */
-#define radix_tree_for_each_contig(slot, root, iter, start) \
- for (slot = radix_tree_iter_init(iter, start) ; \
- slot || (slot = radix_tree_next_chunk(root, iter, \
- RADIX_TREE_ITER_CONTIG)) ; \
- slot = radix_tree_next_slot(slot, iter, \
- RADIX_TREE_ITER_CONTIG))
-
-/**
* radix_tree_for_each_tagged - iterate over tagged slots
*
* @slot: the void** variable for pointer to slot
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 38195f5c96b1..d8a07a4f171d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -300,17 +300,12 @@ void *workingset_eviction(struct address_space *mapping, struct page *page);
void workingset_refault(struct page *page, void *shadow);
void workingset_activation(struct page *page);
-/* Do not use directly, use workingset_lookup_update */
-void workingset_update_node(struct radix_tree_node *node);
-
-/* Returns workingset_update_node() if the mapping has shadow entries. */
-#define workingset_lookup_update(mapping) \
-({ \
- radix_tree_update_node_t __helper = workingset_update_node; \
- if (dax_mapping(mapping) || shmem_mapping(mapping)) \
- __helper = NULL; \
- __helper; \
-})
+/* Only track the nodes of mappings with shadow entries */
+void workingset_update_node(struct xa_node *node);
+#define mapping_set_update(xas, mapping) do { \
+ if (!dax_mapping(mapping) && !shmem_mapping(mapping)) \
+ xas_set_update(xas, workingset_update_node); \
+} while (0)
/* linux/mm/page_alloc.c */
extern unsigned long totalram_pages;
@@ -409,7 +404,7 @@ extern void show_swap_cache_info(void);
extern int add_to_swap(struct page *page);
extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
-extern void __delete_from_swap_cache(struct page *);
+extern void __delete_from_swap_cache(struct page *, swp_entry_t entry);
extern void delete_from_swap_cache(struct page *);
extern void free_page_and_swap_cache(struct page *);
extern void free_pages_and_swap_cache(struct page **, int);
@@ -563,7 +558,8 @@ static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
return -1;
}
-static inline void __delete_from_swap_cache(struct page *page)
+static inline void __delete_from_swap_cache(struct page *page,
+ swp_entry_t entry)
{
}
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 22af9d8a84ae..4d961668e5fc 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -18,9 +18,8 @@
*
* swp_entry_t's are *never* stored anywhere in their arch-dependent format.
*/
-#define SWP_TYPE_SHIFT(e) ((sizeof(e.val) * 8) - \
- (MAX_SWAPFILES_SHIFT + RADIX_TREE_EXCEPTIONAL_SHIFT))
-#define SWP_OFFSET_MASK(e) ((1UL << SWP_TYPE_SHIFT(e)) - 1)
+#define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT)
+#define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1)
/*
* Store a type+offset into a swp_entry_t in an arch-independent format
@@ -29,8 +28,7 @@ static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset)
{
swp_entry_t ret;
- ret.val = (type << SWP_TYPE_SHIFT(ret)) |
- (offset & SWP_OFFSET_MASK(ret));
+ ret.val = (type << SWP_TYPE_SHIFT) | (offset & SWP_OFFSET_MASK);
return ret;
}
@@ -40,7 +38,7 @@ static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset)
*/
static inline unsigned swp_type(swp_entry_t entry)
{
- return (entry.val >> SWP_TYPE_SHIFT(entry));
+ return (entry.val >> SWP_TYPE_SHIFT);
}
/*
@@ -49,7 +47,7 @@ static inline unsigned swp_type(swp_entry_t entry)
*/
static inline pgoff_t swp_offset(swp_entry_t entry)
{
- return entry.val & SWP_OFFSET_MASK(entry);
+ return entry.val & SWP_OFFSET_MASK;
}
#ifdef CONFIG_MMU
@@ -90,16 +88,13 @@ static inline swp_entry_t radix_to_swp_entry(void *arg)
{
swp_entry_t entry;
- entry.val = (unsigned long)arg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
+ entry.val = xa_to_value(arg);
return entry;
}
static inline void *swp_to_radix_entry(swp_entry_t entry)
{
- unsigned long value;
-
- value = entry.val << RADIX_TREE_EXCEPTIONAL_SHIFT;
- return (void *)(value | RADIX_TREE_EXCEPTIONAL_ENTRY);
+ return xa_mk_value(entry.val);
}
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 2dfc8006fe64..d9514928ddac 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -4,10 +4,432 @@
/*
* eXtensible Arrays
* Copyright (c) 2017 Microsoft Corporation
- * Author: Matthew Wilcox <mawilcox@microsoft.com>
+ * Author: Matthew Wilcox <willy@infradead.org>
+ *
+ * See Documentation/core-api/xarray.rst for how to use the XArray.
*/
+#include <linux/bug.h>
+#include <linux/compiler.h>
+#include <linux/gfp.h>
+#include <linux/kconfig.h>
+#include <linux/kernel.h>
+#include <linux/rcupdate.h>
#include <linux/spinlock.h>
+#include <linux/types.h>
+
+/*
+ * The bottom two bits of the entry determine how the XArray interprets
+ * the contents:
+ *
+ * 00: Pointer entry
+ * 10: Internal entry
+ * x1: Value entry or tagged pointer
+ *
+ * Attempting to store internal entries in the XArray is a bug.
+ *
+ * Most internal entries are pointers to the next node in the tree.
+ * The following internal entries have a special meaning:
+ *
+ * 0-62: Sibling entries
+ * 256: Zero entry
+ * 257: Retry entry
+ *
+ * Errors are also represented as internal entries, but use the negative
+ * space (-4094 to -2). They're never stored in the slots array; only
+ * returned by the normal API.
+ */
+
+#define BITS_PER_XA_VALUE (BITS_PER_LONG - 1)
+
+/**
+ * xa_mk_value() - Create an XArray entry from an integer.
+ * @v: Value to store in XArray.
+ *
+ * Context: Any context.
+ * Return: An entry suitable for storing in the XArray.
+ */
+static inline void *xa_mk_value(unsigned long v)
+{
+ WARN_ON((long)v < 0);
+ return (void *)((v << 1) | 1);
+}
+
+/**
+ * xa_to_value() - Get value stored in an XArray entry.
+ * @entry: XArray entry.
+ *
+ * Context: Any context.
+ * Return: The value stored in the XArray entry.
+ */
+static inline unsigned long xa_to_value(const void *entry)
+{
+ return (unsigned long)entry >> 1;
+}
+
+/**
+ * xa_is_value() - Determine if an entry is a value.
+ * @entry: XArray entry.
+ *
+ * Context: Any context.
+ * Return: True if the entry is a value, false if it is a pointer.
+ */
+static inline bool xa_is_value(const void *entry)
+{
+ return (unsigned long)entry & 1;
+}
+
+/**
+ * xa_tag_pointer() - Create an XArray entry for a tagged pointer.
+ * @p: Plain pointer.
+ * @tag: Tag value (0, 1 or 3).
+ *
+ * If the user of the XArray prefers, they can tag their pointers instead
+ * of storing value entries. Three tags are available (0, 1 and 3).
+ * These are distinct from the xa_mark_t as they are not replicated up
+ * through the array and cannot be searched for.
+ *
+ * Context: Any context.
+ * Return: An XArray entry.
+ */
+static inline void *xa_tag_pointer(void *p, unsigned long tag)
+{
+ return (void *)((unsigned long)p | tag);
+}
+
+/**
+ * xa_untag_pointer() - Turn an XArray entry into a plain pointer.
+ * @entry: XArray entry.
+ *
+ * If you have stored a tagged pointer in the XArray, call this function
+ * to get the untagged version of the pointer.
+ *
+ * Context: Any context.
+ * Return: A pointer.
+ */
+static inline void *xa_untag_pointer(void *entry)
+{
+ return (void *)((unsigned long)entry & ~3UL);
+}
+
+/**
+ * xa_pointer_tag() - Get the tag stored in an XArray entry.
+ * @entry: XArray entry.
+ *
+ * If you have stored a tagged pointer in the XArray, call this function
+ * to get the tag of that pointer.
+ *
+ * Context: Any context.
+ * Return: A tag.
+ */
+static inline unsigned int xa_pointer_tag(void *entry)
+{
+ return (unsigned long)entry & 3UL;
+}
+
+/*
+ * xa_mk_internal() - Create an internal entry.
+ * @v: Value to turn into an internal entry.
+ *
+ * Context: Any context.
+ * Return: An XArray internal entry corresponding to this value.
+ */
+static inline void *xa_mk_internal(unsigned long v)
+{
+ return (void *)((v << 2) | 2);
+}
+
+/*
+ * xa_to_internal() - Extract the value from an internal entry.
+ * @entry: XArray entry.
+ *
+ * Context: Any context.
+ * Return: The value which was stored in the internal entry.
+ */
+static inline unsigned long xa_to_internal(const void *entry)
+{
+ return (unsigned long)entry >> 2;
+}
+
+/*
+ * xa_is_internal() - Is the entry an internal entry?
+ * @entry: XArray entry.
+ *
+ * Context: Any context.
+ * Return: %true if the entry is an internal entry.
+ */
+static inline bool xa_is_internal(const void *entry)
+{
+ return ((unsigned long)entry & 3) == 2;
+}
+
+/**
+ * xa_is_err() - Report whether an XArray operation returned an error
+ * @entry: Result from calling an XArray function
+ *
+ * If an XArray operation cannot complete an operation, it will return
+ * a special value indicating an error. This function tells you
+ * whether an error occurred; xa_err() tells you which error occurred.
+ *
+ * Context: Any context.
+ * Return: %true if the entry indicates an error.
+ */
+static inline bool xa_is_err(const void *entry)
+{
+ return unlikely(xa_is_internal(entry));
+}
+
+/**
+ * xa_err() - Turn an XArray result into an errno.
+ * @entry: Result from calling an XArray function.
+ *
+ * If an XArray operation cannot complete an operation, it will return
+ * a special pointer value which encodes an errno. This function extracts
+ * the errno from the pointer value, or returns 0 if the pointer does not
+ * represent an errno.
+ *
+ * Context: Any context.
+ * Return: A negative errno or 0.
+ */
+static inline int xa_err(void *entry)
+{
+ /* xa_to_internal() would not do sign extension. */
+ if (xa_is_err(entry))
+ return (long)entry >> 2;
+ return 0;
+}
+
+typedef unsigned __bitwise xa_mark_t;
+#define XA_MARK_0 ((__force xa_mark_t)0U)
+#define XA_MARK_1 ((__force xa_mark_t)1U)
+#define XA_MARK_2 ((__force xa_mark_t)2U)
+#define XA_PRESENT ((__force xa_mark_t)8U)
+#define XA_MARK_MAX XA_MARK_2
+#define XA_FREE_MARK XA_MARK_0
+
+enum xa_lock_type {
+ XA_LOCK_IRQ = 1,
+ XA_LOCK_BH = 2,
+};
+
+/*
+ * Values for xa_flags. The radix tree stores its GFP flags in the xa_flags,
+ * and we remain compatible with that.
+ */
+#define XA_FLAGS_LOCK_IRQ ((__force gfp_t)XA_LOCK_IRQ)
+#define XA_FLAGS_LOCK_BH ((__force gfp_t)XA_LOCK_BH)
+#define XA_FLAGS_TRACK_FREE ((__force gfp_t)4U)
+#define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \
+ (__force unsigned)(mark)))
+
+#define XA_FLAGS_ALLOC (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK))
+
+/**
+ * struct xarray - The anchor of the XArray.
+ * @xa_lock: Lock that protects the contents of the XArray.
+ *
+ * To use the xarray, define it statically or embed it in your data structure.
+ * It is a very small data structure, so it does not usually make sense to
+ * allocate it separately and keep a pointer to it in your data structure.
+ *
+ * You may use the xa_lock to protect your own data structures as well.
+ */
+/*
+ * If all of the entries in the array are NULL, @xa_head is a NULL pointer.
+ * If the only non-NULL entry in the array is at index 0, @xa_head is that
+ * entry. If any other entry in the array is non-NULL, @xa_head points
+ * to an @xa_node.
+ */
+struct xarray {
+ spinlock_t xa_lock;
+/* private: The rest of the data structure is not to be used directly. */
+ gfp_t xa_flags;
+ void __rcu * xa_head;
+};
+
+#define XARRAY_INIT(name, flags) { \
+ .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \
+ .xa_flags = flags, \
+ .xa_head = NULL, \
+}
+
+/**
+ * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags.
+ * @name: A string that names your XArray.
+ * @flags: XA_FLAG values.
+ *
+ * This is intended for file scope definitions of XArrays. It declares
+ * and initialises an empty XArray with the chosen name and flags. It is
+ * equivalent to calling xa_init_flags() on the array, but it does the
+ * initialisation at compiletime instead of runtime.
+ */
+#define DEFINE_XARRAY_FLAGS(name, flags) \
+ struct xarray name = XARRAY_INIT(name, flags)
+
+/**
+ * DEFINE_XARRAY() - Define an XArray.
+ * @name: A string that names your XArray.
+ *
+ * This is intended for file scope definitions of XArrays. It declares
+ * and initialises an empty XArray with the chosen name. It is equivalent
+ * to calling xa_init() on the array, but it does the initialisation at
+ * compiletime instead of runtime.
+ */
+#define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0)
+
+/**
+ * DEFINE_XARRAY_ALLOC() - Define an XArray which can allocate IDs.
+ * @name: A string that names your XArray.
+ *
+ * This is intended for file scope definitions of allocating XArrays.
+ * See also DEFINE_XARRAY().
+ */
+#define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC)
+
+void xa_init_flags(struct xarray *, gfp_t flags);
+void *xa_load(struct xarray *, unsigned long index);
+void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
+void *xa_cmpxchg(struct xarray *, unsigned long index,
+ void *old, void *entry, gfp_t);
+int xa_reserve(struct xarray *, unsigned long index, gfp_t);
+void *xa_store_range(struct xarray *, unsigned long first, unsigned long last,
+ void *entry, gfp_t);
+bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t);
+void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
+void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);
+void *xa_find(struct xarray *xa, unsigned long *index,
+ unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
+void *xa_find_after(struct xarray *xa, unsigned long *index,
+ unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
+unsigned int xa_extract(struct xarray *, void **dst, unsigned long start,
+ unsigned long max, unsigned int n, xa_mark_t);
+void xa_destroy(struct xarray *);
+
+/**
+ * xa_init() - Initialise an empty XArray.
+ * @xa: XArray.
+ *
+ * An empty XArray is full of NULL entries.
+ *
+ * Context: Any context.
+ */
+static inline void xa_init(struct xarray *xa)
+{
+ xa_init_flags(xa, 0);
+}
+
+/**
+ * xa_empty() - Determine if an array has any present entries.
+ * @xa: XArray.
+ *
+ * Context: Any context.
+ * Return: %true if the array contains only NULL pointers.
+ */
+static inline bool xa_empty(const struct xarray *xa)
+{
+ return xa->xa_head == NULL;
+}
+
+/**
+ * xa_marked() - Inquire whether any entry in this array has a mark set
+ * @xa: Array
+ * @mark: Mark value
+ *
+ * Context: Any context.
+ * Return: %true if any entry has this mark set.
+ */
+static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark)
+{
+ return xa->xa_flags & XA_FLAGS_MARK(mark);
+}
+
+/**
+ * xa_erase() - Erase this entry from the XArray.
+ * @xa: XArray.
+ * @index: Index of entry.
+ *
+ * This function is the equivalent of calling xa_store() with %NULL as
+ * the third argument. The XArray does not need to allocate memory, so
+ * the user does not need to provide GFP flags.
+ *
+ * Context: Process context. Takes and releases the xa_lock.
+ * Return: The entry which used to be at this index.
+ */
+static inline void *xa_erase(struct xarray *xa, unsigned long index)
+{
+ return xa_store(xa, index, NULL, 0);
+}
+
+/**
+ * xa_insert() - Store this entry in the XArray unless another entry is
+ * already present.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * If you would rather see the existing entry in the array, use xa_cmpxchg().
+ * This function is for users who don't care what the entry is, only that
+ * one is present.
+ *
+ * Context: Process context. Takes and releases the xa_lock.
+ * May sleep if the @gfp flags permit.
+ * Return: 0 if the store succeeded. -EEXIST if another entry was present.
+ * -ENOMEM if memory could not be allocated.
+ */
+static inline int xa_insert(struct xarray *xa, unsigned long index,
+ void *entry, gfp_t gfp)
+{
+ void *curr = xa_cmpxchg(xa, index, NULL, entry, gfp);
+ if (!curr)
+ return 0;
+ if (xa_is_err(curr))
+ return xa_err(curr);
+ return -EEXIST;
+}
+
+/**
+ * xa_release() - Release a reserved entry.
+ * @xa: XArray.
+ * @index: Index of entry.
+ *
+ * After calling xa_reserve(), you can call this function to release the
+ * reservation. If the entry at @index has been stored to, this function
+ * will do nothing.
+ */
+static inline void xa_release(struct xarray *xa, unsigned long index)
+{
+ xa_cmpxchg(xa, index, NULL, NULL, 0);
+}
+
+/**
+ * xa_for_each() - Iterate over a portion of an XArray.
+ * @xa: XArray.
+ * @entry: Entry retrieved from array.
+ * @index: Index of @entry.
+ * @max: Maximum index to retrieve from array.
+ * @filter: Selection criterion.
+ *
+ * Initialise @index to the lowest index you want to retrieve from the
+ * array. During the iteration, @entry will have the value of the entry
+ * stored in @xa at @index. The iteration will skip all entries in the
+ * array which do not match @filter. You may modify @index during the
+ * iteration if you want to skip or reprocess indices. It is safe to modify
+ * the array during the iteration. At the end of the iteration, @entry will
+ * be set to NULL and @index will have a value less than or equal to max.
+ *
+ * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n). You have
+ * to handle your own locking with xas_for_each(), and if you have to unlock
+ * after each iteration, it will also end up being O(n.log(n)). xa_for_each()
+ * will spin if it hits a retry entry; if you intend to see retry entries,
+ * you should use the xas_for_each() iterator instead. The xas_for_each()
+ * iterator will expand into more inline code than xa_for_each().
+ *
+ * Context: Any context. Takes and releases the RCU lock.
+ */
+#define xa_for_each(xa, entry, index, max, filter) \
+ for (entry = xa_find(xa, &index, max, filter); entry; \
+ entry = xa_find_after(xa, &index, max, filter))
#define xa_trylock(xa) spin_trylock(&(xa)->xa_lock)
#define xa_lock(xa) spin_lock(&(xa)->xa_lock)
@@ -21,4 +443,873 @@
#define xa_unlock_irqrestore(xa, flags) \
spin_unlock_irqrestore(&(xa)->xa_lock, flags)
+/*
+ * Versions of the normal API which require the caller to hold the
+ * xa_lock. If the GFP flags allow it, they will drop the lock to
+ * allocate memory, then reacquire it afterwards. These functions
+ * may also re-enable interrupts if the XArray flags indicate the
+ * locking should be interrupt safe.
+ */
+void *__xa_erase(struct xarray *, unsigned long index);
+void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
+void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old,
+ void *entry, gfp_t);
+int __xa_alloc(struct xarray *, u32 *id, u32 max, void *entry, gfp_t);
+void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
+void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);
+
+/**
+ * __xa_insert() - Store this entry in the XArray unless another entry is
+ * already present.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * If you would rather see the existing entry in the array, use __xa_cmpxchg().
+ * This function is for users who don't care what the entry is, only that
+ * one is present.
+ *
+ * Context: Any context. Expects xa_lock to be held on entry. May
+ * release and reacquire xa_lock if the @gfp flags permit.
+ * Return: 0 if the store succeeded. -EEXIST if another entry was present.
+ * -ENOMEM if memory could not be allocated.
+ */
+static inline int __xa_insert(struct xarray *xa, unsigned long index,
+ void *entry, gfp_t gfp)
+{
+ void *curr = __xa_cmpxchg(xa, index, NULL, entry, gfp);
+ if (!curr)
+ return 0;
+ if (xa_is_err(curr))
+ return xa_err(curr);
+ return -EEXIST;
+}
+
+/**
+ * xa_erase_bh() - Erase this entry from the XArray.
+ * @xa: XArray.
+ * @index: Index of entry.
+ *
+ * This function is the equivalent of calling xa_store() with %NULL as
+ * the third argument. The XArray does not need to allocate memory, so
+ * the user does not need to provide GFP flags.
+ *
+ * Context: Process context. Takes and releases the xa_lock while
+ * disabling softirqs.
+ * Return: The entry which used to be at this index.
+ */
+static inline void *xa_erase_bh(struct xarray *xa, unsigned long index)
+{
+ void *entry;
+
+ xa_lock_bh(xa);
+ entry = __xa_erase(xa, index);
+ xa_unlock_bh(xa);
+
+ return entry;
+}
+
+/**
+ * xa_erase_irq() - Erase this entry from the XArray.
+ * @xa: XArray.
+ * @index: Index of entry.
+ *
+ * This function is the equivalent of calling xa_store() with %NULL as
+ * the third argument. The XArray does not need to allocate memory, so
+ * the user does not need to provide GFP flags.
+ *
+ * Context: Process context. Takes and releases the xa_lock while
+ * disabling interrupts.
+ * Return: The entry which used to be at this index.
+ */
+static inline void *xa_erase_irq(struct xarray *xa, unsigned long index)
+{
+ void *entry;
+
+ xa_lock_irq(xa);
+ entry = __xa_erase(xa, index);
+ xa_unlock_irq(xa);
+
+ return entry;
+}
+
+/**
+ * xa_alloc() - Find somewhere to store this entry in the XArray.
+ * @xa: XArray.
+ * @id: Pointer to ID.
+ * @max: Maximum ID to allocate (inclusive).
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * Allocates an unused ID in the range specified by @id and @max.
+ * Updates the @id pointer with the index, then stores the entry at that
+ * index. A concurrent lookup will not see an uninitialised @id.
+ *
+ * Context: Process context. Takes and releases the xa_lock. May sleep if
+ * the @gfp flags permit.
+ * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if
+ * there is no more space in the XArray.
+ */
+static inline int xa_alloc(struct xarray *xa, u32 *id, u32 max, void *entry,
+ gfp_t gfp)
+{
+ int err;
+
+ xa_lock(xa);
+ err = __xa_alloc(xa, id, max, entry, gfp);
+ xa_unlock(xa);
+
+ return err;
+}
+
+/**
+ * xa_alloc_bh() - Find somewhere to store this entry in the XArray.
+ * @xa: XArray.
+ * @id: Pointer to ID.
+ * @max: Maximum ID to allocate (inclusive).
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * Allocates an unused ID in the range specified by @id and @max.
+ * Updates the @id pointer with the index, then stores the entry at that
+ * index. A concurrent lookup will not see an uninitialised @id.
+ *
+ * Context: Process context. Takes and releases the xa_lock while
+ * disabling softirqs. May sleep if the @gfp flags permit.
+ * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if
+ * there is no more space in the XArray.
+ */
+static inline int xa_alloc_bh(struct xarray *xa, u32 *id, u32 max, void *entry,
+ gfp_t gfp)
+{
+ int err;
+
+ xa_lock_bh(xa);
+ err = __xa_alloc(xa, id, max, entry, gfp);
+ xa_unlock_bh(xa);
+
+ return err;
+}
+
+/**
+ * xa_alloc_irq() - Find somewhere to store this entry in the XArray.
+ * @xa: XArray.
+ * @id: Pointer to ID.
+ * @max: Maximum ID to allocate (inclusive).
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * Allocates an unused ID in the range specified by @id and @max.
+ * Updates the @id pointer with the index, then stores the entry at that
+ * index. A concurrent lookup will not see an uninitialised @id.
+ *
+ * Context: Process context. Takes and releases the xa_lock while
+ * disabling interrupts. May sleep if the @gfp flags permit.
+ * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if
+ * there is no more space in the XArray.
+ */
+static inline int xa_alloc_irq(struct xarray *xa, u32 *id, u32 max, void *entry,
+ gfp_t gfp)
+{
+ int err;
+
+ xa_lock_irq(xa);
+ err = __xa_alloc(xa, id, max, entry, gfp);
+ xa_unlock_irq(xa);
+
+ return err;
+}
+
+/* Everything below here is the Advanced API. Proceed with caution. */
+
+/*
+ * The xarray is constructed out of a set of 'chunks' of pointers. Choosing
+ * the best chunk size requires some tradeoffs. A power of two recommends
+ * itself so that we can walk the tree based purely on shifts and masks.
+ * Generally, the larger the better; as the number of slots per level of the
+ * tree increases, the less tall the tree needs to be. But that needs to be
+ * balanced against the memory consumption of each node. On a 64-bit system,
+ * xa_node is currently 576 bytes, and we get 7 of them per 4kB page. If we
+ * doubled the number of slots per node, we'd get only 3 nodes per 4kB page.
+ */
+#ifndef XA_CHUNK_SHIFT
+#define XA_CHUNK_SHIFT (CONFIG_BASE_SMALL ? 4 : 6)
+#endif
+#define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT)
+#define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1)
+#define XA_MAX_MARKS 3
+#define XA_MARK_LONGS DIV_ROUND_UP(XA_CHUNK_SIZE, BITS_PER_LONG)
+
+/*
+ * @count is the count of every non-NULL element in the ->slots array
+ * whether that is a value entry, a retry entry, a user pointer,
+ * a sibling entry or a pointer to the next level of the tree.
+ * @nr_values is the count of every element in ->slots which is
+ * either a value entry or a sibling of a value entry.
+ */
+struct xa_node {
+ unsigned char shift; /* Bits remaining in each slot */
+ unsigned char offset; /* Slot offset in parent */
+ unsigned char count; /* Total entry count */
+ unsigned char nr_values; /* Value entry count */
+ struct xa_node __rcu *parent; /* NULL at top of tree */
+ struct xarray *array; /* The array we belong to */
+ union {
+ struct list_head private_list; /* For tree user */
+ struct rcu_head rcu_head; /* Used when freeing node */
+ };
+ void __rcu *slots[XA_CHUNK_SIZE];
+ union {
+ unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS];
+ unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS];
+ };
+};
+
+void xa_dump(const struct xarray *);
+void xa_dump_node(const struct xa_node *);
+
+#ifdef XA_DEBUG
+#define XA_BUG_ON(xa, x) do { \
+ if (x) { \
+ xa_dump(xa); \
+ BUG(); \
+ } \
+ } while (0)
+#define XA_NODE_BUG_ON(node, x) do { \
+ if (x) { \
+ if (node) xa_dump_node(node); \
+ BUG(); \
+ } \
+ } while (0)
+#else
+#define XA_BUG_ON(xa, x) do { } while (0)
+#define XA_NODE_BUG_ON(node, x) do { } while (0)
+#endif
+
+/* Private */
+static inline void *xa_head(const struct xarray *xa)
+{
+ return rcu_dereference_check(xa->xa_head,
+ lockdep_is_held(&xa->xa_lock));
+}
+
+/* Private */
+static inline void *xa_head_locked(const struct xarray *xa)
+{
+ return rcu_dereference_protected(xa->xa_head,
+ lockdep_is_held(&xa->xa_lock));
+}
+
+/* Private */
+static inline void *xa_entry(const struct xarray *xa,
+ const struct xa_node *node, unsigned int offset)
+{
+ XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
+ return rcu_dereference_check(node->slots[offset],
+ lockdep_is_held(&xa->xa_lock));
+}
+
+/* Private */
+static inline void *xa_entry_locked(const struct xarray *xa,
+ const struct xa_node *node, unsigned int offset)
+{
+ XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
+ return rcu_dereference_protected(node->slots[offset],
+ lockdep_is_held(&xa->xa_lock));
+}
+
+/* Private */
+static inline struct xa_node *xa_parent(const struct xarray *xa,
+ const struct xa_node *node)
+{
+ return rcu_dereference_check(node->parent,
+ lockdep_is_held(&xa->xa_lock));
+}
+
+/* Private */
+static inline struct xa_node *xa_parent_locked(const struct xarray *xa,
+ const struct xa_node *node)
+{
+ return rcu_dereference_protected(node->parent,
+ lockdep_is_held(&xa->xa_lock));
+}
+
+/* Private */
+static inline void *xa_mk_node(const struct xa_node *node)
+{
+ return (void *)((unsigned long)node | 2);
+}
+
+/* Private */
+static inline struct xa_node *xa_to_node(const void *entry)
+{
+ return (struct xa_node *)((unsigned long)entry - 2);
+}
+
+/* Private */
+static inline bool xa_is_node(const void *entry)
+{
+ return xa_is_internal(entry) && (unsigned long)entry > 4096;
+}
+
+/* Private */
+static inline void *xa_mk_sibling(unsigned int offset)
+{
+ return xa_mk_internal(offset);
+}
+
+/* Private */
+static inline unsigned long xa_to_sibling(const void *entry)
+{
+ return xa_to_internal(entry);
+}
+
+/**
+ * xa_is_sibling() - Is the entry a sibling entry?
+ * @entry: Entry retrieved from the XArray
+ *
+ * Return: %true if the entry is a sibling entry.
+ */
+static inline bool xa_is_sibling(const void *entry)
+{
+ return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) &&
+ (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1));
+}
+
+#define XA_ZERO_ENTRY xa_mk_internal(256)
+#define XA_RETRY_ENTRY xa_mk_internal(257)
+
+/**
+ * xa_is_zero() - Is the entry a zero entry?
+ * @entry: Entry retrieved from the XArray
+ *
+ * Return: %true if the entry is a zero entry.
+ */
+static inline bool xa_is_zero(const void *entry)
+{
+ return unlikely(entry == XA_ZERO_ENTRY);
+}
+
+/**
+ * xa_is_retry() - Is the entry a retry entry?
+ * @entry: Entry retrieved from the XArray
+ *
+ * Return: %true if the entry is a retry entry.
+ */
+static inline bool xa_is_retry(const void *entry)
+{
+ return unlikely(entry == XA_RETRY_ENTRY);
+}
+
+/**
+ * typedef xa_update_node_t - A callback function from the XArray.
+ * @node: The node which is being processed
+ *
+ * This function is called every time the XArray updates the count of
+ * present and value entries in a node. It allows advanced users to
+ * maintain the private_list in the node.
+ *
+ * Context: The xa_lock is held and interrupts may be disabled.
+ * Implementations should not drop the xa_lock, nor re-enable
+ * interrupts.
+ */
+typedef void (*xa_update_node_t)(struct xa_node *node);
+
+/*
+ * The xa_state is opaque to its users. It contains various different pieces
+ * of state involved in the current operation on the XArray. It should be
+ * declared on the stack and passed between the various internal routines.
+ * The various elements in it should not be accessed directly, but only
+ * through the provided accessor functions. The below documentation is for
+ * the benefit of those working on the code, not for users of the XArray.
+ *
+ * @xa_node usually points to the xa_node containing the slot we're operating
+ * on (and @xa_offset is the offset in the slots array). If there is a
+ * single entry in the array at index 0, there are no allocated xa_nodes to
+ * point to, and so we store %NULL in @xa_node. @xa_node is set to
+ * the value %XAS_RESTART if the xa_state is not walked to the correct
+ * position in the tree of nodes for this operation. If an error occurs
+ * during an operation, it is set to an %XAS_ERROR value. If we run off the
+ * end of the allocated nodes, it is set to %XAS_BOUNDS.
+ */
+struct xa_state {
+ struct xarray *xa;
+ unsigned long xa_index;
+ unsigned char xa_shift;
+ unsigned char xa_sibs;
+ unsigned char xa_offset;
+ unsigned char xa_pad; /* Helps gcc generate better code */
+ struct xa_node *xa_node;
+ struct xa_node *xa_alloc;
+ xa_update_node_t xa_update;
+};
+
+/*
+ * We encode errnos in the xas->xa_node. If an error has happened, we need to
+ * drop the lock to fix it, and once we've done so the xa_state is invalid.
+ */
+#define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL))
+#define XAS_BOUNDS ((struct xa_node *)1UL)
+#define XAS_RESTART ((struct xa_node *)3UL)
+
+#define __XA_STATE(array, index, shift, sibs) { \
+ .xa = array, \
+ .xa_index = index, \
+ .xa_shift = shift, \
+ .xa_sibs = sibs, \
+ .xa_offset = 0, \
+ .xa_pad = 0, \
+ .xa_node = XAS_RESTART, \
+ .xa_alloc = NULL, \
+ .xa_update = NULL \
+}
+
+/**
+ * XA_STATE() - Declare an XArray operation state.
+ * @name: Name of this operation state (usually xas).
+ * @array: Array to operate on.
+ * @index: Initial index of interest.
+ *
+ * Declare and initialise an xa_state on the stack.
+ */
+#define XA_STATE(name, array, index) \
+ struct xa_state name = __XA_STATE(array, index, 0, 0)
+
+/**
+ * XA_STATE_ORDER() - Declare an XArray operation state.
+ * @name: Name of this operation state (usually xas).
+ * @array: Array to operate on.
+ * @index: Initial index of interest.
+ * @order: Order of entry.
+ *
+ * Declare and initialise an xa_state on the stack. This variant of
+ * XA_STATE() allows you to specify the 'order' of the element you
+ * want to operate on.`
+ */
+#define XA_STATE_ORDER(name, array, index, order) \
+ struct xa_state name = __XA_STATE(array, \
+ (index >> order) << order, \
+ order - (order % XA_CHUNK_SHIFT), \
+ (1U << (order % XA_CHUNK_SHIFT)) - 1)
+
+#define xas_marked(xas, mark) xa_marked((xas)->xa, (mark))
+#define xas_trylock(xas) xa_trylock((xas)->xa)
+#define xas_lock(xas) xa_lock((xas)->xa)
+#define xas_unlock(xas) xa_unlock((xas)->xa)
+#define xas_lock_bh(xas) xa_lock_bh((xas)->xa)
+#define xas_unlock_bh(xas) xa_unlock_bh((xas)->xa)
+#define xas_lock_irq(xas) xa_lock_irq((xas)->xa)
+#define xas_unlock_irq(xas) xa_unlock_irq((xas)->xa)
+#define xas_lock_irqsave(xas, flags) \
+ xa_lock_irqsave((xas)->xa, flags)
+#define xas_unlock_irqrestore(xas, flags) \
+ xa_unlock_irqrestore((xas)->xa, flags)
+
+/**
+ * xas_error() - Return an errno stored in the xa_state.
+ * @xas: XArray operation state.
+ *
+ * Return: 0 if no error has been noted. A negative errno if one has.
+ */
+static inline int xas_error(const struct xa_state *xas)
+{
+ return xa_err(xas->xa_node);
+}
+
+/**
+ * xas_set_err() - Note an error in the xa_state.
+ * @xas: XArray operation state.
+ * @err: Negative error number.
+ *
+ * Only call this function with a negative @err; zero or positive errors
+ * will probably not behave the way you think they should. If you want
+ * to clear the error from an xa_state, use xas_reset().
+ */
+static inline void xas_set_err(struct xa_state *xas, long err)
+{
+ xas->xa_node = XA_ERROR(err);
+}
+
+/**
+ * xas_invalid() - Is the xas in a retry or error state?
+ * @xas: XArray operation state.
+ *
+ * Return: %true if the xas cannot be used for operations.
+ */
+static inline bool xas_invalid(const struct xa_state *xas)
+{
+ return (unsigned long)xas->xa_node & 3;
+}
+
+/**
+ * xas_valid() - Is the xas a valid cursor into the array?
+ * @xas: XArray operation state.
+ *
+ * Return: %true if the xas can be used for operations.
+ */
+static inline bool xas_valid(const struct xa_state *xas)
+{
+ return !xas_invalid(xas);
+}
+
+/**
+ * xas_is_node() - Does the xas point to a node?
+ * @xas: XArray operation state.
+ *
+ * Return: %true if the xas currently references a node.
+ */
+static inline bool xas_is_node(const struct xa_state *xas)
+{
+ return xas_valid(xas) && xas->xa_node;
+}
+
+/* True if the pointer is something other than a node */
+static inline bool xas_not_node(struct xa_node *node)
+{
+ return ((unsigned long)node & 3) || !node;
+}
+
+/* True if the node represents RESTART or an error */
+static inline bool xas_frozen(struct xa_node *node)
+{
+ return (unsigned long)node & 2;
+}
+
+/* True if the node represents head-of-tree, RESTART or BOUNDS */
+static inline bool xas_top(struct xa_node *node)
+{
+ return node <= XAS_RESTART;
+}
+
+/**
+ * xas_reset() - Reset an XArray operation state.
+ * @xas: XArray operation state.
+ *
+ * Resets the error or walk state of the @xas so future walks of the
+ * array will start from the root. Use this if you have dropped the
+ * xarray lock and want to reuse the xa_state.
+ *
+ * Context: Any context.
+ */
+static inline void xas_reset(struct xa_state *xas)
+{
+ xas->xa_node = XAS_RESTART;
+}
+
+/**
+ * xas_retry() - Retry the operation if appropriate.
+ * @xas: XArray operation state.
+ * @entry: Entry from xarray.
+ *
+ * The advanced functions may sometimes return an internal entry, such as
+ * a retry entry or a zero entry. This function sets up the @xas to restart
+ * the walk from the head of the array if needed.
+ *
+ * Context: Any context.
+ * Return: true if the operation needs to be retried.
+ */
+static inline bool xas_retry(struct xa_state *xas, const void *entry)
+{
+ if (xa_is_zero(entry))
+ return true;
+ if (!xa_is_retry(entry))
+ return false;
+ xas_reset(xas);
+ return true;
+}
+
+void *xas_load(struct xa_state *);
+void *xas_store(struct xa_state *, void *entry);
+void *xas_find(struct xa_state *, unsigned long max);
+void *xas_find_conflict(struct xa_state *);
+
+bool xas_get_mark(const struct xa_state *, xa_mark_t);
+void xas_set_mark(const struct xa_state *, xa_mark_t);
+void xas_clear_mark(const struct xa_state *, xa_mark_t);
+void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t);
+void xas_init_marks(const struct xa_state *);
+
+bool xas_nomem(struct xa_state *, gfp_t);
+void xas_pause(struct xa_state *);
+
+void xas_create_range(struct xa_state *);
+
+/**
+ * xas_reload() - Refetch an entry from the xarray.
+ * @xas: XArray operation state.
+ *
+ * Use this function to check that a previously loaded entry still has
+ * the same value. This is useful for the lockless pagecache lookup where
+ * we walk the array with only the RCU lock to protect us, lock the page,
+ * then check that the page hasn't moved since we looked it up.
+ *
+ * The caller guarantees that @xas is still valid. If it may be in an
+ * error or restart state, call xas_load() instead.
+ *
+ * Return: The entry at this location in the xarray.
+ */
+static inline void *xas_reload(struct xa_state *xas)
+{
+ struct xa_node *node = xas->xa_node;
+
+ if (node)
+ return xa_entry(xas->xa, node, xas->xa_offset);
+ return xa_head(xas->xa);
+}
+
+/**
+ * xas_set() - Set up XArray operation state for a different index.
+ * @xas: XArray operation state.
+ * @index: New index into the XArray.
+ *
+ * Move the operation state to refer to a different index. This will
+ * have the effect of starting a walk from the top; see xas_next()
+ * to move to an adjacent index.
+ */
+static inline void xas_set(struct xa_state *xas, unsigned long index)
+{
+ xas->xa_index = index;
+ xas->xa_node = XAS_RESTART;
+}
+
+/**
+ * xas_set_order() - Set up XArray operation state for a multislot entry.
+ * @xas: XArray operation state.
+ * @index: Target of the operation.
+ * @order: Entry occupies 2^@order indices.
+ */
+static inline void xas_set_order(struct xa_state *xas, unsigned long index,
+ unsigned int order)
+{
+#ifdef CONFIG_XARRAY_MULTI
+ xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0;
+ xas->xa_shift = order - (order % XA_CHUNK_SHIFT);
+ xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
+ xas->xa_node = XAS_RESTART;
+#else
+ BUG_ON(order > 0);
+ xas_set(xas, index);
+#endif
+}
+
+/**
+ * xas_set_update() - Set up XArray operation state for a callback.
+ * @xas: XArray operation state.
+ * @update: Function to call when updating a node.
+ *
+ * The XArray can notify a caller after it has updated an xa_node.
+ * This is advanced functionality and is only needed by the page cache.
+ */
+static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update)
+{
+ xas->xa_update = update;
+}
+
+/**
+ * xas_next_entry() - Advance iterator to next present entry.
+ * @xas: XArray operation state.
+ * @max: Highest index to return.
+ *
+ * xas_next_entry() is an inline function to optimise xarray traversal for
+ * speed. It is equivalent to calling xas_find(), and will call xas_find()
+ * for all the hard cases.
+ *
+ * Return: The next present entry after the one currently referred to by @xas.
+ */
+static inline void *xas_next_entry(struct xa_state *xas, unsigned long max)
+{
+ struct xa_node *node = xas->xa_node;
+ void *entry;
+
+ if (unlikely(xas_not_node(node) || node->shift ||
+ xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)))
+ return xas_find(xas, max);
+
+ do {
+ if (unlikely(xas->xa_index >= max))
+ return xas_find(xas, max);
+ if (unlikely(xas->xa_offset == XA_CHUNK_MASK))
+ return xas_find(xas, max);
+ entry = xa_entry(xas->xa, node, xas->xa_offset + 1);
+ if (unlikely(xa_is_internal(entry)))
+ return xas_find(xas, max);
+ xas->xa_offset++;
+ xas->xa_index++;
+ } while (!entry);
+
+ return entry;
+}
+
+/* Private */
+static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance,
+ xa_mark_t mark)
+{
+ unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark];
+ unsigned int offset = xas->xa_offset;
+
+ if (advance)
+ offset++;
+ if (XA_CHUNK_SIZE == BITS_PER_LONG) {
+ if (offset < XA_CHUNK_SIZE) {
+ unsigned long data = *addr & (~0UL << offset);
+ if (data)
+ return __ffs(data);
+ }
+ return XA_CHUNK_SIZE;
+ }
+
+ return find_next_bit(addr, XA_CHUNK_SIZE, offset);
+}
+
+/**
+ * xas_next_marked() - Advance iterator to next marked entry.
+ * @xas: XArray operation state.
+ * @max: Highest index to return.
+ * @mark: Mark to search for.
+ *
+ * xas_next_marked() is an inline function to optimise xarray traversal for
+ * speed. It is equivalent to calling xas_find_marked(), and will call
+ * xas_find_marked() for all the hard cases.
+ *
+ * Return: The next marked entry after the one currently referred to by @xas.
+ */
+static inline void *xas_next_marked(struct xa_state *xas, unsigned long max,
+ xa_mark_t mark)
+{
+ struct xa_node *node = xas->xa_node;
+ unsigned int offset;
+
+ if (unlikely(xas_not_node(node) || node->shift))
+ return xas_find_marked(xas, max, mark);
+ offset = xas_find_chunk(xas, true, mark);
+ xas->xa_offset = offset;
+ xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset;
+ if (xas->xa_index > max)
+ return NULL;
+ if (offset == XA_CHUNK_SIZE)
+ return xas_find_marked(xas, max, mark);
+ return xa_entry(xas->xa, node, offset);
+}
+
+/*
+ * If iterating while holding a lock, drop the lock and reschedule
+ * every %XA_CHECK_SCHED loops.
+ */
+enum {
+ XA_CHECK_SCHED = 4096,
+};
+
+/**
+ * xas_for_each() - Iterate over a range of an XArray.
+ * @xas: XArray operation state.
+ * @entry: Entry retrieved from the array.
+ * @max: Maximum index to retrieve from array.
+ *
+ * The loop body will be executed for each entry present in the xarray
+ * between the current xas position and @max. @entry will be set to
+ * the entry retrieved from the xarray. It is safe to delete entries
+ * from the array in the loop body. You should hold either the RCU lock
+ * or the xa_lock while iterating. If you need to drop the lock, call
+ * xas_pause() first.
+ */
+#define xas_for_each(xas, entry, max) \
+ for (entry = xas_find(xas, max); entry; \
+ entry = xas_next_entry(xas, max))
+
+/**
+ * xas_for_each_marked() - Iterate over a range of an XArray.
+ * @xas: XArray operation state.
+ * @entry: Entry retrieved from the array.
+ * @max: Maximum index to retrieve from array.
+ * @mark: Mark to search for.
+ *
+ * The loop body will be executed for each marked entry in the xarray
+ * between the current xas position and @max. @entry will be set to
+ * the entry retrieved from the xarray. It is safe to delete entries
+ * from the array in the loop body. You should hold either the RCU lock
+ * or the xa_lock while iterating. If you need to drop the lock, call
+ * xas_pause() first.
+ */
+#define xas_for_each_marked(xas, entry, max, mark) \
+ for (entry = xas_find_marked(xas, max, mark); entry; \
+ entry = xas_next_marked(xas, max, mark))
+
+/**
+ * xas_for_each_conflict() - Iterate over a range of an XArray.
+ * @xas: XArray operation state.
+ * @entry: Entry retrieved from the array.
+ *
+ * The loop body will be executed for each entry in the XArray that lies
+ * within the range specified by @xas. If the loop completes successfully,
+ * any entries that lie in this range will be replaced by @entry. The caller
+ * may break out of the loop; if they do so, the contents of the XArray will
+ * be unchanged. The operation may fail due to an out of memory condition.
+ * The caller may also call xa_set_err() to exit the loop while setting an
+ * error to record the reason.
+ */
+#define xas_for_each_conflict(xas, entry) \
+ while ((entry = xas_find_conflict(xas)))
+
+void *__xas_next(struct xa_state *);
+void *__xas_prev(struct xa_state *);
+
+/**
+ * xas_prev() - Move iterator to previous index.
+ * @xas: XArray operation state.
+ *
+ * If the @xas was in an error state, it will remain in an error state
+ * and this function will return %NULL. If the @xas has never been walked,
+ * it will have the effect of calling xas_load(). Otherwise one will be
+ * subtracted from the index and the state will be walked to the correct
+ * location in the array for the next operation.
+ *
+ * If the iterator was referencing index 0, this function wraps
+ * around to %ULONG_MAX.
+ *
+ * Return: The entry at the new index. This may be %NULL or an internal
+ * entry.
+ */
+static inline void *xas_prev(struct xa_state *xas)
+{
+ struct xa_node *node = xas->xa_node;
+
+ if (unlikely(xas_not_node(node) || node->shift ||
+ xas->xa_offset == 0))
+ return __xas_prev(xas);
+
+ xas->xa_index--;
+ xas->xa_offset--;
+ return xa_entry(xas->xa, node, xas->xa_offset);
+}
+
+/**
+ * xas_next() - Move state to next index.
+ * @xas: XArray operation state.
+ *
+ * If the @xas was in an error state, it will remain in an error state
+ * and this function will return %NULL. If the @xas has never been walked,
+ * it will have the effect of calling xas_load(). Otherwise one will be
+ * added to the index and the state will be walked to the correct
+ * location in the array for the next operation.
+ *
+ * If the iterator was referencing index %ULONG_MAX, this function wraps
+ * around to 0.
+ *
+ * Return: The entry at the new index. This may be %NULL or an internal
+ * entry.
+ */
+static inline void *xas_next(struct xa_state *xas)
+{
+ struct xa_node *node = xas->xa_node;
+
+ if (unlikely(xas_not_node(node) || node->shift ||
+ xas->xa_offset == XA_CHUNK_MASK))
+ return __xas_next(xas);
+
+ xas->xa_index++;
+ xas->xa_offset++;
+ return xa_entry(xas->xa, node, xas->xa_offset);
+}
+
#endif /* _LINUX_XARRAY_H */
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 620fc4d2559a..9eced2cc9f94 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -1,47 +1,21 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
-#include <linux/radix-tree.h>
#include <linux/device.h>
-#include <linux/types.h>
-#include <linux/pfn_t.h>
#include <linux/io.h>
#include <linux/kasan.h>
-#include <linux/mm.h>
#include <linux/memory_hotplug.h>
+#include <linux/mm.h>
+#include <linux/pfn_t.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/types.h>
#include <linux/wait_bit.h>
+#include <linux/xarray.h>
-static DEFINE_MUTEX(pgmap_lock);
-static RADIX_TREE(pgmap_radix, GFP_KERNEL);
+static DEFINE_XARRAY(pgmap_array);
#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
-static unsigned long order_at(struct resource *res, unsigned long pgoff)
-{
- unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
- unsigned long nr_pages, mask;
-
- nr_pages = PHYS_PFN(resource_size(res));
- if (nr_pages == pgoff)
- return ULONG_MAX;
-
- /*
- * What is the largest aligned power-of-2 range available from
- * this resource pgoff to the end of the resource range,
- * considering the alignment of the current pgoff?
- */
- mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff);
- if (!mask)
- return ULONG_MAX;
-
- return find_first_bit(&mask, BITS_PER_LONG);
-}
-
-#define foreach_order_pgoff(res, order, pgoff) \
- for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
- pgoff += 1UL << order, order = order_at((res), pgoff))
-
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
unsigned long addr,
@@ -70,18 +44,10 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
EXPORT_SYMBOL(device_private_entry_fault);
#endif /* CONFIG_DEVICE_PRIVATE */
-static void pgmap_radix_release(struct resource *res, unsigned long end_pgoff)
+static void pgmap_array_delete(struct resource *res)
{
- unsigned long pgoff, order;
-
- mutex_lock(&pgmap_lock);
- foreach_order_pgoff(res, order, pgoff) {
- if (pgoff >= end_pgoff)
- break;
- radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff);
- }
- mutex_unlock(&pgmap_lock);
-
+ xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end),
+ NULL, GFP_KERNEL);
synchronize_rcu();
}
@@ -142,7 +108,7 @@ static void devm_memremap_pages_release(void *data)
mem_hotplug_done();
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
- pgmap_radix_release(res, -1);
+ pgmap_array_delete(res);
dev_WARN_ONCE(dev, pgmap->altmap.alloc,
"%s: failed to free all reserved pages\n", __func__);
}
@@ -177,7 +143,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
struct resource *res = &pgmap->res;
struct dev_pagemap *conflict_pgmap;
pgprot_t pgprot = PAGE_KERNEL;
- unsigned long pgoff, order;
int error, nid, is_ram;
align_start = res->start & ~(SECTION_SIZE - 1);
@@ -216,20 +181,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
pgmap->dev = dev;
- mutex_lock(&pgmap_lock);
- error = 0;
-
- foreach_order_pgoff(res, order, pgoff) {
- error = __radix_tree_insert(&pgmap_radix,
- PHYS_PFN(res->start) + pgoff, order, pgmap);
- if (error) {
- dev_err(dev, "%s: failed: %d\n", __func__, error);
- break;
- }
- }
- mutex_unlock(&pgmap_lock);
+ error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start),
+ PHYS_PFN(res->end), pgmap, GFP_KERNEL));
if (error)
- goto err_radix;
+ goto err_array;
nid = dev_to_node(dev);
if (nid < 0)
@@ -274,8 +229,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
err_kasan:
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
err_pfn_remap:
- err_radix:
- pgmap_radix_release(res, pgoff);
+ pgmap_array_delete(res);
+ err_array:
return ERR_PTR(error);
}
EXPORT_SYMBOL(devm_memremap_pages);
@@ -315,7 +270,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
/* fall back to slow path lookup */
rcu_read_lock();
- pgmap = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
+ pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
pgmap = NULL;
rcu_read_unlock();
diff --git a/lib/Kconfig b/lib/Kconfig
index d82f20609939..d1573a16aa92 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -399,8 +399,11 @@ config INTERVAL_TREE
for more information.
-config RADIX_TREE_MULTIORDER
+config XARRAY_MULTI
bool
+ help
+ Support entries which occupy multiple consecutive indices in the
+ XArray.
config ASSOCIATIVE_ARRAY
bool
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 04adfc3b185e..e0ba05e6f6bd 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1813,6 +1813,9 @@ config TEST_BITFIELD
config TEST_UUID
tristate "Test functions located in the uuid module at runtime"
+config TEST_XARRAY
+ tristate "Test the XArray code at runtime"
+
config TEST_OVERFLOW
tristate "Test check_*_overflow() functions at runtime"
diff --git a/lib/Makefile b/lib/Makefile
index fa3eb1b4c0e3..3d341f59f756 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -18,7 +18,7 @@ KCOV_INSTRUMENT_debugobjects.o := n
KCOV_INSTRUMENT_dynamic_debug.o := n
lib-y := ctype.o string.o vsprintf.o cmdline.o \
- rbtree.o radix-tree.o timerqueue.o\
+ rbtree.o radix-tree.o timerqueue.o xarray.o \
idr.o int_sqrt.o extable.o \
sha1.o chacha20.o irq_regs.o argv_split.o \
flex_proportions.o ratelimit.o show_mem.o \
@@ -68,6 +68,7 @@ obj-$(CONFIG_TEST_PRINTF) += test_printf.o
obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o
obj-$(CONFIG_TEST_BITFIELD) += test_bitfield.o
obj-$(CONFIG_TEST_UUID) += test_uuid.o
+obj-$(CONFIG_TEST_XARRAY) += test_xarray.o
obj-$(CONFIG_TEST_PARMAN) += test_parman.o
obj-$(CONFIG_TEST_KMOD) += test_kmod.o
obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o
diff --git a/lib/idr.c b/lib/idr.c
index fab2fd5bc326..cb1db9b8d3f6 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -6,8 +6,6 @@
#include <linux/spinlock.h>
#include <linux/xarray.h>
-DEFINE_PER_CPU(struct ida_bitmap *, ida_bitmap);
-
/**
* idr_alloc_u32() - Allocate an ID.
* @idr: IDR handle.
@@ -39,10 +37,8 @@ int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid,
unsigned int base = idr->idr_base;
unsigned int id = *nextid;
- if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
- return -EINVAL;
- if (WARN_ON_ONCE(!(idr->idr_rt.gfp_mask & ROOT_IS_IDR)))
- idr->idr_rt.gfp_mask |= IDR_RT_MARKER;
+ if (WARN_ON_ONCE(!(idr->idr_rt.xa_flags & ROOT_IS_IDR)))
+ idr->idr_rt.xa_flags |= IDR_RT_MARKER;
id = (id < base) ? 0 : id - base;
radix_tree_iter_init(&iter, id);
@@ -295,15 +291,13 @@ void *idr_replace(struct idr *idr, void *ptr, unsigned long id)
void __rcu **slot = NULL;
void *entry;
- if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
- return ERR_PTR(-EINVAL);
id -= idr->idr_base;
entry = __radix_tree_lookup(&idr->idr_rt, id, &node, &slot);
if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE))
return ERR_PTR(-ENOENT);
- __radix_tree_replace(&idr->idr_rt, node, slot, ptr, NULL);
+ __radix_tree_replace(&idr->idr_rt, node, slot, ptr);
return entry;
}
@@ -324,6 +318,9 @@ EXPORT_SYMBOL(idr_replace);
* free the individual IDs in it. You can use ida_is_empty() to find
* out whether the IDA has any IDs currently allocated.
*
+ * The IDA handles its own locking. It is safe to call any of the IDA
+ * functions without synchronisation in your code.
+ *
* IDs are currently limited to the range [0-INT_MAX]. If this is an awkward
* limitation, it should be quite straightforward to raise the maximum.
*/
@@ -331,161 +328,197 @@ EXPORT_SYMBOL(idr_replace);
/*
* Developer's notes:
*
- * The IDA uses the functionality provided by the IDR & radix tree to store
- * bitmaps in each entry. The IDR_FREE tag means there is at least one bit
- * free, unlike the IDR where it means at least one entry is free.
+ * The IDA uses the functionality provided by the XArray to store bitmaps in
+ * each entry. The XA_FREE_MARK is only cleared when all bits in the bitmap
+ * have been set.
*
- * I considered telling the radix tree that each slot is an order-10 node
- * and storing the bit numbers in the radix tree, but the radix tree can't
- * allow a single multiorder entry at index 0, which would significantly
- * increase memory consumption for the IDA. So instead we divide the index
- * by the number of bits in the leaf bitmap before doing a radix tree lookup.
+ * I considered telling the XArray that each slot is an order-10 node
+ * and indexing by bit number, but the XArray can't allow a single multi-index
+ * entry in the head, which would significantly increase memory consumption
+ * for the IDA. So instead we divide the index by the number of bits in the
+ * leaf bitmap before doing a radix tree lookup.
*
* As an optimisation, if there are only a few low bits set in any given
- * leaf, instead of allocating a 128-byte bitmap, we use the 'exceptional
- * entry' functionality of the radix tree to store BITS_PER_LONG - 2 bits
- * directly in the entry. By being really tricksy, we could store
- * BITS_PER_LONG - 1 bits, but there're diminishing returns after optimising
- * for 0-3 allocated IDs.
- *
- * We allow the radix tree 'exceptional' count to get out of date. Nothing
- * in the IDA nor the radix tree code checks it. If it becomes important
- * to maintain an accurate exceptional count, switch the rcu_assign_pointer()
- * calls to radix_tree_iter_replace() which will correct the exceptional
- * count.
- *
- * The IDA always requires a lock to alloc/free. If we add a 'test_bit'
+ * leaf, instead of allocating a 128-byte bitmap, we store the bits
+ * as a value entry. Value entries never have the XA_FREE_MARK cleared
+ * because we can always convert them into a bitmap entry.
+ *
+ * It would be possible to optimise further; once we've run out of a
+ * single 128-byte bitmap, we currently switch to a 576-byte node, put
+ * the 128-byte bitmap in the first entry and then start allocating extra
+ * 128-byte entries. We could instead use the 512 bytes of the node's
+ * data as a bitmap before moving to that scheme. I do not believe this
+ * is a worthwhile optimisation; Rasmus Villemoes surveyed the current
+ * users of the IDA and almost none of them use more than 1024 entries.
+ * Those that do use more than the 8192 IDs that the 512 bytes would
+ * provide.
+ *
+ * The IDA always uses a lock to alloc/free. If we add a 'test_bit'
* equivalent, it will still need locking. Going to RCU lookup would require
* using RCU to free bitmaps, and that's not trivial without embedding an
* RCU head in the bitmap, which adds a 2-pointer overhead to each 128-byte
* bitmap, which is excessive.
*/
-#define IDA_MAX (0x80000000U / IDA_BITMAP_BITS - 1)
-
-static int ida_get_new_above(struct ida *ida, int start)
+/**
+ * ida_alloc_range() - Allocate an unused ID.
+ * @ida: IDA handle.
+ * @min: Lowest ID to allocate.
+ * @max: Highest ID to allocate.
+ * @gfp: Memory allocation flags.
+ *
+ * Allocate an ID between @min and @max, inclusive. The allocated ID will
+ * not exceed %INT_MAX, even if @max is larger.
+ *
+ * Context: Any context.
+ * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
+ * or %-ENOSPC if there are no free IDs.
+ */
+int ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max,
+ gfp_t gfp)
{
- struct radix_tree_root *root = &ida->ida_rt;
- void __rcu **slot;
- struct radix_tree_iter iter;
- struct ida_bitmap *bitmap;
- unsigned long index;
- unsigned bit, ebit;
- int new;
-
- index = start / IDA_BITMAP_BITS;
- bit = start % IDA_BITMAP_BITS;
- ebit = bit + RADIX_TREE_EXCEPTIONAL_SHIFT;
-
- slot = radix_tree_iter_init(&iter, index);
- for (;;) {
- if (slot)
- slot = radix_tree_next_slot(slot, &iter,
- RADIX_TREE_ITER_TAGGED);
- if (!slot) {
- slot = idr_get_free(root, &iter, GFP_NOWAIT, IDA_MAX);
- if (IS_ERR(slot)) {
- if (slot == ERR_PTR(-ENOMEM))
- return -EAGAIN;
- return PTR_ERR(slot);
+ XA_STATE(xas, &ida->xa, min / IDA_BITMAP_BITS);
+ unsigned bit = min % IDA_BITMAP_BITS;
+ unsigned long flags;
+ struct ida_bitmap *bitmap, *alloc = NULL;
+
+ if ((int)min < 0)
+ return -ENOSPC;
+
+ if ((int)max < 0)
+ max = INT_MAX;
+
+retry:
+ xas_lock_irqsave(&xas, flags);
+next:
+ bitmap = xas_find_marked(&xas, max / IDA_BITMAP_BITS, XA_FREE_MARK);
+ if (xas.xa_index > min / IDA_BITMAP_BITS)
+ bit = 0;
+ if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
+ goto nospc;
+
+ if (xa_is_value(bitmap)) {
+ unsigned long tmp = xa_to_value(bitmap);
+
+ if (bit < BITS_PER_XA_VALUE) {
+ bit = find_next_zero_bit(&tmp, BITS_PER_XA_VALUE, bit);
+ if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
+ goto nospc;
+ if (bit < BITS_PER_XA_VALUE) {
+ tmp |= 1UL << bit;
+ xas_store(&xas, xa_mk_value(tmp));
+ goto out;
}
}
- if (iter.index > index) {
- bit = 0;
- ebit = RADIX_TREE_EXCEPTIONAL_SHIFT;
- }
- new = iter.index * IDA_BITMAP_BITS;
- bitmap = rcu_dereference_raw(*slot);
- if (radix_tree_exception(bitmap)) {
- unsigned long tmp = (unsigned long)bitmap;
- ebit = find_next_zero_bit(&tmp, BITS_PER_LONG, ebit);
- if (ebit < BITS_PER_LONG) {
- tmp |= 1UL << ebit;
- rcu_assign_pointer(*slot, (void *)tmp);
- return new + ebit -
- RADIX_TREE_EXCEPTIONAL_SHIFT;
- }
- bitmap = this_cpu_xchg(ida_bitmap, NULL);
- if (!bitmap)
- return -EAGAIN;
- bitmap->bitmap[0] = tmp >> RADIX_TREE_EXCEPTIONAL_SHIFT;
- rcu_assign_pointer(*slot, bitmap);
+ bitmap = alloc;
+ if (!bitmap)
+ bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT);
+ if (!bitmap)
+ goto alloc;
+ bitmap->bitmap[0] = tmp;
+ xas_store(&xas, bitmap);
+ if (xas_error(&xas)) {
+ bitmap->bitmap[0] = 0;
+ goto out;
}
+ }
- if (bitmap) {
- bit = find_next_zero_bit(bitmap->bitmap,
- IDA_BITMAP_BITS, bit);
- new += bit;
- if (new < 0)
- return -ENOSPC;
- if (bit == IDA_BITMAP_BITS)
- continue;
+ if (bitmap) {
+ bit = find_next_zero_bit(bitmap->bitmap, IDA_BITMAP_BITS, bit);
+ if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
+ goto nospc;
+ if (bit == IDA_BITMAP_BITS)
+ goto next;
- __set_bit(bit, bitmap->bitmap);
- if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS))
- radix_tree_iter_tag_clear(root, &iter,
- IDR_FREE);
+ __set_bit(bit, bitmap->bitmap);
+ if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS))
+ xas_clear_mark(&xas, XA_FREE_MARK);
+ } else {
+ if (bit < BITS_PER_XA_VALUE) {
+ bitmap = xa_mk_value(1UL << bit);
} else {
- new += bit;
- if (new < 0)
- return -ENOSPC;
- if (ebit < BITS_PER_LONG) {
- bitmap = (void *)((1UL << ebit) |
- RADIX_TREE_EXCEPTIONAL_ENTRY);
- radix_tree_iter_replace(root, &iter, slot,
- bitmap);
- return new;
- }
- bitmap = this_cpu_xchg(ida_bitmap, NULL);
+ bitmap = alloc;
if (!bitmap)
- return -EAGAIN;
+ bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT);
+ if (!bitmap)
+ goto alloc;
__set_bit(bit, bitmap->bitmap);
- radix_tree_iter_replace(root, &iter, slot, bitmap);
}
-
- return new;
+ xas_store(&xas, bitmap);
+ }
+out:
+ xas_unlock_irqrestore(&xas, flags);
+ if (xas_nomem(&xas, gfp)) {
+ xas.xa_index = min / IDA_BITMAP_BITS;
+ bit = min % IDA_BITMAP_BITS;
+ goto retry;
}
+ if (bitmap != alloc)
+ kfree(alloc);
+ if (xas_error(&xas))
+ return xas_error(&xas);
+ return xas.xa_index * IDA_BITMAP_BITS + bit;
+alloc:
+ xas_unlock_irqrestore(&xas, flags);
+ alloc = kzalloc(sizeof(*bitmap), gfp);
+ if (!alloc)
+ return -ENOMEM;
+ xas_set(&xas, min / IDA_BITMAP_BITS);
+ bit = min % IDA_BITMAP_BITS;
+ goto retry;
+nospc:
+ xas_unlock_irqrestore(&xas, flags);
+ return -ENOSPC;
}
+EXPORT_SYMBOL(ida_alloc_range);
-static void ida_remove(struct ida *ida, int id)
+/**
+ * ida_free() - Release an allocated ID.
+ * @ida: IDA handle.
+ * @id: Previously allocated ID.
+ *
+ * Context: Any context.
+ */
+void ida_free(struct ida *ida, unsigned int id)
{
- unsigned long index = id / IDA_BITMAP_BITS;
- unsigned offset = id % IDA_BITMAP_BITS;
+ XA_STATE(xas, &ida->xa, id / IDA_BITMAP_BITS);
+ unsigned bit = id % IDA_BITMAP_BITS;
struct ida_bitmap *bitmap;
- unsigned long *btmp;
- struct radix_tree_iter iter;
- void __rcu **slot;
+ unsigned long flags;
- slot = radix_tree_iter_lookup(&ida->ida_rt, &iter, index);
- if (!slot)
- goto err;
+ BUG_ON((int)id < 0);
+
+ xas_lock_irqsave(&xas, flags);
+ bitmap = xas_load(&xas);
- bitmap = rcu_dereference_raw(*slot);
- if (radix_tree_exception(bitmap)) {
- btmp = (unsigned long *)slot;
- offset += RADIX_TREE_EXCEPTIONAL_SHIFT;
- if (offset >= BITS_PER_LONG)
+ if (xa_is_value(bitmap)) {
+ unsigned long v = xa_to_value(bitmap);
+ if (bit >= BITS_PER_XA_VALUE)
goto err;
+ if (!(v & (1UL << bit)))
+ goto err;
+ v &= ~(1UL << bit);
+ if (!v)
+ goto delete;
+ xas_store(&xas, xa_mk_value(v));
} else {
- btmp = bitmap->bitmap;
- }
- if (!test_bit(offset, btmp))
- goto err;
-
- __clear_bit(offset, btmp);
- radix_tree_iter_tag_set(&ida->ida_rt, &iter, IDR_FREE);
- if (radix_tree_exception(bitmap)) {
- if (rcu_dereference_raw(*slot) ==
- (void *)RADIX_TREE_EXCEPTIONAL_ENTRY)
- radix_tree_iter_delete(&ida->ida_rt, &iter, slot);
- } else if (bitmap_empty(btmp, IDA_BITMAP_BITS)) {
- kfree(bitmap);
- radix_tree_iter_delete(&ida->ida_rt, &iter, slot);
+ if (!test_bit(bit, bitmap->bitmap))
+ goto err;
+ __clear_bit(bit, bitmap->bitmap);
+ xas_set_mark(&xas, XA_FREE_MARK);
+ if (bitmap_empty(bitmap->bitmap, IDA_BITMAP_BITS)) {
+ kfree(bitmap);
+delete:
+ xas_store(&xas, NULL);
+ }
}
+ xas_unlock_irqrestore(&xas, flags);
return;
err:
+ xas_unlock_irqrestore(&xas, flags);
WARN(1, "ida_free called for id=%d which is not allocated.\n", id);
}
+EXPORT_SYMBOL(ida_free);
/**
* ida_destroy() - Free all IDs.
@@ -500,80 +533,60 @@ static void ida_remove(struct ida *ida, int id)
*/
void ida_destroy(struct ida *ida)
{
+ XA_STATE(xas, &ida->xa, 0);
+ struct ida_bitmap *bitmap;
unsigned long flags;
- struct radix_tree_iter iter;
- void __rcu **slot;
- xa_lock_irqsave(&ida->ida_rt, flags);
- radix_tree_for_each_slot(slot, &ida->ida_rt, &iter, 0) {
- struct ida_bitmap *bitmap = rcu_dereference_raw(*slot);
- if (!radix_tree_exception(bitmap))
+ xas_lock_irqsave(&xas, flags);
+ xas_for_each(&xas, bitmap, ULONG_MAX) {
+ if (!xa_is_value(bitmap))
kfree(bitmap);
- radix_tree_iter_delete(&ida->ida_rt, &iter, slot);
+ xas_store(&xas, NULL);
}
- xa_unlock_irqrestore(&ida->ida_rt, flags);
+ xas_unlock_irqrestore(&xas, flags);
}
EXPORT_SYMBOL(ida_destroy);
-/**
- * ida_alloc_range() - Allocate an unused ID.
- * @ida: IDA handle.
- * @min: Lowest ID to allocate.
- * @max: Highest ID to allocate.
- * @gfp: Memory allocation flags.
- *
- * Allocate an ID between @min and @max, inclusive. The allocated ID will
- * not exceed %INT_MAX, even if @max is larger.
- *
- * Context: Any context.
- * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
- * or %-ENOSPC if there are no free IDs.
- */
-int ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max,
- gfp_t gfp)
-{
- int id = 0;
- unsigned long flags;
+#ifndef __KERNEL__
+extern void xa_dump_index(unsigned long index, unsigned int shift);
+#define IDA_CHUNK_SHIFT ilog2(IDA_BITMAP_BITS)
- if ((int)min < 0)
- return -ENOSPC;
-
- if ((int)max < 0)
- max = INT_MAX;
-
-again:
- xa_lock_irqsave(&ida->ida_rt, flags);
- id = ida_get_new_above(ida, min);
- if (id > (int)max) {
- ida_remove(ida, id);
- id = -ENOSPC;
- }
- xa_unlock_irqrestore(&ida->ida_rt, flags);
+static void ida_dump_entry(void *entry, unsigned long index)
+{
+ unsigned long i;
+
+ if (!entry)
+ return;
+
+ if (xa_is_node(entry)) {
+ struct xa_node *node = xa_to_node(entry);
+ unsigned int shift = node->shift + IDA_CHUNK_SHIFT +
+ XA_CHUNK_SHIFT;
+
+ xa_dump_index(index * IDA_BITMAP_BITS, shift);
+ xa_dump_node(node);
+ for (i = 0; i < XA_CHUNK_SIZE; i++)
+ ida_dump_entry(node->slots[i],
+ index | (i << node->shift));
+ } else if (xa_is_value(entry)) {
+ xa_dump_index(index * IDA_BITMAP_BITS, ilog2(BITS_PER_LONG));
+ pr_cont("value: data %lx [%px]\n", xa_to_value(entry), entry);
+ } else {
+ struct ida_bitmap *bitmap = entry;
- if (unlikely(id == -EAGAIN)) {
- if (!ida_pre_get(ida, gfp))
- return -ENOMEM;
- goto again;
+ xa_dump_index(index * IDA_BITMAP_BITS, IDA_CHUNK_SHIFT);
+ pr_cont("bitmap: %p data", bitmap);
+ for (i = 0; i < IDA_BITMAP_LONGS; i++)
+ pr_cont(" %lx", bitmap->bitmap[i]);
+ pr_cont("\n");
}
-
- return id;
}
-EXPORT_SYMBOL(ida_alloc_range);
-/**
- * ida_free() - Release an allocated ID.
- * @ida: IDA handle.
- * @id: Previously allocated ID.
- *
- * Context: Any context.
- */
-void ida_free(struct ida *ida, unsigned int id)
+static void ida_dump(struct ida *ida)
{
- unsigned long flags;
-
- BUG_ON((int)id < 0);
- xa_lock_irqsave(&ida->ida_rt, flags);
- ida_remove(ida, id);
- xa_unlock_irqrestore(&ida->ida_rt, flags);
+ struct xarray *xa = &ida->xa;
+ pr_debug("ida: %p node %p free %d\n", ida, xa->xa_head,
+ xa->xa_flags >> ROOT_TAG_SHIFT);
+ ida_dump_entry(xa->xa_head, 0);
}
-EXPORT_SYMBOL(ida_free);
+#endif
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index bc03ecc4dfd2..1106bb6aa01e 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -38,15 +38,13 @@
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/xarray.h>
-/* Number of nodes in fully populated tree of given height */
-static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly;
-
/*
* Radix tree node cache.
*/
-static struct kmem_cache *radix_tree_node_cachep;
+struct kmem_cache *radix_tree_node_cachep;
/*
* The radix tree is variable-height, so an insert operation not only has
@@ -98,24 +96,7 @@ static inline void *node_to_entry(void *ptr)
return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
}
-#define RADIX_TREE_RETRY node_to_entry(NULL)
-
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
-/* Sibling slots point directly to another slot in the same node */
-static inline
-bool is_sibling_entry(const struct radix_tree_node *parent, void *node)
-{
- void __rcu **ptr = node;
- return (parent->slots <= ptr) &&
- (ptr < parent->slots + RADIX_TREE_MAP_SIZE);
-}
-#else
-static inline
-bool is_sibling_entry(const struct radix_tree_node *parent, void *node)
-{
- return false;
-}
-#endif
+#define RADIX_TREE_RETRY XA_RETRY_ENTRY
static inline unsigned long
get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot)
@@ -129,24 +110,13 @@ static unsigned int radix_tree_descend(const struct radix_tree_node *parent,
unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK;
void __rcu **entry = rcu_dereference_raw(parent->slots[offset]);
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
- if (radix_tree_is_internal_node(entry)) {
- if (is_sibling_entry(parent, entry)) {
- void __rcu **sibentry;
- sibentry = (void __rcu **) entry_to_node(entry);
- offset = get_slot_offset(parent, sibentry);
- entry = rcu_dereference_raw(*sibentry);
- }
- }
-#endif
-
*nodep = (void *)entry;
return offset;
}
static inline gfp_t root_gfp_mask(const struct radix_tree_root *root)
{
- return root->gfp_mask & (__GFP_BITS_MASK & ~GFP_ZONEMASK);
+ return root->xa_flags & (__GFP_BITS_MASK & ~GFP_ZONEMASK);
}
static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
@@ -169,32 +139,32 @@ static inline int tag_get(const struct radix_tree_node *node, unsigned int tag,
static inline void root_tag_set(struct radix_tree_root *root, unsigned tag)
{
- root->gfp_mask |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT));
+ root->xa_flags |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT));
}
static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag)
{
- root->gfp_mask &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT));
+ root->xa_flags &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT));
}
static inline void root_tag_clear_all(struct radix_tree_root *root)
{
- root->gfp_mask &= (1 << ROOT_TAG_SHIFT) - 1;
+ root->xa_flags &= (__force gfp_t)((1 << ROOT_TAG_SHIFT) - 1);
}
static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag)
{
- return (__force int)root->gfp_mask & (1 << (tag + ROOT_TAG_SHIFT));
+ return (__force int)root->xa_flags & (1 << (tag + ROOT_TAG_SHIFT));
}
static inline unsigned root_tags_get(const struct radix_tree_root *root)
{
- return (__force unsigned)root->gfp_mask >> ROOT_TAG_SHIFT;
+ return (__force unsigned)root->xa_flags >> ROOT_TAG_SHIFT;
}
static inline bool is_idr(const struct radix_tree_root *root)
{
- return !!(root->gfp_mask & ROOT_IS_IDR);
+ return !!(root->xa_flags & ROOT_IS_IDR);
}
/*
@@ -254,7 +224,7 @@ radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag,
static unsigned int iter_offset(const struct radix_tree_iter *iter)
{
- return (iter->index >> iter_shift(iter)) & RADIX_TREE_MAP_MASK;
+ return iter->index & RADIX_TREE_MAP_MASK;
}
/*
@@ -277,99 +247,6 @@ static unsigned long next_index(unsigned long index,
return (index & ~node_maxindex(node)) + (offset << node->shift);
}
-#ifndef __KERNEL__
-static void dump_node(struct radix_tree_node *node, unsigned long index)
-{
- unsigned long i;
-
- pr_debug("radix node: %p offset %d indices %lu-%lu parent %p tags %lx %lx %lx shift %d count %d exceptional %d\n",
- node, node->offset, index, index | node_maxindex(node),
- node->parent,
- node->tags[0][0], node->tags[1][0], node->tags[2][0],
- node->shift, node->count, node->exceptional);
-
- for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
- unsigned long first = index | (i << node->shift);
- unsigned long last = first | ((1UL << node->shift) - 1);
- void *entry = node->slots[i];
- if (!entry)
- continue;
- if (entry == RADIX_TREE_RETRY) {
- pr_debug("radix retry offset %ld indices %lu-%lu parent %p\n",
- i, first, last, node);
- } else if (!radix_tree_is_internal_node(entry)) {
- pr_debug("radix entry %p offset %ld indices %lu-%lu parent %p\n",
- entry, i, first, last, node);
- } else if (is_sibling_entry(node, entry)) {
- pr_debug("radix sblng %p offset %ld indices %lu-%lu parent %p val %p\n",
- entry, i, first, last, node,
- *(void **)entry_to_node(entry));
- } else {
- dump_node(entry_to_node(entry), first);
- }
- }
-}
-
-/* For debug */
-static void radix_tree_dump(struct radix_tree_root *root)
-{
- pr_debug("radix root: %p rnode %p tags %x\n",
- root, root->rnode,
- root->gfp_mask >> ROOT_TAG_SHIFT);
- if (!radix_tree_is_internal_node(root->rnode))
- return;
- dump_node(entry_to_node(root->rnode), 0);
-}
-
-static void dump_ida_node(void *entry, unsigned long index)
-{
- unsigned long i;
-
- if (!entry)
- return;
-
- if (radix_tree_is_internal_node(entry)) {
- struct radix_tree_node *node = entry_to_node(entry);
-
- pr_debug("ida node: %p offset %d indices %lu-%lu parent %p free %lx shift %d count %d\n",
- node, node->offset, index * IDA_BITMAP_BITS,
- ((index | node_maxindex(node)) + 1) *
- IDA_BITMAP_BITS - 1,
- node->parent, node->tags[0][0], node->shift,
- node->count);
- for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
- dump_ida_node(node->slots[i],
- index | (i << node->shift));
- } else if (radix_tree_exceptional_entry(entry)) {
- pr_debug("ida excp: %p offset %d indices %lu-%lu data %lx\n",
- entry, (int)(index & RADIX_TREE_MAP_MASK),
- index * IDA_BITMAP_BITS,
- index * IDA_BITMAP_BITS + BITS_PER_LONG -
- RADIX_TREE_EXCEPTIONAL_SHIFT,
- (unsigned long)entry >>
- RADIX_TREE_EXCEPTIONAL_SHIFT);
- } else {
- struct ida_bitmap *bitmap = entry;
-
- pr_debug("ida btmp: %p offset %d indices %lu-%lu data", bitmap,
- (int)(index & RADIX_TREE_MAP_MASK),
- index * IDA_BITMAP_BITS,
- (index + 1) * IDA_BITMAP_BITS - 1);
- for (i = 0; i < IDA_BITMAP_LONGS; i++)
- pr_cont(" %lx", bitmap->bitmap[i]);
- pr_cont("\n");
- }
-}
-
-static void ida_dump(struct ida *ida)
-{
- struct radix_tree_root *root = &ida->ida_rt;
- pr_debug("ida: %p node %p free %d\n", ida, root->rnode,
- root->gfp_mask >> ROOT_TAG_SHIFT);
- dump_ida_node(root->rnode, 0);
-}
-#endif
-
/*
* This assumes that the caller has performed appropriate preallocation, and
* that the caller has pinned this thread of control to the current CPU.
@@ -378,7 +255,7 @@ static struct radix_tree_node *
radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent,
struct radix_tree_root *root,
unsigned int shift, unsigned int offset,
- unsigned int count, unsigned int exceptional)
+ unsigned int count, unsigned int nr_values)
{
struct radix_tree_node *ret = NULL;
@@ -425,14 +302,14 @@ out:
ret->shift = shift;
ret->offset = offset;
ret->count = count;
- ret->exceptional = exceptional;
+ ret->nr_values = nr_values;
ret->parent = parent;
- ret->root = root;
+ ret->array = root;
}
return ret;
}
-static void radix_tree_node_rcu_free(struct rcu_head *head)
+void radix_tree_node_rcu_free(struct rcu_head *head)
{
struct radix_tree_node *node =
container_of(head, struct radix_tree_node, rcu_head);
@@ -530,77 +407,10 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
}
EXPORT_SYMBOL(radix_tree_maybe_preload);
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
-/*
- * Preload with enough objects to ensure that we can split a single entry
- * of order @old_order into many entries of size @new_order
- */
-int radix_tree_split_preload(unsigned int old_order, unsigned int new_order,
- gfp_t gfp_mask)
-{
- unsigned top = 1 << (old_order % RADIX_TREE_MAP_SHIFT);
- unsigned layers = (old_order / RADIX_TREE_MAP_SHIFT) -
- (new_order / RADIX_TREE_MAP_SHIFT);
- unsigned nr = 0;
-
- WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask));
- BUG_ON(new_order >= old_order);
-
- while (layers--)
- nr = nr * RADIX_TREE_MAP_SIZE + 1;
- return __radix_tree_preload(gfp_mask, top * nr);
-}
-#endif
-
-/*
- * The same as function above, but preload number of nodes required to insert
- * (1 << order) continuous naturally-aligned elements.
- */
-int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
-{
- unsigned long nr_subtrees;
- int nr_nodes, subtree_height;
-
- /* Preloading doesn't help anything with this gfp mask, skip it */
- if (!gfpflags_allow_blocking(gfp_mask)) {
- preempt_disable();
- return 0;
- }
-
- /*
- * Calculate number and height of fully populated subtrees it takes to
- * store (1 << order) elements.
- */
- nr_subtrees = 1 << order;
- for (subtree_height = 0; nr_subtrees > RADIX_TREE_MAP_SIZE;
- subtree_height++)
- nr_subtrees >>= RADIX_TREE_MAP_SHIFT;
-
- /*
- * The worst case is zero height tree with a single item at index 0 and
- * then inserting items starting at ULONG_MAX - (1 << order).
- *
- * This requires RADIX_TREE_MAX_PATH nodes to build branch from root to
- * 0-index item.
- */
- nr_nodes = RADIX_TREE_MAX_PATH;
-
- /* Plus branch to fully populated subtrees. */
- nr_nodes += RADIX_TREE_MAX_PATH - subtree_height;
-
- /* Root node is shared. */
- nr_nodes--;
-
- /* Plus nodes required to build subtrees. */
- nr_nodes += nr_subtrees * height_to_maxnodes[subtree_height];
-
- return __radix_tree_preload(gfp_mask, nr_nodes);
-}
-
static unsigned radix_tree_load_root(const struct radix_tree_root *root,
struct radix_tree_node **nodep, unsigned long *maxindex)
{
- struct radix_tree_node *node = rcu_dereference_raw(root->rnode);
+ struct radix_tree_node *node = rcu_dereference_raw(root->xa_head);
*nodep = node;
@@ -629,7 +439,7 @@ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
while (index > shift_maxindex(maxshift))
maxshift += RADIX_TREE_MAP_SHIFT;
- entry = rcu_dereference_raw(root->rnode);
+ entry = rcu_dereference_raw(root->xa_head);
if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE)))
goto out;
@@ -656,9 +466,9 @@ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
BUG_ON(shift > BITS_PER_LONG);
if (radix_tree_is_internal_node(entry)) {
entry_to_node(entry)->parent = node;
- } else if (radix_tree_exceptional_entry(entry)) {
- /* Moving an exceptional root->rnode to a node */
- node->exceptional = 1;
+ } else if (xa_is_value(entry)) {
+ /* Moving a value entry root->xa_head to a node */
+ node->nr_values = 1;
}
/*
* entry was already in the radix tree, so we do not need
@@ -666,7 +476,7 @@ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
*/
node->slots[0] = (void __rcu *)entry;
entry = node_to_entry(node);
- rcu_assign_pointer(root->rnode, entry);
+ rcu_assign_pointer(root->xa_head, entry);
shift += RADIX_TREE_MAP_SHIFT;
} while (shift <= maxshift);
out:
@@ -677,13 +487,12 @@ out:
* radix_tree_shrink - shrink radix tree to minimum height
* @root radix tree root
*/
-static inline bool radix_tree_shrink(struct radix_tree_root *root,
- radix_tree_update_node_t update_node)
+static inline bool radix_tree_shrink(struct radix_tree_root *root)
{
bool shrunk = false;
for (;;) {
- struct radix_tree_node *node = rcu_dereference_raw(root->rnode);
+ struct radix_tree_node *node = rcu_dereference_raw(root->xa_head);
struct radix_tree_node *child;
if (!radix_tree_is_internal_node(node))
@@ -692,15 +501,20 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root,
/*
* The candidate node has more than one child, or its child
- * is not at the leftmost slot, or the child is a multiorder
- * entry, we cannot shrink.
+ * is not at the leftmost slot, we cannot shrink.
*/
if (node->count != 1)
break;
child = rcu_dereference_raw(node->slots[0]);
if (!child)
break;
- if (!radix_tree_is_internal_node(child) && node->shift)
+
+ /*
+ * For an IDR, we must not shrink entry 0 into the root in
+ * case somebody calls idr_replace() with a pointer that
+ * appears to be an internal entry
+ */
+ if (!node->shift && is_idr(root))
break;
if (radix_tree_is_internal_node(child))
@@ -711,9 +525,9 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root,
* moving the node from one part of the tree to another: if it
* was safe to dereference the old pointer to it
* (node->slots[0]), it will be safe to dereference the new
- * one (root->rnode) as far as dependent read barriers go.
+ * one (root->xa_head) as far as dependent read barriers go.
*/
- root->rnode = (void __rcu *)child;
+ root->xa_head = (void __rcu *)child;
if (is_idr(root) && !tag_get(node, IDR_FREE, 0))
root_tag_clear(root, IDR_FREE);
@@ -738,8 +552,6 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root,
node->count = 0;
if (!radix_tree_is_internal_node(child)) {
node->slots[0] = (void __rcu *)RADIX_TREE_RETRY;
- if (update_node)
- update_node(node);
}
WARN_ON_ONCE(!list_empty(&node->private_list));
@@ -751,8 +563,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root,
}
static bool delete_node(struct radix_tree_root *root,
- struct radix_tree_node *node,
- radix_tree_update_node_t update_node)
+ struct radix_tree_node *node)
{
bool deleted = false;
@@ -761,9 +572,8 @@ static bool delete_node(struct radix_tree_root *root,
if (node->count) {
if (node_to_entry(node) ==
- rcu_dereference_raw(root->rnode))
- deleted |= radix_tree_shrink(root,
- update_node);
+ rcu_dereference_raw(root->xa_head))
+ deleted |= radix_tree_shrink(root);
return deleted;
}
@@ -778,7 +588,7 @@ static bool delete_node(struct radix_tree_root *root,
*/
if (!is_idr(root))
root_tag_clear_all(root);
- root->rnode = NULL;
+ root->xa_head = NULL;
}
WARN_ON_ONCE(!list_empty(&node->private_list));
@@ -795,7 +605,6 @@ static bool delete_node(struct radix_tree_root *root,
* __radix_tree_create - create a slot in a radix tree
* @root: radix tree root
* @index: index key
- * @order: index occupies 2^order aligned slots
* @nodep: returns node
* @slotp: returns slot
*
@@ -803,36 +612,34 @@ static bool delete_node(struct radix_tree_root *root,
* at position @index in the radix tree @root.
*
* Until there is more than one item in the tree, no nodes are
- * allocated and @root->rnode is used as a direct slot instead of
+ * allocated and @root->xa_head is used as a direct slot instead of
* pointing to a node, in which case *@nodep will be NULL.
*
* Returns -ENOMEM, or 0 for success.
*/
-int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
- unsigned order, struct radix_tree_node **nodep,
- void __rcu ***slotp)
+static int __radix_tree_create(struct radix_tree_root *root,
+ unsigned long index, struct radix_tree_node **nodep,
+ void __rcu ***slotp)
{
struct radix_tree_node *node = NULL, *child;
- void __rcu **slot = (void __rcu **)&root->rnode;
+ void __rcu **slot = (void __rcu **)&root->xa_head;
unsigned long maxindex;
unsigned int shift, offset = 0;
- unsigned long max = index | ((1UL << order) - 1);
+ unsigned long max = index;
gfp_t gfp = root_gfp_mask(root);
shift = radix_tree_load_root(root, &child, &maxindex);
/* Make sure the tree is high enough. */
- if (order > 0 && max == ((1UL << order) - 1))
- max++;
if (max > maxindex) {
int error = radix_tree_extend(root, gfp, max, shift);
if (error < 0)
return error;
shift = error;
- child = rcu_dereference_raw(root->rnode);
+ child = rcu_dereference_raw(root->xa_head);
}
- while (shift > order) {
+ while (shift > 0) {
shift -= RADIX_TREE_MAP_SHIFT;
if (child == NULL) {
/* Have to add a child node. */
@@ -875,8 +682,7 @@ static void radix_tree_free_nodes(struct radix_tree_node *node)
for (;;) {
void *entry = rcu_dereference_raw(child->slots[offset]);
- if (radix_tree_is_internal_node(entry) &&
- !is_sibling_entry(child, entry)) {
+ if (xa_is_node(entry) && child->shift) {
child = entry_to_node(entry);
offset = 0;
continue;
@@ -894,96 +700,30 @@ static void radix_tree_free_nodes(struct radix_tree_node *node)
}
}
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
static inline int insert_entries(struct radix_tree_node *node,
- void __rcu **slot, void *item, unsigned order, bool replace)
-{
- struct radix_tree_node *child;
- unsigned i, n, tag, offset, tags = 0;
-
- if (node) {
- if (order > node->shift)
- n = 1 << (order - node->shift);
- else
- n = 1;
- offset = get_slot_offset(node, slot);
- } else {
- n = 1;
- offset = 0;
- }
-
- if (n > 1) {
- offset = offset & ~(n - 1);
- slot = &node->slots[offset];
- }
- child = node_to_entry(slot);
-
- for (i = 0; i < n; i++) {
- if (slot[i]) {
- if (replace) {
- node->count--;
- for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
- if (tag_get(node, tag, offset + i))
- tags |= 1 << tag;
- } else
- return -EEXIST;
- }
- }
-
- for (i = 0; i < n; i++) {
- struct radix_tree_node *old = rcu_dereference_raw(slot[i]);
- if (i) {
- rcu_assign_pointer(slot[i], child);
- for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
- if (tags & (1 << tag))
- tag_clear(node, tag, offset + i);
- } else {
- rcu_assign_pointer(slot[i], item);
- for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
- if (tags & (1 << tag))
- tag_set(node, tag, offset);
- }
- if (radix_tree_is_internal_node(old) &&
- !is_sibling_entry(node, old) &&
- (old != RADIX_TREE_RETRY))
- radix_tree_free_nodes(old);
- if (radix_tree_exceptional_entry(old))
- node->exceptional--;
- }
- if (node) {
- node->count += n;
- if (radix_tree_exceptional_entry(item))
- node->exceptional += n;
- }
- return n;
-}
-#else
-static inline int insert_entries(struct radix_tree_node *node,
- void __rcu **slot, void *item, unsigned order, bool replace)
+ void __rcu **slot, void *item, bool replace)
{
if (*slot)
return -EEXIST;
rcu_assign_pointer(*slot, item);
if (node) {
node->count++;
- if (radix_tree_exceptional_entry(item))
- node->exceptional++;
+ if (xa_is_value(item))
+ node->nr_values++;
}
return 1;
}
-#endif
/**
* __radix_tree_insert - insert into a radix tree
* @root: radix tree root
* @index: index key
- * @order: key covers the 2^order indices around index
* @item: item to insert
*
* Insert an item into the radix tree at position @index.
*/
-int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
- unsigned order, void *item)
+int radix_tree_insert(struct radix_tree_root *root, unsigned long index,
+ void *item)
{
struct radix_tree_node *node;
void __rcu **slot;
@@ -991,11 +731,11 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
BUG_ON(radix_tree_is_internal_node(item));
- error = __radix_tree_create(root, index, order, &node, &slot);
+ error = __radix_tree_create(root, index, &node, &slot);
if (error)
return error;
- error = insert_entries(node, slot, item, order, false);
+ error = insert_entries(node, slot, item, false);
if (error < 0)
return error;
@@ -1010,7 +750,7 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
return 0;
}
-EXPORT_SYMBOL(__radix_tree_insert);
+EXPORT_SYMBOL(radix_tree_insert);
/**
* __radix_tree_lookup - lookup an item in a radix tree
@@ -1023,7 +763,7 @@ EXPORT_SYMBOL(__radix_tree_insert);
* tree @root.
*
* Until there is more than one item in the tree, no nodes are
- * allocated and @root->rnode is used as a direct slot instead of
+ * allocated and @root->xa_head is used as a direct slot instead of
* pointing to a node, in which case *@nodep will be NULL.
*/
void *__radix_tree_lookup(const struct radix_tree_root *root,
@@ -1036,7 +776,7 @@ void *__radix_tree_lookup(const struct radix_tree_root *root,
restart:
parent = NULL;
- slot = (void __rcu **)&root->rnode;
+ slot = (void __rcu **)&root->xa_head;
radix_tree_load_root(root, &node, &maxindex);
if (index > maxindex)
return NULL;
@@ -1049,6 +789,8 @@ void *__radix_tree_lookup(const struct radix_tree_root *root,
parent = entry_to_node(node);
offset = radix_tree_descend(parent, &node, index);
slot = parent->slots + offset;
+ if (parent->shift == 0)
+ break;
}
if (nodep)
@@ -1100,36 +842,12 @@ void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index)
}
EXPORT_SYMBOL(radix_tree_lookup);
-static inline void replace_sibling_entries(struct radix_tree_node *node,
- void __rcu **slot, int count, int exceptional)
-{
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
- void *ptr = node_to_entry(slot);
- unsigned offset = get_slot_offset(node, slot) + 1;
-
- while (offset < RADIX_TREE_MAP_SIZE) {
- if (rcu_dereference_raw(node->slots[offset]) != ptr)
- break;
- if (count < 0) {
- node->slots[offset] = NULL;
- node->count--;
- }
- node->exceptional += exceptional;
- offset++;
- }
-#endif
-}
-
static void replace_slot(void __rcu **slot, void *item,
- struct radix_tree_node *node, int count, int exceptional)
+ struct radix_tree_node *node, int count, int values)
{
- if (WARN_ON_ONCE(radix_tree_is_internal_node(item)))
- return;
-
- if (node && (count || exceptional)) {
+ if (node && (count || values)) {
node->count += count;
- node->exceptional += exceptional;
- replace_sibling_entries(node, slot, count, exceptional);
+ node->nr_values += values;
}
rcu_assign_pointer(*slot, item);
@@ -1172,37 +890,31 @@ static int calculate_count(struct radix_tree_root *root,
* @node: pointer to tree node
* @slot: pointer to slot in @node
* @item: new item to store in the slot.
- * @update_node: callback for changing leaf nodes
*
* For use with __radix_tree_lookup(). Caller must hold tree write locked
* across slot lookup and replacement.
*/
void __radix_tree_replace(struct radix_tree_root *root,
struct radix_tree_node *node,
- void __rcu **slot, void *item,
- radix_tree_update_node_t update_node)
+ void __rcu **slot, void *item)
{
void *old = rcu_dereference_raw(*slot);
- int exceptional = !!radix_tree_exceptional_entry(item) -
- !!radix_tree_exceptional_entry(old);
+ int values = !!xa_is_value(item) - !!xa_is_value(old);
int count = calculate_count(root, node, slot, item, old);
/*
- * This function supports replacing exceptional entries and
+ * This function supports replacing value entries and
* deleting entries, but that needs accounting against the
- * node unless the slot is root->rnode.
+ * node unless the slot is root->xa_head.
*/
- WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->rnode) &&
- (count || exceptional));
- replace_slot(slot, item, node, count, exceptional);
+ WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->xa_head) &&
+ (count || values));
+ replace_slot(slot, item, node, count, values);
if (!node)
return;
- if (update_node)
- update_node(node);
-
- delete_node(root, node, update_node);
+ delete_node(root, node);
}
/**
@@ -1211,12 +923,12 @@ void __radix_tree_replace(struct radix_tree_root *root,
* @slot: pointer to slot
* @item: new item to store in the slot.
*
- * For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(),
+ * For use with radix_tree_lookup_slot() and
* radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked
* across slot lookup and replacement.
*
* NOTE: This cannot be used to switch between non-entries (empty slots),
- * regular entries, and exceptional entries, as that requires accounting
+ * regular entries, and value entries, as that requires accounting
* inside the radix tree node. When switching from one type of entry or
* deleting, use __radix_tree_lookup() and __radix_tree_replace() or
* radix_tree_iter_replace().
@@ -1224,7 +936,7 @@ void __radix_tree_replace(struct radix_tree_root *root,
void radix_tree_replace_slot(struct radix_tree_root *root,
void __rcu **slot, void *item)
{
- __radix_tree_replace(root, NULL, slot, item, NULL);
+ __radix_tree_replace(root, NULL, slot, item);
}
EXPORT_SYMBOL(radix_tree_replace_slot);
@@ -1234,162 +946,16 @@ EXPORT_SYMBOL(radix_tree_replace_slot);
* @slot: pointer to slot
* @item: new item to store in the slot.
*
- * For use with radix_tree_split() and radix_tree_for_each_slot().
- * Caller must hold tree write locked across split and replacement.
+ * For use with radix_tree_for_each_slot().
+ * Caller must hold tree write locked.
*/
void radix_tree_iter_replace(struct radix_tree_root *root,
const struct radix_tree_iter *iter,
void __rcu **slot, void *item)
{
- __radix_tree_replace(root, iter->node, slot, item, NULL);
+ __radix_tree_replace(root, iter->node, slot, item);
}
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
-/**
- * radix_tree_join - replace multiple entries with one multiorder entry
- * @root: radix tree root
- * @index: an index inside the new entry
- * @order: order of the new entry
- * @item: new entry
- *
- * Call this function to replace several entries with one larger entry.
- * The existing entries are presumed to not need freeing as a result of
- * this call.
- *
- * The replacement entry will have all the tags set on it that were set
- * on any of the entries it is replacing.
- */
-int radix_tree_join(struct radix_tree_root *root, unsigned long index,
- unsigned order, void *item)
-{
- struct radix_tree_node *node;
- void __rcu **slot;
- int error;
-
- BUG_ON(radix_tree_is_internal_node(item));
-
- error = __radix_tree_create(root, index, order, &node, &slot);
- if (!error)
- error = insert_entries(node, slot, item, order, true);
- if (error > 0)
- error = 0;
-
- return error;
-}
-
-/**
- * radix_tree_split - Split an entry into smaller entries
- * @root: radix tree root
- * @index: An index within the large entry
- * @order: Order of new entries
- *
- * Call this function as the first step in replacing a multiorder entry
- * with several entries of lower order. After this function returns,
- * loop over the relevant portion of the tree using radix_tree_for_each_slot()
- * and call radix_tree_iter_replace() to set up each new entry.
- *
- * The tags from this entry are replicated to all the new entries.
- *
- * The radix tree should be locked against modification during the entire
- * replacement operation. Lock-free lookups will see RADIX_TREE_RETRY which
- * should prompt RCU walkers to restart the lookup from the root.
- */
-int radix_tree_split(struct radix_tree_root *root, unsigned long index,
- unsigned order)
-{
- struct radix_tree_node *parent, *node, *child;
- void __rcu **slot;
- unsigned int offset, end;
- unsigned n, tag, tags = 0;
- gfp_t gfp = root_gfp_mask(root);
-
- if (!__radix_tree_lookup(root, index, &parent, &slot))
- return -ENOENT;
- if (!parent)
- return -ENOENT;
-
- offset = get_slot_offset(parent, slot);
-
- for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
- if (tag_get(parent, tag, offset))
- tags |= 1 << tag;
-
- for (end = offset + 1; end < RADIX_TREE_MAP_SIZE; end++) {
- if (!is_sibling_entry(parent,
- rcu_dereference_raw(parent->slots[end])))
- break;
- for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
- if (tags & (1 << tag))
- tag_set(parent, tag, end);
- /* rcu_assign_pointer ensures tags are set before RETRY */
- rcu_assign_pointer(parent->slots[end], RADIX_TREE_RETRY);
- }
- rcu_assign_pointer(parent->slots[offset], RADIX_TREE_RETRY);
- parent->exceptional -= (end - offset);
-
- if (order == parent->shift)
- return 0;
- if (order > parent->shift) {
- while (offset < end)
- offset += insert_entries(parent, &parent->slots[offset],
- RADIX_TREE_RETRY, order, true);
- return 0;
- }
-
- node = parent;
-
- for (;;) {
- if (node->shift > order) {
- child = radix_tree_node_alloc(gfp, node, root,
- node->shift - RADIX_TREE_MAP_SHIFT,
- offset, 0, 0);
- if (!child)
- goto nomem;
- if (node != parent) {
- node->count++;
- rcu_assign_pointer(node->slots[offset],
- node_to_entry(child));
- for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
- if (tags & (1 << tag))
- tag_set(node, tag, offset);
- }
-
- node = child;
- offset = 0;
- continue;
- }
-
- n = insert_entries(node, &node->slots[offset],
- RADIX_TREE_RETRY, order, false);
- BUG_ON(n > RADIX_TREE_MAP_SIZE);
-
- for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
- if (tags & (1 << tag))
- tag_set(node, tag, offset);
- offset += n;
-
- while (offset == RADIX_TREE_MAP_SIZE) {
- if (node == parent)
- break;
- offset = node->offset;
- child = node;
- node = node->parent;
- rcu_assign_pointer(node->slots[offset],
- node_to_entry(child));
- offset++;
- }
- if ((node == parent) && (offset == end))
- return 0;
- }
-
- nomem:
- /* Shouldn't happen; did user forget to preload? */
- /* TODO: free all the allocated nodes */
- WARN_ON(1);
- return -ENOMEM;
-}
-#endif
-
static void node_tag_set(struct radix_tree_root *root,
struct radix_tree_node *node,
unsigned int tag, unsigned int offset)
@@ -1447,18 +1013,6 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
}
EXPORT_SYMBOL(radix_tree_tag_set);
-/**
- * radix_tree_iter_tag_set - set a tag on the current iterator entry
- * @root: radix tree root
- * @iter: iterator state
- * @tag: tag to set
- */
-void radix_tree_iter_tag_set(struct radix_tree_root *root,
- const struct radix_tree_iter *iter, unsigned int tag)
-{
- node_tag_set(root, iter->node, tag, iter_offset(iter));
-}
-
static void node_tag_clear(struct radix_tree_root *root,
struct radix_tree_node *node,
unsigned int tag, unsigned int offset)
@@ -1574,14 +1128,6 @@ int radix_tree_tag_get(const struct radix_tree_root *root,
}
EXPORT_SYMBOL(radix_tree_tag_get);
-static inline void __set_iter_shift(struct radix_tree_iter *iter,
- unsigned int shift)
-{
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
- iter->shift = shift;
-#endif
-}
-
/* Construct iter->tags bit-mask from node->tags[tag] array */
static void set_iter_tags(struct radix_tree_iter *iter,
struct radix_tree_node *node, unsigned offset,
@@ -1608,92 +1154,11 @@ static void set_iter_tags(struct radix_tree_iter *iter,
}
}
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
-static void __rcu **skip_siblings(struct radix_tree_node **nodep,
- void __rcu **slot, struct radix_tree_iter *iter)
-{
- while (iter->index < iter->next_index) {
- *nodep = rcu_dereference_raw(*slot);
- if (*nodep && !is_sibling_entry(iter->node, *nodep))
- return slot;
- slot++;
- iter->index = __radix_tree_iter_add(iter, 1);
- iter->tags >>= 1;
- }
-
- *nodep = NULL;
- return NULL;
-}
-
-void __rcu **__radix_tree_next_slot(void __rcu **slot,
- struct radix_tree_iter *iter, unsigned flags)
-{
- unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
- struct radix_tree_node *node;
-
- slot = skip_siblings(&node, slot, iter);
-
- while (radix_tree_is_internal_node(node)) {
- unsigned offset;
- unsigned long next_index;
-
- if (node == RADIX_TREE_RETRY)
- return slot;
- node = entry_to_node(node);
- iter->node = node;
- iter->shift = node->shift;
-
- if (flags & RADIX_TREE_ITER_TAGGED) {
- offset = radix_tree_find_next_bit(node, tag, 0);
- if (offset == RADIX_TREE_MAP_SIZE)
- return NULL;
- slot = &node->slots[offset];
- iter->index = __radix_tree_iter_add(iter, offset);
- set_iter_tags(iter, node, offset, tag);
- node = rcu_dereference_raw(*slot);
- } else {
- offset = 0;
- slot = &node->slots[0];
- for (;;) {
- node = rcu_dereference_raw(*slot);
- if (node)
- break;
- slot++;
- offset++;
- if (offset == RADIX_TREE_MAP_SIZE)
- return NULL;
- }
- iter->index = __radix_tree_iter_add(iter, offset);
- }
- if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0))
- goto none;
- next_index = (iter->index | shift_maxindex(iter->shift)) + 1;
- if (next_index < iter->next_index)
- iter->next_index = next_index;
- }
-
- return slot;
- none:
- iter->next_index = 0;
- return NULL;
-}
-EXPORT_SYMBOL(__radix_tree_next_slot);
-#else
-static void __rcu **skip_siblings(struct radix_tree_node **nodep,
- void __rcu **slot, struct radix_tree_iter *iter)
-{
- return slot;
-}
-#endif
-
void __rcu **radix_tree_iter_resume(void __rcu **slot,
struct radix_tree_iter *iter)
{
- struct radix_tree_node *node;
-
slot++;
iter->index = __radix_tree_iter_add(iter, 1);
- skip_siblings(&node, slot, iter);
iter->next_index = iter->index;
iter->tags = 0;
return NULL;
@@ -1744,8 +1209,7 @@ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root,
iter->next_index = maxindex + 1;
iter->tags = 1;
iter->node = NULL;
- __set_iter_shift(iter, 0);
- return (void __rcu **)&root->rnode;
+ return (void __rcu **)&root->xa_head;
}
do {
@@ -1765,8 +1229,6 @@ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root,
while (++offset < RADIX_TREE_MAP_SIZE) {
void *slot = rcu_dereference_raw(
node->slots[offset]);
- if (is_sibling_entry(node, slot))
- continue;
if (slot)
break;
}
@@ -1784,13 +1246,12 @@ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root,
goto restart;
if (child == RADIX_TREE_RETRY)
break;
- } while (radix_tree_is_internal_node(child));
+ } while (node->shift && radix_tree_is_internal_node(child));
/* Update the iterator state */
- iter->index = (index &~ node_maxindex(node)) | (offset << node->shift);
+ iter->index = (index &~ node_maxindex(node)) | offset;
iter->next_index = (index | node_maxindex(node)) + 1;
iter->node = node;
- __set_iter_shift(iter, node->shift);
if (flags & RADIX_TREE_ITER_TAGGED)
set_iter_tags(iter, node, offset, tag);
@@ -1847,48 +1308,6 @@ radix_tree_gang_lookup(const struct radix_tree_root *root, void **results,
EXPORT_SYMBOL(radix_tree_gang_lookup);
/**
- * radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree
- * @root: radix tree root
- * @results: where the results of the lookup are placed
- * @indices: where their indices should be placed (but usually NULL)
- * @first_index: start the lookup from this key
- * @max_items: place up to this many items at *results
- *
- * Performs an index-ascending scan of the tree for present items. Places
- * their slots at *@results and returns the number of items which were
- * placed at *@results.
- *
- * The implementation is naive.
- *
- * Like radix_tree_gang_lookup as far as RCU and locking goes. Slots must
- * be dereferenced with radix_tree_deref_slot, and if using only RCU
- * protection, radix_tree_deref_slot may fail requiring a retry.
- */
-unsigned int
-radix_tree_gang_lookup_slot(const struct radix_tree_root *root,
- void __rcu ***results, unsigned long *indices,
- unsigned long first_index, unsigned int max_items)
-{
- struct radix_tree_iter iter;
- void __rcu **slot;
- unsigned int ret = 0;
-
- if (unlikely(!max_items))
- return 0;
-
- radix_tree_for_each_slot(slot, root, &iter, first_index) {
- results[ret] = slot;
- if (indices)
- indices[ret] = iter.index;
- if (++ret == max_items)
- break;
- }
-
- return ret;
-}
-EXPORT_SYMBOL(radix_tree_gang_lookup_slot);
-
-/**
* radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
* based on a tag
* @root: radix tree root
@@ -1964,28 +1383,11 @@ radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root,
}
EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
-/**
- * __radix_tree_delete_node - try to free node after clearing a slot
- * @root: radix tree root
- * @node: node containing @index
- * @update_node: callback for changing leaf nodes
- *
- * After clearing the slot at @index in @node from radix tree
- * rooted at @root, call this function to attempt freeing the
- * node and shrinking the tree.
- */
-void __radix_tree_delete_node(struct radix_tree_root *root,
- struct radix_tree_node *node,
- radix_tree_update_node_t update_node)
-{
- delete_node(root, node, update_node);
-}
-
static bool __radix_tree_delete(struct radix_tree_root *root,
struct radix_tree_node *node, void __rcu **slot)
{
void *old = rcu_dereference_raw(*slot);
- int exceptional = radix_tree_exceptional_entry(old) ? -1 : 0;
+ int values = xa_is_value(old) ? -1 : 0;
unsigned offset = get_slot_offset(node, slot);
int tag;
@@ -1995,8 +1397,8 @@ static bool __radix_tree_delete(struct radix_tree_root *root,
for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
node_tag_clear(root, node, tag, offset);
- replace_slot(slot, NULL, node, -1, exceptional);
- return node && delete_node(root, node, NULL);
+ replace_slot(slot, NULL, node, -1, values);
+ return node && delete_node(root, node);
}
/**
@@ -2068,19 +1470,6 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
}
EXPORT_SYMBOL(radix_tree_delete);
-void radix_tree_clear_tags(struct radix_tree_root *root,
- struct radix_tree_node *node,
- void __rcu **slot)
-{
- if (node) {
- unsigned int tag, offset = get_slot_offset(node, slot);
- for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
- node_tag_clear(root, node, tag, offset);
- } else {
- root_tag_clear_all(root);
- }
-}
-
/**
* radix_tree_tagged - test whether any items in the tree are tagged
* @root: radix tree root
@@ -2106,33 +1495,12 @@ void idr_preload(gfp_t gfp_mask)
}
EXPORT_SYMBOL(idr_preload);
-int ida_pre_get(struct ida *ida, gfp_t gfp)
-{
- /*
- * The IDA API has no preload_end() equivalent. Instead,
- * ida_get_new() can return -EAGAIN, prompting the caller
- * to return to the ida_pre_get() step.
- */
- if (!__radix_tree_preload(gfp, IDA_PRELOAD_SIZE))
- preempt_enable();
-
- if (!this_cpu_read(ida_bitmap)) {
- struct ida_bitmap *bitmap = kzalloc(sizeof(*bitmap), gfp);
- if (!bitmap)
- return 0;
- if (this_cpu_cmpxchg(ida_bitmap, NULL, bitmap))
- kfree(bitmap);
- }
-
- return 1;
-}
-
void __rcu **idr_get_free(struct radix_tree_root *root,
struct radix_tree_iter *iter, gfp_t gfp,
unsigned long max)
{
struct radix_tree_node *node = NULL, *child;
- void __rcu **slot = (void __rcu **)&root->rnode;
+ void __rcu **slot = (void __rcu **)&root->xa_head;
unsigned long maxindex, start = iter->next_index;
unsigned int shift, offset = 0;
@@ -2148,8 +1516,10 @@ void __rcu **idr_get_free(struct radix_tree_root *root,
if (error < 0)
return ERR_PTR(error);
shift = error;
- child = rcu_dereference_raw(root->rnode);
+ child = rcu_dereference_raw(root->xa_head);
}
+ if (start == 0 && shift == 0)
+ shift = RADIX_TREE_MAP_SHIFT;
while (shift) {
shift -= RADIX_TREE_MAP_SHIFT;
@@ -2192,7 +1562,6 @@ void __rcu **idr_get_free(struct radix_tree_root *root,
else
iter->next_index = 1;
iter->node = node;
- __set_iter_shift(iter, shift);
set_iter_tags(iter, node, offset, IDR_FREE);
return slot;
@@ -2211,10 +1580,10 @@ void __rcu **idr_get_free(struct radix_tree_root *root,
*/
void idr_destroy(struct idr *idr)
{
- struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.rnode);
+ struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.xa_head);
if (radix_tree_is_internal_node(node))
radix_tree_free_nodes(node);
- idr->idr_rt.rnode = NULL;
+ idr->idr_rt.xa_head = NULL;
root_tag_set(&idr->idr_rt, IDR_FREE);
}
EXPORT_SYMBOL(idr_destroy);
@@ -2228,31 +1597,6 @@ radix_tree_node_ctor(void *arg)
INIT_LIST_HEAD(&node->private_list);
}
-static __init unsigned long __maxindex(unsigned int height)
-{
- unsigned int width = height * RADIX_TREE_MAP_SHIFT;
- int shift = RADIX_TREE_INDEX_BITS - width;
-
- if (shift < 0)
- return ~0UL;
- if (shift >= BITS_PER_LONG)
- return 0UL;
- return ~0UL >> shift;
-}
-
-static __init void radix_tree_init_maxnodes(void)
-{
- unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1];
- unsigned int i, j;
-
- for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
- height_to_maxindex[i] = __maxindex(i);
- for (i = 0; i < ARRAY_SIZE(height_to_maxnodes); i++) {
- for (j = i; j > 0; j--)
- height_to_maxnodes[i] += height_to_maxindex[j - 1] + 1;
- }
-}
-
static int radix_tree_cpu_dead(unsigned int cpu)
{
struct radix_tree_preload *rtp;
@@ -2266,8 +1610,6 @@ static int radix_tree_cpu_dead(unsigned int cpu)
kmem_cache_free(radix_tree_node_cachep, node);
rtp->nr--;
}
- kfree(per_cpu(ida_bitmap, cpu));
- per_cpu(ida_bitmap, cpu) = NULL;
return 0;
}
@@ -2277,11 +1619,11 @@ void __init radix_tree_init(void)
BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32);
BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK);
+ BUILD_BUG_ON(XA_CHUNK_SIZE > 255);
radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
sizeof(struct radix_tree_node), 0,
SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
radix_tree_node_ctor);
- radix_tree_init_maxnodes();
ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead",
NULL, radix_tree_cpu_dead);
WARN_ON(ret < 0);
diff --git a/lib/test_xarray.c b/lib/test_xarray.c
new file mode 100644
index 000000000000..aa47754150ce
--- /dev/null
+++ b/lib/test_xarray.c
@@ -0,0 +1,1238 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * test_xarray.c: Test the XArray API
+ * Copyright (c) 2017-2018 Microsoft Corporation
+ * Author: Matthew Wilcox <willy@infradead.org>
+ */
+
+#include <linux/xarray.h>
+#include <linux/module.h>
+
+static unsigned int tests_run;
+static unsigned int tests_passed;
+
+#ifndef XA_DEBUG
+# ifdef __KERNEL__
+void xa_dump(const struct xarray *xa) { }
+# endif
+#undef XA_BUG_ON
+#define XA_BUG_ON(xa, x) do { \
+ tests_run++; \
+ if (x) { \
+ printk("BUG at %s:%d\n", __func__, __LINE__); \
+ xa_dump(xa); \
+ dump_stack(); \
+ } else { \
+ tests_passed++; \
+ } \
+} while (0)
+#endif
+
+static void *xa_store_index(struct xarray *xa, unsigned long index, gfp_t gfp)
+{
+ return xa_store(xa, index, xa_mk_value(index & LONG_MAX), gfp);
+}
+
+static void xa_alloc_index(struct xarray *xa, unsigned long index, gfp_t gfp)
+{
+ u32 id = 0;
+
+ XA_BUG_ON(xa, xa_alloc(xa, &id, UINT_MAX, xa_mk_value(index & LONG_MAX),
+ gfp) != 0);
+ XA_BUG_ON(xa, id != index);
+}
+
+static void xa_erase_index(struct xarray *xa, unsigned long index)
+{
+ XA_BUG_ON(xa, xa_erase(xa, index) != xa_mk_value(index & LONG_MAX));
+ XA_BUG_ON(xa, xa_load(xa, index) != NULL);
+}
+
+/*
+ * If anyone needs this, please move it to xarray.c. We have no current
+ * users outside the test suite because all current multislot users want
+ * to use the advanced API.
+ */
+static void *xa_store_order(struct xarray *xa, unsigned long index,
+ unsigned order, void *entry, gfp_t gfp)
+{
+ XA_STATE_ORDER(xas, xa, index, order);
+ void *curr;
+
+ do {
+ xas_lock(&xas);
+ curr = xas_store(&xas, entry);
+ xas_unlock(&xas);
+ } while (xas_nomem(&xas, gfp));
+
+ return curr;
+}
+
+static noinline void check_xa_err(struct xarray *xa)
+{
+ XA_BUG_ON(xa, xa_err(xa_store_index(xa, 0, GFP_NOWAIT)) != 0);
+ XA_BUG_ON(xa, xa_err(xa_erase(xa, 0)) != 0);
+#ifndef __KERNEL__
+ /* The kernel does not fail GFP_NOWAIT allocations */
+ XA_BUG_ON(xa, xa_err(xa_store_index(xa, 1, GFP_NOWAIT)) != -ENOMEM);
+ XA_BUG_ON(xa, xa_err(xa_store_index(xa, 1, GFP_NOWAIT)) != -ENOMEM);
+#endif
+ XA_BUG_ON(xa, xa_err(xa_store_index(xa, 1, GFP_KERNEL)) != 0);
+ XA_BUG_ON(xa, xa_err(xa_store(xa, 1, xa_mk_value(0), GFP_KERNEL)) != 0);
+ XA_BUG_ON(xa, xa_err(xa_erase(xa, 1)) != 0);
+// kills the test-suite :-(
+// XA_BUG_ON(xa, xa_err(xa_store(xa, 0, xa_mk_internal(0), 0)) != -EINVAL);
+}
+
+static noinline void check_xas_retry(struct xarray *xa)
+{
+ XA_STATE(xas, xa, 0);
+ void *entry;
+
+ xa_store_index(xa, 0, GFP_KERNEL);
+ xa_store_index(xa, 1, GFP_KERNEL);
+
+ rcu_read_lock();
+ XA_BUG_ON(xa, xas_find(&xas, ULONG_MAX) != xa_mk_value(0));
+ xa_erase_index(xa, 1);
+ XA_BUG_ON(xa, !xa_is_retry(xas_reload(&xas)));
+ XA_BUG_ON(xa, xas_retry(&xas, NULL));
+ XA_BUG_ON(xa, xas_retry(&xas, xa_mk_value(0)));
+ xas_reset(&xas);
+ XA_BUG_ON(xa, xas.xa_node != XAS_RESTART);
+ XA_BUG_ON(xa, xas_next_entry(&xas, ULONG_MAX) != xa_mk_value(0));
+ XA_BUG_ON(xa, xas.xa_node != NULL);
+
+ XA_BUG_ON(xa, xa_store_index(xa, 1, GFP_KERNEL) != NULL);
+ XA_BUG_ON(xa, !xa_is_internal(xas_reload(&xas)));
+ xas.xa_node = XAS_RESTART;
+ XA_BUG_ON(xa, xas_next_entry(&xas, ULONG_MAX) != xa_mk_value(0));
+ rcu_read_unlock();
+
+ /* Make sure we can iterate through retry entries */
+ xas_lock(&xas);
+ xas_set(&xas, 0);
+ xas_store(&xas, XA_RETRY_ENTRY);
+ xas_set(&xas, 1);
+ xas_store(&xas, XA_RETRY_ENTRY);
+
+ xas_set(&xas, 0);
+ xas_for_each(&xas, entry, ULONG_MAX) {
+ xas_store(&xas, xa_mk_value(xas.xa_index));
+ }
+ xas_unlock(&xas);
+
+ xa_erase_index(xa, 0);
+ xa_erase_index(xa, 1);
+}
+
+static noinline void check_xa_load(struct xarray *xa)
+{
+ unsigned long i, j;
+
+ for (i = 0; i < 1024; i++) {
+ for (j = 0; j < 1024; j++) {
+ void *entry = xa_load(xa, j);
+ if (j < i)
+ XA_BUG_ON(xa, xa_to_value(entry) != j);
+ else
+ XA_BUG_ON(xa, entry);
+ }
+ XA_BUG_ON(xa, xa_store_index(xa, i, GFP_KERNEL) != NULL);
+ }
+
+ for (i = 0; i < 1024; i++) {
+ for (j = 0; j < 1024; j++) {
+ void *entry = xa_load(xa, j);
+ if (j >= i)
+ XA_BUG_ON(xa, xa_to_value(entry) != j);
+ else
+ XA_BUG_ON(xa, entry);
+ }
+ xa_erase_index(xa, i);
+ }
+ XA_BUG_ON(xa, !xa_empty(xa));
+}
+
+static noinline void check_xa_mark_1(struct xarray *xa, unsigned long index)
+{
+ unsigned int order;
+ unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 8 : 1;
+
+ /* NULL elements have no marks set */
+ XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0));
+ xa_set_mark(xa, index, XA_MARK_0);
+ XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0));
+
+ /* Storing a pointer will not make a mark appear */
+ XA_BUG_ON(xa, xa_store_index(xa, index, GFP_KERNEL) != NULL);
+ XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0));
+ xa_set_mark(xa, index, XA_MARK_0);
+ XA_BUG_ON(xa, !xa_get_mark(xa, index, XA_MARK_0));
+
+ /* Setting one mark will not set another mark */
+ XA_BUG_ON(xa, xa_get_mark(xa, index + 1, XA_MARK_0));
+ XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_1));
+
+ /* Storing NULL clears marks, and they can't be set again */
+ xa_erase_index(xa, index);
+ XA_BUG_ON(xa, !xa_empty(xa));
+ XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0));
+ xa_set_mark(xa, index, XA_MARK_0);
+ XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0));
+
+ /*
+ * Storing a multi-index entry over entries with marks gives the
+ * entire entry the union of the marks
+ */
+ BUG_ON((index % 4) != 0);
+ for (order = 2; order < max_order; order++) {
+ unsigned long base = round_down(index, 1UL << order);
+ unsigned long next = base + (1UL << order);
+ unsigned long i;
+
+ XA_BUG_ON(xa, xa_store_index(xa, index + 1, GFP_KERNEL));
+ xa_set_mark(xa, index + 1, XA_MARK_0);
+ XA_BUG_ON(xa, xa_store_index(xa, index + 2, GFP_KERNEL));
+ xa_set_mark(xa, index + 2, XA_MARK_1);
+ XA_BUG_ON(xa, xa_store_index(xa, next, GFP_KERNEL));
+ xa_store_order(xa, index, order, xa_mk_value(index),
+ GFP_KERNEL);
+ for (i = base; i < next; i++) {
+ XA_STATE(xas, xa, i);
+ unsigned int seen = 0;
+ void *entry;
+
+ XA_BUG_ON(xa, !xa_get_mark(xa, i, XA_MARK_0));
+ XA_BUG_ON(xa, !xa_get_mark(xa, i, XA_MARK_1));
+ XA_BUG_ON(xa, xa_get_mark(xa, i, XA_MARK_2));
+
+ /* We should see two elements in the array */
+ xas_for_each(&xas, entry, ULONG_MAX)
+ seen++;
+ XA_BUG_ON(xa, seen != 2);
+
+ /* One of which is marked */
+ xas_set(&xas, 0);
+ seen = 0;
+ xas_for_each_marked(&xas, entry, ULONG_MAX, XA_MARK_0)
+ seen++;
+ XA_BUG_ON(xa, seen != 1);
+ }
+ XA_BUG_ON(xa, xa_get_mark(xa, next, XA_MARK_0));
+ XA_BUG_ON(xa, xa_get_mark(xa, next, XA_MARK_1));
+ XA_BUG_ON(xa, xa_get_mark(xa, next, XA_MARK_2));
+ xa_erase_index(xa, index);
+ xa_erase_index(xa, next);
+ XA_BUG_ON(xa, !xa_empty(xa));
+ }
+ XA_BUG_ON(xa, !xa_empty(xa));
+}
+
+static noinline void check_xa_mark_2(struct xarray *xa)
+{
+ XA_STATE(xas, xa, 0);
+ unsigned long index;
+ unsigned int count = 0;
+ void *entry;
+
+ xa_store_index(xa, 0, GFP_KERNEL);
+ xa_set_mark(xa, 0, XA_MARK_0);
+ xas_lock(&xas);
+ xas_load(&xas);
+ xas_init_marks(&xas);
+ xas_unlock(&xas);
+ XA_BUG_ON(xa, !xa_get_mark(xa, 0, XA_MARK_0) == 0);
+
+ for (index = 3500; index < 4500; index++) {
+ xa_store_index(xa, index, GFP_KERNEL);
+ xa_set_mark(xa, index, XA_MARK_0);
+ }
+
+ xas_reset(&xas);
+ rcu_read_lock();
+ xas_for_each_marked(&xas, entry, ULONG_MAX, XA_MARK_0)
+ count++;
+ rcu_read_unlock();
+ XA_BUG_ON(xa, count != 1000);
+
+ xas_lock(&xas);
+ xas_for_each(&xas, entry, ULONG_MAX) {
+ xas_init_marks(&xas);
+ XA_BUG_ON(xa, !xa_get_mark(xa, xas.xa_index, XA_MARK_0));
+ XA_BUG_ON(xa, !xas_get_mark(&xas, XA_MARK_0));
+ }
+ xas_unlock(&xas);
+
+ xa_destroy(xa);
+}
+
+static noinline void check_xa_mark(struct xarray *xa)
+{
+ unsigned long index;
+
+ for (index = 0; index < 16384; index += 4)
+ check_xa_mark_1(xa, index);
+
+ check_xa_mark_2(xa);
+}
+
+static noinline void check_xa_shrink(struct xarray *xa)
+{
+ XA_STATE(xas, xa, 1);
+ struct xa_node *node;
+ unsigned int order;
+ unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 15 : 1;
+
+ XA_BUG_ON(xa, !xa_empty(xa));
+ XA_BUG_ON(xa, xa_store_index(xa, 0, GFP_KERNEL) != NULL);
+ XA_BUG_ON(xa, xa_store_index(xa, 1, GFP_KERNEL) != NULL);
+
+ /*
+ * Check that erasing the entry at 1 shrinks the tree and properly
+ * marks the node as being deleted.
+ */
+ xas_lock(&xas);
+ XA_BUG_ON(xa, xas_load(&xas) != xa_mk_value(1));
+ node = xas.xa_node;
+ XA_BUG_ON(xa, xa_entry_locked(xa, node, 0) != xa_mk_value(0));
+ XA_BUG_ON(xa, xas_store(&xas, NULL) != xa_mk_value(1));
+ XA_BUG_ON(xa, xa_load(xa, 1) != NULL);
+ XA_BUG_ON(xa, xas.xa_node != XAS_BOUNDS);
+ XA_BUG_ON(xa, xa_entry_locked(xa, node, 0) != XA_RETRY_ENTRY);
+ XA_BUG_ON(xa, xas_load(&xas) != NULL);
+ xas_unlock(&xas);
+ XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(0));
+ xa_erase_index(xa, 0);
+ XA_BUG_ON(xa, !xa_empty(xa));
+
+ for (order = 0; order < max_order; order++) {
+ unsigned long max = (1UL << order) - 1;
+ xa_store_order(xa, 0, order, xa_mk_value(0), GFP_KERNEL);
+ XA_BUG_ON(xa, xa_load(xa, max) != xa_mk_value(0));
+ XA_BUG_ON(xa, xa_load(xa, max + 1) != NULL);
+ rcu_read_lock();
+ node = xa_head(xa);
+ rcu_read_unlock();
+ XA_BUG_ON(xa, xa_store_index(xa, ULONG_MAX, GFP_KERNEL) !=
+ NULL);
+ rcu_read_lock();
+ XA_BUG_ON(xa, xa_head(xa) == node);
+ rcu_read_unlock();
+ XA_BUG_ON(xa, xa_load(xa, max + 1) != NULL);
+ xa_erase_index(xa, ULONG_MAX);
+ XA_BUG_ON(xa, xa->xa_head != node);
+ xa_erase_index(xa, 0);
+ }
+}
+
+static noinline void check_cmpxchg(struct xarray *xa)
+{
+ void *FIVE = xa_mk_value(5);
+ void *SIX = xa_mk_value(6);
+ void *LOTS = xa_mk_value(12345678);
+
+ XA_BUG_ON(xa, !xa_empty(xa));
+ XA_BUG_ON(xa, xa_store_index(xa, 12345678, GFP_KERNEL) != NULL);
+ XA_BUG_ON(xa, xa_insert(xa, 12345678, xa, GFP_KERNEL) != -EEXIST);
+ XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, SIX, FIVE, GFP_KERNEL) != LOTS);
+ XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, LOTS, FIVE, GFP_KERNEL) != LOTS);
+ XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, FIVE, LOTS, GFP_KERNEL) != FIVE);
+ XA_BUG_ON(xa, xa_cmpxchg(xa, 5, FIVE, NULL, GFP_KERNEL) != NULL);
+ XA_BUG_ON(xa, xa_cmpxchg(xa, 5, NULL, FIVE, GFP_KERNEL) != NULL);
+ xa_erase_index(xa, 12345678);
+ xa_erase_index(xa, 5);
+ XA_BUG_ON(xa, !xa_empty(xa));
+}
+
+static noinline void check_reserve(struct xarray *xa)
+{
+ void *entry;
+ unsigned long index = 0;
+
+ /* An array with a reserved entry is not empty */
+ XA_BUG_ON(xa, !xa_empty(xa));
+ xa_reserve(xa, 12345678, GFP_KERNEL);
+ XA_BUG_ON(xa, xa_empty(xa));
+ XA_BUG_ON(xa, xa_load(xa, 12345678));
+ xa_release(xa, 12345678);
+ XA_BUG_ON(xa, !xa_empty(xa));
+
+ /* Releasing a used entry does nothing */
+ xa_reserve(xa, 12345678, GFP_KERNEL);
+ XA_BUG_ON(xa, xa_store_index(xa, 12345678, GFP_NOWAIT) != NULL);
+ xa_release(xa, 12345678);
+ xa_erase_index(xa, 12345678);
+ XA_BUG_ON(xa, !xa_empty(xa));
+
+ /* cmpxchg sees a reserved entry as NULL */
+ xa_reserve(xa, 12345678, GFP_KERNEL);
+ XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, NULL, xa_mk_value(12345678),
+ GFP_NOWAIT) != NULL);
+ xa_release(xa, 12345678);
+ xa_erase_index(xa, 12345678);
+ XA_BUG_ON(xa, !xa_empty(xa));
+
+ /* Can iterate through a reserved entry */
+ xa_store_index(xa, 5, GFP_KERNEL);
+ xa_reserve(xa, 6, GFP_KERNEL);
+ xa_store_index(xa, 7, GFP_KERNEL);
+
+ xa_for_each(xa, entry, index, ULONG_MAX, XA_PRESENT) {
+ XA_BUG_ON(xa, index != 5 && index != 7);
+ }
+ xa_destroy(xa);
+}
+
+static noinline void check_xas_erase(struct xarray *xa)
+{
+ XA_STATE(xas, xa, 0);
+ void *entry;
+ unsigned long i, j;
+
+ for (i = 0; i < 200; i++) {
+ for (j = i; j < 2 * i + 17; j++) {
+ xas_set(&xas, j);
+ do {
+ xas_lock(&xas);
+ xas_store(&xas, xa_mk_value(j));
+ xas_unlock(&xas);
+ } while (xas_nomem(&xas, GFP_KERNEL));
+ }
+
+ xas_set(&xas, ULONG_MAX);
+ do {
+ xas_lock(&xas);
+ xas_store(&xas, xa_mk_value(0));
+ xas_unlock(&xas);
+ } while (xas_nomem(&xas, GFP_KERNEL));
+
+ xas_lock(&xas);
+ xas_store(&xas, NULL);
+
+ xas_set(&xas, 0);
+ j = i;
+ xas_for_each(&xas, entry, ULONG_MAX) {
+ XA_BUG_ON(xa, entry != xa_mk_value(j));
+ xas_store(&xas, NULL);
+ j++;
+ }
+ xas_unlock(&xas);
+ XA_BUG_ON(xa, !xa_empty(xa));
+ }
+}
+
+#ifdef CONFIG_XARRAY_MULTI
+static noinline void check_multi_store_1(struct xarray *xa, unsigned long index,
+ unsigned int order)
+{
+ XA_STATE(xas, xa, index);
+ unsigned long min = index & ~((1UL << order) - 1);
+ unsigned long max = min + (1UL << order);
+
+ xa_store_order(xa, index, order, xa_mk_value(index), GFP_KERNEL);
+ XA_BUG_ON(xa, xa_load(xa, min) != xa_mk_value(index));
+ XA_BUG_ON(xa, xa_load(xa, max - 1) != xa_mk_value(index));
+ XA_BUG_ON(xa, xa_load(xa, max) != NULL);
+ XA_BUG_ON(xa, xa_load(xa, min - 1) != NULL);
+
+ XA_BUG_ON(xa, xas_store(&xas, xa_mk_value(min)) != xa_mk_value(index));
+ XA_BUG_ON(xa, xa_load(xa, min) != xa_mk_value(min));
+ XA_BUG_ON(xa, xa_load(xa, max - 1) != xa_mk_value(min));
+ XA_BUG_ON(xa, xa_load(xa, max) != NULL);
+ XA_BUG_ON(xa, xa_load(xa, min - 1) != NULL);
+
+ xa_erase_index(xa, min);
+ XA_BUG_ON(xa, !xa_empty(xa));
+}
+
+static noinline void check_multi_store_2(struct xarray *xa, unsigned long index,
+ unsigned int order)
+{
+ XA_STATE(xas, xa, index);
+ xa_store_order(xa, index, order, xa_mk_value(0), GFP_KERNEL);
+
+ XA_BUG_ON(xa, xas_store(&xas, xa_mk_value(1)) != xa_mk_value(0));
+ XA_BUG_ON(xa, xas.xa_index != index);
+ XA_BUG_ON(xa, xas_store(&xas, NULL) != xa_mk_value(1));
+ XA_BUG_ON(xa, !xa_empty(xa));
+}
+#endif
+
+static noinline void check_multi_store(struct xarray *xa)
+{
+#ifdef CONFIG_XARRAY_MULTI
+ unsigned long i, j, k;
+ unsigned int max_order = (sizeof(long) == 4) ? 30 : 60;
+
+ /* Loading from any position returns the same value */
+ xa_store_order(xa, 0, 1, xa_mk_value(0), GFP_KERNEL);
+ XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(0));
+ XA_BUG_ON(xa, xa_load(xa, 1) != xa_mk_value(0));
+ XA_BUG_ON(xa, xa_load(xa, 2) != NULL);
+ rcu_read_lock();
+ XA_BUG_ON(xa, xa_to_node(xa_head(xa))->count != 2);
+ XA_BUG_ON(xa, xa_to_node(xa_head(xa))->nr_values != 2);
+ rcu_read_unlock();
+
+ /* Storing adjacent to the value does not alter the value */
+ xa_store(xa, 3, xa, GFP_KERNEL);
+ XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(0));
+ XA_BUG_ON(xa, xa_load(xa, 1) != xa_mk_value(0));
+ XA_BUG_ON(xa, xa_load(xa, 2) != NULL);
+ rcu_read_lock();
+ XA_BUG_ON(xa, xa_to_node(xa_head(xa))->count != 3);
+ XA_BUG_ON(xa, xa_to_node(xa_head(xa))->nr_values != 2);
+ rcu_read_unlock();
+
+ /* Overwriting multiple indexes works */
+ xa_store_order(xa, 0, 2, xa_mk_value(1), GFP_KERNEL);
+ XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(1));
+ XA_BUG_ON(xa, xa_load(xa, 1) != xa_mk_value(1));
+ XA_BUG_ON(xa, xa_load(xa, 2) != xa_mk_value(1));
+ XA_BUG_ON(xa, xa_load(xa, 3) != xa_mk_value(1));
+ XA_BUG_ON(xa, xa_load(xa, 4) != NULL);
+ rcu_read_lock();
+ XA_BUG_ON(xa, xa_to_node(xa_head(xa))->count != 4);
+ XA_BUG_ON(xa, xa_to_node(xa_head(xa))->nr_values != 4);
+ rcu_read_unlock();
+
+ /* We can erase multiple values with a single store */
+ xa_store_order(xa, 0, 63, NULL, GFP_KERNEL);
+ XA_BUG_ON(xa, !xa_empty(xa));
+
+ /* Even when the first slot is empty but the others aren't */
+ xa_store_index(xa, 1, GFP_KERNEL);
+ xa_store_index(xa, 2, GFP_KERNEL);
+ xa_store_order(xa, 0, 2, NULL, GFP_KERNEL);
+ XA_BUG_ON(xa, !xa_empty(xa));
+
+ for (i = 0; i < max_order; i++) {
+ for (j = 0; j < max_order; j++) {
+ xa_store_order(xa, 0, i, xa_mk_value(i), GFP_KERNEL);
+ xa_store_order(xa, 0, j, xa_mk_value(j), GFP_KERNEL);
+
+ for (k = 0; k < max_order; k++) {
+ void *entry = xa_load(xa, (1UL << k) - 1);
+ if ((i < k) && (j < k))
+ XA_BUG_ON(xa, entry != NULL);
+ else
+ XA_BUG_ON(xa, entry != xa_mk_value(j));
+ }
+
+ xa_erase(xa, 0);
+ XA_BUG_ON(xa, !xa_empty(xa));
+ }
+ }
+
+ for (i = 0; i < 20; i++) {
+ check_multi_store_1(xa, 200, i);
+ check_multi_store_1(xa, 0, i);
+ check_multi_store_1(xa, (1UL << i) + 1, i);
+ }
+ check_multi_store_2(xa, 4095, 9);
+#endif
+}
+
+static DEFINE_XARRAY_ALLOC(xa0);
+
+static noinline void check_xa_alloc(void)
+{
+ int i;
+ u32 id;
+
+ /* An empty array should assign 0 to the first alloc */
+ xa_alloc_index(&xa0, 0, GFP_KERNEL);
+
+ /* Erasing it should make the array empty again */
+ xa_erase_index(&xa0, 0);
+ XA_BUG_ON(&xa0, !xa_empty(&xa0));
+
+ /* And it should assign 0 again */
+ xa_alloc_index(&xa0, 0, GFP_KERNEL);
+
+ /* The next assigned ID should be 1 */
+ xa_alloc_index(&xa0, 1, GFP_KERNEL);
+ xa_erase_index(&xa0, 1);
+
+ /* Storing a value should mark it used */
+ xa_store_index(&xa0, 1, GFP_KERNEL);
+ xa_alloc_index(&xa0, 2, GFP_KERNEL);
+
+ /* If we then erase 0, it should be free */
+ xa_erase_index(&xa0, 0);
+ xa_alloc_index(&xa0, 0, GFP_KERNEL);
+
+ xa_erase_index(&xa0, 1);
+ xa_erase_index(&xa0, 2);
+
+ for (i = 1; i < 5000; i++) {
+ xa_alloc_index(&xa0, i, GFP_KERNEL);
+ }
+
+ xa_destroy(&xa0);
+
+ id = 0xfffffffeU;
+ XA_BUG_ON(&xa0, xa_alloc(&xa0, &id, UINT_MAX, xa_mk_value(0),
+ GFP_KERNEL) != 0);
+ XA_BUG_ON(&xa0, id != 0xfffffffeU);
+ XA_BUG_ON(&xa0, xa_alloc(&xa0, &id, UINT_MAX, xa_mk_value(0),
+ GFP_KERNEL) != 0);
+ XA_BUG_ON(&xa0, id != 0xffffffffU);
+ XA_BUG_ON(&xa0, xa_alloc(&xa0, &id, UINT_MAX, xa_mk_value(0),
+ GFP_KERNEL) != -ENOSPC);
+ XA_BUG_ON(&xa0, id != 0xffffffffU);
+ xa_destroy(&xa0);
+}
+
+static noinline void __check_store_iter(struct xarray *xa, unsigned long start,
+ unsigned int order, unsigned int present)
+{
+ XA_STATE_ORDER(xas, xa, start, order);
+ void *entry;
+ unsigned int count = 0;
+
+retry:
+ xas_lock(&xas);
+ xas_for_each_conflict(&xas, entry) {
+ XA_BUG_ON(xa, !xa_is_value(entry));
+ XA_BUG_ON(xa, entry < xa_mk_value(start));
+ XA_BUG_ON(xa, entry > xa_mk_value(start + (1UL << order) - 1));
+ count++;
+ }
+ xas_store(&xas, xa_mk_value(start));
+ xas_unlock(&xas);
+ if (xas_nomem(&xas, GFP_KERNEL)) {
+ count = 0;
+ goto retry;
+ }
+ XA_BUG_ON(xa, xas_error(&xas));
+ XA_BUG_ON(xa, count != present);
+ XA_BUG_ON(xa, xa_load(xa, start) != xa_mk_value(start));
+ XA_BUG_ON(xa, xa_load(xa, start + (1UL << order) - 1) !=
+ xa_mk_value(start));
+ xa_erase_index(xa, start);
+}
+
+static noinline void check_store_iter(struct xarray *xa)
+{
+ unsigned int i, j;
+ unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1;
+
+ for (i = 0; i < max_order; i++) {
+ unsigned int min = 1 << i;
+ unsigned int max = (2 << i) - 1;
+ __check_store_iter(xa, 0, i, 0);
+ XA_BUG_ON(xa, !xa_empty(xa));
+ __check_store_iter(xa, min, i, 0);
+ XA_BUG_ON(xa, !xa_empty(xa));
+
+ xa_store_index(xa, min, GFP_KERNEL);
+ __check_store_iter(xa, min, i, 1);
+ XA_BUG_ON(xa, !xa_empty(xa));
+ xa_store_index(xa, max, GFP_KERNEL);
+ __check_store_iter(xa, min, i, 1);
+ XA_BUG_ON(xa, !xa_empty(xa));
+
+ for (j = 0; j < min; j++)
+ xa_store_index(xa, j, GFP_KERNEL);
+ __check_store_iter(xa, 0, i, min);
+ XA_BUG_ON(xa, !xa_empty(xa));
+ for (j = 0; j < min; j++)
+ xa_store_index(xa, min + j, GFP_KERNEL);
+ __check_store_iter(xa, min, i, min);
+ XA_BUG_ON(xa, !xa_empty(xa));
+ }
+#ifdef CONFIG_XARRAY_MULTI
+ xa_store_index(xa, 63, GFP_KERNEL);
+ xa_store_index(xa, 65, GFP_KERNEL);
+ __check_store_iter(xa, 64, 2, 1);
+ xa_erase_index(xa, 63);
+#endif
+ XA_BUG_ON(xa, !xa_empty(xa));
+}
+
+static noinline void check_multi_find(struct xarray *xa)
+{
+#ifdef CONFIG_XARRAY_MULTI
+ unsigned long index;
+
+ xa_store_order(xa, 12, 2, xa_mk_value(12), GFP_KERNEL);
+ XA_BUG_ON(xa, xa_store_index(xa, 16, GFP_KERNEL) != NULL);
+
+ index = 0;
+ XA_BUG_ON(xa, xa_find(xa, &index, ULONG_MAX, XA_PRESENT) !=
+ xa_mk_value(12));
+ XA_BUG_ON(xa, index != 12);
+ index = 13;
+ XA_BUG_ON(xa, xa_find(xa, &index, ULONG_MAX, XA_PRESENT) !=
+ xa_mk_value(12));
+ XA_BUG_ON(xa, (index < 12) || (index >= 16));
+ XA_BUG_ON(xa, xa_find_after(xa, &index, ULONG_MAX, XA_PRESENT) !=
+ xa_mk_value(16));
+ XA_BUG_ON(xa, index != 16);
+
+ xa_erase_index(xa, 12);
+ xa_erase_index(xa, 16);
+ XA_BUG_ON(xa, !xa_empty(xa));
+#endif
+}
+
+static noinline void check_multi_find_2(struct xarray *xa)
+{
+ unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 10 : 1;
+ unsigned int i, j;
+ void *entry;
+
+ for (i = 0; i < max_order; i++) {
+ unsigned long index = 1UL << i;
+ for (j = 0; j < index; j++) {
+ XA_STATE(xas, xa, j + index);
+ xa_store_index(xa, index - 1, GFP_KERNEL);
+ xa_store_order(xa, index, i, xa_mk_value(index),
+ GFP_KERNEL);
+ rcu_read_lock();
+ xas_for_each(&xas, entry, ULONG_MAX) {
+ xa_erase_index(xa, index);
+ }
+ rcu_read_unlock();
+ xa_erase_index(xa, index - 1);
+ XA_BUG_ON(xa, !xa_empty(xa));
+ }
+ }
+}
+
+static noinline void check_find(struct xarray *xa)
+{
+ unsigned long i, j, k;
+
+ XA_BUG_ON(xa, !xa_empty(xa));
+
+ /*
+ * Check xa_find with all pairs between 0 and 99 inclusive,
+ * starting at every index between 0 and 99
+ */
+ for (i = 0; i < 100; i++) {
+ XA_BUG_ON(xa, xa_store_index(xa, i, GFP_KERNEL) != NULL);
+ xa_set_mark(xa, i, XA_MARK_0);
+ for (j = 0; j < i; j++) {
+ XA_BUG_ON(xa, xa_store_index(xa, j, GFP_KERNEL) !=
+ NULL);
+ xa_set_mark(xa, j, XA_MARK_0);
+ for (k = 0; k < 100; k++) {
+ unsigned long index = k;
+ void *entry = xa_find(xa, &index, ULONG_MAX,
+ XA_PRESENT);
+ if (k <= j)
+ XA_BUG_ON(xa, index != j);
+ else if (k <= i)
+ XA_BUG_ON(xa, index != i);
+ else
+ XA_BUG_ON(xa, entry != NULL);
+
+ index = k;
+ entry = xa_find(xa, &index, ULONG_MAX,
+ XA_MARK_0);
+ if (k <= j)
+ XA_BUG_ON(xa, index != j);
+ else if (k <= i)
+ XA_BUG_ON(xa, index != i);
+ else
+ XA_BUG_ON(xa, entry != NULL);
+ }
+ xa_erase_index(xa, j);
+ XA_BUG_ON(xa, xa_get_mark(xa, j, XA_MARK_0));
+ XA_BUG_ON(xa, !xa_get_mark(xa, i, XA_MARK_0));
+ }
+ xa_erase_index(xa, i);
+ XA_BUG_ON(xa, xa_get_mark(xa, i, XA_MARK_0));
+ }
+ XA_BUG_ON(xa, !xa_empty(xa));
+ check_multi_find(xa);
+ check_multi_find_2(xa);
+}
+
+/* See find_swap_entry() in mm/shmem.c */
+static noinline unsigned long xa_find_entry(struct xarray *xa, void *item)
+{
+ XA_STATE(xas, xa, 0);
+ unsigned int checked = 0;
+ void *entry;
+
+ rcu_read_lock();
+ xas_for_each(&xas, entry, ULONG_MAX) {
+ if (xas_retry(&xas, entry))
+ continue;
+ if (entry == item)
+ break;
+ checked++;
+ if ((checked % 4) != 0)
+ continue;
+ xas_pause(&xas);
+ }
+ rcu_read_unlock();
+
+ return entry ? xas.xa_index : -1;
+}
+
+static noinline void check_find_entry(struct xarray *xa)
+{
+#ifdef CONFIG_XARRAY_MULTI
+ unsigned int order;
+ unsigned long offset, index;
+
+ for (order = 0; order < 20; order++) {
+ for (offset = 0; offset < (1UL << (order + 3));
+ offset += (1UL << order)) {
+ for (index = 0; index < (1UL << (order + 5));
+ index += (1UL << order)) {
+ xa_store_order(xa, index, order,
+ xa_mk_value(index), GFP_KERNEL);
+ XA_BUG_ON(xa, xa_load(xa, index) !=
+ xa_mk_value(index));
+ XA_BUG_ON(xa, xa_find_entry(xa,
+ xa_mk_value(index)) != index);
+ }
+ XA_BUG_ON(xa, xa_find_entry(xa, xa) != -1);
+ xa_destroy(xa);
+ }
+ }
+#endif
+
+ XA_BUG_ON(xa, xa_find_entry(xa, xa) != -1);
+ xa_store_index(xa, ULONG_MAX, GFP_KERNEL);
+ XA_BUG_ON(xa, xa_find_entry(xa, xa) != -1);
+ XA_BUG_ON(xa, xa_find_entry(xa, xa_mk_value(LONG_MAX)) != -1);
+ xa_erase_index(xa, ULONG_MAX);
+ XA_BUG_ON(xa, !xa_empty(xa));
+}
+
+static noinline void check_move_small(struct xarray *xa, unsigned long idx)
+{
+ XA_STATE(xas, xa, 0);
+ unsigned long i;
+
+ xa_store_index(xa, 0, GFP_KERNEL);
+ xa_store_index(xa, idx, GFP_KERNEL);
+
+ rcu_read_lock();
+ for (i = 0; i < idx * 4; i++) {
+ void *entry = xas_next(&xas);
+ if (i <= idx)
+ XA_BUG_ON(xa, xas.xa_node == XAS_RESTART);
+ XA_BUG_ON(xa, xas.xa_index != i);
+ if (i == 0 || i == idx)
+ XA_BUG_ON(xa, entry != xa_mk_value(i));
+ else
+ XA_BUG_ON(xa, entry != NULL);
+ }
+ xas_next(&xas);
+ XA_BUG_ON(xa, xas.xa_index != i);
+
+ do {
+ void *entry = xas_prev(&xas);
+ i--;
+ if (i <= idx)
+ XA_BUG_ON(xa, xas.xa_node == XAS_RESTART);
+ XA_BUG_ON(xa, xas.xa_index != i);
+ if (i == 0 || i == idx)
+ XA_BUG_ON(xa, entry != xa_mk_value(i));
+ else
+ XA_BUG_ON(xa, entry != NULL);
+ } while (i > 0);
+
+ xas_set(&xas, ULONG_MAX);
+ XA_BUG_ON(xa, xas_next(&xas) != NULL);
+ XA_BUG_ON(xa, xas.xa_index != ULONG_MAX);
+ XA_BUG_ON(xa, xas_next(&xas) != xa_mk_value(0));
+ XA_BUG_ON(xa, xas.xa_index != 0);
+ XA_BUG_ON(xa, xas_prev(&xas) != NULL);
+ XA_BUG_ON(xa, xas.xa_index != ULONG_MAX);
+ rcu_read_unlock();
+
+ xa_erase_index(xa, 0);
+ xa_erase_index(xa, idx);
+ XA_BUG_ON(xa, !xa_empty(xa));
+}
+
+static noinline void check_move(struct xarray *xa)
+{
+ XA_STATE(xas, xa, (1 << 16) - 1);
+ unsigned long i;
+
+ for (i = 0; i < (1 << 16); i++)
+ XA_BUG_ON(xa, xa_store_index(xa, i, GFP_KERNEL) != NULL);
+
+ rcu_read_lock();
+ do {
+ void *entry = xas_prev(&xas);
+ i--;
+ XA_BUG_ON(xa, entry != xa_mk_value(i));
+ XA_BUG_ON(xa, i != xas.xa_index);
+ } while (i != 0);
+
+ XA_BUG_ON(xa, xas_prev(&xas) != NULL);
+ XA_BUG_ON(xa, xas.xa_index != ULONG_MAX);
+
+ do {
+ void *entry = xas_next(&xas);
+ XA_BUG_ON(xa, entry != xa_mk_value(i));
+ XA_BUG_ON(xa, i != xas.xa_index);
+ i++;
+ } while (i < (1 << 16));
+ rcu_read_unlock();
+
+ for (i = (1 << 8); i < (1 << 15); i++)
+ xa_erase_index(xa, i);
+
+ i = xas.xa_index;
+
+ rcu_read_lock();
+ do {
+ void *entry = xas_prev(&xas);
+ i--;
+ if ((i < (1 << 8)) || (i >= (1 << 15)))
+ XA_BUG_ON(xa, entry != xa_mk_value(i));
+ else
+ XA_BUG_ON(xa, entry != NULL);
+ XA_BUG_ON(xa, i != xas.xa_index);
+ } while (i != 0);
+
+ XA_BUG_ON(xa, xas_prev(&xas) != NULL);
+ XA_BUG_ON(xa, xas.xa_index != ULONG_MAX);
+
+ do {
+ void *entry = xas_next(&xas);
+ if ((i < (1 << 8)) || (i >= (1 << 15)))
+ XA_BUG_ON(xa, entry != xa_mk_value(i));
+ else
+ XA_BUG_ON(xa, entry != NULL);
+ XA_BUG_ON(xa, i != xas.xa_index);
+ i++;
+ } while (i < (1 << 16));
+ rcu_read_unlock();
+
+ xa_destroy(xa);
+
+ for (i = 0; i < 16; i++)
+ check_move_small(xa, 1UL << i);
+
+ for (i = 2; i < 16; i++)
+ check_move_small(xa, (1UL << i) - 1);
+}
+
+static noinline void xa_store_many_order(struct xarray *xa,
+ unsigned long index, unsigned order)
+{
+ XA_STATE_ORDER(xas, xa, index, order);
+ unsigned int i = 0;
+
+ do {
+ xas_lock(&xas);
+ XA_BUG_ON(xa, xas_find_conflict(&xas));
+ xas_create_range(&xas);
+ if (xas_error(&xas))
+ goto unlock;
+ for (i = 0; i < (1U << order); i++) {
+ XA_BUG_ON(xa, xas_store(&xas, xa_mk_value(index + i)));
+ xas_next(&xas);
+ }
+unlock:
+ xas_unlock(&xas);
+ } while (xas_nomem(&xas, GFP_KERNEL));
+
+ XA_BUG_ON(xa, xas_error(&xas));
+}
+
+static noinline void check_create_range_1(struct xarray *xa,
+ unsigned long index, unsigned order)
+{
+ unsigned long i;
+
+ xa_store_many_order(xa, index, order);
+ for (i = index; i < index + (1UL << order); i++)
+ xa_erase_index(xa, i);
+ XA_BUG_ON(xa, !xa_empty(xa));
+}
+
+static noinline void check_create_range_2(struct xarray *xa, unsigned order)
+{
+ unsigned long i;
+ unsigned long nr = 1UL << order;
+
+ for (i = 0; i < nr * nr; i += nr)
+ xa_store_many_order(xa, i, order);
+ for (i = 0; i < nr * nr; i++)
+ xa_erase_index(xa, i);
+ XA_BUG_ON(xa, !xa_empty(xa));
+}
+
+static noinline void check_create_range_3(void)
+{
+ XA_STATE(xas, NULL, 0);
+ xas_set_err(&xas, -EEXIST);
+ xas_create_range(&xas);
+ XA_BUG_ON(NULL, xas_error(&xas) != -EEXIST);
+}
+
+static noinline void check_create_range_4(struct xarray *xa,
+ unsigned long index, unsigned order)
+{
+ XA_STATE_ORDER(xas, xa, index, order);
+ unsigned long base = xas.xa_index;
+ unsigned long i = 0;
+
+ xa_store_index(xa, index, GFP_KERNEL);
+ do {
+ xas_lock(&xas);
+ xas_create_range(&xas);
+ if (xas_error(&xas))
+ goto unlock;
+ for (i = 0; i < (1UL << order); i++) {
+ void *old = xas_store(&xas, xa_mk_value(base + i));
+ if (xas.xa_index == index)
+ XA_BUG_ON(xa, old != xa_mk_value(base + i));
+ else
+ XA_BUG_ON(xa, old != NULL);
+ xas_next(&xas);
+ }
+unlock:
+ xas_unlock(&xas);
+ } while (xas_nomem(&xas, GFP_KERNEL));
+
+ XA_BUG_ON(xa, xas_error(&xas));
+
+ for (i = base; i < base + (1UL << order); i++)
+ xa_erase_index(xa, i);
+ XA_BUG_ON(xa, !xa_empty(xa));
+}
+
+static noinline void check_create_range(struct xarray *xa)
+{
+ unsigned int order;
+ unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 12 : 1;
+
+ for (order = 0; order < max_order; order++) {
+ check_create_range_1(xa, 0, order);
+ check_create_range_1(xa, 1U << order, order);
+ check_create_range_1(xa, 2U << order, order);
+ check_create_range_1(xa, 3U << order, order);
+ check_create_range_1(xa, 1U << 24, order);
+ if (order < 10)
+ check_create_range_2(xa, order);
+
+ check_create_range_4(xa, 0, order);
+ check_create_range_4(xa, 1U << order, order);
+ check_create_range_4(xa, 2U << order, order);
+ check_create_range_4(xa, 3U << order, order);
+ check_create_range_4(xa, 1U << 24, order);
+
+ check_create_range_4(xa, 1, order);
+ check_create_range_4(xa, (1U << order) + 1, order);
+ check_create_range_4(xa, (2U << order) + 1, order);
+ check_create_range_4(xa, (2U << order) - 1, order);
+ check_create_range_4(xa, (3U << order) + 1, order);
+ check_create_range_4(xa, (3U << order) - 1, order);
+ check_create_range_4(xa, (1U << 24) + 1, order);
+ }
+
+ check_create_range_3();
+}
+
+static noinline void __check_store_range(struct xarray *xa, unsigned long first,
+ unsigned long last)
+{
+#ifdef CONFIG_XARRAY_MULTI
+ xa_store_range(xa, first, last, xa_mk_value(first), GFP_KERNEL);
+
+ XA_BUG_ON(xa, xa_load(xa, first) != xa_mk_value(first));
+ XA_BUG_ON(xa, xa_load(xa, last) != xa_mk_value(first));
+ XA_BUG_ON(xa, xa_load(xa, first - 1) != NULL);
+ XA_BUG_ON(xa, xa_load(xa, last + 1) != NULL);
+
+ xa_store_range(xa, first, last, NULL, GFP_KERNEL);
+#endif
+
+ XA_BUG_ON(xa, !xa_empty(xa));
+}
+
+static noinline void check_store_range(struct xarray *xa)
+{
+ unsigned long i, j;
+
+ for (i = 0; i < 128; i++) {
+ for (j = i; j < 128; j++) {
+ __check_store_range(xa, i, j);
+ __check_store_range(xa, 128 + i, 128 + j);
+ __check_store_range(xa, 4095 + i, 4095 + j);
+ __check_store_range(xa, 4096 + i, 4096 + j);
+ __check_store_range(xa, 123456 + i, 123456 + j);
+ __check_store_range(xa, UINT_MAX + i, UINT_MAX + j);
+ }
+ }
+}
+
+static LIST_HEAD(shadow_nodes);
+
+static void test_update_node(struct xa_node *node)
+{
+ if (node->count && node->count == node->nr_values) {
+ if (list_empty(&node->private_list))
+ list_add(&shadow_nodes, &node->private_list);
+ } else {
+ if (!list_empty(&node->private_list))
+ list_del_init(&node->private_list);
+ }
+}
+
+static noinline void shadow_remove(struct xarray *xa)
+{
+ struct xa_node *node;
+
+ xa_lock(xa);
+ while ((node = list_first_entry_or_null(&shadow_nodes,
+ struct xa_node, private_list))) {
+ XA_STATE(xas, node->array, 0);
+ XA_BUG_ON(xa, node->array != xa);
+ list_del_init(&node->private_list);
+ xas.xa_node = xa_parent_locked(node->array, node);
+ xas.xa_offset = node->offset;
+ xas.xa_shift = node->shift + XA_CHUNK_SHIFT;
+ xas_set_update(&xas, test_update_node);
+ xas_store(&xas, NULL);
+ }
+ xa_unlock(xa);
+}
+
+static noinline void check_workingset(struct xarray *xa, unsigned long index)
+{
+ XA_STATE(xas, xa, index);
+ xas_set_update(&xas, test_update_node);
+
+ do {
+ xas_lock(&xas);
+ xas_store(&xas, xa_mk_value(0));
+ xas_next(&xas);
+ xas_store(&xas, xa_mk_value(1));
+ xas_unlock(&xas);
+ } while (xas_nomem(&xas, GFP_KERNEL));
+
+ XA_BUG_ON(xa, list_empty(&shadow_nodes));
+
+ xas_lock(&xas);
+ xas_next(&xas);
+ xas_store(&xas, &xas);
+ XA_BUG_ON(xa, !list_empty(&shadow_nodes));
+
+ xas_store(&xas, xa_mk_value(2));
+ xas_unlock(&xas);
+ XA_BUG_ON(xa, list_empty(&shadow_nodes));
+
+ shadow_remove(xa);
+ XA_BUG_ON(xa, !list_empty(&shadow_nodes));
+ XA_BUG_ON(xa, !xa_empty(xa));
+}
+
+/*
+ * Check that the pointer / value / sibling entries are accounted the
+ * way we expect them to be.
+ */
+static noinline void check_account(struct xarray *xa)
+{
+#ifdef CONFIG_XARRAY_MULTI
+ unsigned int order;
+
+ for (order = 1; order < 12; order++) {
+ XA_STATE(xas, xa, 1 << order);
+
+ xa_store_order(xa, 0, order, xa, GFP_KERNEL);
+ xas_load(&xas);
+ XA_BUG_ON(xa, xas.xa_node->count == 0);
+ XA_BUG_ON(xa, xas.xa_node->count > (1 << order));
+ XA_BUG_ON(xa, xas.xa_node->nr_values != 0);
+
+ xa_store_order(xa, 1 << order, order, xa_mk_value(1 << order),
+ GFP_KERNEL);
+ XA_BUG_ON(xa, xas.xa_node->count != xas.xa_node->nr_values * 2);
+
+ xa_erase(xa, 1 << order);
+ XA_BUG_ON(xa, xas.xa_node->nr_values != 0);
+
+ xa_erase(xa, 0);
+ XA_BUG_ON(xa, !xa_empty(xa));
+ }
+#endif
+}
+
+static noinline void check_destroy(struct xarray *xa)
+{
+ unsigned long index;
+
+ XA_BUG_ON(xa, !xa_empty(xa));
+
+ /* Destroying an empty array is a no-op */
+ xa_destroy(xa);
+ XA_BUG_ON(xa, !xa_empty(xa));
+
+ /* Destroying an array with a single entry */
+ for (index = 0; index < 1000; index++) {
+ xa_store_index(xa, index, GFP_KERNEL);
+ XA_BUG_ON(xa, xa_empty(xa));
+ xa_destroy(xa);
+ XA_BUG_ON(xa, !xa_empty(xa));
+ }
+
+ /* Destroying an array with a single entry at ULONG_MAX */
+ xa_store(xa, ULONG_MAX, xa, GFP_KERNEL);
+ XA_BUG_ON(xa, xa_empty(xa));
+ xa_destroy(xa);
+ XA_BUG_ON(xa, !xa_empty(xa));
+
+#ifdef CONFIG_XARRAY_MULTI
+ /* Destroying an array with a multi-index entry */
+ xa_store_order(xa, 1 << 11, 11, xa, GFP_KERNEL);
+ XA_BUG_ON(xa, xa_empty(xa));
+ xa_destroy(xa);
+ XA_BUG_ON(xa, !xa_empty(xa));
+#endif
+}
+
+static DEFINE_XARRAY(array);
+
+static int xarray_checks(void)
+{
+ check_xa_err(&array);
+ check_xas_retry(&array);
+ check_xa_load(&array);
+ check_xa_mark(&array);
+ check_xa_shrink(&array);
+ check_xas_erase(&array);
+ check_cmpxchg(&array);
+ check_reserve(&array);
+ check_multi_store(&array);
+ check_xa_alloc();
+ check_find(&array);
+ check_find_entry(&array);
+ check_account(&array);
+ check_destroy(&array);
+ check_move(&array);
+ check_create_range(&array);
+ check_store_range(&array);
+ check_store_iter(&array);
+
+ check_workingset(&array, 0);
+ check_workingset(&array, 64);
+ check_workingset(&array, 4096);
+
+ printk("XArray: %u of %u tests passed\n", tests_passed, tests_run);
+ return (tests_run == tests_passed) ? 0 : -EINVAL;
+}
+
+static void xarray_exit(void)
+{
+}
+
+module_init(xarray_checks);
+module_exit(xarray_exit);
+MODULE_AUTHOR("Matthew Wilcox <willy@infradead.org>");
+MODULE_LICENSE("GPL");
diff --git a/lib/xarray.c b/lib/xarray.c
new file mode 100644
index 000000000000..8b176f009c08
--- /dev/null
+++ b/lib/xarray.c
@@ -0,0 +1,2036 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * XArray implementation
+ * Copyright (c) 2017 Microsoft Corporation
+ * Author: Matthew Wilcox <willy@infradead.org>
+ */
+
+#include <linux/bitmap.h>
+#include <linux/export.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+
+/*
+ * Coding conventions in this file:
+ *
+ * @xa is used to refer to the entire xarray.
+ * @xas is the 'xarray operation state'. It may be either a pointer to
+ * an xa_state, or an xa_state stored on the stack. This is an unfortunate
+ * ambiguity.
+ * @index is the index of the entry being operated on
+ * @mark is an xa_mark_t; a small number indicating one of the mark bits.
+ * @node refers to an xa_node; usually the primary one being operated on by
+ * this function.
+ * @offset is the index into the slots array inside an xa_node.
+ * @parent refers to the @xa_node closer to the head than @node.
+ * @entry refers to something stored in a slot in the xarray
+ */
+
+static inline unsigned int xa_lock_type(const struct xarray *xa)
+{
+ return (__force unsigned int)xa->xa_flags & 3;
+}
+
+static inline void xas_lock_type(struct xa_state *xas, unsigned int lock_type)
+{
+ if (lock_type == XA_LOCK_IRQ)
+ xas_lock_irq(xas);
+ else if (lock_type == XA_LOCK_BH)
+ xas_lock_bh(xas);
+ else
+ xas_lock(xas);
+}
+
+static inline void xas_unlock_type(struct xa_state *xas, unsigned int lock_type)
+{
+ if (lock_type == XA_LOCK_IRQ)
+ xas_unlock_irq(xas);
+ else if (lock_type == XA_LOCK_BH)
+ xas_unlock_bh(xas);
+ else
+ xas_unlock(xas);
+}
+
+static inline bool xa_track_free(const struct xarray *xa)
+{
+ return xa->xa_flags & XA_FLAGS_TRACK_FREE;
+}
+
+static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark)
+{
+ if (!(xa->xa_flags & XA_FLAGS_MARK(mark)))
+ xa->xa_flags |= XA_FLAGS_MARK(mark);
+}
+
+static inline void xa_mark_clear(struct xarray *xa, xa_mark_t mark)
+{
+ if (xa->xa_flags & XA_FLAGS_MARK(mark))
+ xa->xa_flags &= ~(XA_FLAGS_MARK(mark));
+}
+
+static inline unsigned long *node_marks(struct xa_node *node, xa_mark_t mark)
+{
+ return node->marks[(__force unsigned)mark];
+}
+
+static inline bool node_get_mark(struct xa_node *node,
+ unsigned int offset, xa_mark_t mark)
+{
+ return test_bit(offset, node_marks(node, mark));
+}
+
+/* returns true if the bit was set */
+static inline bool node_set_mark(struct xa_node *node, unsigned int offset,
+ xa_mark_t mark)
+{
+ return __test_and_set_bit(offset, node_marks(node, mark));
+}
+
+/* returns true if the bit was set */
+static inline bool node_clear_mark(struct xa_node *node, unsigned int offset,
+ xa_mark_t mark)
+{
+ return __test_and_clear_bit(offset, node_marks(node, mark));
+}
+
+static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark)
+{
+ return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE);
+}
+
+static inline void node_mark_all(struct xa_node *node, xa_mark_t mark)
+{
+ bitmap_fill(node_marks(node, mark), XA_CHUNK_SIZE);
+}
+
+#define mark_inc(mark) do { \
+ mark = (__force xa_mark_t)((__force unsigned)(mark) + 1); \
+} while (0)
+
+/*
+ * xas_squash_marks() - Merge all marks to the first entry
+ * @xas: Array operation state.
+ *
+ * Set a mark on the first entry if any entry has it set. Clear marks on
+ * all sibling entries.
+ */
+static void xas_squash_marks(const struct xa_state *xas)
+{
+ unsigned int mark = 0;
+ unsigned int limit = xas->xa_offset + xas->xa_sibs + 1;
+
+ if (!xas->xa_sibs)
+ return;
+
+ do {
+ unsigned long *marks = xas->xa_node->marks[mark];
+ if (find_next_bit(marks, limit, xas->xa_offset + 1) == limit)
+ continue;
+ __set_bit(xas->xa_offset, marks);
+ bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs);
+ } while (mark++ != (__force unsigned)XA_MARK_MAX);
+}
+
+/* extracts the offset within this node from the index */
+static unsigned int get_offset(unsigned long index, struct xa_node *node)
+{
+ return (index >> node->shift) & XA_CHUNK_MASK;
+}
+
+static void xas_set_offset(struct xa_state *xas)
+{
+ xas->xa_offset = get_offset(xas->xa_index, xas->xa_node);
+}
+
+/* move the index either forwards (find) or backwards (sibling slot) */
+static void xas_move_index(struct xa_state *xas, unsigned long offset)
+{
+ unsigned int shift = xas->xa_node->shift;
+ xas->xa_index &= ~XA_CHUNK_MASK << shift;
+ xas->xa_index += offset << shift;
+}
+
+static void xas_advance(struct xa_state *xas)
+{
+ xas->xa_offset++;
+ xas_move_index(xas, xas->xa_offset);
+}
+
+static void *set_bounds(struct xa_state *xas)
+{
+ xas->xa_node = XAS_BOUNDS;
+ return NULL;
+}
+
+/*
+ * Starts a walk. If the @xas is already valid, we assume that it's on
+ * the right path and just return where we've got to. If we're in an
+ * error state, return NULL. If the index is outside the current scope
+ * of the xarray, return NULL without changing @xas->xa_node. Otherwise
+ * set @xas->xa_node to NULL and return the current head of the array.
+ */
+static void *xas_start(struct xa_state *xas)
+{
+ void *entry;
+
+ if (xas_valid(xas))
+ return xas_reload(xas);
+ if (xas_error(xas))
+ return NULL;
+
+ entry = xa_head(xas->xa);
+ if (!xa_is_node(entry)) {
+ if (xas->xa_index)
+ return set_bounds(xas);
+ } else {
+ if ((xas->xa_index >> xa_to_node(entry)->shift) > XA_CHUNK_MASK)
+ return set_bounds(xas);
+ }
+
+ xas->xa_node = NULL;
+ return entry;
+}
+
+static void *xas_descend(struct xa_state *xas, struct xa_node *node)
+{
+ unsigned int offset = get_offset(xas->xa_index, node);
+ void *entry = xa_entry(xas->xa, node, offset);
+
+ xas->xa_node = node;
+ if (xa_is_sibling(entry)) {
+ offset = xa_to_sibling(entry);
+ entry = xa_entry(xas->xa, node, offset);
+ }
+
+ xas->xa_offset = offset;
+ return entry;
+}
+
+/**
+ * xas_load() - Load an entry from the XArray (advanced).
+ * @xas: XArray operation state.
+ *
+ * Usually walks the @xas to the appropriate state to load the entry
+ * stored at xa_index. However, it will do nothing and return %NULL if
+ * @xas is in an error state. xas_load() will never expand the tree.
+ *
+ * If the xa_state is set up to operate on a multi-index entry, xas_load()
+ * may return %NULL or an internal entry, even if there are entries
+ * present within the range specified by @xas.
+ *
+ * Context: Any context. The caller should hold the xa_lock or the RCU lock.
+ * Return: Usually an entry in the XArray, but see description for exceptions.
+ */
+void *xas_load(struct xa_state *xas)
+{
+ void *entry = xas_start(xas);
+
+ while (xa_is_node(entry)) {
+ struct xa_node *node = xa_to_node(entry);
+
+ if (xas->xa_shift > node->shift)
+ break;
+ entry = xas_descend(xas, node);
+ }
+ return entry;
+}
+EXPORT_SYMBOL_GPL(xas_load);
+
+/* Move the radix tree node cache here */
+extern struct kmem_cache *radix_tree_node_cachep;
+extern void radix_tree_node_rcu_free(struct rcu_head *head);
+
+#define XA_RCU_FREE ((struct xarray *)1)
+
+static void xa_node_free(struct xa_node *node)
+{
+ XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
+ node->array = XA_RCU_FREE;
+ call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
+}
+
+/*
+ * xas_destroy() - Free any resources allocated during the XArray operation.
+ * @xas: XArray operation state.
+ *
+ * This function is now internal-only.
+ */
+static void xas_destroy(struct xa_state *xas)
+{
+ struct xa_node *node = xas->xa_alloc;
+
+ if (!node)
+ return;
+ XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
+ kmem_cache_free(radix_tree_node_cachep, node);
+ xas->xa_alloc = NULL;
+}
+
+/**
+ * xas_nomem() - Allocate memory if needed.
+ * @xas: XArray operation state.
+ * @gfp: Memory allocation flags.
+ *
+ * If we need to add new nodes to the XArray, we try to allocate memory
+ * with GFP_NOWAIT while holding the lock, which will usually succeed.
+ * If it fails, @xas is flagged as needing memory to continue. The caller
+ * should drop the lock and call xas_nomem(). If xas_nomem() succeeds,
+ * the caller should retry the operation.
+ *
+ * Forward progress is guaranteed as one node is allocated here and
+ * stored in the xa_state where it will be found by xas_alloc(). More
+ * nodes will likely be found in the slab allocator, but we do not tie
+ * them up here.
+ *
+ * Return: true if memory was needed, and was successfully allocated.
+ */
+bool xas_nomem(struct xa_state *xas, gfp_t gfp)
+{
+ if (xas->xa_node != XA_ERROR(-ENOMEM)) {
+ xas_destroy(xas);
+ return false;
+ }
+ xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp);
+ if (!xas->xa_alloc)
+ return false;
+ XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
+ xas->xa_node = XAS_RESTART;
+ return true;
+}
+EXPORT_SYMBOL_GPL(xas_nomem);
+
+/*
+ * __xas_nomem() - Drop locks and allocate memory if needed.
+ * @xas: XArray operation state.
+ * @gfp: Memory allocation flags.
+ *
+ * Internal variant of xas_nomem().
+ *
+ * Return: true if memory was needed, and was successfully allocated.
+ */
+static bool __xas_nomem(struct xa_state *xas, gfp_t gfp)
+ __must_hold(xas->xa->xa_lock)
+{
+ unsigned int lock_type = xa_lock_type(xas->xa);
+
+ if (xas->xa_node != XA_ERROR(-ENOMEM)) {
+ xas_destroy(xas);
+ return false;
+ }
+ if (gfpflags_allow_blocking(gfp)) {
+ xas_unlock_type(xas, lock_type);
+ xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp);
+ xas_lock_type(xas, lock_type);
+ } else {
+ xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp);
+ }
+ if (!xas->xa_alloc)
+ return false;
+ XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
+ xas->xa_node = XAS_RESTART;
+ return true;
+}
+
+static void xas_update(struct xa_state *xas, struct xa_node *node)
+{
+ if (xas->xa_update)
+ xas->xa_update(node);
+ else
+ XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
+}
+
+static void *xas_alloc(struct xa_state *xas, unsigned int shift)
+{
+ struct xa_node *parent = xas->xa_node;
+ struct xa_node *node = xas->xa_alloc;
+
+ if (xas_invalid(xas))
+ return NULL;
+
+ if (node) {
+ xas->xa_alloc = NULL;
+ } else {
+ node = kmem_cache_alloc(radix_tree_node_cachep,
+ GFP_NOWAIT | __GFP_NOWARN);
+ if (!node) {
+ xas_set_err(xas, -ENOMEM);
+ return NULL;
+ }
+ }
+
+ if (parent) {
+ node->offset = xas->xa_offset;
+ parent->count++;
+ XA_NODE_BUG_ON(node, parent->count > XA_CHUNK_SIZE);
+ xas_update(xas, parent);
+ }
+ XA_NODE_BUG_ON(node, shift > BITS_PER_LONG);
+ XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
+ node->shift = shift;
+ node->count = 0;
+ node->nr_values = 0;
+ RCU_INIT_POINTER(node->parent, xas->xa_node);
+ node->array = xas->xa;
+
+ return node;
+}
+
+#ifdef CONFIG_XARRAY_MULTI
+/* Returns the number of indices covered by a given xa_state */
+static unsigned long xas_size(const struct xa_state *xas)
+{
+ return (xas->xa_sibs + 1UL) << xas->xa_shift;
+}
+#endif
+
+/*
+ * Use this to calculate the maximum index that will need to be created
+ * in order to add the entry described by @xas. Because we cannot store a
+ * multiple-index entry at index 0, the calculation is a little more complex
+ * than you might expect.
+ */
+static unsigned long xas_max(struct xa_state *xas)
+{
+ unsigned long max = xas->xa_index;
+
+#ifdef CONFIG_XARRAY_MULTI
+ if (xas->xa_shift || xas->xa_sibs) {
+ unsigned long mask = xas_size(xas) - 1;
+ max |= mask;
+ if (mask == max)
+ max++;
+ }
+#endif
+
+ return max;
+}
+
+/* The maximum index that can be contained in the array without expanding it */
+static unsigned long max_index(void *entry)
+{
+ if (!xa_is_node(entry))
+ return 0;
+ return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1;
+}
+
+static void xas_shrink(struct xa_state *xas)
+{
+ struct xarray *xa = xas->xa;
+ struct xa_node *node = xas->xa_node;
+
+ for (;;) {
+ void *entry;
+
+ XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
+ if (node->count != 1)
+ break;
+ entry = xa_entry_locked(xa, node, 0);
+ if (!entry)
+ break;
+ if (!xa_is_node(entry) && node->shift)
+ break;
+ xas->xa_node = XAS_BOUNDS;
+
+ RCU_INIT_POINTER(xa->xa_head, entry);
+ if (xa_track_free(xa) && !node_get_mark(node, 0, XA_FREE_MARK))
+ xa_mark_clear(xa, XA_FREE_MARK);
+
+ node->count = 0;
+ node->nr_values = 0;
+ if (!xa_is_node(entry))
+ RCU_INIT_POINTER(node->slots[0], XA_RETRY_ENTRY);
+ xas_update(xas, node);
+ xa_node_free(node);
+ if (!xa_is_node(entry))
+ break;
+ node = xa_to_node(entry);
+ node->parent = NULL;
+ }
+}
+
+/*
+ * xas_delete_node() - Attempt to delete an xa_node
+ * @xas: Array operation state.
+ *
+ * Attempts to delete the @xas->xa_node. This will fail if xa->node has
+ * a non-zero reference count.
+ */
+static void xas_delete_node(struct xa_state *xas)
+{
+ struct xa_node *node = xas->xa_node;
+
+ for (;;) {
+ struct xa_node *parent;
+
+ XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
+ if (node->count)
+ break;
+
+ parent = xa_parent_locked(xas->xa, node);
+ xas->xa_node = parent;
+ xas->xa_offset = node->offset;
+ xa_node_free(node);
+
+ if (!parent) {
+ xas->xa->xa_head = NULL;
+ xas->xa_node = XAS_BOUNDS;
+ return;
+ }
+
+ parent->slots[xas->xa_offset] = NULL;
+ parent->count--;
+ XA_NODE_BUG_ON(parent, parent->count > XA_CHUNK_SIZE);
+ node = parent;
+ xas_update(xas, node);
+ }
+
+ if (!node->parent)
+ xas_shrink(xas);
+}
+
+/**
+ * xas_free_nodes() - Free this node and all nodes that it references
+ * @xas: Array operation state.
+ * @top: Node to free
+ *
+ * This node has been removed from the tree. We must now free it and all
+ * of its subnodes. There may be RCU walkers with references into the tree,
+ * so we must replace all entries with retry markers.
+ */
+static void xas_free_nodes(struct xa_state *xas, struct xa_node *top)
+{
+ unsigned int offset = 0;
+ struct xa_node *node = top;
+
+ for (;;) {
+ void *entry = xa_entry_locked(xas->xa, node, offset);
+
+ if (xa_is_node(entry)) {
+ node = xa_to_node(entry);
+ offset = 0;
+ continue;
+ }
+ if (entry)
+ RCU_INIT_POINTER(node->slots[offset], XA_RETRY_ENTRY);
+ offset++;
+ while (offset == XA_CHUNK_SIZE) {
+ struct xa_node *parent;
+
+ parent = xa_parent_locked(xas->xa, node);
+ offset = node->offset + 1;
+ node->count = 0;
+ node->nr_values = 0;
+ xas_update(xas, node);
+ xa_node_free(node);
+ if (node == top)
+ return;
+ node = parent;
+ }
+ }
+}
+
+/*
+ * xas_expand adds nodes to the head of the tree until it has reached
+ * sufficient height to be able to contain @xas->xa_index
+ */
+static int xas_expand(struct xa_state *xas, void *head)
+{
+ struct xarray *xa = xas->xa;
+ struct xa_node *node = NULL;
+ unsigned int shift = 0;
+ unsigned long max = xas_max(xas);
+
+ if (!head) {
+ if (max == 0)
+ return 0;
+ while ((max >> shift) >= XA_CHUNK_SIZE)
+ shift += XA_CHUNK_SHIFT;
+ return shift + XA_CHUNK_SHIFT;
+ } else if (xa_is_node(head)) {
+ node = xa_to_node(head);
+ shift = node->shift + XA_CHUNK_SHIFT;
+ }
+ xas->xa_node = NULL;
+
+ while (max > max_index(head)) {
+ xa_mark_t mark = 0;
+
+ XA_NODE_BUG_ON(node, shift > BITS_PER_LONG);
+ node = xas_alloc(xas, shift);
+ if (!node)
+ return -ENOMEM;
+
+ node->count = 1;
+ if (xa_is_value(head))
+ node->nr_values = 1;
+ RCU_INIT_POINTER(node->slots[0], head);
+
+ /* Propagate the aggregated mark info to the new child */
+ for (;;) {
+ if (xa_track_free(xa) && mark == XA_FREE_MARK) {
+ node_mark_all(node, XA_FREE_MARK);
+ if (!xa_marked(xa, XA_FREE_MARK)) {
+ node_clear_mark(node, 0, XA_FREE_MARK);
+ xa_mark_set(xa, XA_FREE_MARK);
+ }
+ } else if (xa_marked(xa, mark)) {
+ node_set_mark(node, 0, mark);
+ }
+ if (mark == XA_MARK_MAX)
+ break;
+ mark_inc(mark);
+ }
+
+ /*
+ * Now that the new node is fully initialised, we can add
+ * it to the tree
+ */
+ if (xa_is_node(head)) {
+ xa_to_node(head)->offset = 0;
+ rcu_assign_pointer(xa_to_node(head)->parent, node);
+ }
+ head = xa_mk_node(node);
+ rcu_assign_pointer(xa->xa_head, head);
+ xas_update(xas, node);
+
+ shift += XA_CHUNK_SHIFT;
+ }
+
+ xas->xa_node = node;
+ return shift;
+}
+
+/*
+ * xas_create() - Create a slot to store an entry in.
+ * @xas: XArray operation state.
+ *
+ * Most users will not need to call this function directly, as it is called
+ * by xas_store(). It is useful for doing conditional store operations
+ * (see the xa_cmpxchg() implementation for an example).
+ *
+ * Return: If the slot already existed, returns the contents of this slot.
+ * If the slot was newly created, returns NULL. If it failed to create the
+ * slot, returns NULL and indicates the error in @xas.
+ */
+static void *xas_create(struct xa_state *xas)
+{
+ struct xarray *xa = xas->xa;
+ void *entry;
+ void __rcu **slot;
+ struct xa_node *node = xas->xa_node;
+ int shift;
+ unsigned int order = xas->xa_shift;
+
+ if (xas_top(node)) {
+ entry = xa_head_locked(xa);
+ xas->xa_node = NULL;
+ shift = xas_expand(xas, entry);
+ if (shift < 0)
+ return NULL;
+ entry = xa_head_locked(xa);
+ slot = &xa->xa_head;
+ } else if (xas_error(xas)) {
+ return NULL;
+ } else if (node) {
+ unsigned int offset = xas->xa_offset;
+
+ shift = node->shift;
+ entry = xa_entry_locked(xa, node, offset);
+ slot = &node->slots[offset];
+ } else {
+ shift = 0;
+ entry = xa_head_locked(xa);
+ slot = &xa->xa_head;
+ }
+
+ while (shift > order) {
+ shift -= XA_CHUNK_SHIFT;
+ if (!entry) {
+ node = xas_alloc(xas, shift);
+ if (!node)
+ break;
+ if (xa_track_free(xa))
+ node_mark_all(node, XA_FREE_MARK);
+ rcu_assign_pointer(*slot, xa_mk_node(node));
+ } else if (xa_is_node(entry)) {
+ node = xa_to_node(entry);
+ } else {
+ break;
+ }
+ entry = xas_descend(xas, node);
+ slot = &node->slots[xas->xa_offset];
+ }
+
+ return entry;
+}
+
+/**
+ * xas_create_range() - Ensure that stores to this range will succeed
+ * @xas: XArray operation state.
+ *
+ * Creates all of the slots in the range covered by @xas. Sets @xas to
+ * create single-index entries and positions it at the beginning of the
+ * range. This is for the benefit of users which have not yet been
+ * converted to use multi-index entries.
+ */
+void xas_create_range(struct xa_state *xas)
+{
+ unsigned long index = xas->xa_index;
+ unsigned char shift = xas->xa_shift;
+ unsigned char sibs = xas->xa_sibs;
+
+ xas->xa_index |= ((sibs + 1) << shift) - 1;
+ if (xas_is_node(xas) && xas->xa_node->shift == xas->xa_shift)
+ xas->xa_offset |= sibs;
+ xas->xa_shift = 0;
+ xas->xa_sibs = 0;
+
+ for (;;) {
+ xas_create(xas);
+ if (xas_error(xas))
+ goto restore;
+ if (xas->xa_index <= (index | XA_CHUNK_MASK))
+ goto success;
+ xas->xa_index -= XA_CHUNK_SIZE;
+
+ for (;;) {
+ struct xa_node *node = xas->xa_node;
+ xas->xa_node = xa_parent_locked(xas->xa, node);
+ xas->xa_offset = node->offset - 1;
+ if (node->offset != 0)
+ break;
+ }
+ }
+
+restore:
+ xas->xa_shift = shift;
+ xas->xa_sibs = sibs;
+ xas->xa_index = index;
+ return;
+success:
+ xas->xa_index = index;
+ if (xas->xa_node)
+ xas_set_offset(xas);
+}
+EXPORT_SYMBOL_GPL(xas_create_range);
+
+static void update_node(struct xa_state *xas, struct xa_node *node,
+ int count, int values)
+{
+ if (!node || (!count && !values))
+ return;
+
+ node->count += count;
+ node->nr_values += values;
+ XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
+ XA_NODE_BUG_ON(node, node->nr_values > XA_CHUNK_SIZE);
+ xas_update(xas, node);
+ if (count < 0)
+ xas_delete_node(xas);
+}
+
+/**
+ * xas_store() - Store this entry in the XArray.
+ * @xas: XArray operation state.
+ * @entry: New entry.
+ *
+ * If @xas is operating on a multi-index entry, the entry returned by this
+ * function is essentially meaningless (it may be an internal entry or it
+ * may be %NULL, even if there are non-NULL entries at some of the indices
+ * covered by the range). This is not a problem for any current users,
+ * and can be changed if needed.
+ *
+ * Return: The old entry at this index.
+ */
+void *xas_store(struct xa_state *xas, void *entry)
+{
+ struct xa_node *node;
+ void __rcu **slot = &xas->xa->xa_head;
+ unsigned int offset, max;
+ int count = 0;
+ int values = 0;
+ void *first, *next;
+ bool value = xa_is_value(entry);
+
+ if (entry)
+ first = xas_create(xas);
+ else
+ first = xas_load(xas);
+
+ if (xas_invalid(xas))
+ return first;
+ node = xas->xa_node;
+ if (node && (xas->xa_shift < node->shift))
+ xas->xa_sibs = 0;
+ if ((first == entry) && !xas->xa_sibs)
+ return first;
+
+ next = first;
+ offset = xas->xa_offset;
+ max = xas->xa_offset + xas->xa_sibs;
+ if (node) {
+ slot = &node->slots[offset];
+ if (xas->xa_sibs)
+ xas_squash_marks(xas);
+ }
+ if (!entry)
+ xas_init_marks(xas);
+
+ for (;;) {
+ /*
+ * Must clear the marks before setting the entry to NULL,
+ * otherwise xas_for_each_marked may find a NULL entry and
+ * stop early. rcu_assign_pointer contains a release barrier
+ * so the mark clearing will appear to happen before the
+ * entry is set to NULL.
+ */
+ rcu_assign_pointer(*slot, entry);
+ if (xa_is_node(next))
+ xas_free_nodes(xas, xa_to_node(next));
+ if (!node)
+ break;
+ count += !next - !entry;
+ values += !xa_is_value(first) - !value;
+ if (entry) {
+ if (offset == max)
+ break;
+ if (!xa_is_sibling(entry))
+ entry = xa_mk_sibling(xas->xa_offset);
+ } else {
+ if (offset == XA_CHUNK_MASK)
+ break;
+ }
+ next = xa_entry_locked(xas->xa, node, ++offset);
+ if (!xa_is_sibling(next)) {
+ if (!entry && (offset > max))
+ break;
+ first = next;
+ }
+ slot++;
+ }
+
+ update_node(xas, node, count, values);
+ return first;
+}
+EXPORT_SYMBOL_GPL(xas_store);
+
+/**
+ * xas_get_mark() - Returns the state of this mark.
+ * @xas: XArray operation state.
+ * @mark: Mark number.
+ *
+ * Return: true if the mark is set, false if the mark is clear or @xas
+ * is in an error state.
+ */
+bool xas_get_mark(const struct xa_state *xas, xa_mark_t mark)
+{
+ if (xas_invalid(xas))
+ return false;
+ if (!xas->xa_node)
+ return xa_marked(xas->xa, mark);
+ return node_get_mark(xas->xa_node, xas->xa_offset, mark);
+}
+EXPORT_SYMBOL_GPL(xas_get_mark);
+
+/**
+ * xas_set_mark() - Sets the mark on this entry and its parents.
+ * @xas: XArray operation state.
+ * @mark: Mark number.
+ *
+ * Sets the specified mark on this entry, and walks up the tree setting it
+ * on all the ancestor entries. Does nothing if @xas has not been walked to
+ * an entry, or is in an error state.
+ */
+void xas_set_mark(const struct xa_state *xas, xa_mark_t mark)
+{
+ struct xa_node *node = xas->xa_node;
+ unsigned int offset = xas->xa_offset;
+
+ if (xas_invalid(xas))
+ return;
+
+ while (node) {
+ if (node_set_mark(node, offset, mark))
+ return;
+ offset = node->offset;
+ node = xa_parent_locked(xas->xa, node);
+ }
+
+ if (!xa_marked(xas->xa, mark))
+ xa_mark_set(xas->xa, mark);
+}
+EXPORT_SYMBOL_GPL(xas_set_mark);
+
+/**
+ * xas_clear_mark() - Clears the mark on this entry and its parents.
+ * @xas: XArray operation state.
+ * @mark: Mark number.
+ *
+ * Clears the specified mark on this entry, and walks back to the head
+ * attempting to clear it on all the ancestor entries. Does nothing if
+ * @xas has not been walked to an entry, or is in an error state.
+ */
+void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark)
+{
+ struct xa_node *node = xas->xa_node;
+ unsigned int offset = xas->xa_offset;
+
+ if (xas_invalid(xas))
+ return;
+
+ while (node) {
+ if (!node_clear_mark(node, offset, mark))
+ return;
+ if (node_any_mark(node, mark))
+ return;
+
+ offset = node->offset;
+ node = xa_parent_locked(xas->xa, node);
+ }
+
+ if (xa_marked(xas->xa, mark))
+ xa_mark_clear(xas->xa, mark);
+}
+EXPORT_SYMBOL_GPL(xas_clear_mark);
+
+/**
+ * xas_init_marks() - Initialise all marks for the entry
+ * @xas: Array operations state.
+ *
+ * Initialise all marks for the entry specified by @xas. If we're tracking
+ * free entries with a mark, we need to set it on all entries. All other
+ * marks are cleared.
+ *
+ * This implementation is not as efficient as it could be; we may walk
+ * up the tree multiple times.
+ */
+void xas_init_marks(const struct xa_state *xas)
+{
+ xa_mark_t mark = 0;
+
+ for (;;) {
+ if (xa_track_free(xas->xa) && mark == XA_FREE_MARK)
+ xas_set_mark(xas, mark);
+ else
+ xas_clear_mark(xas, mark);
+ if (mark == XA_MARK_MAX)
+ break;
+ mark_inc(mark);
+ }
+}
+EXPORT_SYMBOL_GPL(xas_init_marks);
+
+/**
+ * xas_pause() - Pause a walk to drop a lock.
+ * @xas: XArray operation state.
+ *
+ * Some users need to pause a walk and drop the lock they're holding in
+ * order to yield to a higher priority thread or carry out an operation
+ * on an entry. Those users should call this function before they drop
+ * the lock. It resets the @xas to be suitable for the next iteration
+ * of the loop after the user has reacquired the lock. If most entries
+ * found during a walk require you to call xas_pause(), the xa_for_each()
+ * iterator may be more appropriate.
+ *
+ * Note that xas_pause() only works for forward iteration. If a user needs
+ * to pause a reverse iteration, we will need a xas_pause_rev().
+ */
+void xas_pause(struct xa_state *xas)
+{
+ struct xa_node *node = xas->xa_node;
+
+ if (xas_invalid(xas))
+ return;
+
+ if (node) {
+ unsigned int offset = xas->xa_offset;
+ while (++offset < XA_CHUNK_SIZE) {
+ if (!xa_is_sibling(xa_entry(xas->xa, node, offset)))
+ break;
+ }
+ xas->xa_index += (offset - xas->xa_offset) << node->shift;
+ } else {
+ xas->xa_index++;
+ }
+ xas->xa_node = XAS_RESTART;
+}
+EXPORT_SYMBOL_GPL(xas_pause);
+
+/*
+ * __xas_prev() - Find the previous entry in the XArray.
+ * @xas: XArray operation state.
+ *
+ * Helper function for xas_prev() which handles all the complex cases
+ * out of line.
+ */
+void *__xas_prev(struct xa_state *xas)
+{
+ void *entry;
+
+ if (!xas_frozen(xas->xa_node))
+ xas->xa_index--;
+ if (xas_not_node(xas->xa_node))
+ return xas_load(xas);
+
+ if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node))
+ xas->xa_offset--;
+
+ while (xas->xa_offset == 255) {
+ xas->xa_offset = xas->xa_node->offset - 1;
+ xas->xa_node = xa_parent(xas->xa, xas->xa_node);
+ if (!xas->xa_node)
+ return set_bounds(xas);
+ }
+
+ for (;;) {
+ entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
+ if (!xa_is_node(entry))
+ return entry;
+
+ xas->xa_node = xa_to_node(entry);
+ xas_set_offset(xas);
+ }
+}
+EXPORT_SYMBOL_GPL(__xas_prev);
+
+/*
+ * __xas_next() - Find the next entry in the XArray.
+ * @xas: XArray operation state.
+ *
+ * Helper function for xas_next() which handles all the complex cases
+ * out of line.
+ */
+void *__xas_next(struct xa_state *xas)
+{
+ void *entry;
+
+ if (!xas_frozen(xas->xa_node))
+ xas->xa_index++;
+ if (xas_not_node(xas->xa_node))
+ return xas_load(xas);
+
+ if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node))
+ xas->xa_offset++;
+
+ while (xas->xa_offset == XA_CHUNK_SIZE) {
+ xas->xa_offset = xas->xa_node->offset + 1;
+ xas->xa_node = xa_parent(xas->xa, xas->xa_node);
+ if (!xas->xa_node)
+ return set_bounds(xas);
+ }
+
+ for (;;) {
+ entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
+ if (!xa_is_node(entry))
+ return entry;
+
+ xas->xa_node = xa_to_node(entry);
+ xas_set_offset(xas);
+ }
+}
+EXPORT_SYMBOL_GPL(__xas_next);
+
+/**
+ * xas_find() - Find the next present entry in the XArray.
+ * @xas: XArray operation state.
+ * @max: Highest index to return.
+ *
+ * If the @xas has not yet been walked to an entry, return the entry
+ * which has an index >= xas.xa_index. If it has been walked, the entry
+ * currently being pointed at has been processed, and so we move to the
+ * next entry.
+ *
+ * If no entry is found and the array is smaller than @max, the iterator
+ * is set to the smallest index not yet in the array. This allows @xas
+ * to be immediately passed to xas_store().
+ *
+ * Return: The entry, if found, otherwise %NULL.
+ */
+void *xas_find(struct xa_state *xas, unsigned long max)
+{
+ void *entry;
+
+ if (xas_error(xas))
+ return NULL;
+
+ if (!xas->xa_node) {
+ xas->xa_index = 1;
+ return set_bounds(xas);
+ } else if (xas_top(xas->xa_node)) {
+ entry = xas_load(xas);
+ if (entry || xas_not_node(xas->xa_node))
+ return entry;
+ } else if (!xas->xa_node->shift &&
+ xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)) {
+ xas->xa_offset = ((xas->xa_index - 1) & XA_CHUNK_MASK) + 1;
+ }
+
+ xas_advance(xas);
+
+ while (xas->xa_node && (xas->xa_index <= max)) {
+ if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) {
+ xas->xa_offset = xas->xa_node->offset + 1;
+ xas->xa_node = xa_parent(xas->xa, xas->xa_node);
+ continue;
+ }
+
+ entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
+ if (xa_is_node(entry)) {
+ xas->xa_node = xa_to_node(entry);
+ xas->xa_offset = 0;
+ continue;
+ }
+ if (entry && !xa_is_sibling(entry))
+ return entry;
+
+ xas_advance(xas);
+ }
+
+ if (!xas->xa_node)
+ xas->xa_node = XAS_BOUNDS;
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(xas_find);
+
+/**
+ * xas_find_marked() - Find the next marked entry in the XArray.
+ * @xas: XArray operation state.
+ * @max: Highest index to return.
+ * @mark: Mark number to search for.
+ *
+ * If the @xas has not yet been walked to an entry, return the marked entry
+ * which has an index >= xas.xa_index. If it has been walked, the entry
+ * currently being pointed at has been processed, and so we return the
+ * first marked entry with an index > xas.xa_index.
+ *
+ * If no marked entry is found and the array is smaller than @max, @xas is
+ * set to the bounds state and xas->xa_index is set to the smallest index
+ * not yet in the array. This allows @xas to be immediately passed to
+ * xas_store().
+ *
+ * If no entry is found before @max is reached, @xas is set to the restart
+ * state.
+ *
+ * Return: The entry, if found, otherwise %NULL.
+ */
+void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark)
+{
+ bool advance = true;
+ unsigned int offset;
+ void *entry;
+
+ if (xas_error(xas))
+ return NULL;
+
+ if (!xas->xa_node) {
+ xas->xa_index = 1;
+ goto out;
+ } else if (xas_top(xas->xa_node)) {
+ advance = false;
+ entry = xa_head(xas->xa);
+ xas->xa_node = NULL;
+ if (xas->xa_index > max_index(entry))
+ goto bounds;
+ if (!xa_is_node(entry)) {
+ if (xa_marked(xas->xa, mark))
+ return entry;
+ xas->xa_index = 1;
+ goto out;
+ }
+ xas->xa_node = xa_to_node(entry);
+ xas->xa_offset = xas->xa_index >> xas->xa_node->shift;
+ }
+
+ while (xas->xa_index <= max) {
+ if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) {
+ xas->xa_offset = xas->xa_node->offset + 1;
+ xas->xa_node = xa_parent(xas->xa, xas->xa_node);
+ if (!xas->xa_node)
+ break;
+ advance = false;
+ continue;
+ }
+
+ if (!advance) {
+ entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
+ if (xa_is_sibling(entry)) {
+ xas->xa_offset = xa_to_sibling(entry);
+ xas_move_index(xas, xas->xa_offset);
+ }
+ }
+
+ offset = xas_find_chunk(xas, advance, mark);
+ if (offset > xas->xa_offset) {
+ advance = false;
+ xas_move_index(xas, offset);
+ /* Mind the wrap */
+ if ((xas->xa_index - 1) >= max)
+ goto max;
+ xas->xa_offset = offset;
+ if (offset == XA_CHUNK_SIZE)
+ continue;
+ }
+
+ entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
+ if (!xa_is_node(entry))
+ return entry;
+ xas->xa_node = xa_to_node(entry);
+ xas_set_offset(xas);
+ }
+
+out:
+ if (!max)
+ goto max;
+bounds:
+ xas->xa_node = XAS_BOUNDS;
+ return NULL;
+max:
+ xas->xa_node = XAS_RESTART;
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(xas_find_marked);
+
+/**
+ * xas_find_conflict() - Find the next present entry in a range.
+ * @xas: XArray operation state.
+ *
+ * The @xas describes both a range and a position within that range.
+ *
+ * Context: Any context. Expects xa_lock to be held.
+ * Return: The next entry in the range covered by @xas or %NULL.
+ */
+void *xas_find_conflict(struct xa_state *xas)
+{
+ void *curr;
+
+ if (xas_error(xas))
+ return NULL;
+
+ if (!xas->xa_node)
+ return NULL;
+
+ if (xas_top(xas->xa_node)) {
+ curr = xas_start(xas);
+ if (!curr)
+ return NULL;
+ while (xa_is_node(curr)) {
+ struct xa_node *node = xa_to_node(curr);
+ curr = xas_descend(xas, node);
+ }
+ if (curr)
+ return curr;
+ }
+
+ if (xas->xa_node->shift > xas->xa_shift)
+ return NULL;
+
+ for (;;) {
+ if (xas->xa_node->shift == xas->xa_shift) {
+ if ((xas->xa_offset & xas->xa_sibs) == xas->xa_sibs)
+ break;
+ } else if (xas->xa_offset == XA_CHUNK_MASK) {
+ xas->xa_offset = xas->xa_node->offset;
+ xas->xa_node = xa_parent_locked(xas->xa, xas->xa_node);
+ if (!xas->xa_node)
+ break;
+ continue;
+ }
+ curr = xa_entry_locked(xas->xa, xas->xa_node, ++xas->xa_offset);
+ if (xa_is_sibling(curr))
+ continue;
+ while (xa_is_node(curr)) {
+ xas->xa_node = xa_to_node(curr);
+ xas->xa_offset = 0;
+ curr = xa_entry_locked(xas->xa, xas->xa_node, 0);
+ }
+ if (curr)
+ return curr;
+ }
+ xas->xa_offset -= xas->xa_sibs;
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(xas_find_conflict);
+
+/**
+ * xa_init_flags() - Initialise an empty XArray with flags.
+ * @xa: XArray.
+ * @flags: XA_FLAG values.
+ *
+ * If you need to initialise an XArray with special flags (eg you need
+ * to take the lock from interrupt context), use this function instead
+ * of xa_init().
+ *
+ * Context: Any context.
+ */
+void xa_init_flags(struct xarray *xa, gfp_t flags)
+{
+ unsigned int lock_type;
+ static struct lock_class_key xa_lock_irq;
+ static struct lock_class_key xa_lock_bh;
+
+ spin_lock_init(&xa->xa_lock);
+ xa->xa_flags = flags;
+ xa->xa_head = NULL;
+
+ lock_type = xa_lock_type(xa);
+ if (lock_type == XA_LOCK_IRQ)
+ lockdep_set_class(&xa->xa_lock, &xa_lock_irq);
+ else if (lock_type == XA_LOCK_BH)
+ lockdep_set_class(&xa->xa_lock, &xa_lock_bh);
+}
+EXPORT_SYMBOL(xa_init_flags);
+
+/**
+ * xa_load() - Load an entry from an XArray.
+ * @xa: XArray.
+ * @index: index into array.
+ *
+ * Context: Any context. Takes and releases the RCU lock.
+ * Return: The entry at @index in @xa.
+ */
+void *xa_load(struct xarray *xa, unsigned long index)
+{
+ XA_STATE(xas, xa, index);
+ void *entry;
+
+ rcu_read_lock();
+ do {
+ entry = xas_load(&xas);
+ if (xa_is_zero(entry))
+ entry = NULL;
+ } while (xas_retry(&xas, entry));
+ rcu_read_unlock();
+
+ return entry;
+}
+EXPORT_SYMBOL(xa_load);
+
+static void *xas_result(struct xa_state *xas, void *curr)
+{
+ if (xa_is_zero(curr))
+ return NULL;
+ XA_NODE_BUG_ON(xas->xa_node, xa_is_internal(curr));
+ if (xas_error(xas))
+ curr = xas->xa_node;
+ return curr;
+}
+
+/**
+ * __xa_erase() - Erase this entry from the XArray while locked.
+ * @xa: XArray.
+ * @index: Index into array.
+ *
+ * If the entry at this index is a multi-index entry then all indices will
+ * be erased, and the entry will no longer be a multi-index entry.
+ * This function expects the xa_lock to be held on entry.
+ *
+ * Context: Any context. Expects xa_lock to be held on entry. May
+ * release and reacquire xa_lock if @gfp flags permit.
+ * Return: The old entry at this index.
+ */
+void *__xa_erase(struct xarray *xa, unsigned long index)
+{
+ XA_STATE(xas, xa, index);
+ return xas_result(&xas, xas_store(&xas, NULL));
+}
+EXPORT_SYMBOL_GPL(__xa_erase);
+
+/**
+ * xa_store() - Store this entry in the XArray.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * After this function returns, loads from this index will return @entry.
+ * Storing into an existing multislot entry updates the entry of every index.
+ * The marks associated with @index are unaffected unless @entry is %NULL.
+ *
+ * Context: Process context. Takes and releases the xa_lock. May sleep
+ * if the @gfp flags permit.
+ * Return: The old entry at this index on success, xa_err(-EINVAL) if @entry
+ * cannot be stored in an XArray, or xa_err(-ENOMEM) if memory allocation
+ * failed.
+ */
+void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
+{
+ XA_STATE(xas, xa, index);
+ void *curr;
+
+ if (WARN_ON_ONCE(xa_is_internal(entry)))
+ return XA_ERROR(-EINVAL);
+
+ do {
+ xas_lock(&xas);
+ curr = xas_store(&xas, entry);
+ if (xa_track_free(xa) && entry)
+ xas_clear_mark(&xas, XA_FREE_MARK);
+ xas_unlock(&xas);
+ } while (xas_nomem(&xas, gfp));
+
+ return xas_result(&xas, curr);
+}
+EXPORT_SYMBOL(xa_store);
+
+/**
+ * __xa_store() - Store this entry in the XArray.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * You must already be holding the xa_lock when calling this function.
+ * It will drop the lock if needed to allocate memory, and then reacquire
+ * it afterwards.
+ *
+ * Context: Any context. Expects xa_lock to be held on entry. May
+ * release and reacquire xa_lock if @gfp flags permit.
+ * Return: The old entry at this index or xa_err() if an error happened.
+ */
+void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
+{
+ XA_STATE(xas, xa, index);
+ void *curr;
+
+ if (WARN_ON_ONCE(xa_is_internal(entry)))
+ return XA_ERROR(-EINVAL);
+
+ do {
+ curr = xas_store(&xas, entry);
+ if (xa_track_free(xa) && entry)
+ xas_clear_mark(&xas, XA_FREE_MARK);
+ } while (__xas_nomem(&xas, gfp));
+
+ return xas_result(&xas, curr);
+}
+EXPORT_SYMBOL(__xa_store);
+
+/**
+ * xa_cmpxchg() - Conditionally replace an entry in the XArray.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @old: Old value to test against.
+ * @entry: New value to place in array.
+ * @gfp: Memory allocation flags.
+ *
+ * If the entry at @index is the same as @old, replace it with @entry.
+ * If the return value is equal to @old, then the exchange was successful.
+ *
+ * Context: Process context. Takes and releases the xa_lock. May sleep
+ * if the @gfp flags permit.
+ * Return: The old value at this index or xa_err() if an error happened.
+ */
+void *xa_cmpxchg(struct xarray *xa, unsigned long index,
+ void *old, void *entry, gfp_t gfp)
+{
+ XA_STATE(xas, xa, index);
+ void *curr;
+
+ if (WARN_ON_ONCE(xa_is_internal(entry)))
+ return XA_ERROR(-EINVAL);
+
+ do {
+ xas_lock(&xas);
+ curr = xas_load(&xas);
+ if (curr == XA_ZERO_ENTRY)
+ curr = NULL;
+ if (curr == old) {
+ xas_store(&xas, entry);
+ if (xa_track_free(xa) && entry)
+ xas_clear_mark(&xas, XA_FREE_MARK);
+ }
+ xas_unlock(&xas);
+ } while (xas_nomem(&xas, gfp));
+
+ return xas_result(&xas, curr);
+}
+EXPORT_SYMBOL(xa_cmpxchg);
+
+/**
+ * __xa_cmpxchg() - Store this entry in the XArray.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @old: Old value to test against.
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * You must already be holding the xa_lock when calling this function.
+ * It will drop the lock if needed to allocate memory, and then reacquire
+ * it afterwards.
+ *
+ * Context: Any context. Expects xa_lock to be held on entry. May
+ * release and reacquire xa_lock if @gfp flags permit.
+ * Return: The old entry at this index or xa_err() if an error happened.
+ */
+void *__xa_cmpxchg(struct xarray *xa, unsigned long index,
+ void *old, void *entry, gfp_t gfp)
+{
+ XA_STATE(xas, xa, index);
+ void *curr;
+
+ if (WARN_ON_ONCE(xa_is_internal(entry)))
+ return XA_ERROR(-EINVAL);
+
+ do {
+ curr = xas_load(&xas);
+ if (curr == XA_ZERO_ENTRY)
+ curr = NULL;
+ if (curr == old) {
+ xas_store(&xas, entry);
+ if (xa_track_free(xa) && entry)
+ xas_clear_mark(&xas, XA_FREE_MARK);
+ }
+ } while (__xas_nomem(&xas, gfp));
+
+ return xas_result(&xas, curr);
+}
+EXPORT_SYMBOL(__xa_cmpxchg);
+
+/**
+ * xa_reserve() - Reserve this index in the XArray.
+ * @xa: XArray.
+ * @index: Index into array.
+ * @gfp: Memory allocation flags.
+ *
+ * Ensures there is somewhere to store an entry at @index in the array.
+ * If there is already something stored at @index, this function does
+ * nothing. If there was nothing there, the entry is marked as reserved.
+ * Loads from @index will continue to see a %NULL pointer until a
+ * subsequent store to @index.
+ *
+ * If you do not use the entry that you have reserved, call xa_release()
+ * or xa_erase() to free any unnecessary memory.
+ *
+ * Context: Process context. Takes and releases the xa_lock, IRQ or BH safe
+ * if specified in XArray flags. May sleep if the @gfp flags permit.
+ * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
+ */
+int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp)
+{
+ XA_STATE(xas, xa, index);
+ unsigned int lock_type = xa_lock_type(xa);
+ void *curr;
+
+ do {
+ xas_lock_type(&xas, lock_type);
+ curr = xas_load(&xas);
+ if (!curr)
+ xas_store(&xas, XA_ZERO_ENTRY);
+ xas_unlock_type(&xas, lock_type);
+ } while (xas_nomem(&xas, gfp));
+
+ return xas_error(&xas);
+}
+EXPORT_SYMBOL(xa_reserve);
+
+#ifdef CONFIG_XARRAY_MULTI
+static void xas_set_range(struct xa_state *xas, unsigned long first,
+ unsigned long last)
+{
+ unsigned int shift = 0;
+ unsigned long sibs = last - first;
+ unsigned int offset = XA_CHUNK_MASK;
+
+ xas_set(xas, first);
+
+ while ((first & XA_CHUNK_MASK) == 0) {
+ if (sibs < XA_CHUNK_MASK)
+ break;
+ if ((sibs == XA_CHUNK_MASK) && (offset < XA_CHUNK_MASK))
+ break;
+ shift += XA_CHUNK_SHIFT;
+ if (offset == XA_CHUNK_MASK)
+ offset = sibs & XA_CHUNK_MASK;
+ sibs >>= XA_CHUNK_SHIFT;
+ first >>= XA_CHUNK_SHIFT;
+ }
+
+ offset = first & XA_CHUNK_MASK;
+ if (offset + sibs > XA_CHUNK_MASK)
+ sibs = XA_CHUNK_MASK - offset;
+ if ((((first + sibs + 1) << shift) - 1) > last)
+ sibs -= 1;
+
+ xas->xa_shift = shift;
+ xas->xa_sibs = sibs;
+}
+
+/**
+ * xa_store_range() - Store this entry at a range of indices in the XArray.
+ * @xa: XArray.
+ * @first: First index to affect.
+ * @last: Last index to affect.
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * After this function returns, loads from any index between @first and @last,
+ * inclusive will return @entry.
+ * Storing into an existing multislot entry updates the entry of every index.
+ * The marks associated with @index are unaffected unless @entry is %NULL.
+ *
+ * Context: Process context. Takes and releases the xa_lock. May sleep
+ * if the @gfp flags permit.
+ * Return: %NULL on success, xa_err(-EINVAL) if @entry cannot be stored in
+ * an XArray, or xa_err(-ENOMEM) if memory allocation failed.
+ */
+void *xa_store_range(struct xarray *xa, unsigned long first,
+ unsigned long last, void *entry, gfp_t gfp)
+{
+ XA_STATE(xas, xa, 0);
+
+ if (WARN_ON_ONCE(xa_is_internal(entry)))
+ return XA_ERROR(-EINVAL);
+ if (last < first)
+ return XA_ERROR(-EINVAL);
+
+ do {
+ xas_lock(&xas);
+ if (entry) {
+ unsigned int order = (last == ~0UL) ? 64 :
+ ilog2(last + 1);
+ xas_set_order(&xas, last, order);
+ xas_create(&xas);
+ if (xas_error(&xas))
+ goto unlock;
+ }
+ do {
+ xas_set_range(&xas, first, last);
+ xas_store(&xas, entry);
+ if (xas_error(&xas))
+ goto unlock;
+ first += xas_size(&xas);
+ } while (first <= last);
+unlock:
+ xas_unlock(&xas);
+ } while (xas_nomem(&xas, gfp));
+
+ return xas_result(&xas, NULL);
+}
+EXPORT_SYMBOL(xa_store_range);
+#endif /* CONFIG_XARRAY_MULTI */
+
+/**
+ * __xa_alloc() - Find somewhere to store this entry in the XArray.
+ * @xa: XArray.
+ * @id: Pointer to ID.
+ * @max: Maximum ID to allocate (inclusive).
+ * @entry: New entry.
+ * @gfp: Memory allocation flags.
+ *
+ * Allocates an unused ID in the range specified by @id and @max.
+ * Updates the @id pointer with the index, then stores the entry at that
+ * index. A concurrent lookup will not see an uninitialised @id.
+ *
+ * Context: Any context. Expects xa_lock to be held on entry. May
+ * release and reacquire xa_lock if @gfp flags permit.
+ * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if
+ * there is no more space in the XArray.
+ */
+int __xa_alloc(struct xarray *xa, u32 *id, u32 max, void *entry, gfp_t gfp)
+{
+ XA_STATE(xas, xa, 0);
+ int err;
+
+ if (WARN_ON_ONCE(xa_is_internal(entry)))
+ return -EINVAL;
+ if (WARN_ON_ONCE(!xa_track_free(xa)))
+ return -EINVAL;
+
+ if (!entry)
+ entry = XA_ZERO_ENTRY;
+
+ do {
+ xas.xa_index = *id;
+ xas_find_marked(&xas, max, XA_FREE_MARK);
+ if (xas.xa_node == XAS_RESTART)
+ xas_set_err(&xas, -ENOSPC);
+ xas_store(&xas, entry);
+ xas_clear_mark(&xas, XA_FREE_MARK);
+ } while (__xas_nomem(&xas, gfp));
+
+ err = xas_error(&xas);
+ if (!err)
+ *id = xas.xa_index;
+ return err;
+}
+EXPORT_SYMBOL(__xa_alloc);
+
+/**
+ * __xa_set_mark() - Set this mark on this entry while locked.
+ * @xa: XArray.
+ * @index: Index of entry.
+ * @mark: Mark number.
+ *
+ * Attempting to set a mark on a NULL entry does not succeed.
+ *
+ * Context: Any context. Expects xa_lock to be held on entry.
+ */
+void __xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
+{
+ XA_STATE(xas, xa, index);
+ void *entry = xas_load(&xas);
+
+ if (entry)
+ xas_set_mark(&xas, mark);
+}
+EXPORT_SYMBOL_GPL(__xa_set_mark);
+
+/**
+ * __xa_clear_mark() - Clear this mark on this entry while locked.
+ * @xa: XArray.
+ * @index: Index of entry.
+ * @mark: Mark number.
+ *
+ * Context: Any context. Expects xa_lock to be held on entry.
+ */
+void __xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
+{
+ XA_STATE(xas, xa, index);
+ void *entry = xas_load(&xas);
+
+ if (entry)
+ xas_clear_mark(&xas, mark);
+}
+EXPORT_SYMBOL_GPL(__xa_clear_mark);
+
+/**
+ * xa_get_mark() - Inquire whether this mark is set on this entry.
+ * @xa: XArray.
+ * @index: Index of entry.
+ * @mark: Mark number.
+ *
+ * This function uses the RCU read lock, so the result may be out of date
+ * by the time it returns. If you need the result to be stable, use a lock.
+ *
+ * Context: Any context. Takes and releases the RCU lock.
+ * Return: True if the entry at @index has this mark set, false if it doesn't.
+ */
+bool xa_get_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
+{
+ XA_STATE(xas, xa, index);
+ void *entry;
+
+ rcu_read_lock();
+ entry = xas_start(&xas);
+ while (xas_get_mark(&xas, mark)) {
+ if (!xa_is_node(entry))
+ goto found;
+ entry = xas_descend(&xas, xa_to_node(entry));
+ }
+ rcu_read_unlock();
+ return false;
+ found:
+ rcu_read_unlock();
+ return true;
+}
+EXPORT_SYMBOL(xa_get_mark);
+
+/**
+ * xa_set_mark() - Set this mark on this entry.
+ * @xa: XArray.
+ * @index: Index of entry.
+ * @mark: Mark number.
+ *
+ * Attempting to set a mark on a NULL entry does not succeed.
+ *
+ * Context: Process context. Takes and releases the xa_lock.
+ */
+void xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
+{
+ xa_lock(xa);
+ __xa_set_mark(xa, index, mark);
+ xa_unlock(xa);
+}
+EXPORT_SYMBOL(xa_set_mark);
+
+/**
+ * xa_clear_mark() - Clear this mark on this entry.
+ * @xa: XArray.
+ * @index: Index of entry.
+ * @mark: Mark number.
+ *
+ * Clearing a mark always succeeds.
+ *
+ * Context: Process context. Takes and releases the xa_lock.
+ */
+void xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
+{
+ xa_lock(xa);
+ __xa_clear_mark(xa, index, mark);
+ xa_unlock(xa);
+}
+EXPORT_SYMBOL(xa_clear_mark);
+
+/**
+ * xa_find() - Search the XArray for an entry.
+ * @xa: XArray.
+ * @indexp: Pointer to an index.
+ * @max: Maximum index to search to.
+ * @filter: Selection criterion.
+ *
+ * Finds the entry in @xa which matches the @filter, and has the lowest
+ * index that is at least @indexp and no more than @max.
+ * If an entry is found, @indexp is updated to be the index of the entry.
+ * This function is protected by the RCU read lock, so it may not find
+ * entries which are being simultaneously added. It will not return an
+ * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find().
+ *
+ * Context: Any context. Takes and releases the RCU lock.
+ * Return: The entry, if found, otherwise %NULL.
+ */
+void *xa_find(struct xarray *xa, unsigned long *indexp,
+ unsigned long max, xa_mark_t filter)
+{
+ XA_STATE(xas, xa, *indexp);
+ void *entry;
+
+ rcu_read_lock();
+ do {
+ if ((__force unsigned int)filter < XA_MAX_MARKS)
+ entry = xas_find_marked(&xas, max, filter);
+ else
+ entry = xas_find(&xas, max);
+ } while (xas_retry(&xas, entry));
+ rcu_read_unlock();
+
+ if (entry)
+ *indexp = xas.xa_index;
+ return entry;
+}
+EXPORT_SYMBOL(xa_find);
+
+/**
+ * xa_find_after() - Search the XArray for a present entry.
+ * @xa: XArray.
+ * @indexp: Pointer to an index.
+ * @max: Maximum index to search to.
+ * @filter: Selection criterion.
+ *
+ * Finds the entry in @xa which matches the @filter and has the lowest
+ * index that is above @indexp and no more than @max.
+ * If an entry is found, @indexp is updated to be the index of the entry.
+ * This function is protected by the RCU read lock, so it may miss entries
+ * which are being simultaneously added. It will not return an
+ * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find().
+ *
+ * Context: Any context. Takes and releases the RCU lock.
+ * Return: The pointer, if found, otherwise %NULL.
+ */
+void *xa_find_after(struct xarray *xa, unsigned long *indexp,
+ unsigned long max, xa_mark_t filter)
+{
+ XA_STATE(xas, xa, *indexp + 1);
+ void *entry;
+
+ rcu_read_lock();
+ for (;;) {
+ if ((__force unsigned int)filter < XA_MAX_MARKS)
+ entry = xas_find_marked(&xas, max, filter);
+ else
+ entry = xas_find(&xas, max);
+ if (xas.xa_shift) {
+ if (xas.xa_index & ((1UL << xas.xa_shift) - 1))
+ continue;
+ } else {
+ if (xas.xa_offset < (xas.xa_index & XA_CHUNK_MASK))
+ continue;
+ }
+ if (!xas_retry(&xas, entry))
+ break;
+ }
+ rcu_read_unlock();
+
+ if (entry)
+ *indexp = xas.xa_index;
+ return entry;
+}
+EXPORT_SYMBOL(xa_find_after);
+
+static unsigned int xas_extract_present(struct xa_state *xas, void **dst,
+ unsigned long max, unsigned int n)
+{
+ void *entry;
+ unsigned int i = 0;
+
+ rcu_read_lock();
+ xas_for_each(xas, entry, max) {
+ if (xas_retry(xas, entry))
+ continue;
+ dst[i++] = entry;
+ if (i == n)
+ break;
+ }
+ rcu_read_unlock();
+
+ return i;
+}
+
+static unsigned int xas_extract_marked(struct xa_state *xas, void **dst,
+ unsigned long max, unsigned int n, xa_mark_t mark)
+{
+ void *entry;
+ unsigned int i = 0;
+
+ rcu_read_lock();
+ xas_for_each_marked(xas, entry, max, mark) {
+ if (xas_retry(xas, entry))
+ continue;
+ dst[i++] = entry;
+ if (i == n)
+ break;
+ }
+ rcu_read_unlock();
+
+ return i;
+}
+
+/**
+ * xa_extract() - Copy selected entries from the XArray into a normal array.
+ * @xa: The source XArray to copy from.
+ * @dst: The buffer to copy entries into.
+ * @start: The first index in the XArray eligible to be selected.
+ * @max: The last index in the XArray eligible to be selected.
+ * @n: The maximum number of entries to copy.
+ * @filter: Selection criterion.
+ *
+ * Copies up to @n entries that match @filter from the XArray. The
+ * copied entries will have indices between @start and @max, inclusive.
+ *
+ * The @filter may be an XArray mark value, in which case entries which are
+ * marked with that mark will be copied. It may also be %XA_PRESENT, in
+ * which case all entries which are not NULL will be copied.
+ *
+ * The entries returned may not represent a snapshot of the XArray at a
+ * moment in time. For example, if another thread stores to index 5, then
+ * index 10, calling xa_extract() may return the old contents of index 5
+ * and the new contents of index 10. Indices not modified while this
+ * function is running will not be skipped.
+ *
+ * If you need stronger guarantees, holding the xa_lock across calls to this
+ * function will prevent concurrent modification.
+ *
+ * Context: Any context. Takes and releases the RCU lock.
+ * Return: The number of entries copied.
+ */
+unsigned int xa_extract(struct xarray *xa, void **dst, unsigned long start,
+ unsigned long max, unsigned int n, xa_mark_t filter)
+{
+ XA_STATE(xas, xa, start);
+
+ if (!n)
+ return 0;
+
+ if ((__force unsigned int)filter < XA_MAX_MARKS)
+ return xas_extract_marked(&xas, dst, max, n, filter);
+ return xas_extract_present(&xas, dst, max, n);
+}
+EXPORT_SYMBOL(xa_extract);
+
+/**
+ * xa_destroy() - Free all internal data structures.
+ * @xa: XArray.
+ *
+ * After calling this function, the XArray is empty and has freed all memory
+ * allocated for its internal data structures. You are responsible for
+ * freeing the objects referenced by the XArray.
+ *
+ * Context: Any context. Takes and releases the xa_lock, interrupt-safe.
+ */
+void xa_destroy(struct xarray *xa)
+{
+ XA_STATE(xas, xa, 0);
+ unsigned long flags;
+ void *entry;
+
+ xas.xa_node = NULL;
+ xas_lock_irqsave(&xas, flags);
+ entry = xa_head_locked(xa);
+ RCU_INIT_POINTER(xa->xa_head, NULL);
+ xas_init_marks(&xas);
+ /* lockdep checks we're still holding the lock in xas_free_nodes() */
+ if (xa_is_node(entry))
+ xas_free_nodes(&xas, xa_to_node(entry));
+ xas_unlock_irqrestore(&xas, flags);
+}
+EXPORT_SYMBOL(xa_destroy);
+
+#ifdef XA_DEBUG
+void xa_dump_node(const struct xa_node *node)
+{
+ unsigned i, j;
+
+ if (!node)
+ return;
+ if ((unsigned long)node & 3) {
+ pr_cont("node %px\n", node);
+ return;
+ }
+
+ pr_cont("node %px %s %d parent %px shift %d count %d values %d "
+ "array %px list %px %px marks",
+ node, node->parent ? "offset" : "max", node->offset,
+ node->parent, node->shift, node->count, node->nr_values,
+ node->array, node->private_list.prev, node->private_list.next);
+ for (i = 0; i < XA_MAX_MARKS; i++)
+ for (j = 0; j < XA_MARK_LONGS; j++)
+ pr_cont(" %lx", node->marks[i][j]);
+ pr_cont("\n");
+}
+
+void xa_dump_index(unsigned long index, unsigned int shift)
+{
+ if (!shift)
+ pr_info("%lu: ", index);
+ else if (shift >= BITS_PER_LONG)
+ pr_info("0-%lu: ", ~0UL);
+ else
+ pr_info("%lu-%lu: ", index, index | ((1UL << shift) - 1));
+}
+
+void xa_dump_entry(const void *entry, unsigned long index, unsigned long shift)
+{
+ if (!entry)
+ return;
+
+ xa_dump_index(index, shift);
+
+ if (xa_is_node(entry)) {
+ if (shift == 0) {
+ pr_cont("%px\n", entry);
+ } else {
+ unsigned long i;
+ struct xa_node *node = xa_to_node(entry);
+ xa_dump_node(node);
+ for (i = 0; i < XA_CHUNK_SIZE; i++)
+ xa_dump_entry(node->slots[i],
+ index + (i << node->shift), node->shift);
+ }
+ } else if (xa_is_value(entry))
+ pr_cont("value %ld (0x%lx) [%px]\n", xa_to_value(entry),
+ xa_to_value(entry), entry);
+ else if (!xa_is_internal(entry))
+ pr_cont("%px\n", entry);
+ else if (xa_is_retry(entry))
+ pr_cont("retry (%ld)\n", xa_to_internal(entry));
+ else if (xa_is_sibling(entry))
+ pr_cont("sibling (slot %ld)\n", xa_to_sibling(entry));
+ else if (xa_is_zero(entry))
+ pr_cont("zero (%ld)\n", xa_to_internal(entry));
+ else
+ pr_cont("UNKNOWN ENTRY (%px)\n", entry);
+}
+
+void xa_dump(const struct xarray *xa)
+{
+ void *entry = xa->xa_head;
+ unsigned int shift = 0;
+
+ pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry,
+ xa->xa_flags, xa_marked(xa, XA_MARK_0),
+ xa_marked(xa, XA_MARK_1), xa_marked(xa, XA_MARK_2));
+ if (xa_is_node(entry))
+ shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT;
+ xa_dump_entry(entry, 0, shift);
+}
+#endif
diff --git a/mm/Kconfig b/mm/Kconfig
index de64ea658716..02301a89089e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -379,7 +379,7 @@ config TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
select COMPACTION
- select RADIX_TREE_MULTIORDER
+ select XARRAY_MULTI
help
Transparent Hugepages allows the kernel to use huge pages and
huge tlb transparently to the applications whenever possible.
@@ -671,7 +671,7 @@ config ZONE_DEVICE
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
depends on ARCH_HAS_ZONE_DEVICE
- select RADIX_TREE_MULTIORDER
+ select XARRAY_MULTI
help
Device memory hotplug support allows for establishing pmem,
diff --git a/mm/filemap.c b/mm/filemap.c
index 3968da1f7f5a..218d0b2ec82d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -113,60 +113,26 @@
* ->tasklist_lock (memory_failure, collect_procs_ao)
*/
-static int page_cache_tree_insert(struct address_space *mapping,
- struct page *page, void **shadowp)
-{
- struct radix_tree_node *node;
- void **slot;
- int error;
-
- error = __radix_tree_create(&mapping->i_pages, page->index, 0,
- &node, &slot);
- if (error)
- return error;
- if (*slot) {
- void *p;
-
- p = radix_tree_deref_slot_protected(slot,
- &mapping->i_pages.xa_lock);
- if (!radix_tree_exceptional_entry(p))
- return -EEXIST;
-
- mapping->nrexceptional--;
- if (shadowp)
- *shadowp = p;
- }
- __radix_tree_replace(&mapping->i_pages, node, slot, page,
- workingset_lookup_update(mapping));
- mapping->nrpages++;
- return 0;
-}
-
-static void page_cache_tree_delete(struct address_space *mapping,
+static void page_cache_delete(struct address_space *mapping,
struct page *page, void *shadow)
{
- int i, nr;
+ XA_STATE(xas, &mapping->i_pages, page->index);
+ unsigned int nr = 1;
+
+ mapping_set_update(&xas, mapping);
- /* hugetlb pages are represented by one entry in the radix tree */
- nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
+ /* hugetlb pages are represented by a single entry in the xarray */
+ if (!PageHuge(page)) {
+ xas_set_order(&xas, page->index, compound_order(page));
+ nr = 1U << compound_order(page);
+ }
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(nr != 1 && shadow, page);
- for (i = 0; i < nr; i++) {
- struct radix_tree_node *node;
- void **slot;
-
- __radix_tree_lookup(&mapping->i_pages, page->index + i,
- &node, &slot);
-
- VM_BUG_ON_PAGE(!node && nr != 1, page);
-
- radix_tree_clear_tags(&mapping->i_pages, node, slot);
- __radix_tree_replace(&mapping->i_pages, node, slot, shadow,
- workingset_lookup_update(mapping));
- }
+ xas_store(&xas, shadow);
+ xas_init_marks(&xas);
page->mapping = NULL;
/* Leave page->index set: truncation lookup relies upon it */
@@ -265,7 +231,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
trace_mm_filemap_delete_from_page_cache(page);
unaccount_page_cache_page(mapping, page);
- page_cache_tree_delete(mapping, page, shadow);
+ page_cache_delete(mapping, page, shadow);
}
static void page_cache_free_page(struct address_space *mapping,
@@ -308,7 +274,7 @@ void delete_from_page_cache(struct page *page)
EXPORT_SYMBOL(delete_from_page_cache);
/*
- * page_cache_tree_delete_batch - delete several pages from page cache
+ * page_cache_delete_batch - delete several pages from page cache
* @mapping: the mapping to which pages belong
* @pvec: pagevec with pages to delete
*
@@ -321,24 +287,19 @@ EXPORT_SYMBOL(delete_from_page_cache);
*
* The function expects the i_pages lock to be held.
*/
-static void
-page_cache_tree_delete_batch(struct address_space *mapping,
+static void page_cache_delete_batch(struct address_space *mapping,
struct pagevec *pvec)
{
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
int total_pages = 0;
int i = 0, tail_pages = 0;
struct page *page;
- pgoff_t start;
- start = pvec->pages[0]->index;
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
+ mapping_set_update(&xas, mapping);
+ xas_for_each(&xas, page, ULONG_MAX) {
if (i >= pagevec_count(pvec) && !tail_pages)
break;
- page = radix_tree_deref_slot_protected(slot,
- &mapping->i_pages.xa_lock);
- if (radix_tree_exceptional_entry(page))
+ if (xa_is_value(page))
continue;
if (!tail_pages) {
/*
@@ -346,8 +307,11 @@ page_cache_tree_delete_batch(struct address_space *mapping,
* have our pages locked so they are protected from
* being removed.
*/
- if (page != pvec->pages[i])
+ if (page != pvec->pages[i]) {
+ VM_BUG_ON_PAGE(page->index >
+ pvec->pages[i]->index, page);
continue;
+ }
WARN_ON_ONCE(!PageLocked(page));
if (PageTransHuge(page) && !PageHuge(page))
tail_pages = HPAGE_PMD_NR - 1;
@@ -358,11 +322,11 @@ page_cache_tree_delete_batch(struct address_space *mapping,
*/
i++;
} else {
+ VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages
+ != pvec->pages[i]->index, page);
tail_pages--;
}
- radix_tree_clear_tags(&mapping->i_pages, iter.node, slot);
- __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL,
- workingset_lookup_update(mapping));
+ xas_store(&xas, NULL);
total_pages++;
}
mapping->nrpages -= total_pages;
@@ -383,7 +347,7 @@ void delete_from_page_cache_batch(struct address_space *mapping,
unaccount_page_cache_page(mapping, pvec->pages[i]);
}
- page_cache_tree_delete_batch(mapping, pvec);
+ page_cache_delete_batch(mapping, pvec);
xa_unlock_irqrestore(&mapping->i_pages, flags);
for (i = 0; i < pagevec_count(pvec); i++)
@@ -493,20 +457,31 @@ EXPORT_SYMBOL(filemap_flush);
bool filemap_range_has_page(struct address_space *mapping,
loff_t start_byte, loff_t end_byte)
{
- pgoff_t index = start_byte >> PAGE_SHIFT;
- pgoff_t end = end_byte >> PAGE_SHIFT;
struct page *page;
+ XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
+ pgoff_t max = end_byte >> PAGE_SHIFT;
if (end_byte < start_byte)
return false;
- if (mapping->nrpages == 0)
- return false;
+ rcu_read_lock();
+ for (;;) {
+ page = xas_find(&xas, max);
+ if (xas_retry(&xas, page))
+ continue;
+ /* Shadow entries don't count */
+ if (xa_is_value(page))
+ continue;
+ /*
+ * We don't need to try to pin this page; we're about to
+ * release the RCU lock anyway. It is enough to know that
+ * there was a page here recently.
+ */
+ break;
+ }
+ rcu_read_unlock();
- if (!find_get_pages_range(mapping, &index, end, 1, &page))
- return false;
- put_page(page);
- return true;
+ return page != NULL;
}
EXPORT_SYMBOL(filemap_range_has_page);
@@ -777,51 +752,44 @@ EXPORT_SYMBOL(file_write_and_wait_range);
* locked. This function does not add the new page to the LRU, the
* caller must do that.
*
- * The remove + add is atomic. The only way this function can fail is
- * memory allocation failure.
+ * The remove + add is atomic. This function cannot fail.
*/
int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
{
- int error;
+ struct address_space *mapping = old->mapping;
+ void (*freepage)(struct page *) = mapping->a_ops->freepage;
+ pgoff_t offset = old->index;
+ XA_STATE(xas, &mapping->i_pages, offset);
+ unsigned long flags;
VM_BUG_ON_PAGE(!PageLocked(old), old);
VM_BUG_ON_PAGE(!PageLocked(new), new);
VM_BUG_ON_PAGE(new->mapping, new);
- error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK);
- if (!error) {
- struct address_space *mapping = old->mapping;
- void (*freepage)(struct page *);
- unsigned long flags;
-
- pgoff_t offset = old->index;
- freepage = mapping->a_ops->freepage;
-
- get_page(new);
- new->mapping = mapping;
- new->index = offset;
+ get_page(new);
+ new->mapping = mapping;
+ new->index = offset;
- xa_lock_irqsave(&mapping->i_pages, flags);
- __delete_from_page_cache(old, NULL);
- error = page_cache_tree_insert(mapping, new, NULL);
- BUG_ON(error);
+ xas_lock_irqsave(&xas, flags);
+ xas_store(&xas, new);
- /*
- * hugetlb pages do not participate in page cache accounting.
- */
- if (!PageHuge(new))
- __inc_node_page_state(new, NR_FILE_PAGES);
- if (PageSwapBacked(new))
- __inc_node_page_state(new, NR_SHMEM);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
- mem_cgroup_migrate(old, new);
- radix_tree_preload_end();
- if (freepage)
- freepage(old);
- put_page(old);
- }
+ old->mapping = NULL;
+ /* hugetlb pages do not participate in page cache accounting. */
+ if (!PageHuge(old))
+ __dec_node_page_state(new, NR_FILE_PAGES);
+ if (!PageHuge(new))
+ __inc_node_page_state(new, NR_FILE_PAGES);
+ if (PageSwapBacked(old))
+ __dec_node_page_state(new, NR_SHMEM);
+ if (PageSwapBacked(new))
+ __inc_node_page_state(new, NR_SHMEM);
+ xas_unlock_irqrestore(&xas, flags);
+ mem_cgroup_migrate(old, new);
+ if (freepage)
+ freepage(old);
+ put_page(old);
- return error;
+ return 0;
}
EXPORT_SYMBOL_GPL(replace_page_cache_page);
@@ -830,12 +798,15 @@ static int __add_to_page_cache_locked(struct page *page,
pgoff_t offset, gfp_t gfp_mask,
void **shadowp)
{
+ XA_STATE(xas, &mapping->i_pages, offset);
int huge = PageHuge(page);
struct mem_cgroup *memcg;
int error;
+ void *old;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
+ mapping_set_update(&xas, mapping);
if (!huge) {
error = mem_cgroup_try_charge(page, current->mm,
@@ -844,39 +815,47 @@ static int __add_to_page_cache_locked(struct page *page,
return error;
}
- error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK);
- if (error) {
- if (!huge)
- mem_cgroup_cancel_charge(page, memcg, false);
- return error;
- }
-
get_page(page);
page->mapping = mapping;
page->index = offset;
- xa_lock_irq(&mapping->i_pages);
- error = page_cache_tree_insert(mapping, page, shadowp);
- radix_tree_preload_end();
- if (unlikely(error))
- goto err_insert;
+ do {
+ xas_lock_irq(&xas);
+ old = xas_load(&xas);
+ if (old && !xa_is_value(old))
+ xas_set_err(&xas, -EEXIST);
+ xas_store(&xas, page);
+ if (xas_error(&xas))
+ goto unlock;
+
+ if (xa_is_value(old)) {
+ mapping->nrexceptional--;
+ if (shadowp)
+ *shadowp = old;
+ }
+ mapping->nrpages++;
+
+ /* hugetlb pages do not participate in page cache accounting */
+ if (!huge)
+ __inc_node_page_state(page, NR_FILE_PAGES);
+unlock:
+ xas_unlock_irq(&xas);
+ } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
+
+ if (xas_error(&xas))
+ goto error;
- /* hugetlb pages do not participate in page cache accounting. */
- if (!huge)
- __inc_node_page_state(page, NR_FILE_PAGES);
- xa_unlock_irq(&mapping->i_pages);
if (!huge)
mem_cgroup_commit_charge(page, memcg, false, false);
trace_mm_filemap_add_to_page_cache(page);
return 0;
-err_insert:
+error:
page->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
- xa_unlock_irq(&mapping->i_pages);
if (!huge)
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
- return error;
+ return xas_error(&xas);
}
/**
@@ -1341,86 +1320,76 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
}
/**
- * page_cache_next_hole - find the next hole (not-present entry)
- * @mapping: mapping
- * @index: index
- * @max_scan: maximum range to search
- *
- * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the
- * lowest indexed hole.
- *
- * Returns: the index of the hole if found, otherwise returns an index
- * outside of the set specified (in which case 'return - index >=
- * max_scan' will be true). In rare cases of index wrap-around, 0 will
- * be returned.
- *
- * page_cache_next_hole may be called under rcu_read_lock. However,
- * like radix_tree_gang_lookup, this will not atomically search a
- * snapshot of the tree at a single point in time. For example, if a
- * hole is created at index 5, then subsequently a hole is created at
- * index 10, page_cache_next_hole covering both indexes may return 10
- * if called under rcu_read_lock.
+ * page_cache_next_miss() - Find the next gap in the page cache.
+ * @mapping: Mapping.
+ * @index: Index.
+ * @max_scan: Maximum range to search.
+ *
+ * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
+ * gap with the lowest index.
+ *
+ * This function may be called under the rcu_read_lock. However, this will
+ * not atomically search a snapshot of the cache at a single point in time.
+ * For example, if a gap is created at index 5, then subsequently a gap is
+ * created at index 10, page_cache_next_miss covering both indices may
+ * return 10 if called under the rcu_read_lock.
+ *
+ * Return: The index of the gap if found, otherwise an index outside the
+ * range specified (in which case 'return - index >= max_scan' will be true).
+ * In the rare case of index wrap-around, 0 will be returned.
*/
-pgoff_t page_cache_next_hole(struct address_space *mapping,
+pgoff_t page_cache_next_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan)
{
- unsigned long i;
-
- for (i = 0; i < max_scan; i++) {
- struct page *page;
+ XA_STATE(xas, &mapping->i_pages, index);
- page = radix_tree_lookup(&mapping->i_pages, index);
- if (!page || radix_tree_exceptional_entry(page))
+ while (max_scan--) {
+ void *entry = xas_next(&xas);
+ if (!entry || xa_is_value(entry))
break;
- index++;
- if (index == 0)
+ if (xas.xa_index == 0)
break;
}
- return index;
+ return xas.xa_index;
}
-EXPORT_SYMBOL(page_cache_next_hole);
+EXPORT_SYMBOL(page_cache_next_miss);
/**
- * page_cache_prev_hole - find the prev hole (not-present entry)
- * @mapping: mapping
- * @index: index
- * @max_scan: maximum range to search
- *
- * Search backwards in the range [max(index-max_scan+1, 0), index] for
- * the first hole.
- *
- * Returns: the index of the hole if found, otherwise returns an index
- * outside of the set specified (in which case 'index - return >=
- * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX
- * will be returned.
- *
- * page_cache_prev_hole may be called under rcu_read_lock. However,
- * like radix_tree_gang_lookup, this will not atomically search a
- * snapshot of the tree at a single point in time. For example, if a
- * hole is created at index 10, then subsequently a hole is created at
- * index 5, page_cache_prev_hole covering both indexes may return 5 if
- * called under rcu_read_lock.
+ * page_cache_prev_miss() - Find the next gap in the page cache.
+ * @mapping: Mapping.
+ * @index: Index.
+ * @max_scan: Maximum range to search.
+ *
+ * Search the range [max(index - max_scan + 1, 0), index] for the
+ * gap with the highest index.
+ *
+ * This function may be called under the rcu_read_lock. However, this will
+ * not atomically search a snapshot of the cache at a single point in time.
+ * For example, if a gap is created at index 10, then subsequently a gap is
+ * created at index 5, page_cache_prev_miss() covering both indices may
+ * return 5 if called under the rcu_read_lock.
+ *
+ * Return: The index of the gap if found, otherwise an index outside the
+ * range specified (in which case 'index - return >= max_scan' will be true).
+ * In the rare case of wrap-around, ULONG_MAX will be returned.
*/
-pgoff_t page_cache_prev_hole(struct address_space *mapping,
+pgoff_t page_cache_prev_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan)
{
- unsigned long i;
-
- for (i = 0; i < max_scan; i++) {
- struct page *page;
+ XA_STATE(xas, &mapping->i_pages, index);
- page = radix_tree_lookup(&mapping->i_pages, index);
- if (!page || radix_tree_exceptional_entry(page))
+ while (max_scan--) {
+ void *entry = xas_prev(&xas);
+ if (!entry || xa_is_value(entry))
break;
- index--;
- if (index == ULONG_MAX)
+ if (xas.xa_index == ULONG_MAX)
break;
}
- return index;
+ return xas.xa_index;
}
-EXPORT_SYMBOL(page_cache_prev_hole);
+EXPORT_SYMBOL(page_cache_prev_miss);
/**
* find_get_entry - find and get a page cache entry
@@ -1437,47 +1406,40 @@ EXPORT_SYMBOL(page_cache_prev_hole);
*/
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
{
- void **pagep;
+ XA_STATE(xas, &mapping->i_pages, offset);
struct page *head, *page;
rcu_read_lock();
repeat:
- page = NULL;
- pagep = radix_tree_lookup_slot(&mapping->i_pages, offset);
- if (pagep) {
- page = radix_tree_deref_slot(pagep);
- if (unlikely(!page))
- goto out;
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto repeat;
- /*
- * A shadow entry of a recently evicted page,
- * or a swap entry from shmem/tmpfs. Return
- * it without attempting to raise page count.
- */
- goto out;
- }
+ xas_reset(&xas);
+ page = xas_load(&xas);
+ if (xas_retry(&xas, page))
+ goto repeat;
+ /*
+ * A shadow entry of a recently evicted page, or a swap entry from
+ * shmem/tmpfs. Return it without attempting to raise page count.
+ */
+ if (!page || xa_is_value(page))
+ goto out;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
- goto repeat;
+ head = compound_head(page);
+ if (!page_cache_get_speculative(head))
+ goto repeat;
- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
+ /* The page was split under us? */
+ if (compound_head(page) != head) {
+ put_page(head);
+ goto repeat;
+ }
- /*
- * Has the page moved?
- * This is part of the lockless pagecache protocol. See
- * include/linux/pagemap.h for details.
- */
- if (unlikely(page != *pagep)) {
- put_page(head);
- goto repeat;
- }
+ /*
+ * Has the page moved?
+ * This is part of the lockless pagecache protocol. See
+ * include/linux/pagemap.h for details.
+ */
+ if (unlikely(page != xas_reload(&xas))) {
+ put_page(head);
+ goto repeat;
}
out:
rcu_read_unlock();
@@ -1508,7 +1470,7 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
repeat:
page = find_get_entry(mapping, offset);
- if (page && !radix_tree_exception(page)) {
+ if (page && !xa_is_value(page)) {
lock_page(page);
/* Has the page been truncated? */
if (unlikely(page_mapping(page) != mapping)) {
@@ -1554,7 +1516,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
repeat:
page = find_get_entry(mapping, offset);
- if (radix_tree_exceptional_entry(page))
+ if (xa_is_value(page))
page = NULL;
if (!page)
goto no_page;
@@ -1640,53 +1602,48 @@ unsigned find_get_entries(struct address_space *mapping,
pgoff_t start, unsigned int nr_entries,
struct page **entries, pgoff_t *indices)
{
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, start);
+ struct page *page;
unsigned int ret = 0;
- struct radix_tree_iter iter;
if (!nr_entries)
return 0;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
- struct page *head, *page;
-repeat:
- page = radix_tree_deref_slot(slot);
- if (unlikely(!page))
+ xas_for_each(&xas, page, ULONG_MAX) {
+ struct page *head;
+ if (xas_retry(&xas, page))
continue;
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- /*
- * A shadow entry of a recently evicted page, a swap
- * entry from shmem/tmpfs or a DAX entry. Return it
- * without attempting to raise page count.
- */
+ /*
+ * A shadow entry of a recently evicted page, a swap
+ * entry from shmem/tmpfs or a DAX entry. Return it
+ * without attempting to raise page count.
+ */
+ if (xa_is_value(page))
goto export;
- }
head = compound_head(page);
if (!page_cache_get_speculative(head))
- goto repeat;
+ goto retry;
/* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
+ if (compound_head(page) != head)
+ goto put_page;
/* Has the page moved? */
- if (unlikely(page != *slot)) {
- put_page(head);
- goto repeat;
- }
+ if (unlikely(page != xas_reload(&xas)))
+ goto put_page;
+
export:
- indices[ret] = iter.index;
+ indices[ret] = xas.xa_index;
entries[ret] = page;
if (++ret == nr_entries)
break;
+ continue;
+put_page:
+ put_page(head);
+retry:
+ xas_reset(&xas);
}
rcu_read_unlock();
return ret;
@@ -1717,64 +1674,50 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
pgoff_t end, unsigned int nr_pages,
struct page **pages)
{
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, *start);
+ struct page *page;
unsigned ret = 0;
if (unlikely(!nr_pages))
return 0;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) {
- struct page *head, *page;
-
- if (iter.index > end)
- break;
-repeat:
- page = radix_tree_deref_slot(slot);
- if (unlikely(!page))
+ xas_for_each(&xas, page, end) {
+ struct page *head;
+ if (xas_retry(&xas, page))
continue;
-
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- /*
- * A shadow entry of a recently evicted page,
- * or a swap entry from shmem/tmpfs. Skip
- * over it.
- */
+ /* Skip over shadow, swap and DAX entries */
+ if (xa_is_value(page))
continue;
- }
head = compound_head(page);
if (!page_cache_get_speculative(head))
- goto repeat;
+ goto retry;
/* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
+ if (compound_head(page) != head)
+ goto put_page;
/* Has the page moved? */
- if (unlikely(page != *slot)) {
- put_page(head);
- goto repeat;
- }
+ if (unlikely(page != xas_reload(&xas)))
+ goto put_page;
pages[ret] = page;
if (++ret == nr_pages) {
- *start = pages[ret - 1]->index + 1;
+ *start = page->index + 1;
goto out;
}
+ continue;
+put_page:
+ put_page(head);
+retry:
+ xas_reset(&xas);
}
/*
* We come here when there is no page beyond @end. We take care to not
* overflow the index @start as it confuses some of the callers. This
- * breaks the iteration when there is page at index -1 but that is
+ * breaks the iteration when there is a page at index -1 but that is
* already broken anyway.
*/
if (end == (pgoff_t)-1)
@@ -1802,57 +1745,43 @@ out:
unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
unsigned int nr_pages, struct page **pages)
{
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, index);
+ struct page *page;
unsigned int ret = 0;
if (unlikely(!nr_pages))
return 0;
rcu_read_lock();
- radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) {
- struct page *head, *page;
-repeat:
- page = radix_tree_deref_slot(slot);
- /* The hole, there no reason to continue */
- if (unlikely(!page))
- break;
-
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- /*
- * A shadow entry of a recently evicted page,
- * or a swap entry from shmem/tmpfs. Stop
- * looking for contiguous pages.
- */
+ for (page = xas_load(&xas); page; page = xas_next(&xas)) {
+ struct page *head;
+ if (xas_retry(&xas, page))
+ continue;
+ /*
+ * If the entry has been swapped out, we can stop looking.
+ * No current caller is looking for DAX entries.
+ */
+ if (xa_is_value(page))
break;
- }
head = compound_head(page);
if (!page_cache_get_speculative(head))
- goto repeat;
+ goto retry;
/* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
+ if (compound_head(page) != head)
+ goto put_page;
/* Has the page moved? */
- if (unlikely(page != *slot)) {
- put_page(head);
- goto repeat;
- }
+ if (unlikely(page != xas_reload(&xas)))
+ goto put_page;
/*
* must check mapping and index after taking the ref.
* otherwise we can get both false positives and false
* negatives, which is just confusing to the caller.
*/
- if (page->mapping == NULL || page_to_pgoff(page) != iter.index) {
+ if (!page->mapping || page_to_pgoff(page) != xas.xa_index) {
put_page(page);
break;
}
@@ -1860,6 +1789,11 @@ repeat:
pages[ret] = page;
if (++ret == nr_pages)
break;
+ continue;
+put_page:
+ put_page(head);
+retry:
+ xas_reset(&xas);
}
rcu_read_unlock();
return ret;
@@ -1879,74 +1813,58 @@ EXPORT_SYMBOL(find_get_pages_contig);
* @tag. We update @index to index the next page for the traversal.
*/
unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
- pgoff_t end, int tag, unsigned int nr_pages,
+ pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
struct page **pages)
{
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, *index);
+ struct page *page;
unsigned ret = 0;
if (unlikely(!nr_pages))
return 0;
rcu_read_lock();
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) {
- struct page *head, *page;
-
- if (iter.index > end)
- break;
-repeat:
- page = radix_tree_deref_slot(slot);
- if (unlikely(!page))
+ xas_for_each_marked(&xas, page, end, tag) {
+ struct page *head;
+ if (xas_retry(&xas, page))
continue;
-
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- /*
- * A shadow entry of a recently evicted page.
- *
- * Those entries should never be tagged, but
- * this tree walk is lockless and the tags are
- * looked up in bulk, one radix tree node at a
- * time, so there is a sizable window for page
- * reclaim to evict a page we saw tagged.
- *
- * Skip over it.
- */
+ /*
+ * Shadow entries should never be tagged, but this iteration
+ * is lockless so there is a window for page reclaim to evict
+ * a page we saw tagged. Skip over it.
+ */
+ if (xa_is_value(page))
continue;
- }
head = compound_head(page);
if (!page_cache_get_speculative(head))
- goto repeat;
+ goto retry;
/* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
+ if (compound_head(page) != head)
+ goto put_page;
/* Has the page moved? */
- if (unlikely(page != *slot)) {
- put_page(head);
- goto repeat;
- }
+ if (unlikely(page != xas_reload(&xas)))
+ goto put_page;
pages[ret] = page;
if (++ret == nr_pages) {
- *index = pages[ret - 1]->index + 1;
+ *index = page->index + 1;
goto out;
}
+ continue;
+put_page:
+ put_page(head);
+retry:
+ xas_reset(&xas);
}
/*
- * We come here when we got at @end. We take care to not overflow the
+ * We come here when we got to @end. We take care to not overflow the
* index @index as it confuses some of the callers. This breaks the
- * iteration when there is page at index -1 but that is already broken
- * anyway.
+ * iteration when there is a page at index -1 but that is already
+ * broken anyway.
*/
if (end == (pgoff_t)-1)
*index = (pgoff_t)-1;
@@ -1972,57 +1890,51 @@ EXPORT_SYMBOL(find_get_pages_range_tag);
* @tag.
*/
unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
- int tag, unsigned int nr_entries,
+ xa_mark_t tag, unsigned int nr_entries,
struct page **entries, pgoff_t *indices)
{
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, start);
+ struct page *page;
unsigned int ret = 0;
- struct radix_tree_iter iter;
if (!nr_entries)
return 0;
rcu_read_lock();
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) {
- struct page *head, *page;
-repeat:
- page = radix_tree_deref_slot(slot);
- if (unlikely(!page))
+ xas_for_each_marked(&xas, page, ULONG_MAX, tag) {
+ struct page *head;
+ if (xas_retry(&xas, page))
continue;
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
-
- /*
- * A shadow entry of a recently evicted page, a swap
- * entry from shmem/tmpfs or a DAX entry. Return it
- * without attempting to raise page count.
- */
+ /*
+ * A shadow entry of a recently evicted page, a swap
+ * entry from shmem/tmpfs or a DAX entry. Return it
+ * without attempting to raise page count.
+ */
+ if (xa_is_value(page))
goto export;
- }
head = compound_head(page);
if (!page_cache_get_speculative(head))
- goto repeat;
+ goto retry;
/* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
+ if (compound_head(page) != head)
+ goto put_page;
/* Has the page moved? */
- if (unlikely(page != *slot)) {
- put_page(head);
- goto repeat;
- }
+ if (unlikely(page != xas_reload(&xas)))
+ goto put_page;
+
export:
- indices[ret] = iter.index;
+ indices[ret] = xas.xa_index;
entries[ret] = page;
if (++ret == nr_entries)
break;
+ continue;
+put_page:
+ put_page(head);
+retry:
+ xas_reset(&xas);
}
rcu_read_unlock();
return ret;
@@ -2626,45 +2538,31 @@ EXPORT_SYMBOL(filemap_fault);
void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
- struct radix_tree_iter iter;
- void **slot;
struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
unsigned long max_idx;
+ XA_STATE(xas, &mapping->i_pages, start_pgoff);
struct page *head, *page;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) {
- if (iter.index > end_pgoff)
- break;
-repeat:
- page = radix_tree_deref_slot(slot);
- if (unlikely(!page))
- goto next;
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
+ xas_for_each(&xas, page, end_pgoff) {
+ if (xas_retry(&xas, page))
+ continue;
+ if (xa_is_value(page))
goto next;
- }
head = compound_head(page);
if (!page_cache_get_speculative(head))
- goto repeat;
+ goto next;
/* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
+ if (compound_head(page) != head)
+ goto skip;
/* Has the page moved? */
- if (unlikely(page != *slot)) {
- put_page(head);
- goto repeat;
- }
+ if (unlikely(page != xas_reload(&xas)))
+ goto skip;
if (!PageUptodate(page) ||
PageReadahead(page) ||
@@ -2683,10 +2581,10 @@ repeat:
if (file->f_ra.mmap_miss > 0)
file->f_ra.mmap_miss--;
- vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT;
+ vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
if (vmf->pte)
- vmf->pte += iter.index - last_pgoff;
- last_pgoff = iter.index;
+ vmf->pte += xas.xa_index - last_pgoff;
+ last_pgoff = xas.xa_index;
if (alloc_set_pte(vmf, NULL, page))
goto unlock;
unlock_page(page);
@@ -2699,8 +2597,6 @@ next:
/* Huge page is mapped? No need to proceed. */
if (pmd_trans_huge(*vmf->pmd))
break;
- if (iter.index == end_pgoff)
- break;
}
rcu_read_unlock();
}
@@ -2810,7 +2706,7 @@ repeat:
put_page(page);
if (err == -EEXIST)
goto repeat;
- /* Presumably ENOMEM for radix tree node */
+ /* Presumably ENOMEM for xarray node */
return ERR_PTR(err);
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 25ef59b7ee34..4e4ef8fa479d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2450,13 +2450,13 @@ static void __split_huge_page(struct page *page, struct list_head *list,
ClearPageCompound(head);
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
- /* Additional pin to radix tree of swap cache */
+ /* Additional pin to swap cache */
if (PageSwapCache(head))
page_ref_add(head, 2);
else
page_ref_inc(head);
} else {
- /* Additional pin to radix tree */
+ /* Additional pin to page cache */
page_ref_add(head, 2);
xa_unlock(&head->mapping->i_pages);
}
@@ -2568,7 +2568,7 @@ bool can_split_huge_page(struct page *page, int *pextra_pins)
{
int extra_pins;
- /* Additional pins from radix tree */
+ /* Additional pins from page cache */
if (PageAnon(page))
extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
else
@@ -2664,17 +2664,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);
if (mapping) {
- void **pslot;
+ XA_STATE(xas, &mapping->i_pages, page_index(head));
- xa_lock(&mapping->i_pages);
- pslot = radix_tree_lookup_slot(&mapping->i_pages,
- page_index(head));
/*
- * Check if the head page is present in radix tree.
+ * Check if the head page is present in page cache.
* We assume all tail are present too, if head is there.
*/
- if (radix_tree_deref_slot_protected(pslot,
- &mapping->i_pages.xa_lock) != head)
+ xa_lock(&mapping->i_pages);
+ if (xas_load(&xas) != head)
goto fail;
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a31d740e6cd1..c13625c1ad5e 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1288,17 +1288,17 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
*
* Basic scheme is simple, details are more complex:
* - allocate and freeze a new huge page;
- * - scan over radix tree replacing old pages the new one
+ * - scan page cache replacing old pages with the new one
* + swap in pages if necessary;
* + fill in gaps;
- * + keep old pages around in case if rollback is required;
- * - if replacing succeed:
+ * + keep old pages around in case rollback is required;
+ * - if replacing succeeds:
* + copy data over;
* + free old pages;
* + unfreeze huge page;
* - if replacing failed;
* + put all pages back and unfreeze them;
- * + restore gaps in the radix-tree;
+ * + restore gaps in the page cache;
* + free huge page;
*/
static void collapse_shmem(struct mm_struct *mm,
@@ -1306,12 +1306,11 @@ static void collapse_shmem(struct mm_struct *mm,
struct page **hpage, int node)
{
gfp_t gfp;
- struct page *page, *new_page, *tmp;
+ struct page *new_page;
struct mem_cgroup *memcg;
pgoff_t index, end = start + HPAGE_PMD_NR;
LIST_HEAD(pagelist);
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
int nr_none = 0, result = SCAN_SUCCEED;
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
@@ -1336,48 +1335,49 @@ static void collapse_shmem(struct mm_struct *mm,
__SetPageLocked(new_page);
BUG_ON(!page_ref_freeze(new_page, 1));
-
/*
- * At this point the new_page is 'frozen' (page_count() is zero), locked
- * and not up-to-date. It's safe to insert it into radix tree, because
- * nobody would be able to map it or use it in other way until we
- * unfreeze it.
+ * At this point the new_page is 'frozen' (page_count() is zero),
+ * locked and not up-to-date. It's safe to insert it into the page
+ * cache, because nobody would be able to map it or use it in other
+ * way until we unfreeze it.
*/
- index = start;
- xa_lock_irq(&mapping->i_pages);
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
- int n = min(iter.index, end) - index;
-
- /*
- * Handle holes in the radix tree: charge it from shmem and
- * insert relevant subpage of new_page into the radix-tree.
- */
- if (n && !shmem_charge(mapping->host, n)) {
- result = SCAN_FAIL;
+ /* This will be less messy when we use multi-index entries */
+ do {
+ xas_lock_irq(&xas);
+ xas_create_range(&xas);
+ if (!xas_error(&xas))
break;
- }
- nr_none += n;
- for (; index < min(iter.index, end); index++) {
- radix_tree_insert(&mapping->i_pages, index,
- new_page + (index % HPAGE_PMD_NR));
- }
+ xas_unlock_irq(&xas);
+ if (!xas_nomem(&xas, GFP_KERNEL))
+ goto out;
+ } while (1);
- /* We are done. */
- if (index >= end)
- break;
+ xas_set(&xas, start);
+ for (index = start; index < end; index++) {
+ struct page *page = xas_next(&xas);
+
+ VM_BUG_ON(index != xas.xa_index);
+ if (!page) {
+ if (!shmem_charge(mapping->host, 1)) {
+ result = SCAN_FAIL;
+ break;
+ }
+ xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
+ nr_none++;
+ continue;
+ }
- page = radix_tree_deref_slot_protected(slot,
- &mapping->i_pages.xa_lock);
- if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) {
- xa_unlock_irq(&mapping->i_pages);
+ if (xa_is_value(page) || !PageUptodate(page)) {
+ xas_unlock_irq(&xas);
/* swap in or instantiate fallocated page */
if (shmem_getpage(mapping->host, index, &page,
SGP_NOHUGE)) {
result = SCAN_FAIL;
- goto tree_unlocked;
+ goto xa_unlocked;
}
- xa_lock_irq(&mapping->i_pages);
+ xas_lock_irq(&xas);
+ xas_set(&xas, index);
} else if (trylock_page(page)) {
get_page(page);
} else {
@@ -1397,7 +1397,7 @@ static void collapse_shmem(struct mm_struct *mm,
result = SCAN_TRUNCATED;
goto out_unlock;
}
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
if (isolate_lru_page(page)) {
result = SCAN_DEL_PAGE_LRU;
@@ -1407,17 +1407,16 @@ static void collapse_shmem(struct mm_struct *mm,
if (page_mapped(page))
unmap_mapping_pages(mapping, index, 1, false);
- xa_lock_irq(&mapping->i_pages);
+ xas_lock_irq(&xas);
+ xas_set(&xas, index);
- slot = radix_tree_lookup_slot(&mapping->i_pages, index);
- VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot,
- &mapping->i_pages.xa_lock), page);
+ VM_BUG_ON_PAGE(page != xas_load(&xas), page);
VM_BUG_ON_PAGE(page_mapped(page), page);
/*
* The page is expected to have page_count() == 3:
* - we hold a pin on it;
- * - one reference from radix tree;
+ * - one reference from page cache;
* - one from isolate_lru_page;
*/
if (!page_ref_freeze(page, 3)) {
@@ -1432,56 +1431,30 @@ static void collapse_shmem(struct mm_struct *mm,
list_add_tail(&page->lru, &pagelist);
/* Finally, replace with the new page. */
- radix_tree_replace_slot(&mapping->i_pages, slot,
- new_page + (index % HPAGE_PMD_NR));
-
- slot = radix_tree_iter_resume(slot, &iter);
- index++;
+ xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
continue;
out_lru:
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
putback_lru_page(page);
out_isolate_failed:
unlock_page(page);
put_page(page);
- goto tree_unlocked;
+ goto xa_unlocked;
out_unlock:
unlock_page(page);
put_page(page);
break;
}
+ xas_unlock_irq(&xas);
- /*
- * Handle hole in radix tree at the end of the range.
- * This code only triggers if there's nothing in radix tree
- * beyond 'end'.
- */
- if (result == SCAN_SUCCEED && index < end) {
- int n = end - index;
-
- if (!shmem_charge(mapping->host, n)) {
- result = SCAN_FAIL;
- goto tree_locked;
- }
-
- for (; index < end; index++) {
- radix_tree_insert(&mapping->i_pages, index,
- new_page + (index % HPAGE_PMD_NR));
- }
- nr_none += n;
- }
-
-tree_locked:
- xa_unlock_irq(&mapping->i_pages);
-tree_unlocked:
-
+xa_unlocked:
if (result == SCAN_SUCCEED) {
- unsigned long flags;
+ struct page *page, *tmp;
struct zone *zone = page_zone(new_page);
/*
- * Replacing old pages with new one has succeed, now we need to
- * copy the content and free old pages.
+ * Replacing old pages with new one has succeeded, now we
+ * need to copy the content and free the old pages.
*/
list_for_each_entry_safe(page, tmp, &pagelist, lru) {
copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
@@ -1495,16 +1468,16 @@ tree_unlocked:
put_page(page);
}
- local_irq_save(flags);
+ local_irq_disable();
__inc_node_page_state(new_page, NR_SHMEM_THPS);
if (nr_none) {
__mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
__mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none);
}
- local_irq_restore(flags);
+ local_irq_enable();
/*
- * Remove pte page tables, so we can re-faulti
+ * Remove pte page tables, so we can re-fault
* the page as huge.
*/
retract_page_tables(mapping, start);
@@ -1521,37 +1494,37 @@ tree_unlocked:
khugepaged_pages_collapsed++;
} else {
- /* Something went wrong: rollback changes to the radix-tree */
+ struct page *page;
+ /* Something went wrong: roll back page cache changes */
shmem_uncharge(mapping->host, nr_none);
- xa_lock_irq(&mapping->i_pages);
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
- if (iter.index >= end)
- break;
+ xas_lock_irq(&xas);
+ xas_set(&xas, start);
+ xas_for_each(&xas, page, end - 1) {
page = list_first_entry_or_null(&pagelist,
struct page, lru);
- if (!page || iter.index < page->index) {
+ if (!page || xas.xa_index < page->index) {
if (!nr_none)
break;
nr_none--;
/* Put holes back where they were */
- radix_tree_delete(&mapping->i_pages, iter.index);
+ xas_store(&xas, NULL);
continue;
}
- VM_BUG_ON_PAGE(page->index != iter.index, page);
+ VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
/* Unfreeze the page. */
list_del(&page->lru);
page_ref_unfreeze(page, 2);
- radix_tree_replace_slot(&mapping->i_pages, slot, page);
- slot = radix_tree_iter_resume(slot, &iter);
- xa_unlock_irq(&mapping->i_pages);
+ xas_store(&xas, page);
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
putback_lru_page(page);
unlock_page(page);
- xa_lock_irq(&mapping->i_pages);
+ xas_lock_irq(&xas);
}
VM_BUG_ON(nr_none);
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
/* Unfreeze new_page, caller would take care about freeing it */
page_ref_unfreeze(new_page, 1);
@@ -1569,8 +1542,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
pgoff_t start, struct page **hpage)
{
struct page *page = NULL;
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, start);
int present, swap;
int node = NUMA_NO_NODE;
int result = SCAN_SUCCEED;
@@ -1579,17 +1551,11 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
swap = 0;
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
- if (iter.index >= start + HPAGE_PMD_NR)
- break;
-
- page = radix_tree_deref_slot(slot);
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
+ xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
+ if (xas_retry(&xas, page))
continue;
- }
- if (radix_tree_exception(page)) {
+ if (xa_is_value(page)) {
if (++swap > khugepaged_max_ptes_swap) {
result = SCAN_EXCEED_SWAP_PTE;
break;
@@ -1628,7 +1594,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
present++;
if (need_resched()) {
- slot = radix_tree_iter_resume(slot, &iter);
+ xas_pause(&xas);
cond_resched_rcu();
}
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 71d21df2a3f3..6cb1ca93e290 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -251,7 +251,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
page = find_get_entry(mapping, index);
- if (!radix_tree_exceptional_entry(page)) {
+ if (!xa_is_value(page)) {
if (page)
put_page(page);
continue;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 10a9b554d69f..54920cbc46bf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4728,7 +4728,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
/* shmem/tmpfs may report page out on swap: account for that too. */
if (shmem_mapping(mapping)) {
page = find_get_entry(mapping, pgoff);
- if (radix_tree_exceptional_entry(page)) {
+ if (xa_is_value(page)) {
swp_entry_t swp = radix_to_swp_entry(page);
if (do_memsw_account())
*entry = swp;
diff --git a/mm/memfd.c b/mm/memfd.c
index 2bb5e257080e..97264c79d2cd 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -21,44 +21,36 @@
#include <uapi/linux/memfd.h>
/*
- * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
+ * We need a tag: a new tag would expand every xa_node by 8 bytes,
* so reuse a tag which we firmly believe is never set or cleared on tmpfs
* or hugetlbfs because they are memory only filesystems.
*/
#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
#define LAST_SCAN 4 /* about 150ms max */
-static void memfd_tag_pins(struct address_space *mapping)
+static void memfd_tag_pins(struct xa_state *xas)
{
- struct radix_tree_iter iter;
- void __rcu **slot;
- pgoff_t start;
struct page *page;
+ unsigned int tagged = 0;
lru_add_drain();
- start = 0;
- rcu_read_lock();
-
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
- page = radix_tree_deref_slot(slot);
- if (!page || radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- } else if (page_count(page) - page_mapcount(page) > 1) {
- xa_lock_irq(&mapping->i_pages);
- radix_tree_tag_set(&mapping->i_pages, iter.index,
- MEMFD_TAG_PINNED);
- xa_unlock_irq(&mapping->i_pages);
- }
- if (need_resched()) {
- slot = radix_tree_iter_resume(slot, &iter);
- cond_resched_rcu();
- }
+ xas_lock_irq(xas);
+ xas_for_each(xas, page, ULONG_MAX) {
+ if (xa_is_value(page))
+ continue;
+ if (page_count(page) - page_mapcount(page) > 1)
+ xas_set_mark(xas, MEMFD_TAG_PINNED);
+
+ if (++tagged % XA_CHECK_SCHED)
+ continue;
+
+ xas_pause(xas);
+ xas_unlock_irq(xas);
+ cond_resched();
+ xas_lock_irq(xas);
}
- rcu_read_unlock();
+ xas_unlock_irq(xas);
}
/*
@@ -72,17 +64,17 @@ static void memfd_tag_pins(struct address_space *mapping)
*/
static int memfd_wait_for_pins(struct address_space *mapping)
{
- struct radix_tree_iter iter;
- void __rcu **slot;
- pgoff_t start;
+ XA_STATE(xas, &mapping->i_pages, 0);
struct page *page;
int error, scan;
- memfd_tag_pins(mapping);
+ memfd_tag_pins(&xas);
error = 0;
for (scan = 0; scan <= LAST_SCAN; scan++) {
- if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED))
+ unsigned int tagged = 0;
+
+ if (!xas_marked(&xas, MEMFD_TAG_PINNED))
break;
if (!scan)
@@ -90,45 +82,34 @@ static int memfd_wait_for_pins(struct address_space *mapping)
else if (schedule_timeout_killable((HZ << scan) / 200))
scan = LAST_SCAN;
- start = 0;
- rcu_read_lock();
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter,
- start, MEMFD_TAG_PINNED) {
-
- page = radix_tree_deref_slot(slot);
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
-
- page = NULL;
- }
-
- if (page &&
- page_count(page) - page_mapcount(page) != 1) {
- if (scan < LAST_SCAN)
- goto continue_resched;
-
+ xas_set(&xas, 0);
+ xas_lock_irq(&xas);
+ xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
+ bool clear = true;
+ if (xa_is_value(page))
+ continue;
+ if (page_count(page) - page_mapcount(page) != 1) {
/*
* On the last scan, we clean up all those tags
* we inserted; but make a note that we still
* found pages pinned.
*/
- error = -EBUSY;
+ if (scan == LAST_SCAN)
+ error = -EBUSY;
+ else
+ clear = false;
}
+ if (clear)
+ xas_clear_mark(&xas, MEMFD_TAG_PINNED);
+ if (++tagged % XA_CHECK_SCHED)
+ continue;
- xa_lock_irq(&mapping->i_pages);
- radix_tree_tag_clear(&mapping->i_pages,
- iter.index, MEMFD_TAG_PINNED);
- xa_unlock_irq(&mapping->i_pages);
-continue_resched:
- if (need_resched()) {
- slot = radix_tree_iter_resume(slot, &iter);
- cond_resched_rcu();
- }
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
+ cond_resched();
+ xas_lock_irq(&xas);
}
- rcu_read_unlock();
+ xas_unlock_irq(&xas);
}
return error;
diff --git a/mm/migrate.c b/mm/migrate.c
index b6700f2962f3..f7e4bfdc13b7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -326,7 +326,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
page = migration_entry_to_page(entry);
/*
- * Once radix-tree replacement of page migration started, page_count
+ * Once page cache replacement of page migration started, page_count
* *must* be zero. And, we don't want to call wait_on_page_locked()
* against a page without get_page().
* So, we use get_page_unless_zero(), here. Even failed, page fault
@@ -441,10 +441,10 @@ int migrate_page_move_mapping(struct address_space *mapping,
struct buffer_head *head, enum migrate_mode mode,
int extra_count)
{
+ XA_STATE(xas, &mapping->i_pages, page_index(page));
struct zone *oldzone, *newzone;
int dirty;
int expected_count = 1 + extra_count;
- void **pslot;
/*
* Device public or private pages have an extra refcount as they are
@@ -470,21 +470,16 @@ int migrate_page_move_mapping(struct address_space *mapping,
oldzone = page_zone(page);
newzone = page_zone(newpage);
- xa_lock_irq(&mapping->i_pages);
-
- pslot = radix_tree_lookup_slot(&mapping->i_pages,
- page_index(page));
+ xas_lock_irq(&xas);
expected_count += hpage_nr_pages(page) + page_has_private(page);
- if (page_count(page) != expected_count ||
- radix_tree_deref_slot_protected(pslot,
- &mapping->i_pages.xa_lock) != page) {
- xa_unlock_irq(&mapping->i_pages);
+ if (page_count(page) != expected_count || xas_load(&xas) != page) {
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
if (!page_ref_freeze(page, expected_count)) {
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
@@ -498,7 +493,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
if (mode == MIGRATE_ASYNC && head &&
!buffer_migrate_lock_buffers(head, mode)) {
page_ref_unfreeze(page, expected_count);
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
@@ -526,16 +521,13 @@ int migrate_page_move_mapping(struct address_space *mapping,
SetPageDirty(newpage);
}
- radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
+ xas_store(&xas, newpage);
if (PageTransHuge(page)) {
int i;
- int index = page_index(page);
for (i = 1; i < HPAGE_PMD_NR; i++) {
- pslot = radix_tree_lookup_slot(&mapping->i_pages,
- index + i);
- radix_tree_replace_slot(&mapping->i_pages, pslot,
- newpage + i);
+ xas_next(&xas);
+ xas_store(&xas, newpage + i);
}
}
@@ -546,7 +538,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
*/
page_ref_unfreeze(page, expected_count - hpage_nr_pages(page));
- xa_unlock(&mapping->i_pages);
+ xas_unlock(&xas);
/* Leave irq disabled to prevent preemption while updating stats */
/*
@@ -586,22 +578,18 @@ EXPORT_SYMBOL(migrate_page_move_mapping);
int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page)
{
+ XA_STATE(xas, &mapping->i_pages, page_index(page));
int expected_count;
- void **pslot;
-
- xa_lock_irq(&mapping->i_pages);
-
- pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page));
+ xas_lock_irq(&xas);
expected_count = 2 + page_has_private(page);
- if (page_count(page) != expected_count ||
- radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) {
- xa_unlock_irq(&mapping->i_pages);
+ if (page_count(page) != expected_count || xas_load(&xas) != page) {
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
if (!page_ref_freeze(page, expected_count)) {
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
@@ -610,11 +598,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
get_page(newpage);
- radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
+ xas_store(&xas, newpage);
page_ref_unfreeze(page, expected_count - 1);
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
return MIGRATEPAGE_SUCCESS;
}
diff --git a/mm/mincore.c b/mm/mincore.c
index fc37afe226e6..4985965aa20a 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -66,7 +66,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
* shmem/tmpfs may return swap: account for swapcache
* page too.
*/
- if (radix_tree_exceptional_entry(page)) {
+ if (xa_is_value(page)) {
swp_entry_t swp = radix_to_swp_entry(page);
page = find_get_page(swap_address_space(swp),
swp_offset(swp));
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 439a304a6c92..3f690bae6b78 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2097,34 +2097,25 @@ void __init page_writeback_init(void)
* dirty pages in the file (thus it is important for this function to be quick
* so that it can tag pages faster than a dirtying process can create them).
*/
-/*
- * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock
- * latency.
- */
void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
-#define WRITEBACK_TAG_BATCH 4096
- unsigned long tagged = 0;
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, start);
+ unsigned int tagged = 0;
+ void *page;
- xa_lock_irq(&mapping->i_pages);
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start,
- PAGECACHE_TAG_DIRTY) {
- if (iter.index > end)
- break;
- radix_tree_iter_tag_set(&mapping->i_pages, &iter,
- PAGECACHE_TAG_TOWRITE);
- tagged++;
- if ((tagged % WRITEBACK_TAG_BATCH) != 0)
+ xas_lock_irq(&xas);
+ xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) {
+ xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
+ if (++tagged % XA_CHECK_SCHED)
continue;
- slot = radix_tree_iter_resume(slot, &iter);
- xa_unlock_irq(&mapping->i_pages);
+
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
cond_resched();
- xa_lock_irq(&mapping->i_pages);
+ xas_lock_irq(&xas);
}
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
}
EXPORT_SYMBOL(tag_pages_for_writeback);
@@ -2170,7 +2161,7 @@ int write_cache_pages(struct address_space *mapping,
pgoff_t end; /* Inclusive */
pgoff_t done_index;
int range_whole = 0;
- int tag;
+ xa_mark_t tag;
pagevec_init(&pvec);
if (wbc->range_cyclic) {
@@ -2442,7 +2433,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
- * its radix tree.
+ * the xarray.
*
* This is also used when a single buffer is being dirtied: we want to set the
* page dirty in that case, but not all the buffers. This is a "bottom-up"
@@ -2468,7 +2459,7 @@ int __set_page_dirty_nobuffers(struct page *page)
BUG_ON(page_mapping(page) != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->i_pages, page_index(page),
+ __xa_set_mark(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
xa_unlock_irqrestore(&mapping->i_pages, flags);
unlock_page_memcg(page);
@@ -2631,13 +2622,13 @@ EXPORT_SYMBOL(__cancel_dirty_page);
* Returns true if the page was previously dirty.
*
* This is for preparing to put the page under writeout. We leave the page
- * tagged as dirty in the radix tree so that a concurrent write-for-sync
+ * tagged as dirty in the xarray so that a concurrent write-for-sync
* can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
* implementation will run either set_page_writeback() or set_page_dirty(),
- * at which stage we bring the page's dirty flag and radix-tree dirty tag
+ * at which stage we bring the page's dirty flag and xarray dirty tag
* back into sync.
*
- * This incoherency between the page's dirty flag and radix-tree tag is
+ * This incoherency between the page's dirty flag and xarray tag is
* unfortunate, but it only exists while the page is locked.
*/
int clear_page_dirty_for_io(struct page *page)
@@ -2718,7 +2709,7 @@ int test_clear_page_writeback(struct page *page)
xa_lock_irqsave(&mapping->i_pages, flags);
ret = TestClearPageWriteback(page);
if (ret) {
- radix_tree_tag_clear(&mapping->i_pages, page_index(page),
+ __xa_clear_mark(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi)) {
struct bdi_writeback *wb = inode_to_wb(inode);
@@ -2758,11 +2749,13 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
lock_page_memcg(page);
if (mapping && mapping_use_writeback_tags(mapping)) {
+ XA_STATE(xas, &mapping->i_pages, page_index(page));
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
- xa_lock_irqsave(&mapping->i_pages, flags);
+ xas_lock_irqsave(&xas, flags);
+ xas_load(&xas);
ret = TestSetPageWriteback(page);
if (!ret) {
bool on_wblist;
@@ -2770,8 +2763,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
on_wblist = mapping_tagged(mapping,
PAGECACHE_TAG_WRITEBACK);
- radix_tree_tag_set(&mapping->i_pages, page_index(page),
- PAGECACHE_TAG_WRITEBACK);
+ xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi))
inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
@@ -2784,12 +2776,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
sb_mark_inode_writeback(mapping->host);
}
if (!PageDirty(page))
- radix_tree_tag_clear(&mapping->i_pages, page_index(page),
- PAGECACHE_TAG_DIRTY);
+ xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
if (!keep_write)
- radix_tree_tag_clear(&mapping->i_pages, page_index(page),
- PAGECACHE_TAG_TOWRITE);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
+ xas_unlock_irqrestore(&xas, flags);
} else {
ret = TestSetPageWriteback(page);
}
@@ -2803,16 +2793,6 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
}
EXPORT_SYMBOL(__test_set_page_writeback);
-/*
- * Return true if any of the pages in the mapping are marked with the
- * passed tag.
- */
-int mapping_tagged(struct address_space *mapping, int tag)
-{
- return radix_tree_tagged(&mapping->i_pages, tag);
-}
-EXPORT_SYMBOL(mapping_tagged);
-
/**
* wait_for_stable_page() - wait for writeback to finish, if necessary.
* @page: The page to wait on.
diff --git a/mm/readahead.c b/mm/readahead.c
index 4e630143a0ba..f3d6f9656a3c 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -176,10 +176,8 @@ unsigned int __do_page_cache_readahead(struct address_space *mapping,
if (page_offset > end_index)
break;
- rcu_read_lock();
- page = radix_tree_lookup(&mapping->i_pages, page_offset);
- rcu_read_unlock();
- if (page && !radix_tree_exceptional_entry(page)) {
+ page = xa_load(&mapping->i_pages, page_offset);
+ if (page && !xa_is_value(page)) {
/*
* Page already present? Kick off the current batch of
* contiguous pages before continuing with the next
@@ -336,7 +334,7 @@ static pgoff_t count_history_pages(struct address_space *mapping,
pgoff_t head;
rcu_read_lock();
- head = page_cache_prev_hole(mapping, offset - 1, max);
+ head = page_cache_prev_miss(mapping, offset - 1, max);
rcu_read_unlock();
return offset - 1 - head;
@@ -425,7 +423,7 @@ ondemand_readahead(struct address_space *mapping,
pgoff_t start;
rcu_read_lock();
- start = page_cache_next_hole(mapping, offset + 1, max_pages);
+ start = page_cache_next_miss(mapping, offset + 1, max_pages);
rcu_read_unlock();
if (!start || start - offset > max_pages)
diff --git a/mm/shmem.c b/mm/shmem.c
index 446942677cd4..56bf122e0bb4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -322,24 +322,20 @@ void shmem_uncharge(struct inode *inode, long pages)
}
/*
- * Replace item expected in radix tree by a new item, while holding tree lock.
+ * Replace item expected in xarray by a new item, while holding xa_lock.
*/
-static int shmem_radix_tree_replace(struct address_space *mapping,
+static int shmem_replace_entry(struct address_space *mapping,
pgoff_t index, void *expected, void *replacement)
{
- struct radix_tree_node *node;
- void __rcu **pslot;
+ XA_STATE(xas, &mapping->i_pages, index);
void *item;
VM_BUG_ON(!expected);
VM_BUG_ON(!replacement);
- item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot);
- if (!item)
- return -ENOENT;
+ item = xas_load(&xas);
if (item != expected)
return -ENOENT;
- __radix_tree_replace(&mapping->i_pages, node, pslot,
- replacement, NULL);
+ xas_store(&xas, replacement);
return 0;
}
@@ -353,12 +349,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
static bool shmem_confirm_swap(struct address_space *mapping,
pgoff_t index, swp_entry_t swap)
{
- void *item;
-
- rcu_read_lock();
- item = radix_tree_lookup(&mapping->i_pages, index);
- rcu_read_unlock();
- return item == swp_to_radix_entry(swap);
+ return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
}
/*
@@ -586,9 +577,11 @@ static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
*/
static int shmem_add_to_page_cache(struct page *page,
struct address_space *mapping,
- pgoff_t index, void *expected)
+ pgoff_t index, void *expected, gfp_t gfp)
{
- int error, nr = hpage_nr_pages(page);
+ XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
+ unsigned long i = 0;
+ unsigned long nr = 1UL << compound_order(page);
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(index != round_down(index, nr), page);
@@ -600,47 +593,39 @@ static int shmem_add_to_page_cache(struct page *page,
page->mapping = mapping;
page->index = index;
- xa_lock_irq(&mapping->i_pages);
- if (PageTransHuge(page)) {
- void __rcu **results;
- pgoff_t idx;
- int i;
-
- error = 0;
- if (radix_tree_gang_lookup_slot(&mapping->i_pages,
- &results, &idx, index, 1) &&
- idx < index + HPAGE_PMD_NR) {
- error = -EEXIST;
+ do {
+ void *entry;
+ xas_lock_irq(&xas);
+ entry = xas_find_conflict(&xas);
+ if (entry != expected)
+ xas_set_err(&xas, -EEXIST);
+ xas_create_range(&xas);
+ if (xas_error(&xas))
+ goto unlock;
+next:
+ xas_store(&xas, page + i);
+ if (++i < nr) {
+ xas_next(&xas);
+ goto next;
}
-
- if (!error) {
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- error = radix_tree_insert(&mapping->i_pages,
- index + i, page + i);
- VM_BUG_ON(error);
- }
+ if (PageTransHuge(page)) {
count_vm_event(THP_FILE_ALLOC);
+ __inc_node_page_state(page, NR_SHMEM_THPS);
}
- } else if (!expected) {
- error = radix_tree_insert(&mapping->i_pages, index, page);
- } else {
- error = shmem_radix_tree_replace(mapping, index, expected,
- page);
- }
-
- if (!error) {
mapping->nrpages += nr;
- if (PageTransHuge(page))
- __inc_node_page_state(page, NR_SHMEM_THPS);
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
__mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
- xa_unlock_irq(&mapping->i_pages);
- } else {
+unlock:
+ xas_unlock_irq(&xas);
+ } while (xas_nomem(&xas, gfp));
+
+ if (xas_error(&xas)) {
page->mapping = NULL;
- xa_unlock_irq(&mapping->i_pages);
page_ref_sub(page, nr);
+ return xas_error(&xas);
}
- return error;
+
+ return 0;
}
/*
@@ -654,7 +639,7 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
VM_BUG_ON_PAGE(PageCompound(page), page);
xa_lock_irq(&mapping->i_pages);
- error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
+ error = shmem_replace_entry(mapping, page->index, page, radswap);
page->mapping = NULL;
mapping->nrpages--;
__dec_node_page_state(page, NR_FILE_PAGES);
@@ -665,7 +650,7 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
}
/*
- * Remove swap entry from radix tree, free the swap and its page cache.
+ * Remove swap entry from page cache, free the swap and its page cache.
*/
static int shmem_free_swap(struct address_space *mapping,
pgoff_t index, void *radswap)
@@ -673,7 +658,7 @@ static int shmem_free_swap(struct address_space *mapping,
void *old;
xa_lock_irq(&mapping->i_pages);
- old = radix_tree_delete_item(&mapping->i_pages, index, radswap);
+ old = __xa_cmpxchg(&mapping->i_pages, index, radswap, NULL, 0);
xa_unlock_irq(&mapping->i_pages);
if (old != radswap)
return -ENOENT;
@@ -691,29 +676,19 @@ static int shmem_free_swap(struct address_space *mapping,
unsigned long shmem_partial_swap_usage(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
- struct radix_tree_iter iter;
- void __rcu **slot;
+ XA_STATE(xas, &mapping->i_pages, start);
struct page *page;
unsigned long swapped = 0;
rcu_read_lock();
-
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
- if (iter.index >= end)
- break;
-
- page = radix_tree_deref_slot(slot);
-
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
+ xas_for_each(&xas, page, end - 1) {
+ if (xas_retry(&xas, page))
continue;
- }
-
- if (radix_tree_exceptional_entry(page))
+ if (xa_is_value(page))
swapped++;
if (need_resched()) {
- slot = radix_tree_iter_resume(slot, &iter);
+ xas_pause(&xas);
cond_resched_rcu();
}
}
@@ -788,7 +763,7 @@ void shmem_unlock_mapping(struct address_space *mapping)
}
/*
- * Remove range of pages and swap entries from radix tree, and free them.
+ * Remove range of pages and swap entries from page cache, and free them.
* If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
*/
static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
@@ -824,7 +799,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (index >= end)
break;
- if (radix_tree_exceptional_entry(page)) {
+ if (xa_is_value(page)) {
if (unfalloc)
continue;
nr_swaps_freed += !shmem_free_swap(mapping,
@@ -921,7 +896,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (index >= end)
break;
- if (radix_tree_exceptional_entry(page)) {
+ if (xa_is_value(page)) {
if (unfalloc)
continue;
if (shmem_free_swap(mapping, index, page)) {
@@ -1110,34 +1085,27 @@ static void shmem_evict_inode(struct inode *inode)
clear_inode(inode);
}
-static unsigned long find_swap_entry(struct radix_tree_root *root, void *item)
+static unsigned long find_swap_entry(struct xarray *xa, void *item)
{
- struct radix_tree_iter iter;
- void __rcu **slot;
- unsigned long found = -1;
+ XA_STATE(xas, xa, 0);
unsigned int checked = 0;
+ void *entry;
rcu_read_lock();
- radix_tree_for_each_slot(slot, root, &iter, 0) {
- void *entry = radix_tree_deref_slot(slot);
-
- if (radix_tree_deref_retry(entry)) {
- slot = radix_tree_iter_retry(&iter);
+ xas_for_each(&xas, entry, ULONG_MAX) {
+ if (xas_retry(&xas, entry))
continue;
- }
- if (entry == item) {
- found = iter.index;
+ if (entry == item)
break;
- }
checked++;
- if ((checked % 4096) != 0)
+ if ((checked % XA_CHECK_SCHED) != 0)
continue;
- slot = radix_tree_iter_resume(slot, &iter);
+ xas_pause(&xas);
cond_resched_rcu();
}
-
rcu_read_unlock();
- return found;
+
+ return entry ? xas.xa_index : -1;
}
/*
@@ -1175,10 +1143,10 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
* We needed to drop mutex to make that restrictive page
* allocation, but the inode might have been freed while we
* dropped it: although a racing shmem_evict_inode() cannot
- * complete without emptying the radix_tree, our page lock
+ * complete without emptying the page cache, our page lock
* on this swapcache page is not enough to prevent that -
* free_swap_and_cache() of our swap entry will only
- * trylock_page(), removing swap from radix_tree whatever.
+ * trylock_page(), removing swap from page cache whatever.
*
* We must not proceed to shmem_add_to_page_cache() if the
* inode has been freed, but of course we cannot rely on
@@ -1200,7 +1168,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
*/
if (!error)
error = shmem_add_to_page_cache(*pagep, mapping, index,
- radswap);
+ radswap, gfp);
if (error != -ENOMEM) {
/*
* Truncation and eviction use free_swap_and_cache(), which
@@ -1244,7 +1212,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
&memcg, false);
if (error)
goto out;
- /* No radix_tree_preload: swap entry keeps a place for page in tree */
+ /* No memory allocation: swap entry occupies the slot for the page */
error = -EAGAIN;
mutex_lock(&shmem_swaplist_mutex);
@@ -1453,23 +1421,17 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
- struct inode *inode = &info->vfs_inode;
- struct address_space *mapping = inode->i_mapping;
- pgoff_t idx, hindex;
- void __rcu **results;
+ struct address_space *mapping = info->vfs_inode.i_mapping;
+ pgoff_t hindex;
struct page *page;
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
return NULL;
hindex = round_down(index, HPAGE_PMD_NR);
- rcu_read_lock();
- if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx,
- hindex, 1) && idx < hindex + HPAGE_PMD_NR) {
- rcu_read_unlock();
+ if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
+ XA_PRESENT))
return NULL;
- }
- rcu_read_unlock();
shmem_pseudo_vma_init(&pvma, info, hindex);
page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
@@ -1578,8 +1540,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
* a nice clean interface for us to replace oldpage by newpage there.
*/
xa_lock_irq(&swap_mapping->i_pages);
- error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
- newpage);
+ error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
if (!error) {
__inc_node_page_state(newpage, NR_FILE_PAGES);
__dec_node_page_state(oldpage, NR_FILE_PAGES);
@@ -1643,7 +1604,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
repeat:
swap.val = 0;
page = find_lock_entry(mapping, index);
- if (radix_tree_exceptional_entry(page)) {
+ if (xa_is_value(page)) {
swap = radix_to_swp_entry(page);
page = NULL;
}
@@ -1718,7 +1679,7 @@ repeat:
false);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
- swp_to_radix_entry(swap));
+ swp_to_radix_entry(swap), gfp);
/*
* We already confirmed swap under page lock, and make
* no memory allocation here, so usually no possibility
@@ -1824,13 +1785,8 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode,
PageTransHuge(page));
if (error)
goto unacct;
- error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK,
- compound_order(page));
- if (!error) {
- error = shmem_add_to_page_cache(page, mapping, hindex,
- NULL);
- radix_tree_preload_end();
- }
+ error = shmem_add_to_page_cache(page, mapping, hindex,
+ NULL, gfp & GFP_RECLAIM_MASK);
if (error) {
mem_cgroup_cancel_charge(page, memcg,
PageTransHuge(page));
@@ -1931,7 +1887,7 @@ unlock:
spin_unlock_irq(&info->lock);
goto repeat;
}
- if (error == -EEXIST) /* from above or from radix_tree_insert */
+ if (error == -EEXIST)
goto repeat;
return error;
}
@@ -2299,11 +2255,8 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
if (ret)
goto out_release;
- ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
- if (!ret) {
- ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL);
- radix_tree_preload_end();
- }
+ ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
+ gfp & GFP_RECLAIM_MASK);
if (ret)
goto out_release_uncharge;
@@ -2548,7 +2501,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
/*
- * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
+ * llseek SEEK_DATA or SEEK_HOLE through the page cache.
*/
static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
pgoff_t index, pgoff_t end, int whence)
@@ -2578,7 +2531,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
index = indices[i];
}
page = pvec.pages[i];
- if (page && !radix_tree_exceptional_entry(page)) {
+ if (page && !xa_is_value(page)) {
if (!PageUptodate(page))
page = NULL;
}
diff --git a/mm/swap.c b/mm/swap.c
index 87a54c8dee34..aa483719922e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -964,7 +964,7 @@ void pagevec_remove_exceptionals(struct pagevec *pvec)
for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
- if (!radix_tree_exceptional_entry(page))
+ if (!xa_is_value(page))
pvec->pages[j++] = page;
}
pvec->nr = j;
@@ -1001,7 +1001,7 @@ EXPORT_SYMBOL(pagevec_lookup_range);
unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
struct address_space *mapping, pgoff_t *index, pgoff_t end,
- int tag)
+ xa_mark_t tag)
{
pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
PAGEVEC_SIZE, pvec->pages);
@@ -1011,7 +1011,7 @@ EXPORT_SYMBOL(pagevec_lookup_range_tag);
unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
struct address_space *mapping, pgoff_t *index, pgoff_t end,
- int tag, unsigned max_pages)
+ xa_mark_t tag, unsigned max_pages)
{
pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0d6a7f268d2e..fd2f21e1c60a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -107,14 +107,15 @@ void show_swap_cache_info(void)
}
/*
- * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
+ * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
* but sets SwapCache flag and private instead of mapping and index.
*/
-int __add_to_swap_cache(struct page *page, swp_entry_t entry)
+int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
{
- int error, i, nr = hpage_nr_pages(page);
- struct address_space *address_space;
+ struct address_space *address_space = swap_address_space(entry);
pgoff_t idx = swp_offset(entry);
+ XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
+ unsigned long i, nr = 1UL << compound_order(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapCache(page), page);
@@ -123,73 +124,52 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
page_ref_add(page, nr);
SetPageSwapCache(page);
- address_space = swap_address_space(entry);
- xa_lock_irq(&address_space->i_pages);
- for (i = 0; i < nr; i++) {
- set_page_private(page + i, entry.val + i);
- error = radix_tree_insert(&address_space->i_pages,
- idx + i, page + i);
- if (unlikely(error))
- break;
- }
- if (likely(!error)) {
+ do {
+ xas_lock_irq(&xas);
+ xas_create_range(&xas);
+ if (xas_error(&xas))
+ goto unlock;
+ for (i = 0; i < nr; i++) {
+ VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
+ set_page_private(page + i, entry.val + i);
+ xas_store(&xas, page + i);
+ xas_next(&xas);
+ }
address_space->nrpages += nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
ADD_CACHE_INFO(add_total, nr);
- } else {
- /*
- * Only the context which have set SWAP_HAS_CACHE flag
- * would call add_to_swap_cache().
- * So add_to_swap_cache() doesn't returns -EEXIST.
- */
- VM_BUG_ON(error == -EEXIST);
- set_page_private(page + i, 0UL);
- while (i--) {
- radix_tree_delete(&address_space->i_pages, idx + i);
- set_page_private(page + i, 0UL);
- }
- ClearPageSwapCache(page);
- page_ref_sub(page, nr);
- }
- xa_unlock_irq(&address_space->i_pages);
+unlock:
+ xas_unlock_irq(&xas);
+ } while (xas_nomem(&xas, gfp));
- return error;
-}
-
-
-int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
-{
- int error;
+ if (!xas_error(&xas))
+ return 0;
- error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page));
- if (!error) {
- error = __add_to_swap_cache(page, entry);
- radix_tree_preload_end();
- }
- return error;
+ ClearPageSwapCache(page);
+ page_ref_sub(page, nr);
+ return xas_error(&xas);
}
/*
* This must be called only on pages that have
* been verified to be in the swap cache.
*/
-void __delete_from_swap_cache(struct page *page)
+void __delete_from_swap_cache(struct page *page, swp_entry_t entry)
{
- struct address_space *address_space;
+ struct address_space *address_space = swap_address_space(entry);
int i, nr = hpage_nr_pages(page);
- swp_entry_t entry;
- pgoff_t idx;
+ pgoff_t idx = swp_offset(entry);
+ XA_STATE(xas, &address_space->i_pages, idx);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
VM_BUG_ON_PAGE(PageWriteback(page), page);
- entry.val = page_private(page);
- address_space = swap_address_space(entry);
- idx = swp_offset(entry);
for (i = 0; i < nr; i++) {
- radix_tree_delete(&address_space->i_pages, idx + i);
+ void *entry = xas_store(&xas, NULL);
+ VM_BUG_ON_PAGE(entry != page + i, entry);
set_page_private(page + i, 0);
+ xas_next(&xas);
}
ClearPageSwapCache(page);
address_space->nrpages -= nr;
@@ -217,7 +197,7 @@ int add_to_swap(struct page *page)
return 0;
/*
- * Radix-tree node allocations from PF_MEMALLOC contexts could
+ * XArray node allocations from PF_MEMALLOC contexts could
* completely exhaust the page allocator. __GFP_NOMEMALLOC
* stops emergency reserves from being allocated.
*
@@ -229,7 +209,6 @@ int add_to_swap(struct page *page)
*/
err = add_to_swap_cache(page, entry,
__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
- /* -ENOMEM radix-tree allocation failure */
if (err)
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
@@ -263,14 +242,11 @@ fail:
*/
void delete_from_swap_cache(struct page *page)
{
- swp_entry_t entry;
- struct address_space *address_space;
+ swp_entry_t entry = { .val = page_private(page) };
+ struct address_space *address_space = swap_address_space(entry);
- entry.val = page_private(page);
-
- address_space = swap_address_space(entry);
xa_lock_irq(&address_space->i_pages);
- __delete_from_swap_cache(page);
+ __delete_from_swap_cache(page, entry);
xa_unlock_irq(&address_space->i_pages);
put_swap_page(page, entry);
@@ -414,18 +390,10 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
}
/*
- * call radix_tree_preload() while we can wait.
- */
- err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL);
- if (err)
- break;
-
- /*
* Swap entry may have been freed since our caller observed it.
*/
err = swapcache_prepare(entry);
if (err == -EEXIST) {
- radix_tree_preload_end();
/*
* We might race against get_swap_page() and stumble
* across a SWAP_HAS_CACHE swap_map entry whose page
@@ -433,27 +401,20 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
*/
cond_resched();
continue;
- }
- if (err) { /* swp entry is obsolete ? */
- radix_tree_preload_end();
+ } else if (err) /* swp entry is obsolete ? */
break;
- }
- /* May fail (-ENOMEM) if radix-tree node allocation failed. */
+ /* May fail (-ENOMEM) if XArray node allocation failed. */
__SetPageLocked(new_page);
__SetPageSwapBacked(new_page);
- err = __add_to_swap_cache(new_page, entry);
+ err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
if (likely(!err)) {
- radix_tree_preload_end();
- /*
- * Initiate read into locked page and return.
- */
+ /* Initiate read into locked page */
SetPageWorkingset(new_page);
lru_cache_add_anon(new_page);
*new_page_allocated = true;
return new_page;
}
- radix_tree_preload_end();
__ClearPageLocked(new_page);
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
@@ -626,7 +587,7 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages)
return -ENOMEM;
for (i = 0; i < nr; i++) {
space = spaces + i;
- INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC|__GFP_NOWARN);
+ xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ);
atomic_set(&space->i_mmap_writable, 0);
space->a_ops = &swap_aops;
/* swap cache doesn't use writeback related tags */
diff --git a/mm/truncate.c b/mm/truncate.c
index 1d2fb2dca96f..45d68e90b703 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -33,15 +33,12 @@
static inline void __clear_shadow_entry(struct address_space *mapping,
pgoff_t index, void *entry)
{
- struct radix_tree_node *node;
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, index);
- if (!__radix_tree_lookup(&mapping->i_pages, index, &node, &slot))
+ xas_set_update(&xas, workingset_update_node);
+ if (xas_load(&xas) != entry)
return;
- if (*slot != entry)
- return;
- __radix_tree_replace(&mapping->i_pages, node, slot, NULL,
- workingset_update_node);
+ xas_store(&xas, NULL);
mapping->nrexceptional--;
}
@@ -70,7 +67,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
return;
for (j = 0; j < pagevec_count(pvec); j++)
- if (radix_tree_exceptional_entry(pvec->pages[j]))
+ if (xa_is_value(pvec->pages[j]))
break;
if (j == pagevec_count(pvec))
@@ -85,7 +82,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
struct page *page = pvec->pages[i];
pgoff_t index = indices[i];
- if (!radix_tree_exceptional_entry(page)) {
+ if (!xa_is_value(page)) {
pvec->pages[j++] = page;
continue;
}
@@ -347,7 +344,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
if (index >= end)
break;
- if (radix_tree_exceptional_entry(page))
+ if (xa_is_value(page))
continue;
if (!trylock_page(page))
@@ -442,7 +439,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
break;
}
- if (radix_tree_exceptional_entry(page))
+ if (xa_is_value(page))
continue;
lock_page(page);
@@ -561,7 +558,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
if (index > end)
break;
- if (radix_tree_exceptional_entry(page)) {
+ if (xa_is_value(page)) {
invalidate_exceptional_entry(mapping, index,
page);
continue;
@@ -692,7 +689,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
if (index > end)
break;
- if (radix_tree_exceptional_entry(page)) {
+ if (xa_is_value(page)) {
if (!invalidate_exceptional_entry2(mapping,
index, page))
ret = -EBUSY;
@@ -738,10 +735,10 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
index++;
}
/*
- * For DAX we invalidate page tables after invalidating radix tree. We
+ * For DAX we invalidate page tables after invalidating page cache. We
* could invalidate page tables while invalidating each entry however
* that would be expensive. And doing range unmapping before doesn't
- * work as we have no cheap way to find whether radix tree entry didn't
+ * work as we have no cheap way to find whether page cache entry didn't
* get remapped later.
*/
if (dax_mapping(mapping)) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 28c9ae5633b9..62ac0c488624 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -751,12 +751,12 @@ static inline int is_page_cache_freeable(struct page *page)
{
/*
* A freeable page cache page is referenced only by the caller
- * that isolated the page, the page cache radix tree and
- * optional buffer heads at page->private.
+ * that isolated the page, the page cache and optional buffer
+ * heads at page->private.
*/
- int radix_pins = PageTransHuge(page) && PageSwapCache(page) ?
+ int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ?
HPAGE_PMD_NR : 1;
- return page_count(page) - page_has_private(page) == 1 + radix_pins;
+ return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
}
static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
@@ -932,7 +932,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
mem_cgroup_swapout(page, swap);
- __delete_from_swap_cache(page);
+ __delete_from_swap_cache(page, swap);
xa_unlock_irqrestore(&mapping->i_pages, flags);
put_swap_page(page, swap);
} else {
diff --git a/mm/workingset.c b/mm/workingset.c
index cbc13d4dfa79..d46f8c92aa2f 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -160,20 +160,20 @@
* and activations is maintained (node->inactive_age).
*
* On eviction, a snapshot of this counter (along with some bits to
- * identify the node) is stored in the now empty page cache radix tree
+ * identify the node) is stored in the now empty page cache
* slot of the evicted page. This is called a shadow entry.
*
* On cache misses for which there are shadow entries, an eligible
* refault distance will immediately activate the refaulting page.
*/
-#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
+#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
/*
* Eviction timestamps need to be able to cover the full range of
- * actionable refaults. However, bits are tight in the radix tree
+ * actionable refaults. However, bits are tight in the xarray
* entry, and after storing the identifier for the lruvec there might
* not be enough left to represent every single actionable refault. In
* that case, we have to sacrifice granularity for distance, and group
@@ -185,22 +185,21 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
bool workingset)
{
eviction >>= bucket_order;
+ eviction &= EVICTION_MASK;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
eviction = (eviction << 1) | workingset;
- eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
- return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
+ return xa_mk_value(eviction);
}
static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
unsigned long *evictionp, bool *workingsetp)
{
- unsigned long entry = (unsigned long)shadow;
+ unsigned long entry = xa_to_value(shadow);
int memcgid, nid;
bool workingset;
- entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
workingset = entry & 1;
entry >>= 1;
nid = entry & ((1UL << NODES_SHIFT) - 1);
@@ -367,7 +366,7 @@ out:
static struct list_lru shadow_nodes;
-void workingset_update_node(struct radix_tree_node *node)
+void workingset_update_node(struct xa_node *node)
{
/*
* Track non-empty nodes that contain only shadow entries;
@@ -379,7 +378,7 @@ void workingset_update_node(struct radix_tree_node *node)
*/
VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */
- if (node->count && node->count == node->exceptional) {
+ if (node->count && node->count == node->nr_values) {
if (list_empty(&node->private_list)) {
list_lru_add(&shadow_nodes, &node->private_list);
__inc_lruvec_page_state(virt_to_page(node),
@@ -404,7 +403,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
nodes = list_lru_shrink_count(&shadow_nodes, sc);
/*
- * Approximate a reasonable limit for the radix tree nodes
+ * Approximate a reasonable limit for the nodes
* containing shadow entries. We don't need to keep more
* shadow entries than possible pages on the active list,
* since refault distances bigger than that are dismissed.
@@ -419,11 +418,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
* worst-case density of 1/8th. Below that, not all eligible
* refaults can be detected anymore.
*
- * On 64-bit with 7 radix_tree_nodes per page and 64 slots
+ * On 64-bit with 7 xa_nodes per page and 64 slots
* each, this will reclaim shadow entries when they consume
* ~1.8% of available memory:
*
- * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE
+ * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
*/
#ifdef CONFIG_MEMCG
if (sc->memcg) {
@@ -438,7 +437,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
#endif
pages = node_present_pages(sc->nid);
- max_nodes = pages >> (RADIX_TREE_MAP_SHIFT - 3);
+ max_nodes = pages >> (XA_CHUNK_SHIFT - 3);
if (!nodes)
return SHRINK_EMPTY;
@@ -451,11 +450,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
static enum lru_status shadow_lru_isolate(struct list_head *item,
struct list_lru_one *lru,
spinlock_t *lru_lock,
- void *arg)
+ void *arg) __must_hold(lru_lock)
{
+ struct xa_node *node = container_of(item, struct xa_node, private_list);
+ XA_STATE(xas, node->array, 0);
struct address_space *mapping;
- struct radix_tree_node *node;
- unsigned int i;
int ret;
/*
@@ -463,15 +462,14 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
* the shadow node LRU under the i_pages lock and the
* lru_lock. Because the page cache tree is emptied before
* the inode can be destroyed, holding the lru_lock pins any
- * address_space that has radix tree nodes on the LRU.
+ * address_space that has nodes on the LRU.
*
* We can then safely transition to the i_pages lock to
* pin only the address_space of the particular node we want
* to reclaim, take the node off-LRU, and drop the lru_lock.
*/
- node = container_of(item, struct radix_tree_node, private_list);
- mapping = container_of(node->root, struct address_space, i_pages);
+ mapping = container_of(node->array, struct address_space, i_pages);
/* Coming from the list, invert the lock order */
if (!xa_trylock(&mapping->i_pages)) {
@@ -490,29 +488,21 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
* no pages, so we expect to be able to remove them all and
* delete and free the empty node afterwards.
*/
- if (WARN_ON_ONCE(!node->exceptional))
+ if (WARN_ON_ONCE(!node->nr_values))
goto out_invalid;
- if (WARN_ON_ONCE(node->count != node->exceptional))
- goto out_invalid;
- for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
- if (node->slots[i]) {
- if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i])))
- goto out_invalid;
- if (WARN_ON_ONCE(!node->exceptional))
- goto out_invalid;
- if (WARN_ON_ONCE(!mapping->nrexceptional))
- goto out_invalid;
- node->slots[i] = NULL;
- node->exceptional--;
- node->count--;
- mapping->nrexceptional--;
- }
- }
- if (WARN_ON_ONCE(node->exceptional))
+ if (WARN_ON_ONCE(node->count != node->nr_values))
goto out_invalid;
+ mapping->nrexceptional -= node->nr_values;
+ xas.xa_node = xa_parent_locked(&mapping->i_pages, node);
+ xas.xa_offset = node->offset;
+ xas.xa_shift = node->shift + XA_CHUNK_SHIFT;
+ xas_set_update(&xas, workingset_update_node);
+ /*
+ * We could store a shadow entry here which was the minimum of the
+ * shadow entries we were tracking ...
+ */
+ xas_store(&xas, NULL);
__inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
- __radix_tree_delete_node(&mapping->i_pages, node,
- workingset_lookup_update(mapping));
out_invalid:
xa_unlock_irq(&mapping->i_pages);
diff --git a/tools/include/asm-generic/bitops.h b/tools/include/asm-generic/bitops.h
index 9bce3b56b5e7..5d2ab38965cc 100644
--- a/tools/include/asm-generic/bitops.h
+++ b/tools/include/asm-generic/bitops.h
@@ -27,5 +27,6 @@
#include <asm-generic/bitops/hweight.h>
#include <asm-generic/bitops/atomic.h>
+#include <asm-generic/bitops/non-atomic.h>
#endif /* __TOOLS_ASM_GENERIC_BITOPS_H */
diff --git a/tools/include/asm-generic/bitops/atomic.h b/tools/include/asm-generic/bitops/atomic.h
index 21c41ccd1266..2f6ea28764a7 100644
--- a/tools/include/asm-generic/bitops/atomic.h
+++ b/tools/include/asm-generic/bitops/atomic.h
@@ -15,13 +15,4 @@ static inline void clear_bit(int nr, unsigned long *addr)
addr[nr / __BITS_PER_LONG] &= ~(1UL << (nr % __BITS_PER_LONG));
}
-static __always_inline int test_bit(unsigned int nr, const unsigned long *addr)
-{
- return ((1UL << (nr % __BITS_PER_LONG)) &
- (((unsigned long *)addr)[nr / __BITS_PER_LONG])) != 0;
-}
-
-#define __set_bit(nr, addr) set_bit(nr, addr)
-#define __clear_bit(nr, addr) clear_bit(nr, addr)
-
#endif /* _TOOLS_LINUX_ASM_GENERIC_BITOPS_ATOMIC_H_ */
diff --git a/tools/include/asm-generic/bitops/non-atomic.h b/tools/include/asm-generic/bitops/non-atomic.h
new file mode 100644
index 000000000000..7e10c4b50c5d
--- /dev/null
+++ b/tools/include/asm-generic/bitops/non-atomic.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
+#define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
+
+#include <asm/types.h>
+
+/**
+ * __set_bit - Set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * Unlike set_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static inline void __set_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BIT_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+
+ *p |= mask;
+}
+
+static inline void __clear_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BIT_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+
+ *p &= ~mask;
+}
+
+/**
+ * __change_bit - Toggle a bit in memory
+ * @nr: the bit to change
+ * @addr: the address to start counting from
+ *
+ * Unlike change_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static inline void __change_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BIT_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+
+ *p ^= mask;
+}
+
+/**
+ * __test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
+ * If two examples of this operation race, one can appear to succeed
+ * but actually fail. You must protect multiple accesses with a lock.
+ */
+static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BIT_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+ unsigned long old = *p;
+
+ *p = old | mask;
+ return (old & mask) != 0;
+}
+
+/**
+ * __test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
+ * If two examples of this operation race, one can appear to succeed
+ * but actually fail. You must protect multiple accesses with a lock.
+ */
+static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BIT_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+ unsigned long old = *p;
+
+ *p = old & ~mask;
+ return (old & mask) != 0;
+}
+
+/* WARNING: non atomic and it can be reordered! */
+static inline int __test_and_change_bit(int nr,
+ volatile unsigned long *addr)
+{
+ unsigned long mask = BIT_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+ unsigned long old = *p;
+
+ *p = old ^ mask;
+ return (old & mask) != 0;
+}
+
+/**
+ * test_bit - Determine whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static inline int test_bit(int nr, const volatile unsigned long *addr)
+{
+ return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
+}
+
+#endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */
diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index e63662db131b..05dca5c203f3 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -15,6 +15,7 @@ void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int bits);
+void bitmap_clear(unsigned long *map, unsigned int start, int len);
#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h
index 0ad884452c5c..6935ef94e77a 100644
--- a/tools/include/linux/kernel.h
+++ b/tools/include/linux/kernel.h
@@ -70,6 +70,7 @@
#define BUG_ON(cond) assert(!(cond))
#endif
#endif
+#define BUG() BUG_ON(1)
#if __BYTE_ORDER == __BIG_ENDIAN
#define cpu_to_le16 bswap_16
diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h
index 1738c0391da4..c934572d935c 100644
--- a/tools/include/linux/spinlock.h
+++ b/tools/include/linux/spinlock.h
@@ -8,8 +8,14 @@
#define spinlock_t pthread_mutex_t
#define DEFINE_SPINLOCK(x) pthread_mutex_t x = PTHREAD_MUTEX_INITIALIZER
#define __SPIN_LOCK_UNLOCKED(x) (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER
-#define spin_lock_init(x) pthread_mutex_init(x, NULL)
-
+#define spin_lock_init(x) pthread_mutex_init(x, NULL)
+
+#define spin_lock(x) pthread_mutex_lock(x)
+#define spin_unlock(x) pthread_mutex_unlock(x)
+#define spin_lock_bh(x) pthread_mutex_lock(x)
+#define spin_unlock_bh(x) pthread_mutex_unlock(x)
+#define spin_lock_irq(x) pthread_mutex_lock(x)
+#define spin_unlock_irq(x) pthread_mutex_unlock(x)
#define spin_lock_irqsave(x, f) (void)f, pthread_mutex_lock(x)
#define spin_unlock_irqrestore(x, f) (void)f, pthread_mutex_unlock(x)
@@ -31,4 +37,6 @@ static inline bool arch_spin_is_locked(arch_spinlock_t *mutex)
return true;
}
+#include <linux/lockdep.h>
+
#endif
diff --git a/tools/testing/radix-tree/.gitignore b/tools/testing/radix-tree/.gitignore
index d4706c0ffceb..3834899b6693 100644
--- a/tools/testing/radix-tree/.gitignore
+++ b/tools/testing/radix-tree/.gitignore
@@ -4,3 +4,4 @@ idr-test
main
multiorder
radix-tree.c
+xarray
diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile
index 37baecc3766f..acf1afa01c5b 100644
--- a/tools/testing/radix-tree/Makefile
+++ b/tools/testing/radix-tree/Makefile
@@ -4,8 +4,8 @@ CFLAGS += -I. -I../../include -g -Og -Wall -D_LGPL_SOURCE -fsanitize=address \
-fsanitize=undefined
LDFLAGS += -fsanitize=address -fsanitize=undefined
LDLIBS+= -lpthread -lurcu
-TARGETS = main idr-test multiorder
-CORE_OFILES := radix-tree.o idr.o linux.o test.o find_bit.o
+TARGETS = main idr-test multiorder xarray
+CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o find_bit.o bitmap.o
OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \
tag_check.o multiorder.o idr-test.o iteration_check.o benchmark.o
@@ -25,6 +25,8 @@ main: $(OFILES)
idr-test.o: ../../../lib/test_ida.c
idr-test: idr-test.o $(CORE_OFILES)
+xarray: $(CORE_OFILES)
+
multiorder: multiorder.o $(CORE_OFILES)
clean:
@@ -35,6 +37,7 @@ vpath %.c ../../lib
$(OFILES): Makefile *.h */*.h generated/map-shift.h \
../../include/linux/*.h \
../../include/asm/*.h \
+ ../../../include/linux/xarray.h \
../../../include/linux/radix-tree.h \
../../../include/linux/idr.h
@@ -44,8 +47,10 @@ radix-tree.c: ../../../lib/radix-tree.c
idr.c: ../../../lib/idr.c
sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@
+xarray.o: ../../../lib/xarray.c ../../../lib/test_xarray.c
+
generated/map-shift.h:
@if ! grep -qws $(SHIFT) generated/map-shift.h; then \
- echo "#define RADIX_TREE_MAP_SHIFT $(SHIFT)" > \
+ echo "#define XA_CHUNK_SHIFT $(SHIFT)" > \
generated/map-shift.h; \
fi
diff --git a/tools/testing/radix-tree/benchmark.c b/tools/testing/radix-tree/benchmark.c
index 99c40f3ed133..7e195ed8e92d 100644
--- a/tools/testing/radix-tree/benchmark.c
+++ b/tools/testing/radix-tree/benchmark.c
@@ -17,9 +17,6 @@
#include <time.h>
#include "test.h"
-#define for_each_index(i, base, order) \
- for (i = base; i < base + (1 << order); i++)
-
#define NSEC_PER_SEC 1000000000L
static long long benchmark_iter(struct radix_tree_root *root, bool tagged)
@@ -61,7 +58,7 @@ again:
}
static void benchmark_insert(struct radix_tree_root *root,
- unsigned long size, unsigned long step, int order)
+ unsigned long size, unsigned long step)
{
struct timespec start, finish;
unsigned long index;
@@ -70,19 +67,19 @@ static void benchmark_insert(struct radix_tree_root *root,
clock_gettime(CLOCK_MONOTONIC, &start);
for (index = 0 ; index < size ; index += step)
- item_insert_order(root, index, order);
+ item_insert(root, index);
clock_gettime(CLOCK_MONOTONIC, &finish);
nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC +
(finish.tv_nsec - start.tv_nsec);
- printv(2, "Size: %8ld, step: %8ld, order: %d, insertion: %15lld ns\n",
- size, step, order, nsec);
+ printv(2, "Size: %8ld, step: %8ld, insertion: %15lld ns\n",
+ size, step, nsec);
}
static void benchmark_tagging(struct radix_tree_root *root,
- unsigned long size, unsigned long step, int order)
+ unsigned long size, unsigned long step)
{
struct timespec start, finish;
unsigned long index;
@@ -98,138 +95,53 @@ static void benchmark_tagging(struct radix_tree_root *root,
nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC +
(finish.tv_nsec - start.tv_nsec);
- printv(2, "Size: %8ld, step: %8ld, order: %d, tagging: %17lld ns\n",
- size, step, order, nsec);
+ printv(2, "Size: %8ld, step: %8ld, tagging: %17lld ns\n",
+ size, step, nsec);
}
static void benchmark_delete(struct radix_tree_root *root,
- unsigned long size, unsigned long step, int order)
+ unsigned long size, unsigned long step)
{
struct timespec start, finish;
- unsigned long index, i;
+ unsigned long index;
long long nsec;
clock_gettime(CLOCK_MONOTONIC, &start);
for (index = 0 ; index < size ; index += step)
- for_each_index(i, index, order)
- item_delete(root, i);
+ item_delete(root, index);
clock_gettime(CLOCK_MONOTONIC, &finish);
nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC +
(finish.tv_nsec - start.tv_nsec);
- printv(2, "Size: %8ld, step: %8ld, order: %d, deletion: %16lld ns\n",
- size, step, order, nsec);
+ printv(2, "Size: %8ld, step: %8ld, deletion: %16lld ns\n",
+ size, step, nsec);
}
-static void benchmark_size(unsigned long size, unsigned long step, int order)
+static void benchmark_size(unsigned long size, unsigned long step)
{
RADIX_TREE(tree, GFP_KERNEL);
long long normal, tagged;
- benchmark_insert(&tree, size, step, order);
- benchmark_tagging(&tree, size, step, order);
+ benchmark_insert(&tree, size, step);
+ benchmark_tagging(&tree, size, step);
tagged = benchmark_iter(&tree, true);
normal = benchmark_iter(&tree, false);
- printv(2, "Size: %8ld, step: %8ld, order: %d, tagged iteration: %8lld ns\n",
- size, step, order, tagged);
- printv(2, "Size: %8ld, step: %8ld, order: %d, normal iteration: %8lld ns\n",
- size, step, order, normal);
+ printv(2, "Size: %8ld, step: %8ld, tagged iteration: %8lld ns\n",
+ size, step, tagged);
+ printv(2, "Size: %8ld, step: %8ld, normal iteration: %8lld ns\n",
+ size, step, normal);
- benchmark_delete(&tree, size, step, order);
+ benchmark_delete(&tree, size, step);
item_kill_tree(&tree);
rcu_barrier();
}
-static long long __benchmark_split(unsigned long index,
- int old_order, int new_order)
-{
- struct timespec start, finish;
- long long nsec;
- RADIX_TREE(tree, GFP_ATOMIC);
-
- item_insert_order(&tree, index, old_order);
-
- clock_gettime(CLOCK_MONOTONIC, &start);
- radix_tree_split(&tree, index, new_order);
- clock_gettime(CLOCK_MONOTONIC, &finish);
- nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC +
- (finish.tv_nsec - start.tv_nsec);
-
- item_kill_tree(&tree);
-
- return nsec;
-
-}
-
-static void benchmark_split(unsigned long size, unsigned long step)
-{
- int i, j, idx;
- long long nsec = 0;
-
-
- for (idx = 0; idx < size; idx += step) {
- for (i = 3; i < 11; i++) {
- for (j = 0; j < i; j++) {
- nsec += __benchmark_split(idx, i, j);
- }
- }
- }
-
- printv(2, "Size %8ld, step %8ld, split time %10lld ns\n",
- size, step, nsec);
-
-}
-
-static long long __benchmark_join(unsigned long index,
- unsigned order1, unsigned order2)
-{
- unsigned long loc;
- struct timespec start, finish;
- long long nsec;
- void *item, *item2 = item_create(index + 1, order1);
- RADIX_TREE(tree, GFP_KERNEL);
-
- item_insert_order(&tree, index, order2);
- item = radix_tree_lookup(&tree, index);
-
- clock_gettime(CLOCK_MONOTONIC, &start);
- radix_tree_join(&tree, index + 1, order1, item2);
- clock_gettime(CLOCK_MONOTONIC, &finish);
- nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC +
- (finish.tv_nsec - start.tv_nsec);
-
- loc = find_item(&tree, item);
- if (loc == -1)
- free(item);
-
- item_kill_tree(&tree);
-
- return nsec;
-}
-
-static void benchmark_join(unsigned long step)
-{
- int i, j, idx;
- long long nsec = 0;
-
- for (idx = 0; idx < 1 << 10; idx += step) {
- for (i = 1; i < 15; i++) {
- for (j = 0; j < i; j++) {
- nsec += __benchmark_join(idx, i, j);
- }
- }
- }
-
- printv(2, "Size %8d, step %8ld, join time %10lld ns\n",
- 1 << 10, step, nsec);
-}
-
void benchmark(void)
{
unsigned long size[] = {1 << 10, 1 << 20, 0};
@@ -242,16 +154,5 @@ void benchmark(void)
for (c = 0; size[c]; c++)
for (s = 0; step[s]; s++)
- benchmark_size(size[c], step[s], 0);
-
- for (c = 0; size[c]; c++)
- for (s = 0; step[s]; s++)
- benchmark_size(size[c], step[s] << 9, 9);
-
- for (c = 0; size[c]; c++)
- for (s = 0; step[s]; s++)
- benchmark_split(size[c], step[s]);
-
- for (s = 0; step[s]; s++)
- benchmark_join(step[s]);
+ benchmark_size(size[c], step[s]);
}
diff --git a/tools/testing/radix-tree/bitmap.c b/tools/testing/radix-tree/bitmap.c
new file mode 100644
index 000000000000..66ec4a24a203
--- /dev/null
+++ b/tools/testing/radix-tree/bitmap.c
@@ -0,0 +1,23 @@
+/* lib/bitmap.c pulls in at least two other files. */
+
+#include <linux/bitmap.h>
+
+void bitmap_clear(unsigned long *map, unsigned int start, int len)
+{
+ unsigned long *p = map + BIT_WORD(start);
+ const unsigned int size = start + len;
+ int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+ unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+
+ while (len - bits_to_clear >= 0) {
+ *p &= ~mask_to_clear;
+ len -= bits_to_clear;
+ bits_to_clear = BITS_PER_LONG;
+ mask_to_clear = ~0UL;
+ p++;
+ }
+ if (len) {
+ mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+ *p &= ~mask_to_clear;
+ }
+}
diff --git a/tools/testing/radix-tree/generated/autoconf.h b/tools/testing/radix-tree/generated/autoconf.h
index cf88dc5b8832..2218b3cc184e 100644
--- a/tools/testing/radix-tree/generated/autoconf.h
+++ b/tools/testing/radix-tree/generated/autoconf.h
@@ -1 +1 @@
-#define CONFIG_RADIX_TREE_MULTIORDER 1
+#define CONFIG_XARRAY_MULTI 1
diff --git a/tools/testing/radix-tree/idr-test.c b/tools/testing/radix-tree/idr-test.c
index 321ba92c70d2..1b63bdb7688f 100644
--- a/tools/testing/radix-tree/idr-test.c
+++ b/tools/testing/radix-tree/idr-test.c
@@ -19,7 +19,7 @@
#include "test.h"
-#define DUMMY_PTR ((void *)0x12)
+#define DUMMY_PTR ((void *)0x10)
int item_idr_free(int id, void *p, void *data)
{
@@ -227,6 +227,66 @@ void idr_u32_test(int base)
idr_u32_test1(&idr, 0xffffffff);
}
+static void idr_align_test(struct idr *idr)
+{
+ char name[] = "Motorola 68000";
+ int i, id;
+ void *entry;
+
+ for (i = 0; i < 9; i++) {
+ BUG_ON(idr_alloc(idr, &name[i], 0, 0, GFP_KERNEL) != i);
+ idr_for_each_entry(idr, entry, id);
+ }
+ idr_destroy(idr);
+
+ for (i = 1; i < 10; i++) {
+ BUG_ON(idr_alloc(idr, &name[i], 0, 0, GFP_KERNEL) != i - 1);
+ idr_for_each_entry(idr, entry, id);
+ }
+ idr_destroy(idr);
+
+ for (i = 2; i < 11; i++) {
+ BUG_ON(idr_alloc(idr, &name[i], 0, 0, GFP_KERNEL) != i - 2);
+ idr_for_each_entry(idr, entry, id);
+ }
+ idr_destroy(idr);
+
+ for (i = 3; i < 12; i++) {
+ BUG_ON(idr_alloc(idr, &name[i], 0, 0, GFP_KERNEL) != i - 3);
+ idr_for_each_entry(idr, entry, id);
+ }
+ idr_destroy(idr);
+
+ for (i = 0; i < 8; i++) {
+ BUG_ON(idr_alloc(idr, &name[i], 0, 0, GFP_KERNEL) != 0);
+ BUG_ON(idr_alloc(idr, &name[i + 1], 0, 0, GFP_KERNEL) != 1);
+ idr_for_each_entry(idr, entry, id);
+ idr_remove(idr, 1);
+ idr_for_each_entry(idr, entry, id);
+ idr_remove(idr, 0);
+ BUG_ON(!idr_is_empty(idr));
+ }
+
+ for (i = 0; i < 8; i++) {
+ BUG_ON(idr_alloc(idr, NULL, 0, 0, GFP_KERNEL) != 0);
+ idr_for_each_entry(idr, entry, id);
+ idr_replace(idr, &name[i], 0);
+ idr_for_each_entry(idr, entry, id);
+ BUG_ON(idr_find(idr, 0) != &name[i]);
+ idr_remove(idr, 0);
+ }
+
+ for (i = 0; i < 8; i++) {
+ BUG_ON(idr_alloc(idr, &name[i], 0, 0, GFP_KERNEL) != 0);
+ BUG_ON(idr_alloc(idr, NULL, 0, 0, GFP_KERNEL) != 1);
+ idr_remove(idr, 1);
+ idr_for_each_entry(idr, entry, id);
+ idr_replace(idr, &name[i + 1], 0);
+ idr_for_each_entry(idr, entry, id);
+ idr_remove(idr, 0);
+ }
+}
+
void idr_checks(void)
{
unsigned long i;
@@ -307,6 +367,7 @@ void idr_checks(void)
idr_u32_test(4);
idr_u32_test(1);
idr_u32_test(0);
+ idr_align_test(&idr);
}
#define module_init(x)
@@ -344,16 +405,16 @@ void ida_check_conv_user(void)
DEFINE_IDA(ida);
unsigned long i;
- radix_tree_cpu_dead(1);
for (i = 0; i < 1000000; i++) {
int id = ida_alloc(&ida, GFP_NOWAIT);
if (id == -ENOMEM) {
- IDA_BUG_ON(&ida, (i % IDA_BITMAP_BITS) !=
- BITS_PER_LONG - 2);
+ IDA_BUG_ON(&ida, ((i % IDA_BITMAP_BITS) !=
+ BITS_PER_XA_VALUE) &&
+ ((i % IDA_BITMAP_BITS) != 0));
id = ida_alloc(&ida, GFP_KERNEL);
} else {
IDA_BUG_ON(&ida, (i % IDA_BITMAP_BITS) ==
- BITS_PER_LONG - 2);
+ BITS_PER_XA_VALUE);
}
IDA_BUG_ON(&ida, id != i);
}
diff --git a/tools/testing/radix-tree/iteration_check.c b/tools/testing/radix-tree/iteration_check.c
index a92bab513701..238db187aa15 100644
--- a/tools/testing/radix-tree/iteration_check.c
+++ b/tools/testing/radix-tree/iteration_check.c
@@ -1,5 +1,5 @@
/*
- * iteration_check.c: test races having to do with radix tree iteration
+ * iteration_check.c: test races having to do with xarray iteration
* Copyright (c) 2016 Intel Corporation
* Author: Ross Zwisler <ross.zwisler@linux.intel.com>
*
@@ -12,41 +12,54 @@
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
-#include <linux/radix-tree.h>
#include <pthread.h>
#include "test.h"
#define NUM_THREADS 5
#define MAX_IDX 100
-#define TAG 0
-#define NEW_TAG 1
+#define TAG XA_MARK_0
+#define NEW_TAG XA_MARK_1
-static pthread_mutex_t tree_lock = PTHREAD_MUTEX_INITIALIZER;
static pthread_t threads[NUM_THREADS];
static unsigned int seeds[3];
-static RADIX_TREE(tree, GFP_KERNEL);
+static DEFINE_XARRAY(array);
static bool test_complete;
static int max_order;
-/* relentlessly fill the tree with tagged entries */
+void my_item_insert(struct xarray *xa, unsigned long index)
+{
+ XA_STATE(xas, xa, index);
+ struct item *item = item_create(index, 0);
+ int order;
+
+retry:
+ xas_lock(&xas);
+ for (order = max_order; order >= 0; order--) {
+ xas_set_order(&xas, index, order);
+ item->order = order;
+ if (xas_find_conflict(&xas))
+ continue;
+ xas_store(&xas, item);
+ xas_set_mark(&xas, TAG);
+ break;
+ }
+ xas_unlock(&xas);
+ if (xas_nomem(&xas, GFP_KERNEL))
+ goto retry;
+ if (order < 0)
+ free(item);
+}
+
+/* relentlessly fill the array with tagged entries */
static void *add_entries_fn(void *arg)
{
rcu_register_thread();
while (!test_complete) {
unsigned long pgoff;
- int order;
for (pgoff = 0; pgoff < MAX_IDX; pgoff++) {
- pthread_mutex_lock(&tree_lock);
- for (order = max_order; order >= 0; order--) {
- if (item_insert_order(&tree, pgoff, order)
- == 0) {
- item_tag_set(&tree, pgoff, TAG);
- break;
- }
- }
- pthread_mutex_unlock(&tree_lock);
+ my_item_insert(&array, pgoff);
}
}
@@ -56,33 +69,25 @@ static void *add_entries_fn(void *arg)
}
/*
- * Iterate over the tagged entries, doing a radix_tree_iter_retry() as we find
- * things that have been removed and randomly resetting our iteration to the
- * next chunk with radix_tree_iter_resume(). Both radix_tree_iter_retry() and
- * radix_tree_iter_resume() cause radix_tree_next_slot() to be called with a
- * NULL 'slot' variable.
+ * Iterate over tagged entries, retrying when we find ourselves in a deleted
+ * node and randomly pausing the iteration.
*/
static void *tagged_iteration_fn(void *arg)
{
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, &array, 0);
+ void *entry;
rcu_register_thread();
while (!test_complete) {
+ xas_set(&xas, 0);
rcu_read_lock();
- radix_tree_for_each_tagged(slot, &tree, &iter, 0, TAG) {
- void *entry = radix_tree_deref_slot(slot);
- if (unlikely(!entry))
+ xas_for_each_marked(&xas, entry, ULONG_MAX, TAG) {
+ if (xas_retry(&xas, entry))
continue;
- if (radix_tree_deref_retry(entry)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
-
if (rand_r(&seeds[0]) % 50 == 0) {
- slot = radix_tree_iter_resume(slot, &iter);
+ xas_pause(&xas);
rcu_read_unlock();
rcu_barrier();
rcu_read_lock();
@@ -97,33 +102,25 @@ static void *tagged_iteration_fn(void *arg)
}
/*
- * Iterate over the entries, doing a radix_tree_iter_retry() as we find things
- * that have been removed and randomly resetting our iteration to the next
- * chunk with radix_tree_iter_resume(). Both radix_tree_iter_retry() and
- * radix_tree_iter_resume() cause radix_tree_next_slot() to be called with a
- * NULL 'slot' variable.
+ * Iterate over the entries, retrying when we find ourselves in a deleted
+ * node and randomly pausing the iteration.
*/
static void *untagged_iteration_fn(void *arg)
{
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, &array, 0);
+ void *entry;
rcu_register_thread();
while (!test_complete) {
+ xas_set(&xas, 0);
rcu_read_lock();
- radix_tree_for_each_slot(slot, &tree, &iter, 0) {
- void *entry = radix_tree_deref_slot(slot);
- if (unlikely(!entry))
+ xas_for_each(&xas, entry, ULONG_MAX) {
+ if (xas_retry(&xas, entry))
continue;
- if (radix_tree_deref_retry(entry)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
-
if (rand_r(&seeds[1]) % 50 == 0) {
- slot = radix_tree_iter_resume(slot, &iter);
+ xas_pause(&xas);
rcu_read_unlock();
rcu_barrier();
rcu_read_lock();
@@ -138,7 +135,7 @@ static void *untagged_iteration_fn(void *arg)
}
/*
- * Randomly remove entries to help induce radix_tree_iter_retry() calls in the
+ * Randomly remove entries to help induce retries in the
* two iteration functions.
*/
static void *remove_entries_fn(void *arg)
@@ -147,12 +144,13 @@ static void *remove_entries_fn(void *arg)
while (!test_complete) {
int pgoff;
+ struct item *item;
pgoff = rand_r(&seeds[2]) % MAX_IDX;
- pthread_mutex_lock(&tree_lock);
- item_delete(&tree, pgoff);
- pthread_mutex_unlock(&tree_lock);
+ item = xa_erase(&array, pgoff);
+ if (item)
+ item_free(item, pgoff);
}
rcu_unregister_thread();
@@ -165,8 +163,7 @@ static void *tag_entries_fn(void *arg)
rcu_register_thread();
while (!test_complete) {
- tag_tagged_items(&tree, &tree_lock, 0, MAX_IDX, 10, TAG,
- NEW_TAG);
+ tag_tagged_items(&array, 0, MAX_IDX, 10, TAG, NEW_TAG);
}
rcu_unregister_thread();
return NULL;
@@ -217,5 +214,5 @@ void iteration_test(unsigned order, unsigned test_duration)
}
}
- item_kill_tree(&tree);
+ item_kill_tree(&array);
}
diff --git a/tools/testing/radix-tree/linux/bug.h b/tools/testing/radix-tree/linux/bug.h
index 23b8ed52f8c8..03dc8a57eb99 100644
--- a/tools/testing/radix-tree/linux/bug.h
+++ b/tools/testing/radix-tree/linux/bug.h
@@ -1 +1,2 @@
+#include <stdio.h>
#include "asm/bug.h"
diff --git a/tools/testing/radix-tree/linux/kconfig.h b/tools/testing/radix-tree/linux/kconfig.h
new file mode 100644
index 000000000000..6c8675859913
--- /dev/null
+++ b/tools/testing/radix-tree/linux/kconfig.h
@@ -0,0 +1 @@
+#include "../../../../include/linux/kconfig.h"
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h
index 426f32f28547..4568248222ae 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -14,7 +14,12 @@
#include "../../../include/linux/kconfig.h"
#define printk printf
+#define pr_info printk
#define pr_debug printk
#define pr_cont printk
+#define __acquires(x)
+#define __releases(x)
+#define __must_hold(x)
+
#endif /* _KERNEL_H */
diff --git a/tools/testing/radix-tree/linux/lockdep.h b/tools/testing/radix-tree/linux/lockdep.h
new file mode 100644
index 000000000000..565fccdfe6e9
--- /dev/null
+++ b/tools/testing/radix-tree/linux/lockdep.h
@@ -0,0 +1,11 @@
+#ifndef _LINUX_LOCKDEP_H
+#define _LINUX_LOCKDEP_H
+struct lock_class_key {
+ unsigned int a;
+};
+
+static inline void lockdep_set_class(spinlock_t *lock,
+ struct lock_class_key *key)
+{
+}
+#endif /* _LINUX_LOCKDEP_H */
diff --git a/tools/testing/radix-tree/linux/radix-tree.h b/tools/testing/radix-tree/linux/radix-tree.h
index 24f13d27a8da..d1635a5bef02 100644
--- a/tools/testing/radix-tree/linux/radix-tree.h
+++ b/tools/testing/radix-tree/linux/radix-tree.h
@@ -2,7 +2,6 @@
#ifndef _TEST_RADIX_TREE_H
#define _TEST_RADIX_TREE_H
-#include "generated/map-shift.h"
#include "../../../../include/linux/radix-tree.h"
extern int kmalloc_verbose;
diff --git a/tools/testing/radix-tree/linux/rcupdate.h b/tools/testing/radix-tree/linux/rcupdate.h
index 73ed33658203..fd280b070fdb 100644
--- a/tools/testing/radix-tree/linux/rcupdate.h
+++ b/tools/testing/radix-tree/linux/rcupdate.h
@@ -6,5 +6,7 @@
#define rcu_dereference_raw(p) rcu_dereference(p)
#define rcu_dereference_protected(p, cond) rcu_dereference(p)
+#define rcu_dereference_check(p, cond) rcu_dereference(p)
+#define RCU_INIT_POINTER(p, v) (p) = (v)
#endif
diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c
index b741686e53d6..77a44c54998f 100644
--- a/tools/testing/radix-tree/main.c
+++ b/tools/testing/radix-tree/main.c
@@ -214,7 +214,7 @@ void copy_tag_check(void)
}
// printf("\ncopying tags...\n");
- tagged = tag_tagged_items(&tree, NULL, start, end, ITEMS, 0, 1);
+ tagged = tag_tagged_items(&tree, start, end, ITEMS, XA_MARK_0, XA_MARK_1);
// printf("checking copied tags\n");
assert(tagged == count);
@@ -223,7 +223,7 @@ void copy_tag_check(void)
/* Copy tags in several rounds */
// printf("\ncopying tags...\n");
tmp = rand() % (count / 10 + 2);
- tagged = tag_tagged_items(&tree, NULL, start, end, tmp, 0, 2);
+ tagged = tag_tagged_items(&tree, start, end, tmp, XA_MARK_0, XA_MARK_2);
assert(tagged == count);
// printf("%lu %lu %lu\n", tagged, tmp, count);
@@ -236,63 +236,6 @@ void copy_tag_check(void)
item_kill_tree(&tree);
}
-static void __locate_check(struct radix_tree_root *tree, unsigned long index,
- unsigned order)
-{
- struct item *item;
- unsigned long index2;
-
- item_insert_order(tree, index, order);
- item = item_lookup(tree, index);
- index2 = find_item(tree, item);
- if (index != index2) {
- printv(2, "index %ld order %d inserted; found %ld\n",
- index, order, index2);
- abort();
- }
-}
-
-static void __order_0_locate_check(void)
-{
- RADIX_TREE(tree, GFP_KERNEL);
- int i;
-
- for (i = 0; i < 50; i++)
- __locate_check(&tree, rand() % INT_MAX, 0);
-
- item_kill_tree(&tree);
-}
-
-static void locate_check(void)
-{
- RADIX_TREE(tree, GFP_KERNEL);
- unsigned order;
- unsigned long offset, index;
-
- __order_0_locate_check();
-
- for (order = 0; order < 20; order++) {
- for (offset = 0; offset < (1 << (order + 3));
- offset += (1UL << order)) {
- for (index = 0; index < (1UL << (order + 5));
- index += (1UL << order)) {
- __locate_check(&tree, index + offset, order);
- }
- if (find_item(&tree, &tree) != -1)
- abort();
-
- item_kill_tree(&tree);
- }
- }
-
- if (find_item(&tree, &tree) != -1)
- abort();
- __locate_check(&tree, -1, 0);
- if (find_item(&tree, &tree) != -1)
- abort();
- item_kill_tree(&tree);
-}
-
static void single_thread_tests(bool long_run)
{
int i;
@@ -303,10 +246,6 @@ static void single_thread_tests(bool long_run)
rcu_barrier();
printv(2, "after multiorder_check: %d allocated, preempt %d\n",
nr_allocated, preempt_count);
- locate_check();
- rcu_barrier();
- printv(2, "after locate_check: %d allocated, preempt %d\n",
- nr_allocated, preempt_count);
tag_check();
rcu_barrier();
printv(2, "after tag_check: %d allocated, preempt %d\n",
@@ -365,6 +304,7 @@ int main(int argc, char **argv)
rcu_register_thread();
radix_tree_init();
+ xarray_tests();
regression1_test();
regression2_test();
regression3_test();
diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c
index 7bf405638b0b..ff27a74d9762 100644
--- a/tools/testing/radix-tree/multiorder.c
+++ b/tools/testing/radix-tree/multiorder.c
@@ -20,230 +20,39 @@
#include "test.h"
-#define for_each_index(i, base, order) \
- for (i = base; i < base + (1 << order); i++)
-
-static void __multiorder_tag_test(int index, int order)
-{
- RADIX_TREE(tree, GFP_KERNEL);
- int base, err, i;
-
- /* our canonical entry */
- base = index & ~((1 << order) - 1);
-
- printv(2, "Multiorder tag test with index %d, canonical entry %d\n",
- index, base);
-
- err = item_insert_order(&tree, index, order);
- assert(!err);
-
- /*
- * Verify we get collisions for covered indices. We try and fail to
- * insert an exceptional entry so we don't leak memory via
- * item_insert_order().
- */
- for_each_index(i, base, order) {
- err = __radix_tree_insert(&tree, i, order,
- (void *)(0xA0 | RADIX_TREE_EXCEPTIONAL_ENTRY));
- assert(err == -EEXIST);
- }
-
- for_each_index(i, base, order) {
- assert(!radix_tree_tag_get(&tree, i, 0));
- assert(!radix_tree_tag_get(&tree, i, 1));
- }
-
- assert(radix_tree_tag_set(&tree, index, 0));
-
- for_each_index(i, base, order) {
- assert(radix_tree_tag_get(&tree, i, 0));
- assert(!radix_tree_tag_get(&tree, i, 1));
- }
-
- assert(tag_tagged_items(&tree, NULL, 0, ~0UL, 10, 0, 1) == 1);
- assert(radix_tree_tag_clear(&tree, index, 0));
-
- for_each_index(i, base, order) {
- assert(!radix_tree_tag_get(&tree, i, 0));
- assert(radix_tree_tag_get(&tree, i, 1));
- }
-
- assert(radix_tree_tag_clear(&tree, index, 1));
-
- assert(!radix_tree_tagged(&tree, 0));
- assert(!radix_tree_tagged(&tree, 1));
-
- item_kill_tree(&tree);
-}
-
-static void __multiorder_tag_test2(unsigned order, unsigned long index2)
+static int item_insert_order(struct xarray *xa, unsigned long index,
+ unsigned order)
{
- RADIX_TREE(tree, GFP_KERNEL);
- unsigned long index = (1 << order);
- index2 += index;
-
- assert(item_insert_order(&tree, 0, order) == 0);
- assert(item_insert(&tree, index2) == 0);
-
- assert(radix_tree_tag_set(&tree, 0, 0));
- assert(radix_tree_tag_set(&tree, index2, 0));
-
- assert(tag_tagged_items(&tree, NULL, 0, ~0UL, 10, 0, 1) == 2);
-
- item_kill_tree(&tree);
-}
-
-static void multiorder_tag_tests(void)
-{
- int i, j;
-
- /* test multi-order entry for indices 0-7 with no sibling pointers */
- __multiorder_tag_test(0, 3);
- __multiorder_tag_test(5, 3);
-
- /* test multi-order entry for indices 8-15 with no sibling pointers */
- __multiorder_tag_test(8, 3);
- __multiorder_tag_test(15, 3);
-
- /*
- * Our order 5 entry covers indices 0-31 in a tree with height=2.
- * This is broken up as follows:
- * 0-7: canonical entry
- * 8-15: sibling 1
- * 16-23: sibling 2
- * 24-31: sibling 3
- */
- __multiorder_tag_test(0, 5);
- __multiorder_tag_test(29, 5);
-
- /* same test, but with indices 32-63 */
- __multiorder_tag_test(32, 5);
- __multiorder_tag_test(44, 5);
-
- /*
- * Our order 8 entry covers indices 0-255 in a tree with height=3.
- * This is broken up as follows:
- * 0-63: canonical entry
- * 64-127: sibling 1
- * 128-191: sibling 2
- * 192-255: sibling 3
- */
- __multiorder_tag_test(0, 8);
- __multiorder_tag_test(190, 8);
-
- /* same test, but with indices 256-511 */
- __multiorder_tag_test(256, 8);
- __multiorder_tag_test(300, 8);
-
- __multiorder_tag_test(0x12345678UL, 8);
-
- for (i = 1; i < 10; i++)
- for (j = 0; j < (10 << i); j++)
- __multiorder_tag_test2(i, j);
-}
-
-static void multiorder_check(unsigned long index, int order)
-{
- unsigned long i;
- unsigned long min = index & ~((1UL << order) - 1);
- unsigned long max = min + (1UL << order);
- void **slot;
- struct item *item2 = item_create(min, order);
- RADIX_TREE(tree, GFP_KERNEL);
-
- printv(2, "Multiorder index %ld, order %d\n", index, order);
-
- assert(item_insert_order(&tree, index, order) == 0);
-
- for (i = min; i < max; i++) {
- struct item *item = item_lookup(&tree, i);
- assert(item != 0);
- assert(item->index == index);
- }
- for (i = 0; i < min; i++)
- item_check_absent(&tree, i);
- for (i = max; i < 2*max; i++)
- item_check_absent(&tree, i);
- for (i = min; i < max; i++)
- assert(radix_tree_insert(&tree, i, item2) == -EEXIST);
-
- slot = radix_tree_lookup_slot(&tree, index);
- free(*slot);
- radix_tree_replace_slot(&tree, slot, item2);
- for (i = min; i < max; i++) {
- struct item *item = item_lookup(&tree, i);
- assert(item != 0);
- assert(item->index == min);
- }
-
- assert(item_delete(&tree, min) != 0);
-
- for (i = 0; i < 2*max; i++)
- item_check_absent(&tree, i);
-}
-
-static void multiorder_shrink(unsigned long index, int order)
-{
- unsigned long i;
- unsigned long max = 1 << order;
- RADIX_TREE(tree, GFP_KERNEL);
- struct radix_tree_node *node;
-
- printv(2, "Multiorder shrink index %ld, order %d\n", index, order);
+ XA_STATE_ORDER(xas, xa, index, order);
+ struct item *item = item_create(index, order);
- assert(item_insert_order(&tree, 0, order) == 0);
-
- node = tree.rnode;
-
- assert(item_insert(&tree, index) == 0);
- assert(node != tree.rnode);
-
- assert(item_delete(&tree, index) != 0);
- assert(node == tree.rnode);
-
- for (i = 0; i < max; i++) {
- struct item *item = item_lookup(&tree, i);
- assert(item != 0);
- assert(item->index == 0);
- }
- for (i = max; i < 2*max; i++)
- item_check_absent(&tree, i);
-
- if (!item_delete(&tree, 0)) {
- printv(2, "failed to delete index %ld (order %d)\n", index, order);
- abort();
- }
-
- for (i = 0; i < 2*max; i++)
- item_check_absent(&tree, i);
-}
-
-static void multiorder_insert_bug(void)
-{
- RADIX_TREE(tree, GFP_KERNEL);
+ do {
+ xas_lock(&xas);
+ xas_store(&xas, item);
+ xas_unlock(&xas);
+ } while (xas_nomem(&xas, GFP_KERNEL));
- item_insert(&tree, 0);
- radix_tree_tag_set(&tree, 0, 0);
- item_insert_order(&tree, 3 << 6, 6);
+ if (!xas_error(&xas))
+ return 0;
- item_kill_tree(&tree);
+ free(item);
+ return xas_error(&xas);
}
-void multiorder_iteration(void)
+void multiorder_iteration(struct xarray *xa)
{
- RADIX_TREE(tree, GFP_KERNEL);
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, xa, 0);
+ struct item *item;
int i, j, err;
- printv(1, "Multiorder iteration test\n");
-
#define NUM_ENTRIES 11
int index[NUM_ENTRIES] = {0, 2, 4, 8, 16, 32, 34, 36, 64, 72, 128};
int order[NUM_ENTRIES] = {1, 1, 2, 3, 4, 1, 0, 1, 3, 0, 7};
+ printv(1, "Multiorder iteration test\n");
+
for (i = 0; i < NUM_ENTRIES; i++) {
- err = item_insert_order(&tree, index[i], order[i]);
+ err = item_insert_order(xa, index[i], order[i]);
assert(!err);
}
@@ -252,14 +61,14 @@ void multiorder_iteration(void)
if (j <= (index[i] | ((1 << order[i]) - 1)))
break;
- radix_tree_for_each_slot(slot, &tree, &iter, j) {
- int height = order[i] / RADIX_TREE_MAP_SHIFT;
- int shift = height * RADIX_TREE_MAP_SHIFT;
+ xas_set(&xas, j);
+ xas_for_each(&xas, item, ULONG_MAX) {
+ int height = order[i] / XA_CHUNK_SHIFT;
+ int shift = height * XA_CHUNK_SHIFT;
unsigned long mask = (1UL << order[i]) - 1;
- struct item *item = *slot;
- assert((iter.index | mask) == (index[i] | mask));
- assert(iter.shift == shift);
+ assert((xas.xa_index | mask) == (index[i] | mask));
+ assert(xas.xa_node->shift == shift);
assert(!radix_tree_is_internal_node(item));
assert((item->index | mask) == (index[i] | mask));
assert(item->order == order[i]);
@@ -267,18 +76,15 @@ void multiorder_iteration(void)
}
}
- item_kill_tree(&tree);
+ item_kill_tree(xa);
}
-void multiorder_tagged_iteration(void)
+void multiorder_tagged_iteration(struct xarray *xa)
{
- RADIX_TREE(tree, GFP_KERNEL);
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, xa, 0);
+ struct item *item;
int i, j;
- printv(1, "Multiorder tagged iteration test\n");
-
#define MT_NUM_ENTRIES 9
int index[MT_NUM_ENTRIES] = {0, 2, 4, 16, 32, 40, 64, 72, 128};
int order[MT_NUM_ENTRIES] = {1, 0, 2, 4, 3, 1, 3, 0, 7};
@@ -286,13 +92,15 @@ void multiorder_tagged_iteration(void)
#define TAG_ENTRIES 7
int tag_index[TAG_ENTRIES] = {0, 4, 16, 40, 64, 72, 128};
+ printv(1, "Multiorder tagged iteration test\n");
+
for (i = 0; i < MT_NUM_ENTRIES; i++)
- assert(!item_insert_order(&tree, index[i], order[i]));
+ assert(!item_insert_order(xa, index[i], order[i]));
- assert(!radix_tree_tagged(&tree, 1));
+ assert(!xa_marked(xa, XA_MARK_1));
for (i = 0; i < TAG_ENTRIES; i++)
- assert(radix_tree_tag_set(&tree, tag_index[i], 1));
+ xa_set_mark(xa, tag_index[i], XA_MARK_1);
for (j = 0; j < 256; j++) {
int k;
@@ -304,23 +112,23 @@ void multiorder_tagged_iteration(void)
break;
}
- radix_tree_for_each_tagged(slot, &tree, &iter, j, 1) {
+ xas_set(&xas, j);
+ xas_for_each_marked(&xas, item, ULONG_MAX, XA_MARK_1) {
unsigned long mask;
- struct item *item = *slot;
for (k = i; index[k] < tag_index[i]; k++)
;
mask = (1UL << order[k]) - 1;
- assert((iter.index | mask) == (tag_index[i] | mask));
- assert(!radix_tree_is_internal_node(item));
+ assert((xas.xa_index | mask) == (tag_index[i] | mask));
+ assert(!xa_is_internal(item));
assert((item->index | mask) == (tag_index[i] | mask));
assert(item->order == order[k]);
i++;
}
}
- assert(tag_tagged_items(&tree, NULL, 0, ~0UL, TAG_ENTRIES, 1, 2) ==
- TAG_ENTRIES);
+ assert(tag_tagged_items(xa, 0, ULONG_MAX, TAG_ENTRIES, XA_MARK_1,
+ XA_MARK_2) == TAG_ENTRIES);
for (j = 0; j < 256; j++) {
int mask, k;
@@ -332,297 +140,31 @@ void multiorder_tagged_iteration(void)
break;
}
- radix_tree_for_each_tagged(slot, &tree, &iter, j, 2) {
- struct item *item = *slot;
+ xas_set(&xas, j);
+ xas_for_each_marked(&xas, item, ULONG_MAX, XA_MARK_2) {
for (k = i; index[k] < tag_index[i]; k++)
;
mask = (1 << order[k]) - 1;
- assert((iter.index | mask) == (tag_index[i] | mask));
- assert(!radix_tree_is_internal_node(item));
+ assert((xas.xa_index | mask) == (tag_index[i] | mask));
+ assert(!xa_is_internal(item));
assert((item->index | mask) == (tag_index[i] | mask));
assert(item->order == order[k]);
i++;
}
}
- assert(tag_tagged_items(&tree, NULL, 1, ~0UL, MT_NUM_ENTRIES * 2, 1, 0)
- == TAG_ENTRIES);
+ assert(tag_tagged_items(xa, 1, ULONG_MAX, MT_NUM_ENTRIES * 2, XA_MARK_1,
+ XA_MARK_0) == TAG_ENTRIES);
i = 0;
- radix_tree_for_each_tagged(slot, &tree, &iter, 0, 0) {
- assert(iter.index == tag_index[i]);
+ xas_set(&xas, 0);
+ xas_for_each_marked(&xas, item, ULONG_MAX, XA_MARK_0) {
+ assert(xas.xa_index == tag_index[i]);
i++;
}
+ assert(i == TAG_ENTRIES);
- item_kill_tree(&tree);
-}
-
-/*
- * Basic join checks: make sure we can't find an entry in the tree after
- * a larger entry has replaced it
- */
-static void multiorder_join1(unsigned long index,
- unsigned order1, unsigned order2)
-{
- unsigned long loc;
- void *item, *item2 = item_create(index + 1, order1);
- RADIX_TREE(tree, GFP_KERNEL);
-
- item_insert_order(&tree, index, order2);
- item = radix_tree_lookup(&tree, index);
- radix_tree_join(&tree, index + 1, order1, item2);
- loc = find_item(&tree, item);
- if (loc == -1)
- free(item);
- item = radix_tree_lookup(&tree, index + 1);
- assert(item == item2);
- item_kill_tree(&tree);
-}
-
-/*
- * Check that the accounting of exceptional entries is handled correctly
- * by joining an exceptional entry to a normal pointer.
- */
-static void multiorder_join2(unsigned order1, unsigned order2)
-{
- RADIX_TREE(tree, GFP_KERNEL);
- struct radix_tree_node *node;
- void *item1 = item_create(0, order1);
- void *item2;
-
- item_insert_order(&tree, 0, order2);
- radix_tree_insert(&tree, 1 << order2, (void *)0x12UL);
- item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL);
- assert(item2 == (void *)0x12UL);
- assert(node->exceptional == 1);
-
- item2 = radix_tree_lookup(&tree, 0);
- free(item2);
-
- radix_tree_join(&tree, 0, order1, item1);
- item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL);
- assert(item2 == item1);
- assert(node->exceptional == 0);
- item_kill_tree(&tree);
-}
-
-/*
- * This test revealed an accounting bug for exceptional entries at one point.
- * Nodes were being freed back into the pool with an elevated exception count
- * by radix_tree_join() and then radix_tree_split() was failing to zero the
- * count of exceptional entries.
- */
-static void multiorder_join3(unsigned int order)
-{
- RADIX_TREE(tree, GFP_KERNEL);
- struct radix_tree_node *node;
- void **slot;
- struct radix_tree_iter iter;
- unsigned long i;
-
- for (i = 0; i < (1 << order); i++) {
- radix_tree_insert(&tree, i, (void *)0x12UL);
- }
-
- radix_tree_join(&tree, 0, order, (void *)0x16UL);
- rcu_barrier();
-
- radix_tree_split(&tree, 0, 0);
-
- radix_tree_for_each_slot(slot, &tree, &iter, 0) {
- radix_tree_iter_replace(&tree, &iter, slot, (void *)0x12UL);
- }
-
- __radix_tree_lookup(&tree, 0, &node, NULL);
- assert(node->exceptional == node->count);
-
- item_kill_tree(&tree);
-}
-
-static void multiorder_join(void)
-{
- int i, j, idx;
-
- for (idx = 0; idx < 1024; idx = idx * 2 + 3) {
- for (i = 1; i < 15; i++) {
- for (j = 0; j < i; j++) {
- multiorder_join1(idx, i, j);
- }
- }
- }
-
- for (i = 1; i < 15; i++) {
- for (j = 0; j < i; j++) {
- multiorder_join2(i, j);
- }
- }
-
- for (i = 3; i < 10; i++) {
- multiorder_join3(i);
- }
-}
-
-static void check_mem(unsigned old_order, unsigned new_order, unsigned alloc)
-{
- struct radix_tree_preload *rtp = &radix_tree_preloads;
- if (rtp->nr != 0)
- printv(2, "split(%u %u) remaining %u\n", old_order, new_order,
- rtp->nr);
- /*
- * Can't check for equality here as some nodes may have been
- * RCU-freed while we ran. But we should never finish with more
- * nodes allocated since they should have all been preloaded.
- */
- if (nr_allocated > alloc)
- printv(2, "split(%u %u) allocated %u %u\n", old_order, new_order,
- alloc, nr_allocated);
-}
-
-static void __multiorder_split(int old_order, int new_order)
-{
- RADIX_TREE(tree, GFP_ATOMIC);
- void **slot;
- struct radix_tree_iter iter;
- unsigned alloc;
- struct item *item;
-
- radix_tree_preload(GFP_KERNEL);
- assert(item_insert_order(&tree, 0, old_order) == 0);
- radix_tree_preload_end();
-
- /* Wipe out the preloaded cache or it'll confuse check_mem() */
- radix_tree_cpu_dead(0);
-
- item = radix_tree_tag_set(&tree, 0, 2);
-
- radix_tree_split_preload(old_order, new_order, GFP_KERNEL);
- alloc = nr_allocated;
- radix_tree_split(&tree, 0, new_order);
- check_mem(old_order, new_order, alloc);
- radix_tree_for_each_slot(slot, &tree, &iter, 0) {
- radix_tree_iter_replace(&tree, &iter, slot,
- item_create(iter.index, new_order));
- }
- radix_tree_preload_end();
-
- item_kill_tree(&tree);
- free(item);
-}
-
-static void __multiorder_split2(int old_order, int new_order)
-{
- RADIX_TREE(tree, GFP_KERNEL);
- void **slot;
- struct radix_tree_iter iter;
- struct radix_tree_node *node;
- void *item;
-
- __radix_tree_insert(&tree, 0, old_order, (void *)0x12);
-
- item = __radix_tree_lookup(&tree, 0, &node, NULL);
- assert(item == (void *)0x12);
- assert(node->exceptional > 0);
-
- radix_tree_split(&tree, 0, new_order);
- radix_tree_for_each_slot(slot, &tree, &iter, 0) {
- radix_tree_iter_replace(&tree, &iter, slot,
- item_create(iter.index, new_order));
- }
-
- item = __radix_tree_lookup(&tree, 0, &node, NULL);
- assert(item != (void *)0x12);
- assert(node->exceptional == 0);
-
- item_kill_tree(&tree);
-}
-
-static void __multiorder_split3(int old_order, int new_order)
-{
- RADIX_TREE(tree, GFP_KERNEL);
- void **slot;
- struct radix_tree_iter iter;
- struct radix_tree_node *node;
- void *item;
-
- __radix_tree_insert(&tree, 0, old_order, (void *)0x12);
-
- item = __radix_tree_lookup(&tree, 0, &node, NULL);
- assert(item == (void *)0x12);
- assert(node->exceptional > 0);
-
- radix_tree_split(&tree, 0, new_order);
- radix_tree_for_each_slot(slot, &tree, &iter, 0) {
- radix_tree_iter_replace(&tree, &iter, slot, (void *)0x16);
- }
-
- item = __radix_tree_lookup(&tree, 0, &node, NULL);
- assert(item == (void *)0x16);
- assert(node->exceptional > 0);
-
- item_kill_tree(&tree);
-
- __radix_tree_insert(&tree, 0, old_order, (void *)0x12);
-
- item = __radix_tree_lookup(&tree, 0, &node, NULL);
- assert(item == (void *)0x12);
- assert(node->exceptional > 0);
-
- radix_tree_split(&tree, 0, new_order);
- radix_tree_for_each_slot(slot, &tree, &iter, 0) {
- if (iter.index == (1 << new_order))
- radix_tree_iter_replace(&tree, &iter, slot,
- (void *)0x16);
- else
- radix_tree_iter_replace(&tree, &iter, slot, NULL);
- }
-
- item = __radix_tree_lookup(&tree, 1 << new_order, &node, NULL);
- assert(item == (void *)0x16);
- assert(node->count == node->exceptional);
- do {
- node = node->parent;
- if (!node)
- break;
- assert(node->count == 1);
- assert(node->exceptional == 0);
- } while (1);
-
- item_kill_tree(&tree);
-}
-
-static void multiorder_split(void)
-{
- int i, j;
-
- for (i = 3; i < 11; i++)
- for (j = 0; j < i; j++) {
- __multiorder_split(i, j);
- __multiorder_split2(i, j);
- __multiorder_split3(i, j);
- }
-}
-
-static void multiorder_account(void)
-{
- RADIX_TREE(tree, GFP_KERNEL);
- struct radix_tree_node *node;
- void **slot;
-
- item_insert_order(&tree, 0, 5);
-
- __radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12);
- __radix_tree_lookup(&tree, 0, &node, NULL);
- assert(node->count == node->exceptional * 2);
- radix_tree_delete(&tree, 1 << 5);
- assert(node->exceptional == 0);
-
- __radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12);
- __radix_tree_lookup(&tree, 1 << 5, &node, &slot);
- assert(node->count == node->exceptional * 2);
- __radix_tree_replace(&tree, node, slot, NULL, NULL);
- assert(node->exceptional == 0);
-
- item_kill_tree(&tree);
+ item_kill_tree(xa);
}
bool stop_iteration = false;
@@ -645,68 +187,45 @@ static void *creator_func(void *ptr)
static void *iterator_func(void *ptr)
{
- struct radix_tree_root *tree = ptr;
- struct radix_tree_iter iter;
+ XA_STATE(xas, ptr, 0);
struct item *item;
- void **slot;
while (!stop_iteration) {
rcu_read_lock();
- radix_tree_for_each_slot(slot, tree, &iter, 0) {
- item = radix_tree_deref_slot(slot);
-
- if (!item)
+ xas_for_each(&xas, item, ULONG_MAX) {
+ if (xas_retry(&xas, item))
continue;
- if (radix_tree_deref_retry(item)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- item_sanity(item, iter.index);
+ item_sanity(item, xas.xa_index);
}
rcu_read_unlock();
}
return NULL;
}
-static void multiorder_iteration_race(void)
+static void multiorder_iteration_race(struct xarray *xa)
{
const int num_threads = sysconf(_SC_NPROCESSORS_ONLN);
pthread_t worker_thread[num_threads];
- RADIX_TREE(tree, GFP_KERNEL);
int i;
- pthread_create(&worker_thread[0], NULL, &creator_func, &tree);
+ pthread_create(&worker_thread[0], NULL, &creator_func, xa);
for (i = 1; i < num_threads; i++)
- pthread_create(&worker_thread[i], NULL, &iterator_func, &tree);
+ pthread_create(&worker_thread[i], NULL, &iterator_func, xa);
for (i = 0; i < num_threads; i++)
pthread_join(worker_thread[i], NULL);
- item_kill_tree(&tree);
+ item_kill_tree(xa);
}
+static DEFINE_XARRAY(array);
+
void multiorder_checks(void)
{
- int i;
-
- for (i = 0; i < 20; i++) {
- multiorder_check(200, i);
- multiorder_check(0, i);
- multiorder_check((1UL << i) + 1, i);
- }
-
- for (i = 0; i < 15; i++)
- multiorder_shrink((1UL << (i + RADIX_TREE_MAP_SHIFT)), i);
-
- multiorder_insert_bug();
- multiorder_tag_tests();
- multiorder_iteration();
- multiorder_tagged_iteration();
- multiorder_join();
- multiorder_split();
- multiorder_account();
- multiorder_iteration_race();
+ multiorder_iteration(&array);
+ multiorder_tagged_iteration(&array);
+ multiorder_iteration_race(&array);
radix_tree_cpu_dead(0);
}
diff --git a/tools/testing/radix-tree/regression1.c b/tools/testing/radix-tree/regression1.c
index 0aece092f40e..a61c7bcbc72d 100644
--- a/tools/testing/radix-tree/regression1.c
+++ b/tools/testing/radix-tree/regression1.c
@@ -44,7 +44,6 @@
#include "regression.h"
static RADIX_TREE(mt_tree, GFP_KERNEL);
-static pthread_mutex_t mt_lock = PTHREAD_MUTEX_INITIALIZER;
struct page {
pthread_mutex_t lock;
@@ -53,12 +52,12 @@ struct page {
unsigned long index;
};
-static struct page *page_alloc(void)
+static struct page *page_alloc(int index)
{
struct page *p;
p = malloc(sizeof(struct page));
p->count = 1;
- p->index = 1;
+ p->index = index;
pthread_mutex_init(&p->lock, NULL);
return p;
@@ -80,53 +79,33 @@ static void page_free(struct page *p)
static unsigned find_get_pages(unsigned long start,
unsigned int nr_pages, struct page **pages)
{
- unsigned int i;
- unsigned int ret;
- unsigned int nr_found;
+ XA_STATE(xas, &mt_tree, start);
+ struct page *page;
+ unsigned int ret = 0;
rcu_read_lock();
-restart:
- nr_found = radix_tree_gang_lookup_slot(&mt_tree,
- (void ***)pages, NULL, start, nr_pages);
- ret = 0;
- for (i = 0; i < nr_found; i++) {
- struct page *page;
-repeat:
- page = radix_tree_deref_slot((void **)pages[i]);
- if (unlikely(!page))
+ xas_for_each(&xas, page, ULONG_MAX) {
+ if (xas_retry(&xas, page))
continue;
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- assert((start | i) == 0);
- goto restart;
- }
- /*
- * No exceptional entries are inserted in this test.
- */
- assert(0);
- }
-
pthread_mutex_lock(&page->lock);
- if (!page->count) {
- pthread_mutex_unlock(&page->lock);
- goto repeat;
- }
+ if (!page->count)
+ goto unlock;
+
/* don't actually update page refcount */
pthread_mutex_unlock(&page->lock);
/* Has the page moved? */
- if (unlikely(page != *((void **)pages[i]))) {
- goto repeat;
- }
+ if (unlikely(page != xas_reload(&xas)))
+ goto put_page;
pages[ret] = page;
ret++;
+ continue;
+unlock:
+ pthread_mutex_unlock(&page->lock);
+put_page:
+ xas_reset(&xas);
}
rcu_read_unlock();
return ret;
@@ -145,30 +124,30 @@ static void *regression1_fn(void *arg)
for (j = 0; j < 1000000; j++) {
struct page *p;
- p = page_alloc();
- pthread_mutex_lock(&mt_lock);
+ p = page_alloc(0);
+ xa_lock(&mt_tree);
radix_tree_insert(&mt_tree, 0, p);
- pthread_mutex_unlock(&mt_lock);
+ xa_unlock(&mt_tree);
- p = page_alloc();
- pthread_mutex_lock(&mt_lock);
+ p = page_alloc(1);
+ xa_lock(&mt_tree);
radix_tree_insert(&mt_tree, 1, p);
- pthread_mutex_unlock(&mt_lock);
+ xa_unlock(&mt_tree);
- pthread_mutex_lock(&mt_lock);
+ xa_lock(&mt_tree);
p = radix_tree_delete(&mt_tree, 1);
pthread_mutex_lock(&p->lock);
p->count--;
pthread_mutex_unlock(&p->lock);
- pthread_mutex_unlock(&mt_lock);
+ xa_unlock(&mt_tree);
page_free(p);
- pthread_mutex_lock(&mt_lock);
+ xa_lock(&mt_tree);
p = radix_tree_delete(&mt_tree, 0);
pthread_mutex_lock(&p->lock);
p->count--;
pthread_mutex_unlock(&p->lock);
- pthread_mutex_unlock(&mt_lock);
+ xa_unlock(&mt_tree);
page_free(p);
}
} else {
diff --git a/tools/testing/radix-tree/regression2.c b/tools/testing/radix-tree/regression2.c
index 424b91c77831..f2c7e640a919 100644
--- a/tools/testing/radix-tree/regression2.c
+++ b/tools/testing/radix-tree/regression2.c
@@ -53,9 +53,9 @@
#include "regression.h"
#include "test.h"
-#define PAGECACHE_TAG_DIRTY 0
-#define PAGECACHE_TAG_WRITEBACK 1
-#define PAGECACHE_TAG_TOWRITE 2
+#define PAGECACHE_TAG_DIRTY XA_MARK_0
+#define PAGECACHE_TAG_WRITEBACK XA_MARK_1
+#define PAGECACHE_TAG_TOWRITE XA_MARK_2
static RADIX_TREE(mt_tree, GFP_KERNEL);
unsigned long page_count = 0;
@@ -92,7 +92,7 @@ void regression2_test(void)
/* 1. */
start = 0;
end = max_slots - 2;
- tag_tagged_items(&mt_tree, NULL, start, end, 1,
+ tag_tagged_items(&mt_tree, start, end, 1,
PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
/* 2. */
diff --git a/tools/testing/radix-tree/regression3.c b/tools/testing/radix-tree/regression3.c
index ace2543c3eda..9f9a3b280f56 100644
--- a/tools/testing/radix-tree/regression3.c
+++ b/tools/testing/radix-tree/regression3.c
@@ -69,21 +69,6 @@ void regression3_test(void)
continue;
}
}
- radix_tree_delete(&root, 1);
-
- first = true;
- radix_tree_for_each_contig(slot, &root, &iter, 0) {
- printv(2, "contig %ld %p\n", iter.index, *slot);
- if (first) {
- radix_tree_insert(&root, 1, ptr);
- first = false;
- }
- if (radix_tree_deref_retry(*slot)) {
- printv(2, "retry at %ld\n", iter.index);
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- }
radix_tree_for_each_slot(slot, &root, &iter, 0) {
printv(2, "slot %ld %p\n", iter.index, *slot);
@@ -93,14 +78,6 @@ void regression3_test(void)
}
}
- radix_tree_for_each_contig(slot, &root, &iter, 0) {
- printv(2, "contig %ld %p\n", iter.index, *slot);
- if (!iter.index) {
- printv(2, "next at %ld\n", iter.index);
- slot = radix_tree_iter_resume(slot, &iter);
- }
- }
-
radix_tree_tag_set(&root, 0, 0);
radix_tree_tag_set(&root, 1, 0);
radix_tree_for_each_tagged(slot, &root, &iter, 0, 0) {
diff --git a/tools/testing/radix-tree/tag_check.c b/tools/testing/radix-tree/tag_check.c
index 543181e4847b..f898957b1a19 100644
--- a/tools/testing/radix-tree/tag_check.c
+++ b/tools/testing/radix-tree/tag_check.c
@@ -24,7 +24,7 @@ __simple_checks(struct radix_tree_root *tree, unsigned long index, int tag)
item_tag_set(tree, index, tag);
ret = item_tag_get(tree, index, tag);
assert(ret != 0);
- ret = tag_tagged_items(tree, NULL, first, ~0UL, 10, tag, !tag);
+ ret = tag_tagged_items(tree, first, ~0UL, 10, tag, !tag);
assert(ret == 1);
ret = item_tag_get(tree, index, !tag);
assert(ret != 0);
@@ -321,7 +321,7 @@ static void single_check(void)
assert(ret == 0);
verify_tag_consistency(&tree, 0);
verify_tag_consistency(&tree, 1);
- ret = tag_tagged_items(&tree, NULL, first, 10, 10, 0, 1);
+ ret = tag_tagged_items(&tree, first, 10, 10, XA_MARK_0, XA_MARK_1);
assert(ret == 1);
ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 0, BATCH, 1);
assert(ret == 1);
@@ -331,34 +331,6 @@ static void single_check(void)
item_kill_tree(&tree);
}
-void radix_tree_clear_tags_test(void)
-{
- unsigned long index;
- struct radix_tree_node *node;
- struct radix_tree_iter iter;
- void **slot;
-
- RADIX_TREE(tree, GFP_KERNEL);
-
- item_insert(&tree, 0);
- item_tag_set(&tree, 0, 0);
- __radix_tree_lookup(&tree, 0, &node, &slot);
- radix_tree_clear_tags(&tree, node, slot);
- assert(item_tag_get(&tree, 0, 0) == 0);
-
- for (index = 0; index < 1000; index++) {
- item_insert(&tree, index);
- item_tag_set(&tree, index, 0);
- }
-
- radix_tree_for_each_slot(slot, &tree, &iter, 0) {
- radix_tree_clear_tags(&tree, iter.node, slot);
- assert(item_tag_get(&tree, iter.index, 0) == 0);
- }
-
- item_kill_tree(&tree);
-}
-
void tag_check(void)
{
single_check();
@@ -376,5 +348,4 @@ void tag_check(void)
thrash_tags();
rcu_barrier();
printv(2, "after thrash_tags: %d allocated\n", nr_allocated);
- radix_tree_clear_tags_test();
}
diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c
index def6015570b2..a15d0512e633 100644
--- a/tools/testing/radix-tree/test.c
+++ b/tools/testing/radix-tree/test.c
@@ -25,11 +25,6 @@ int item_tag_get(struct radix_tree_root *root, unsigned long index, int tag)
return radix_tree_tag_get(root, index, tag);
}
-int __item_insert(struct radix_tree_root *root, struct item *item)
-{
- return __radix_tree_insert(root, item->index, item->order, item);
-}
-
struct item *item_create(unsigned long index, unsigned int order)
{
struct item *ret = malloc(sizeof(*ret));
@@ -39,21 +34,15 @@ struct item *item_create(unsigned long index, unsigned int order)
return ret;
}
-int item_insert_order(struct radix_tree_root *root, unsigned long index,
- unsigned order)
+int item_insert(struct radix_tree_root *root, unsigned long index)
{
- struct item *item = item_create(index, order);
- int err = __item_insert(root, item);
+ struct item *item = item_create(index, 0);
+ int err = radix_tree_insert(root, item->index, item);
if (err)
free(item);
return err;
}
-int item_insert(struct radix_tree_root *root, unsigned long index)
-{
- return item_insert_order(root, index, 0);
-}
-
void item_sanity(struct item *item, unsigned long index)
{
unsigned long mask;
@@ -63,16 +52,21 @@ void item_sanity(struct item *item, unsigned long index)
assert((item->index | mask) == (index | mask));
}
+void item_free(struct item *item, unsigned long index)
+{
+ item_sanity(item, index);
+ free(item);
+}
+
int item_delete(struct radix_tree_root *root, unsigned long index)
{
struct item *item = radix_tree_delete(root, index);
- if (item) {
- item_sanity(item, index);
- free(item);
- return 1;
- }
- return 0;
+ if (!item)
+ return 0;
+
+ item_free(item, index);
+ return 1;
}
static void item_free_rcu(struct rcu_head *head)
@@ -82,9 +76,9 @@ static void item_free_rcu(struct rcu_head *head)
free(item);
}
-int item_delete_rcu(struct radix_tree_root *root, unsigned long index)
+int item_delete_rcu(struct xarray *xa, unsigned long index)
{
- struct item *item = radix_tree_delete(root, index);
+ struct item *item = xa_erase(xa, index);
if (item) {
item_sanity(item, index);
@@ -176,59 +170,30 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start,
}
/* Use the same pattern as tag_pages_for_writeback() in mm/page-writeback.c */
-int tag_tagged_items(struct radix_tree_root *root, pthread_mutex_t *lock,
- unsigned long start, unsigned long end, unsigned batch,
- unsigned iftag, unsigned thentag)
+int tag_tagged_items(struct xarray *xa, unsigned long start, unsigned long end,
+ unsigned batch, xa_mark_t iftag, xa_mark_t thentag)
{
- unsigned long tagged = 0;
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, xa, start);
+ unsigned int tagged = 0;
+ struct item *item;
if (batch == 0)
batch = 1;
- if (lock)
- pthread_mutex_lock(lock);
- radix_tree_for_each_tagged(slot, root, &iter, start, iftag) {
- if (iter.index > end)
- break;
- radix_tree_iter_tag_set(root, &iter, thentag);
- tagged++;
- if ((tagged % batch) != 0)
+ xas_lock_irq(&xas);
+ xas_for_each_marked(&xas, item, end, iftag) {
+ xas_set_mark(&xas, thentag);
+ if (++tagged % batch)
continue;
- slot = radix_tree_iter_resume(slot, &iter);
- if (lock) {
- pthread_mutex_unlock(lock);
- rcu_barrier();
- pthread_mutex_lock(lock);
- }
- }
- if (lock)
- pthread_mutex_unlock(lock);
-
- return tagged;
-}
-/* Use the same pattern as find_swap_entry() in mm/shmem.c */
-unsigned long find_item(struct radix_tree_root *root, void *item)
-{
- struct radix_tree_iter iter;
- void **slot;
- unsigned long found = -1;
- unsigned long checked = 0;
-
- radix_tree_for_each_slot(slot, root, &iter, 0) {
- if (*slot == item) {
- found = iter.index;
- break;
- }
- checked++;
- if ((checked % 4) != 0)
- continue;
- slot = radix_tree_iter_resume(slot, &iter);
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
+ rcu_barrier();
+ xas_lock_irq(&xas);
}
+ xas_unlock_irq(&xas);
- return found;
+ return tagged;
}
static int verify_node(struct radix_tree_node *slot, unsigned int tag,
@@ -281,43 +246,31 @@ static int verify_node(struct radix_tree_node *slot, unsigned int tag,
void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag)
{
- struct radix_tree_node *node = root->rnode;
+ struct radix_tree_node *node = root->xa_head;
if (!radix_tree_is_internal_node(node))
return;
verify_node(node, tag, !!root_tag_get(root, tag));
}
-void item_kill_tree(struct radix_tree_root *root)
+void item_kill_tree(struct xarray *xa)
{
- struct radix_tree_iter iter;
- void **slot;
- struct item *items[32];
- int nfound;
-
- radix_tree_for_each_slot(slot, root, &iter, 0) {
- if (radix_tree_exceptional_entry(*slot))
- radix_tree_delete(root, iter.index);
- }
+ XA_STATE(xas, xa, 0);
+ void *entry;
- while ((nfound = radix_tree_gang_lookup(root, (void **)items, 0, 32))) {
- int i;
-
- for (i = 0; i < nfound; i++) {
- void *ret;
-
- ret = radix_tree_delete(root, items[i]->index);
- assert(ret == items[i]);
- free(items[i]);
+ xas_for_each(&xas, entry, ULONG_MAX) {
+ if (!xa_is_value(entry)) {
+ item_free(entry, xas.xa_index);
}
+ xas_store(&xas, NULL);
}
- assert(radix_tree_gang_lookup(root, (void **)items, 0, 32) == 0);
- assert(root->rnode == NULL);
+
+ assert(xa_empty(xa));
}
void tree_verify_min_height(struct radix_tree_root *root, int maxindex)
{
unsigned shift;
- struct radix_tree_node *node = root->rnode;
+ struct radix_tree_node *node = root->xa_head;
if (!radix_tree_is_internal_node(node)) {
assert(maxindex == 0);
return;
diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h
index 92d901eacf49..1ee4b2c0ad10 100644
--- a/tools/testing/radix-tree/test.h
+++ b/tools/testing/radix-tree/test.h
@@ -11,13 +11,11 @@ struct item {
};
struct item *item_create(unsigned long index, unsigned int order);
-int __item_insert(struct radix_tree_root *root, struct item *item);
int item_insert(struct radix_tree_root *root, unsigned long index);
void item_sanity(struct item *item, unsigned long index);
-int item_insert_order(struct radix_tree_root *root, unsigned long index,
- unsigned order);
+void item_free(struct item *item, unsigned long index);
int item_delete(struct radix_tree_root *root, unsigned long index);
-int item_delete_rcu(struct radix_tree_root *root, unsigned long index);
+int item_delete_rcu(struct xarray *xa, unsigned long index);
struct item *item_lookup(struct radix_tree_root *root, unsigned long index);
void item_check_present(struct radix_tree_root *root, unsigned long index);
@@ -29,11 +27,10 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start,
unsigned long nr, int chunk);
void item_kill_tree(struct radix_tree_root *root);
-int tag_tagged_items(struct radix_tree_root *, pthread_mutex_t *,
- unsigned long start, unsigned long end, unsigned batch,
- unsigned iftag, unsigned thentag);
-unsigned long find_item(struct radix_tree_root *, void *item);
+int tag_tagged_items(struct xarray *, unsigned long start, unsigned long end,
+ unsigned batch, xa_mark_t iftag, xa_mark_t thentag);
+void xarray_tests(void);
void tag_check(void);
void multiorder_checks(void);
void iteration_test(unsigned order, unsigned duration);
diff --git a/tools/testing/radix-tree/xarray.c b/tools/testing/radix-tree/xarray.c
new file mode 100644
index 000000000000..e61e43efe463
--- /dev/null
+++ b/tools/testing/radix-tree/xarray.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * xarray.c: Userspace shim for XArray test-suite
+ * Copyright (c) 2018 Matthew Wilcox <willy@infradead.org>
+ */
+
+#define XA_DEBUG
+#include "test.h"
+
+#define module_init(x)
+#define module_exit(x)
+#define MODULE_AUTHOR(x)
+#define MODULE_LICENSE(x)
+#define dump_stack() assert(0)
+
+#include "../../../lib/xarray.c"
+#undef XA_DEBUG
+#include "../../../lib/test_xarray.c"
+
+void xarray_tests(void)
+{
+ xarray_checks();
+ xarray_exit();
+}
+
+int __weak main(void)
+{
+ radix_tree_init();
+ xarray_tests();
+ radix_tree_cpu_dead(1);
+ rcu_barrier();
+ if (nr_allocated)
+ printf("nr_allocated = %d\n", nr_allocated);
+ return 0;
+}