aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/device-mapper/cache-policies.txt67
-rw-r--r--Documentation/device-mapper/cache.txt9
-rw-r--r--Documentation/device-mapper/dm-raid.txt2
-rw-r--r--Documentation/device-mapper/statistics.txt41
-rw-r--r--drivers/md/Kconfig12
-rw-r--r--drivers/md/Makefile2
-rw-r--r--drivers/md/dm-bio-prison.c26
-rw-r--r--drivers/md/dm-bio-prison.h13
-rw-r--r--drivers/md/dm-cache-metadata.c133
-rw-r--r--drivers/md/dm-cache-metadata.h10
-rw-r--r--drivers/md/dm-cache-policy-cleaner.c6
-rw-r--r--drivers/md/dm-cache-policy-internal.h52
-rw-r--r--drivers/md/dm-cache-policy-mq.c93
-rw-r--r--drivers/md/dm-cache-policy-smq.c1791
-rw-r--r--drivers/md/dm-cache-policy.h30
-rw-r--r--drivers/md/dm-cache-target.c832
-rw-r--r--drivers/md/dm-crypt.c30
-rw-r--r--drivers/md/dm-log-writes.c4
-rw-r--r--drivers/md/dm-raid.c225
-rw-r--r--drivers/md/dm-raid1.c75
-rw-r--r--drivers/md/dm-stats.c341
-rw-r--r--drivers/md/dm-stats.h4
-rw-r--r--drivers/md/dm-stripe.c4
-rw-r--r--drivers/md/dm-table.c4
-rw-r--r--drivers/md/dm-thin-metadata.c124
-rw-r--r--drivers/md/dm-thin-metadata.h11
-rw-r--r--drivers/md/dm-thin.c612
-rw-r--r--drivers/md/dm.c190
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c6
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h1
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c127
-rw-r--r--drivers/md/persistent-data/dm-btree.h9
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c50
33 files changed, 4213 insertions, 723 deletions
diff --git a/Documentation/device-mapper/cache-policies.txt b/Documentation/device-mapper/cache-policies.txt
index 0d124a971801..d9246a32e673 100644
--- a/Documentation/device-mapper/cache-policies.txt
+++ b/Documentation/device-mapper/cache-policies.txt
@@ -25,10 +25,10 @@ trying to see when the io scheduler has let the ios run.
Overview of supplied cache replacement policies
===============================================
-multiqueue
-----------
+multiqueue (mq)
+---------------
-This policy is the default.
+This policy has been deprecated in favor of the smq policy (see below).
The multiqueue policy has three sets of 16 queues: one set for entries
waiting for the cache and another two for those in the cache (a set for
@@ -73,6 +73,67 @@ If you're trying to quickly warm a new cache device you may wish to
reduce these to encourage promotion. Remember to switch them back to
their defaults after the cache fills though.
+Stochastic multiqueue (smq)
+---------------------------
+
+This policy is the default.
+
+The stochastic multi-queue (smq) policy addresses some of the problems
+with the multiqueue (mq) policy.
+
+The smq policy (vs mq) offers the promise of less memory utilization,
+improved performance and increased adaptability in the face of changing
+workloads. SMQ also does not have any cumbersome tuning knobs.
+
+Users may switch from "mq" to "smq" simply by appropriately reloading a
+DM table that is using the cache target. Doing so will cause all of the
+mq policy's hints to be dropped. Also, performance of the cache may
+degrade slightly until smq recalculates the origin device's hotspots
+that should be cached.
+
+Memory usage:
+The mq policy uses a lot of memory; 88 bytes per cache block on a 64
+bit machine.
+
+SMQ uses 28bit indexes to implement it's data structures rather than
+pointers. It avoids storing an explicit hit count for each block. It
+has a 'hotspot' queue rather than a pre cache which uses a quarter of
+the entries (each hotspot block covers a larger area than a single
+cache block).
+
+All these mean smq uses ~25bytes per cache block. Still a lot of
+memory, but a substantial improvement nontheless.
+
+Level balancing:
+MQ places entries in different levels of the multiqueue structures
+based on their hit count (~ln(hit count)). This means the bottom
+levels generally have the most entries, and the top ones have very
+few. Having unbalanced levels like this reduces the efficacy of the
+multiqueue.
+
+SMQ does not maintain a hit count, instead it swaps hit entries with
+the least recently used entry from the level above. The over all
+ordering being a side effect of this stochastic process. With this
+scheme we can decide how many entries occupy each multiqueue level,
+resulting in better promotion/demotion decisions.
+
+Adaptability:
+The MQ policy maintains a hit count for each cache block. For a
+different block to get promoted to the cache it's hit count has to
+exceed the lowest currently in the cache. This means it can take a
+long time for the cache to adapt between varying IO patterns.
+Periodically degrading the hit counts could help with this, but I
+haven't found a nice general solution.
+
+SMQ doesn't maintain hit counts, so a lot of this problem just goes
+away. In addition it tracks performance of the hotspot queue, which
+is used to decide which blocks to promote. If the hotspot queue is
+performing badly then it starts moving entries more quickly between
+levels. This lets it adapt to new IO patterns very quickly.
+
+Performance:
+Testing SMQ shows substantially better performance than MQ.
+
cleaner
-------
diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt
index 68c0f517c60e..82960cffbad3 100644
--- a/Documentation/device-mapper/cache.txt
+++ b/Documentation/device-mapper/cache.txt
@@ -221,6 +221,7 @@ Status
<#read hits> <#read misses> <#write hits> <#write misses>
<#demotions> <#promotions> <#dirty> <#features> <features>*
<#core args> <core args>* <policy name> <#policy args> <policy args>*
+<cache metadata mode>
metadata block size : Fixed block size for each metadata block in
sectors
@@ -251,8 +252,12 @@ core args : Key/value pairs for tuning the core
e.g. migration_threshold
policy name : Name of the policy
#policy args : Number of policy arguments to follow (must be even)
-policy args : Key/value pairs
- e.g. sequential_threshold
+policy args : Key/value pairs e.g. sequential_threshold
+cache metadata mode : ro if read-only, rw if read-write
+ In serious cases where even a read-only mode is deemed unsafe
+ no further I/O will be permitted and the status will just
+ contain the string 'Fail'. The userspace recovery tools
+ should then be used.
Messages
--------
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index ef8ba9fa58c4..cb12af3b51c2 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -224,3 +224,5 @@ Version History
New status (STATUSTYPE_INFO) fields: sync_action and mismatch_cnt.
1.5.1 Add ability to restore transiently failed devices on resume.
1.5.2 'mismatch_cnt' is zero unless [last_]sync_action is "check".
+1.6.0 Add discard support (and devices_handle_discard_safely module param).
+1.7.0 Add support for MD RAID0 mappings.
diff --git a/Documentation/device-mapper/statistics.txt b/Documentation/device-mapper/statistics.txt
index 2a1673adc200..4919b2dfd1b3 100644
--- a/Documentation/device-mapper/statistics.txt
+++ b/Documentation/device-mapper/statistics.txt
@@ -13,9 +13,14 @@ the range specified.
The I/O statistics counters for each step-sized area of a region are
in the same format as /sys/block/*/stat or /proc/diskstats (see:
Documentation/iostats.txt). But two extra counters (12 and 13) are
-provided: total time spent reading and writing in milliseconds. All
-these counters may be accessed by sending the @stats_print message to
-the appropriate DM device via dmsetup.
+provided: total time spent reading and writing. When the histogram
+argument is used, the 14th parameter is reported that represents the
+histogram of latencies. All these counters may be accessed by sending
+the @stats_print message to the appropriate DM device via dmsetup.
+
+The reported times are in milliseconds and the granularity depends on
+the kernel ticks. When the option precise_timestamps is used, the
+reported times are in nanoseconds.
Each region has a corresponding unique identifier, which we call a
region_id, that is assigned when the region is created. The region_id
@@ -33,7 +38,9 @@ memory is used by reading
Messages
========
- @stats_create <range> <step> [<program_id> [<aux_data>]]
+ @stats_create <range> <step>
+ [<number_of_optional_arguments> <optional_arguments>...]
+ [<program_id> [<aux_data>]]
Create a new region and return the region_id.
@@ -48,6 +55,29 @@ Messages
"/<number_of_areas>" - the range is subdivided into the specified
number of areas.
+ <number_of_optional_arguments>
+ The number of optional arguments
+
+ <optional_arguments>
+ The following optional arguments are supported
+ precise_timestamps - use precise timer with nanosecond resolution
+ instead of the "jiffies" variable. When this argument is
+ used, the resulting times are in nanoseconds instead of
+ milliseconds. Precise timestamps are a little bit slower
+ to obtain than jiffies-based timestamps.
+ histogram:n1,n2,n3,n4,... - collect histogram of latencies. The
+ numbers n1, n2, etc are times that represent the boundaries
+ of the histogram. If precise_timestamps is not used, the
+ times are in milliseconds, otherwise they are in
+ nanoseconds. For each range, the kernel will report the
+ number of requests that completed within this range. For
+ example, if we use "histogram:10,20,30", the kernel will
+ report four numbers a:b:c:d. a is the number of requests
+ that took 0-10 ms to complete, b is the number of requests
+ that took 10-20 ms to complete, c is the number of requests
+ that took 20-30 ms to complete and d is the number of
+ requests that took more than 30 ms to complete.
+
<program_id>
An optional parameter. A name that uniquely identifies
the userspace owner of the range. This groups ranges together
@@ -55,6 +85,9 @@ Messages
created and ignore those created by others.
The kernel returns this string back in the output of
@stats_list message, but it doesn't use it for anything else.
+ If we omit the number of optional arguments, program id must not
+ be a number, otherwise it would be interpreted as the number of
+ optional arguments.
<aux_data>
An optional parameter. A word that provides auxiliary data
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index edcf4ab66e00..b59727309072 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -304,6 +304,18 @@ config DM_CACHE_MQ
This is meant to be a general purpose policy. It prioritises
reads over writes.
+config DM_CACHE_SMQ
+ tristate "Stochastic MQ Cache Policy (EXPERIMENTAL)"
+ depends on DM_CACHE
+ default y
+ ---help---
+ A cache policy that uses a multiqueue ordered by recent hits
+ to select which blocks should be promoted and demoted.
+ This is meant to be a general purpose policy. It prioritises
+ reads over writes. This SMQ policy (vs MQ) offers the promise
+ of less memory utilization, improved performance and increased
+ adaptability in the face of changing workloads.
+
config DM_CACHE_CLEANER
tristate "Cleaner Cache Policy (EXPERIMENTAL)"
depends on DM_CACHE
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index dba4db5985fb..462f443a4f85 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -13,6 +13,7 @@ dm-log-userspace-y \
dm-thin-pool-y += dm-thin.o dm-thin-metadata.o
dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
dm-cache-mq-y += dm-cache-policy-mq.o
+dm-cache-smq-y += dm-cache-policy-smq.o
dm-cache-cleaner-y += dm-cache-policy-cleaner.o
dm-era-y += dm-era-target.o
md-mod-y += md.o bitmap.o
@@ -54,6 +55,7 @@ obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
obj-$(CONFIG_DM_VERITY) += dm-verity.o
obj-$(CONFIG_DM_CACHE) += dm-cache.o
obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o
+obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o
obj-$(CONFIG_DM_ERA) += dm-era.o
obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index be065300e93c..cd6d1d21e057 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -255,6 +255,32 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
}
EXPORT_SYMBOL_GPL(dm_cell_visit_release);
+static int __promote_or_release(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell)
+{
+ if (bio_list_empty(&cell->bios)) {
+ rb_erase(&cell->node, &prison->cells);
+ return 1;
+ }
+
+ cell->holder = bio_list_pop(&cell->bios);
+ return 0;
+}
+
+int dm_cell_promote_or_release(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell)
+{
+ int r;
+ unsigned long flags;
+
+ spin_lock_irqsave(&prison->lock, flags);
+ r = __promote_or_release(prison, cell);
+ spin_unlock_irqrestore(&prison->lock, flags);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_promote_or_release);
+
/*----------------------------------------------------------------*/
#define DEFERRED_SET_SIZE 64
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index 74cf01144b1f..54352f009bfd 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -101,6 +101,19 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
void (*visit_fn)(void *, struct dm_bio_prison_cell *),
void *context, struct dm_bio_prison_cell *cell);
+/*
+ * Rather than always releasing the prisoners in a cell, the client may
+ * want to promote one of them to be the new holder. There is a race here
+ * though between releasing an empty cell, and other threads adding new
+ * inmates. So this function makes the decision with its lock held.
+ *
+ * This function can have two outcomes:
+ * i) An inmate is promoted to be the holder of the cell (return value of 0).
+ * ii) The cell has no inmate for promotion and is released (return value of 1).
+ */
+int dm_cell_promote_or_release(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell);
+
/*----------------------------------------------------------------*/
/*
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index c1c010498a21..20cc36b01b77 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -39,6 +39,8 @@
enum superblock_flag_bits {
/* for spotting crashes that would invalidate the dirty bitset */
CLEAN_SHUTDOWN,
+ /* metadata must be checked using the tools */
+ NEEDS_CHECK,
};
/*
@@ -107,6 +109,7 @@ struct dm_cache_metadata {
struct dm_disk_bitset discard_info;
struct rw_semaphore root_lock;
+ unsigned long flags;
dm_block_t root;
dm_block_t hint_root;
dm_block_t discard_root;
@@ -129,6 +132,14 @@ struct dm_cache_metadata {
* buffer before the superblock is locked and updated.
*/
__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+
+ /*
+ * Set if a transaction has to be aborted but the attempt to roll
+ * back to the previous (good) transaction failed. The only
+ * metadata operation permissible in this state is the closing of
+ * the device.
+ */
+ bool fail_io:1;
};
/*-------------------------------------------------------------------
@@ -527,6 +538,7 @@ static unsigned long clear_clean_shutdown(unsigned long flags)
static void read_superblock_fields(struct dm_cache_metadata *cmd,
struct cache_disk_superblock *disk_super)
{
+ cmd->flags = le32_to_cpu(disk_super->flags);
cmd->root = le64_to_cpu(disk_super->mapping_root);
cmd->hint_root = le64_to_cpu(disk_super->hint_root);
cmd->discard_root = le64_to_cpu(disk_super->discard_root);
@@ -625,6 +637,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
if (mutator)
update_flags(disk_super, mutator);
+ disk_super->flags = cpu_to_le32(cmd->flags);
disk_super->mapping_root = cpu_to_le64(cmd->root);
disk_super->hint_root = cpu_to_le64(cmd->hint_root);
disk_super->discard_root = cpu_to_le64(cmd->discard_root);
@@ -693,6 +706,7 @@ static struct dm_cache_metadata *metadata_open(struct block_device *bdev,
cmd->cache_blocks = 0;
cmd->policy_hint_size = policy_hint_size;
cmd->changed = true;
+ cmd->fail_io = false;
r = __create_persistent_data_objects(cmd, may_format_device);
if (r) {
@@ -796,7 +810,8 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
list_del(&cmd->list);
mutex_unlock(&table_lock);
- __destroy_persistent_data_objects(cmd);
+ if (!cmd->fail_io)
+ __destroy_persistent_data_objects(cmd);
kfree(cmd);
}
}
@@ -848,13 +863,26 @@ static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
return 0;
}
+#define WRITE_LOCK(cmd) \
+ if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \
+ return -EINVAL; \
+ down_write(&cmd->root_lock)
+
+#define WRITE_LOCK_VOID(cmd) \
+ if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \
+ return; \
+ down_write(&cmd->root_lock)
+
+#define WRITE_UNLOCK(cmd) \
+ up_write(&cmd->root_lock)
+
int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
{
int r;
bool clean;
__le64 null_mapping = pack_value(0, 0);
- down_write(&cmd->root_lock);
+ WRITE_LOCK(cmd);
__dm_bless_for_disk(&null_mapping);
if (from_cblock(new_cache_size) < from_cblock(cmd->cache_blocks)) {
@@ -880,7 +908,7 @@ int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
cmd->changed = true;
out:
- up_write(&cmd->root_lock);
+ WRITE_UNLOCK(cmd);
return r;
}
@@ -891,7 +919,7 @@ int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
{
int r;
- down_write(&cmd->root_lock);
+ WRITE_LOCK(cmd);
r = dm_bitset_resize(&cmd->discard_info,
cmd->discard_root,
from_dblock(cmd->discard_nr_blocks),
@@ -903,7 +931,7 @@ int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
}
cmd->changed = true;
- up_write(&cmd->root_lock);
+ WRITE_UNLOCK(cmd);
return r;
}
@@ -946,9 +974,9 @@ int dm_cache_set_discard(struct dm_cache_metadata *cmd,
{
int r;
- down_write(&cmd->root_lock);
+ WRITE_LOCK(cmd);
r = __discard(cmd, dblock, discard);
- up_write(&cmd->root_lock);
+ WRITE_UNLOCK(cmd);
return r;
}
@@ -1020,9 +1048,9 @@ int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
{
int r;
- down_write(&cmd->root_lock);
+ WRITE_LOCK(cmd);
r = __remove(cmd, cblock);
- up_write(&cmd->root_lock);
+ WRITE_UNLOCK(cmd);
return r;
}
@@ -1048,9 +1076,9 @@ int dm_cache_insert_mapping(struct dm_cache_metadata *cmd,
{
int r;
- down_write(&cmd->root_lock);
+ WRITE_LOCK(cmd);
r = __insert(cmd, cblock, oblock);
- up_write(&cmd->root_lock);
+ WRITE_UNLOCK(cmd);
return r;
}
@@ -1234,9 +1262,9 @@ int dm_cache_set_dirty(struct dm_cache_metadata *cmd,
{
int r;
- down_write(&cmd->root_lock);
+ WRITE_LOCK(cmd);
r = __dirty(cmd, cblock, dirty);
- up_write(&cmd->root_lock);
+ WRITE_UNLOCK(cmd);
return r;
}
@@ -1252,9 +1280,9 @@ void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
struct dm_cache_statistics *stats)
{
- down_write(&cmd->root_lock);
+ WRITE_LOCK_VOID(cmd);
cmd->stats = *stats;
- up_write(&cmd->root_lock);
+ WRITE_UNLOCK(cmd);
}
int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
@@ -1263,7 +1291,7 @@ int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
flags_mutator mutator = (clean_shutdown ? set_clean_shutdown :
clear_clean_shutdown);
- down_write(&cmd->root_lock);
+ WRITE_LOCK(cmd);
r = __commit_transaction(cmd, mutator);
if (r)
goto out;
@@ -1271,7 +1299,7 @@ int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
r = __begin_transaction(cmd);
out:
- up_write(&cmd->root_lock);
+ WRITE_UNLOCK(cmd);
return r;
}
@@ -1376,9 +1404,9 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
{
int r;
- down_write(&cmd->root_lock);
+ WRITE_LOCK(cmd);
r = write_hints(cmd, policy);
- up_write(&cmd->root_lock);
+ WRITE_UNLOCK(cmd);
return r;
}
@@ -1387,3 +1415,70 @@ int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result)
{
return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result);
}
+
+void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd)
+{
+ WRITE_LOCK_VOID(cmd);
+ dm_bm_set_read_only(cmd->bm);
+ WRITE_UNLOCK(cmd);
+}
+
+void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd)
+{
+ WRITE_LOCK_VOID(cmd);
+ dm_bm_set_read_write(cmd->bm);
+ WRITE_UNLOCK(cmd);
+}
+
+int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd)
+{
+ int r;
+ struct dm_block *sblock;
+ struct cache_disk_superblock *disk_super;
+
+ /*
+ * We ignore fail_io for this function.
+ */
+ down_write(&cmd->root_lock);
+ set_bit(NEEDS_CHECK, &cmd->flags);
+
+ r = superblock_lock(cmd, &sblock);
+ if (r) {
+ DMERR("couldn't read superblock");
+ goto out;
+ }
+
+ disk_super = dm_block_data(sblock);
+ disk_super->flags = cpu_to_le32(cmd->flags);
+
+ dm_bm_unlock(sblock);
+
+out:
+ up_write(&cmd->root_lock);
+ return r;
+}
+
+bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd)
+{
+ bool needs_check;
+
+ down_read(&cmd->root_lock);
+ needs_check = !!test_bit(NEEDS_CHECK, &cmd->flags);
+ up_read(&cmd->root_lock);
+
+ return needs_check;
+}
+
+int dm_cache_metadata_abort(struct dm_cache_metadata *cmd)
+{
+ int r;
+
+ WRITE_LOCK(cmd);
+ __destroy_persistent_data_objects(cmd);
+ r = __create_persistent_data_objects(cmd, false);
+ if (r)
+ cmd->fail_io = true;
+ WRITE_UNLOCK(cmd);
+
+ return r;
+}
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 4ecc403be283..2ffee21f318d 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -102,6 +102,10 @@ struct dm_cache_statistics {
void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
struct dm_cache_statistics *stats);
+
+/*
+ * 'void' because it's no big deal if it fails.
+ */
void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
struct dm_cache_statistics *stats);
@@ -133,6 +137,12 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
*/
int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result);
+bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd);
+int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd);
+void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd);
+void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd);
+int dm_cache_metadata_abort(struct dm_cache_metadata *cmd);
+
/*----------------------------------------------------------------*/
#endif /* DM_CACHE_METADATA_H */
diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
index b04d1f904d07..240c9f0e85e7 100644
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ b/drivers/md/dm-cache-policy-cleaner.c
@@ -171,7 +171,8 @@ static void remove_cache_hash_entry(struct wb_cache_entry *e)
/* Public interface (see dm-cache-policy.h */
static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock,
bool can_block, bool can_migrate, bool discarded_oblock,
- struct bio *bio, struct policy_result *result)
+ struct bio *bio, struct policy_locker *locker,
+ struct policy_result *result)
{
struct policy *p = to_policy(pe);
struct wb_cache_entry *e;
@@ -358,7 +359,8 @@ static struct wb_cache_entry *get_next_dirty_entry(struct policy *p)
static int wb_writeback_work(struct dm_cache_policy *pe,
dm_oblock_t *oblock,
- dm_cblock_t *cblock)
+ dm_cblock_t *cblock,
+ bool critical_only)
{
int r = -ENOENT;
struct policy *p = to_policy(pe);
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 2256a1f24f73..2816018faa7f 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -7,6 +7,7 @@
#ifndef DM_CACHE_POLICY_INTERNAL_H
#define DM_CACHE_POLICY_INTERNAL_H
+#include <linux/vmalloc.h>
#include "dm-cache-policy.h"
/*----------------------------------------------------------------*/
@@ -16,9 +17,10 @@
*/
static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock,
bool can_block, bool can_migrate, bool discarded_oblock,
- struct bio *bio, struct policy_result *result)
+ struct bio *bio, struct policy_locker *locker,
+ struct policy_result *result)
{
- return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, result);
+ return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, locker, result);
}
static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
@@ -54,9 +56,10 @@ static inline int policy_walk_mappings(struct dm_cache_policy *p,
static inline int policy_writeback_work(struct dm_cache_policy *p,
dm_oblock_t *oblock,
- dm_cblock_t *cblock)
+ dm_cblock_t *cblock,
+ bool critical_only)
{
- return p->writeback_work ? p->writeback_work(p, oblock, cblock) : -ENOENT;
+ return p->writeback_work ? p->writeback_work(p, oblock, cblock, critical_only) : -ENOENT;
}
static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
@@ -80,19 +83,21 @@ static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
return p->residency(p);
}
-static inline void policy_tick(struct dm_cache_policy *p)
+static inline void policy_tick(struct dm_cache_policy *p, bool can_block)
{
if (p->tick)
- return p->tick(p);
+ return p->tick(p, can_block);
}
-static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
+static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result,
+ unsigned maxlen, ssize_t *sz_ptr)
{
- ssize_t sz = 0;
+ ssize_t sz = *sz_ptr;
if (p->emit_config_values)
- return p->emit_config_values(p, result, maxlen);
+ return p->emit_config_values(p, result, maxlen, sz_ptr);
- DMEMIT("0");
+ DMEMIT("0 ");
+ *sz_ptr = sz;
return 0;
}
@@ -105,6 +110,33 @@ static inline int policy_set_config_value(struct dm_cache_policy *p,
/*----------------------------------------------------------------*/
/*
+ * Some utility functions commonly used by policies and the core target.
+ */
+static inline size_t bitset_size_in_bytes(unsigned nr_entries)
+{
+ return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
+}
+
+static inline unsigned long *alloc_bitset(unsigned nr_entries)
+{
+ size_t s = bitset_size_in_bytes(nr_entries);
+ return vzalloc(s);
+}
+
+static inline void clear_bitset(void *bitset, unsigned nr_entries)
+{
+ size_t s = bitset_size_in_bytes(nr_entries);
+ memset(bitset, 0, s);
+}
+
+static inline void free_bitset(unsigned long *bits)
+{
+ vfree(bits);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
* Creates a new cache policy given a policy name, a cache size, an origin size and the block size.
*/
struct dm_cache_policy *dm_cache_policy_create(const char *name, dm_cblock_t cache_size,
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 3ddd1162334d..32814371b8d3 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -693,9 +693,10 @@ static void requeue(struct mq_policy *mq, struct entry *e)
* - set the hit count to a hard coded value other than 1, eg, is it better
* if it goes in at level 2?
*/
-static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
+static int demote_cblock(struct mq_policy *mq,
+ struct policy_locker *locker, dm_oblock_t *oblock)
{
- struct entry *demoted = pop(mq, &mq->cache_clean);
+ struct entry *demoted = peek(&mq->cache_clean);
if (!demoted)
/*
@@ -707,6 +708,13 @@ static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
*/
return -ENOSPC;
+ if (locker->fn(locker, demoted->oblock))
+ /*
+ * We couldn't lock the demoted block.
+ */
+ return -EBUSY;
+
+ del(mq, demoted);
*oblock = demoted->oblock;
free_entry(&mq->cache_pool, demoted);
@@ -795,6 +803,7 @@ static int cache_entry_found(struct mq_policy *mq,
* finding which cache block to use.
*/
static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
+ struct policy_locker *locker,
struct policy_result *result)
{
int r;
@@ -803,11 +812,12 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
/* Ensure there's a free cblock in the cache */
if (epool_empty(&mq->cache_pool)) {
result->op = POLICY_REPLACE;
- r = demote_cblock(mq, &result->old_oblock);
+ r = demote_cblock(mq, locker, &result->old_oblock);
if (r) {
result->op = POLICY_MISS;
return 0;
}
+
} else
result->op = POLICY_NEW;
@@ -829,7 +839,8 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
bool can_migrate, bool discarded_oblock,
- int data_dir, struct policy_result *result)
+ int data_dir, struct policy_locker *locker,
+ struct policy_result *result)
{
int r = 0;
@@ -842,7 +853,7 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
else {
requeue(mq, e);
- r = pre_cache_to_cache(mq, e, result);
+ r = pre_cache_to_cache(mq, e, locker, result);
}
return r;
@@ -872,6 +883,7 @@ static void insert_in_pre_cache(struct mq_policy *mq,
}
static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
+ struct policy_locker *locker,
struct policy_result *result)
{
int r;
@@ -879,7 +891,7 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
if (epool_empty(&mq->cache_pool)) {
result->op = POLICY_REPLACE;
- r = demote_cblock(mq, &result->old_oblock);
+ r = demote_cblock(mq, locker, &result->old_oblock);
if (unlikely(r)) {
result->op = POLICY_MISS;
insert_in_pre_cache(mq, oblock);
@@ -907,11 +919,12 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
bool can_migrate, bool discarded_oblock,
- int data_dir, struct policy_result *result)
+ int data_dir, struct policy_locker *locker,
+ struct policy_result *result)
{
if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) {
if (can_migrate)
- insert_in_cache(mq, oblock, result);
+ insert_in_cache(mq, oblock, locker, result);
else
return -EWOULDBLOCK;
} else {
@@ -928,7 +941,8 @@ static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
*/
static int map(struct mq_policy *mq, dm_oblock_t oblock,
bool can_migrate, bool discarded_oblock,
- int data_dir, struct policy_result *result)
+ int data_dir, struct policy_locker *locker,
+ struct policy_result *result)
{
int r = 0;
struct entry *e = hash_lookup(mq, oblock);
@@ -942,11 +956,11 @@ static int map(struct mq_policy *mq, dm_oblock_t oblock,
else if (e)
r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
- data_dir, result);
+ data_dir, locker, result);
else
r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
- data_dir, result);
+ data_dir, locker, result);
if (r == -EWOULDBLOCK)
result->op = POLICY_MISS;
@@ -1012,7 +1026,8 @@ static void copy_tick(struct mq_policy *mq)
static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
bool can_block, bool can_migrate, bool discarded_oblock,
- struct bio *bio, struct policy_result *result)
+ struct bio *bio, struct policy_locker *locker,
+ struct policy_result *result)
{
int r;
struct mq_policy *mq = to_mq_policy(p);
@@ -1028,7 +1043,7 @@ static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
iot_examine_bio(&mq->tracker, bio);
r = map(mq, oblock, can_migrate, discarded_oblock,
- bio_data_dir(bio), result);
+ bio_data_dir(bio), locker, result);
mutex_unlock(&mq->lock);
@@ -1221,7 +1236,7 @@ static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
}
static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
- dm_cblock_t *cblock)
+ dm_cblock_t *cblock, bool critical_only)
{
int r;
struct mq_policy *mq = to_mq_policy(p);
@@ -1268,7 +1283,7 @@ static dm_cblock_t mq_residency(struct dm_cache_policy *p)
return r;
}
-static void mq_tick(struct dm_cache_policy *p)
+static void mq_tick(struct dm_cache_policy *p, bool can_block)
{
struct mq_policy *mq = to_mq_policy(p);
unsigned long flags;
@@ -1276,6 +1291,12 @@ static void mq_tick(struct dm_cache_policy *p)
spin_lock_irqsave(&mq->tick_lock, flags);
mq->tick_protected++;
spin_unlock_irqrestore(&mq->tick_lock, flags);
+
+ if (can_block) {
+ mutex_lock(&mq->lock);
+ copy_tick(mq);
+ mutex_unlock(&mq->lock);
+ }
}
static int mq_set_config_value(struct dm_cache_policy *p,
@@ -1308,22 +1329,24 @@ static int mq_set_config_value(struct dm_cache_policy *p,
return 0;
}
-static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
+static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
+ unsigned maxlen, ssize_t *sz_ptr)
{
- ssize_t sz = 0;
+ ssize_t sz = *sz_ptr;
struct mq_policy *mq = to_mq_policy(p);
DMEMIT("10 random_threshold %u "
"sequential_threshold %u "
"discard_promote_adjustment %u "
"read_promote_adjustment %u "
- "write_promote_adjustment %u",
+ "write_promote_adjustment %u ",
mq->tracker.thresholds[PATTERN_RANDOM],
mq->tracker.thresholds[PATTERN_SEQUENTIAL],
mq->discard_promote_adjustment,
mq->read_promote_adjustment,
mq->write_promote_adjustment);
+ *sz_ptr = sz;
return 0;
}
@@ -1408,21 +1431,12 @@ bad_pre_cache_init:
static struct dm_cache_policy_type mq_policy_type = {
.name = "mq",
- .version = {1, 3, 0},
+ .version = {1, 4, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = mq_create
};
-static struct dm_cache_policy_type default_policy_type = {
- .name = "default",
- .version = {1, 3, 0},
- .hint_size = 4,
- .owner = THIS_MODULE,
- .create = mq_create,
- .real = &mq_policy_type
-};
-
static int __init mq_init(void)
{
int r;
@@ -1432,36 +1446,21 @@ static int __init mq_init(void)
__alignof__(struct entry),
0, NULL);
if (!mq_entry_cache)
- goto bad;
+ return -ENOMEM;
r = dm_cache_policy_register(&mq_policy_type);
if (r) {
DMERR("register failed %d", r);
- goto bad_register_mq;
- }
-
- r = dm_cache_policy_register(&default_policy_type);
- if (!r) {
- DMINFO("version %u.%u.%u loaded",
- mq_policy_type.version[0],
- mq_policy_type.version[1],
- mq_policy_type.version[2]);
- return 0;
+ kmem_cache_destroy(mq_entry_cache);
+ return -ENOMEM;
}
- DMERR("register failed (as default) %d", r);
-
- dm_cache_policy_unregister(&mq_policy_type);
-bad_register_mq:
- kmem_cache_destroy(mq_entry_cache);
-bad:
- return -ENOMEM;
+ return 0;
}
static void __exit mq_exit(void)
{
dm_cache_policy_unregister(&mq_policy_type);
- dm_cache_policy_unregister(&default_policy_type);
kmem_cache_destroy(mq_entry_cache);
}
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
new file mode 100644
index 000000000000..80f02d3330e2
--- /dev/null
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -0,0 +1,1791 @@
+/*
+ * Copyright (C) 2015 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-policy.h"
+#include "dm-cache-policy-internal.h"
+#include "dm.h"
+
+#include <linux/hash.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/vmalloc.h>
+#include <linux/math64.h>
+
+#define DM_MSG_PREFIX "cache-policy-smq"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Safe division functions that return zero on divide by zero.
+ */
+static unsigned safe_div(unsigned n, unsigned d)
+{
+ return d ? n / d : 0u;
+}
+
+static unsigned safe_mod(unsigned n, unsigned d)
+{
+ return d ? n % d : 0u;
+}
+
+/*----------------------------------------------------------------*/
+
+struct entry {
+ unsigned hash_next:28;
+ unsigned prev:28;
+ unsigned next:28;
+ unsigned level:7;
+ bool dirty:1;
+ bool allocated:1;
+ bool sentinel:1;
+
+ dm_oblock_t oblock;
+};
+
+/*----------------------------------------------------------------*/
+
+#define INDEXER_NULL ((1u << 28u) - 1u)
+
+/*
+ * An entry_space manages a set of entries that we use for the queues.
+ * The clean and dirty queues share entries, so this object is separate
+ * from the queue itself.
+ */
+struct entry_space {
+ struct entry *begin;
+ struct entry *end;
+};
+
+static int space_init(struct entry_space *es, unsigned nr_entries)
+{
+ if (!nr_entries) {
+ es->begin = es->end = NULL;
+ return 0;
+ }
+
+ es->begin = vzalloc(sizeof(struct entry) * nr_entries);
+ if (!es->begin)
+ return -ENOMEM;
+
+ es->end = es->begin + nr_entries;
+ return 0;
+}
+
+static void space_exit(struct entry_space *es)
+{
+ vfree(es->begin);
+}
+
+static struct entry *__get_entry(struct entry_space *es, unsigned block)
+{
+ struct entry *e;
+
+ e = es->begin + block;
+ BUG_ON(e >= es->end);
+
+ return e;
+}
+
+static unsigned to_index(struct entry_space *es, struct entry *e)
+{
+ BUG_ON(e < es->begin || e >= es->end);
+ return e - es->begin;
+}
+
+static struct entry *to_entry(struct entry_space *es, unsigned block)
+{
+ if (block == INDEXER_NULL)
+ return NULL;
+
+ return __get_entry(es, block);
+}
+
+/*----------------------------------------------------------------*/
+
+struct ilist {
+ unsigned nr_elts; /* excluding sentinel entries */
+ unsigned head, tail;
+};
+
+static void l_init(struct ilist *l)
+{
+ l->nr_elts = 0;
+ l->head = l->tail = INDEXER_NULL;
+}
+
+static struct entry *l_head(struct entry_space *es, struct ilist *l)
+{
+ return to_entry(es, l->head);
+}
+
+static struct entry *l_tail(struct entry_space *es, struct ilist *l)
+{
+ return to_entry(es, l->tail);
+}
+
+static struct entry *l_next(struct entry_space *es, struct entry *e)
+{
+ return to_entry(es, e->next);
+}
+
+static struct entry *l_prev(struct entry_space *es, struct entry *e)
+{
+ return to_entry(es, e->prev);
+}
+
+static bool l_empty(struct ilist *l)
+{
+ return l->head == INDEXER_NULL;
+}
+
+static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e)
+{
+ struct entry *head = l_head(es, l);
+
+ e->next = l->head;
+ e->prev = INDEXER_NULL;
+
+ if (head)
+ head->prev = l->head = to_index(es, e);
+ else
+ l->head = l->tail = to_index(es, e);
+
+ if (!e->sentinel)
+ l->nr_elts++;
+}
+
+static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e)
+{
+ struct entry *tail = l_tail(es, l);
+
+ e->next = INDEXER_NULL;
+ e->prev = l->tail;
+
+ if (tail)
+ tail->next = l->tail = to_index(es, e);
+ else
+ l->head = l->tail = to_index(es, e);
+
+ if (!e->sentinel)
+ l->nr_elts++;
+}
+
+static void l_add_before(struct entry_space *es, struct ilist *l,
+ struct entry *old, struct entry *e)
+{
+ struct entry *prev = l_prev(es, old);
+
+ if (!prev)
+ l_add_head(es, l, e);
+
+ else {
+ e->prev = old->prev;
+ e->next = to_index(es, old);
+ prev->next = old->prev = to_index(es, e);
+
+ if (!e->sentinel)
+ l->nr_elts++;
+ }
+}
+
+static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
+{
+ struct entry *prev = l_prev(es, e);
+ struct entry *next = l_next(es, e);
+
+ if (prev)
+ prev->next = e->next;
+ else
+ l->head = e->next;
+
+ if (next)
+ next->prev = e->prev;
+ else
+ l->tail = e->prev;
+
+ if (!e->sentinel)
+ l->nr_elts--;
+}
+
+static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
+{
+ struct entry *e;
+
+ for (e = l_tail(es, l); e; e = l_prev(es, e))
+ if (!e->sentinel) {
+ l_del(es, l, e);
+ return e;
+ }
+
+ return NULL;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * The stochastic-multi-queue is a set of lru lists stacked into levels.
+ * Entries are moved up levels when they are used, which loosely orders the
+ * most accessed entries in the top levels and least in the bottom. This
+ * structure is *much* better than a single lru list.
+ */
+#define MAX_LEVELS 64u
+
+struct queue {
+ struct entry_space *es;
+
+ unsigned nr_elts;
+ unsigned nr_levels;
+ struct ilist qs[MAX_LEVELS];
+
+ /*
+ * We maintain a count of the number of entries we would like in each
+ * level.
+ */
+ unsigned last_target_nr_elts;
+ unsigned nr_top_levels;
+ unsigned nr_in_top_levels;
+ unsigned target_count[MAX_LEVELS];
+};
+
+static void q_init(struct queue *q, struct entry_space *es, unsigned nr_levels)
+{
+ unsigned i;
+
+ q->es = es;
+ q->nr_elts = 0;
+ q->nr_levels = nr_levels;
+
+ for (i = 0; i < q->nr_levels; i++) {
+ l_init(q->qs + i);
+ q->target_count[i] = 0u;
+ }
+
+ q->last_target_nr_elts = 0u;
+ q->nr_top_levels = 0u;
+ q->nr_in_top_levels = 0u;
+}
+
+static unsigned q_size(struct queue *q)
+{
+ return q->nr_elts;
+}
+
+/*
+ * Insert an entry to the back of the given level.
+ */
+static void q_push(struct queue *q, struct entry *e)
+{
+ if (!e->sentinel)
+ q->nr_elts++;
+
+ l_add_tail(q->es, q->qs + e->level, e);
+}
+
+static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
+{
+ if (!e->sentinel)
+ q->nr_elts++;
+
+ l_add_before(q->es, q->qs + e->level, old, e);
+}
+
+static void q_del(struct queue *q, struct entry *e)
+{
+ l_del(q->es, q->qs + e->level, e);
+ if (!e->sentinel)
+ q->nr_elts--;
+}
+
+/*
+ * Return the oldest entry of the lowest populated level.
+ */
+static struct entry *q_peek(struct queue *q, unsigned max_level, bool can_cross_sentinel)
+{
+ unsigned level;
+ struct entry *e;
+
+ max_level = min(max_level, q->nr_levels);
+
+ for (level = 0; level < max_level; level++)
+ for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
+ if (e->sentinel) {
+ if (can_cross_sentinel)
+ continue;
+ else
+ break;
+ }
+
+ return e;
+ }
+
+ return NULL;
+}
+
+static struct entry *q_pop(struct queue *q)
+{
+ struct entry *e = q_peek(q, q->nr_levels, true);
+
+ if (e)
+ q_del(q, e);
+
+ return e;
+}
+
+/*
+ * Pops an entry from a level that is not past a sentinel.
+ */
+static struct entry *q_pop_old(struct queue *q, unsigned max_level)
+{
+ struct entry *e = q_peek(q, max_level, false);
+
+ if (e)
+ q_del(q, e);
+
+ return e;
+}
+
+/*
+ * This function assumes there is a non-sentinel entry to pop. It's only
+ * used by redistribute, so we know this is true. It also doesn't adjust
+ * the q->nr_elts count.
+ */
+static struct entry *__redist_pop_from(struct queue *q, unsigned level)
+{
+ struct entry *e;
+
+ for (; level < q->nr_levels; level++)
+ for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e))
+ if (!e->sentinel) {
+ l_del(q->es, q->qs + e->level, e);
+ return e;
+ }
+
+ return NULL;
+}
+
+static void q_set_targets_subrange_(struct queue *q, unsigned nr_elts, unsigned lbegin, unsigned lend)
+{
+ unsigned level, nr_levels, entries_per_level, remainder;
+
+ BUG_ON(lbegin > lend);
+ BUG_ON(lend > q->nr_levels);
+ nr_levels = lend - lbegin;
+ entries_per_level = safe_div(nr_elts, nr_levels);
+ remainder = safe_mod(nr_elts, nr_levels);
+
+ for (level = lbegin; level < lend; level++)
+ q->target_count[level] =
+ (level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level;
+}
+
+/*
+ * Typically we have fewer elements in the top few levels which allows us
+ * to adjust the promote threshold nicely.
+ */
+static void q_set_targets(struct queue *q)
+{
+ if (q->last_target_nr_elts == q->nr_elts)
+ return;
+
+ q->last_target_nr_elts = q->nr_elts;
+
+ if (q->nr_top_levels > q->nr_levels)
+ q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels);
+
+ else {
+ q_set_targets_subrange_(q, q->nr_in_top_levels,
+ q->nr_levels - q->nr_top_levels, q->nr_levels);
+
+ if (q->nr_in_top_levels < q->nr_elts)
+ q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels,
+ 0, q->nr_levels - q->nr_top_levels);
+ else
+ q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels);
+ }
+}
+
+static void q_redistribute(struct queue *q)
+{
+ unsigned target, level;
+ struct ilist *l, *l_above;
+ struct entry *e;
+
+ q_set_targets(q);
+
+ for (level = 0u; level < q->nr_levels - 1u; level++) {
+ l = q->qs + level;
+ target = q->target_count[level];
+
+ /*
+ * Pull down some entries from the level above.
+ */
+ while (l->nr_elts < target) {
+ e = __redist_pop_from(q, level + 1u);
+ if (!e) {
+ /* bug in nr_elts */
+ break;
+ }
+
+ e->level = level;
+ l_add_tail(q->es, l, e);
+ }
+
+ /*
+ * Push some entries up.
+ */
+ l_above = q->qs + level + 1u;
+ while (l->nr_elts > target) {
+ e = l_pop_tail(q->es, l);
+
+ if (!e)
+ /* bug in nr_elts */
+ break;
+
+ e->level = level + 1u;
+ l_add_head(q->es, l_above, e);
+ }
+ }
+}
+
+static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels)
+{
+ struct entry *de;
+ unsigned new_level;
+
+ q_del(q, e);
+
+ if (extra_levels && (e->level < q->nr_levels - 1u)) {
+ new_level = min(q->nr_levels - 1u, e->level + extra_levels);
+ for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) {
+ if (de->sentinel)
+ continue;
+
+ q_del(q, de);
+ de->level = e->level;
+
+ if (dest)
+ q_push_before(q, dest, de);
+ else
+ q_push(q, de);
+ break;
+ }
+
+ e->level = new_level;
+ }
+
+ q_push(q, e);
+}
+
+static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels)
+{
+ q_requeue_before(q, NULL, e, extra_levels);
+}
+
+/*----------------------------------------------------------------*/
+
+#define FP_SHIFT 8
+#define SIXTEENTH (1u << (FP_SHIFT - 4u))
+#define EIGHTH (1u << (FP_SHIFT - 3u))
+
+struct stats {
+ unsigned hit_threshold;
+ unsigned hits;
+ unsigned misses;
+};
+
+enum performance {
+ Q_POOR,
+ Q_FAIR,
+ Q_WELL
+};
+
+static void stats_init(struct stats *s, unsigned nr_levels)
+{
+ s->hit_threshold = (nr_levels * 3u) / 4u;
+ s->hits = 0u;
+ s->misses = 0u;
+}
+
+static void stats_reset(struct stats *s)
+{
+ s->hits = s->misses = 0u;
+}
+
+static void stats_level_accessed(struct stats *s, unsigned level)
+{
+ if (level >= s->hit_threshold)
+ s->hits++;
+ else
+ s->misses++;
+}
+
+static void stats_miss(struct stats *s)
+{
+ s->misses++;
+}
+
+/*
+ * There are times when we don't have any confidence in the hotspot queue.
+ * Such as when a fresh cache is created and the blocks have been spread
+ * out across the levels, or if an io load changes. We detect this by
+ * seeing how often a lookup is in the top levels of the hotspot queue.
+ */
+static enum performance stats_assess(struct stats *s)
+{
+ unsigned confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses);
+
+ if (confidence < SIXTEENTH)
+ return Q_POOR;
+
+ else if (confidence < EIGHTH)
+ return Q_FAIR;
+
+ else
+ return Q_WELL;
+}
+
+/*----------------------------------------------------------------*/
+
+struct hash_table {
+ struct entry_space *es;
+ unsigned long long hash_bits;
+ unsigned *buckets;
+};
+
+/*
+ * All cache entries are stored in a chained hash table. To save space we
+ * use indexing again, and only store indexes to the next entry.
+ */
+static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries)
+{
+ unsigned i, nr_buckets;
+
+ ht->es = es;
+ nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
+ ht->hash_bits = ffs(nr_buckets) - 1;
+
+ ht->buckets = vmalloc(sizeof(*ht->buckets) * nr_buckets);
+ if (!ht->buckets)
+ return -ENOMEM;
+
+ for (i = 0; i < nr_buckets; i++)
+ ht->buckets[i] = INDEXER_NULL;
+
+ return 0;
+}
+
+static void h_exit(struct hash_table *ht)
+{
+ vfree(ht->buckets);
+}
+
+static struct entry *h_head(struct hash_table *ht, unsigned bucket)
+{
+ return to_entry(ht->es, ht->buckets[bucket]);
+}
+
+static struct entry *h_next(struct hash_table *ht, struct entry *e)
+{
+ return to_entry(ht->es, e->hash_next);
+}
+
+static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e)
+{
+ e->hash_next = ht->buckets[bucket];
+ ht->buckets[bucket] = to_index(ht->es, e);
+}
+
+static void h_insert(struct hash_table *ht, struct entry *e)
+{
+ unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
+ __h_insert(ht, h, e);
+}
+
+static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock,
+ struct entry **prev)
+{
+ struct entry *e;
+
+ *prev = NULL;
+ for (e = h_head(ht, h); e; e = h_next(ht, e)) {
+ if (e->oblock == oblock)
+ return e;
+
+ *prev = e;
+ }
+
+ return NULL;
+}
+
+static void __h_unlink(struct hash_table *ht, unsigned h,
+ struct entry *e, struct entry *prev)
+{
+ if (prev)
+ prev->hash_next = e->hash_next;
+ else
+ ht->buckets[h] = e->hash_next;
+}
+
+/*
+ * Also moves each entry to the front of the bucket.
+ */
+static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
+{
+ struct entry *e, *prev;
+ unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
+
+ e = __h_lookup(ht, h, oblock, &prev);
+ if (e && prev) {
+ /*
+ * Move to the front because this entry is likely
+ * to be hit again.
+ */
+ __h_unlink(ht, h, e, prev);
+ __h_insert(ht, h, e);
+ }
+
+ return e;
+}
+
+static void h_remove(struct hash_table *ht, struct entry *e)
+{
+ unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
+ struct entry *prev;
+
+ /*
+ * The down side of using a singly linked list is we have to
+ * iterate the bucket to remove an item.
+ */
+ e = __h_lookup(ht, h, e->oblock, &prev);
+ if (e)
+ __h_unlink(ht, h, e, prev);
+}
+
+/*----------------------------------------------------------------*/
+
+struct entry_alloc {
+ struct entry_space *es;
+ unsigned begin;
+
+ unsigned nr_allocated;
+ struct ilist free;
+};
+
+static void init_allocator(struct entry_alloc *ea, struct entry_space *es,
+ unsigned begin, unsigned end)
+{
+ unsigned i;
+
+ ea->es = es;
+ ea->nr_allocated = 0u;
+ ea->begin = begin;
+
+ l_init(&ea->free);
+ for (i = begin; i != end; i++)
+ l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i));
+}
+
+static void init_entry(struct entry *e)
+{
+ /*
+ * We can't memset because that would clear the hotspot and
+ * sentinel bits which remain constant.
+ */
+ e->hash_next = INDEXER_NULL;
+ e->next = INDEXER_NULL;
+ e->prev = INDEXER_NULL;
+ e->level = 0u;
+ e->allocated = true;
+}
+
+static struct entry *alloc_entry(struct entry_alloc *ea)
+{
+ struct entry *e;
+
+ if (l_empty(&ea->free))
+ return NULL;
+
+ e = l_pop_tail(ea->es, &ea->free);
+ init_entry(e);
+ ea->nr_allocated++;
+
+ return e;
+}
+
+/*
+ * This assumes the cblock hasn't already been allocated.
+ */
+static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned i)
+{
+ struct entry *e = __get_entry(ea->es, ea->begin + i);
+
+ BUG_ON(e->allocated);
+
+ l_del(ea->es, &ea->free, e);
+ init_entry(e);
+ ea->nr_allocated++;
+
+ return e;
+}
+
+static void free_entry(struct entry_alloc *ea, struct entry *e)
+{
+ BUG_ON(!ea->nr_allocated);
+ BUG_ON(!e->allocated);
+
+ ea->nr_allocated--;
+ e->allocated = false;
+ l_add_tail(ea->es, &ea->free, e);
+}
+
+static bool allocator_empty(struct entry_alloc *ea)
+{
+ return l_empty(&ea->free);
+}
+
+static unsigned get_index(struct entry_alloc *ea, struct entry *e)
+{
+ return to_index(ea->es, e) - ea->begin;
+}
+
+static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
+{
+ return __get_entry(ea->es, ea->begin + index);
+}
+
+/*----------------------------------------------------------------*/
+
+#define NR_HOTSPOT_LEVELS 64u
+#define NR_CACHE_LEVELS 64u
+
+#define WRITEBACK_PERIOD (10 * HZ)
+#define DEMOTE_PERIOD (60 * HZ)
+
+#define HOTSPOT_UPDATE_PERIOD (HZ)
+#define CACHE_UPDATE_PERIOD (10u * HZ)
+
+struct smq_policy {
+ struct dm_cache_policy policy;
+
+ /* protects everything */
+ struct mutex lock;
+ dm_cblock_t cache_size;
+ sector_t cache_block_size;
+
+ sector_t hotspot_block_size;
+ unsigned nr_hotspot_blocks;
+ unsigned cache_blocks_per_hotspot_block;
+ unsigned hotspot_level_jump;
+
+ struct entry_space es;
+ struct entry_alloc writeback_sentinel_alloc;
+ struct entry_alloc demote_sentinel_alloc;
+ struct entry_alloc hotspot_alloc;
+ struct entry_alloc cache_alloc;
+
+ unsigned long *hotspot_hit_bits;
+ unsigned long *cache_hit_bits;
+
+ /*
+ * We maintain three queues of entries. The cache proper,
+ * consisting of a clean and dirty queue, containing the currently
+ * active mappings. The hotspot queue uses a larger block size to
+ * track blocks that are being hit frequently and potential
+ * candidates for promotion to the cache.
+ */
+ struct queue hotspot;
+ struct queue clean;
+ struct queue dirty;
+
+ struct stats hotspot_stats;
+ struct stats cache_stats;
+
+ /*
+ * Keeps track of time, incremented by the core. We use this to
+ * avoid attributing multiple hits within the same tick.
+ *
+ * Access to tick_protected should be done with the spin lock held.
+ * It's copied to tick at the start of the map function (within the
+ * mutex).
+ */
+ spinlock_t tick_lock;
+ unsigned tick_protected;
+ unsigned tick;
+
+ /*
+ * The hash tables allows us to quickly find an entry by origin
+ * block.
+ */
+ struct hash_table table;
+ struct hash_table hotspot_table;
+
+ bool current_writeback_sentinels;
+ unsigned long next_writeback_period;
+
+ bool current_demote_sentinels;
+ unsigned long next_demote_period;
+
+ unsigned write_promote_level;
+ unsigned read_promote_level;
+
+ unsigned long next_hotspot_period;
+ unsigned long next_cache_period;
+};
+
+/*----------------------------------------------------------------*/
+
+static struct entry *get_sentinel(struct entry_alloc *ea, unsigned level, bool which)
+{
+ return get_entry(ea, which ? level : NR_CACHE_LEVELS + level);
+}
+
+static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned level)
+{
+ return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels);
+}
+
+static struct entry *demote_sentinel(struct smq_policy *mq, unsigned level)
+{
+ return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels);
+}
+
+static void __update_writeback_sentinels(struct smq_policy *mq)
+{
+ unsigned level;
+ struct queue *q = &mq->dirty;
+ struct entry *sentinel;
+
+ for (level = 0; level < q->nr_levels; level++) {
+ sentinel = writeback_sentinel(mq, level);
+ q_del(q, sentinel);
+ q_push(q, sentinel);
+ }
+}
+
+static void __update_demote_sentinels(struct smq_policy *mq)
+{
+ unsigned level;
+ struct queue *q = &mq->clean;
+ struct entry *sentinel;
+
+ for (level = 0; level < q->nr_levels; level++) {
+ sentinel = demote_sentinel(mq, level);
+ q_del(q, sentinel);
+ q_push(q, sentinel);
+ }
+}
+
+static void update_sentinels(struct smq_policy *mq)
+{
+ if (time_after(jiffies, mq->next_writeback_period)) {
+ __update_writeback_sentinels(mq);
+ mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
+ mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
+ }
+
+ if (time_after(jiffies, mq->next_demote_period)) {
+ __update_demote_sentinels(mq);
+ mq->next_demote_period = jiffies + DEMOTE_PERIOD;
+ mq->current_demote_sentinels = !mq->current_demote_sentinels;
+ }
+}
+
+static void __sentinels_init(struct smq_policy *mq)
+{
+ unsigned level;
+ struct entry *sentinel;
+
+ for (level = 0; level < NR_CACHE_LEVELS; level++) {
+ sentinel = writeback_sentinel(mq, level);
+ sentinel->level = level;
+ q_push(&mq->dirty, sentinel);
+
+ sentinel = demote_sentinel(mq, level);
+ sentinel->level = level;
+ q_push(&mq->clean, sentinel);
+ }
+}
+
+static void sentinels_init(struct smq_policy *mq)
+{
+ mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
+ mq->next_demote_period = jiffies + DEMOTE_PERIOD;
+
+ mq->current_writeback_sentinels = false;
+ mq->current_demote_sentinels = false;
+ __sentinels_init(mq);
+
+ mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
+ mq->current_demote_sentinels = !mq->current_demote_sentinels;
+ __sentinels_init(mq);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * These methods tie together the dirty queue, clean queue and hash table.
+ */
+static void push_new(struct smq_policy *mq, struct entry *e)
+{
+ struct queue *q = e->dirty ? &mq->dirty : &mq->clean;
+ h_insert(&mq->table, e);
+ q_push(q, e);
+}
+
+static void push(struct smq_policy *mq, struct entry *e)
+{
+ struct entry *sentinel;
+
+ h_insert(&mq->table, e);
+
+ /*
+ * Punch this into the queue just in front of the sentinel, to
+ * ensure it's cleaned straight away.
+ */
+ if (e->dirty) {
+ sentinel = writeback_sentinel(mq, e->level);
+ q_push_before(&mq->dirty, sentinel, e);
+ } else {
+ sentinel = demote_sentinel(mq, e->level);
+ q_push_before(&mq->clean, sentinel, e);
+ }
+}
+
+/*
+ * Removes an entry from cache. Removes from the hash table.
+ */
+static void __del(struct smq_policy *mq, struct queue *q, struct entry *e)
+{
+ q_del(q, e);
+ h_remove(&mq->table, e);
+}
+
+static void del(struct smq_policy *mq, struct entry *e)
+{
+ __del(mq, e->dirty ? &mq->dirty : &mq->clean, e);
+}
+
+static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level)
+{
+ struct entry *e = q_pop_old(q, max_level);
+ if (e)
+ h_remove(&mq->table, e);
+ return e;
+}
+
+static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
+{
+ return to_cblock(get_index(&mq->cache_alloc, e));
+}
+
+static void requeue(struct smq_policy *mq, struct entry *e)
+{
+ struct entry *sentinel;
+
+ if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
+ if (e->dirty) {
+ sentinel = writeback_sentinel(mq, e->level);
+ q_requeue_before(&mq->dirty, sentinel, e, 1u);
+ } else {
+ sentinel = demote_sentinel(mq, e->level);
+ q_requeue_before(&mq->clean, sentinel, e, 1u);
+ }
+ }
+}
+
+static unsigned default_promote_level(struct smq_policy *mq)
+{
+ /*
+ * The promote level depends on the current performance of the
+ * cache.
+ *
+ * If the cache is performing badly, then we can't afford
+ * to promote much without causing performance to drop below that
+ * of the origin device.
+ *
+ * If the cache is performing well, then we don't need to promote
+ * much. If it isn't broken, don't fix it.
+ *
+ * If the cache is middling then we promote more.
+ *
+ * This scheme reminds me of a graph of entropy vs probability of a
+ * binary variable.
+ */
+ static unsigned table[] = {1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1};
+
+ unsigned hits = mq->cache_stats.hits;
+ unsigned misses = mq->cache_stats.misses;
+ unsigned index = safe_div(hits << 4u, hits + misses);
+ return table[index];
+}
+
+static void update_promote_levels(struct smq_policy *mq)
+{
+ /*
+ * If there are unused cache entries then we want to be really
+ * eager to promote.
+ */
+ unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
+ default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
+
+ /*
+ * If the hotspot queue is performing badly then we have little
+ * confidence that we know which blocks to promote. So we cut down
+ * the amount of promotions.
+ */
+ switch (stats_assess(&mq->hotspot_stats)) {
+ case Q_POOR:
+ threshold_level /= 4u;
+ break;
+
+ case Q_FAIR:
+ threshold_level /= 2u;
+ break;
+
+ case Q_WELL:
+ break;
+ }
+
+ mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
+ mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u;
+}
+
+/*
+ * If the hotspot queue is performing badly, then we try and move entries
+ * around more quickly.
+ */
+static void update_level_jump(struct smq_policy *mq)
+{
+ switch (stats_assess(&mq->hotspot_stats)) {
+ case Q_POOR:
+ mq->hotspot_level_jump = 4u;
+ break;
+
+ case Q_FAIR:
+ mq->hotspot_level_jump = 2u;
+ break;
+
+ case Q_WELL:
+ mq->hotspot_level_jump = 1u;
+ break;
+ }
+}
+
+static void end_hotspot_period(struct smq_policy *mq)
+{
+ clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
+ update_promote_levels(mq);
+
+ if (time_after(jiffies, mq->next_hotspot_period)) {
+ update_level_jump(mq);
+ q_redistribute(&mq->hotspot);
+ stats_reset(&mq->hotspot_stats);
+ mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
+ }
+}
+
+static void end_cache_period(struct smq_policy *mq)
+{
+ if (time_after(jiffies, mq->next_cache_period)) {
+ clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
+
+ q_redistribute(&mq->dirty);
+ q_redistribute(&mq->clean);
+ stats_reset(&mq->cache_stats);
+
+ mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
+ }
+}
+
+static int demote_cblock(struct smq_policy *mq,
+ struct policy_locker *locker,
+ dm_oblock_t *oblock)
+{
+ struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false);
+ if (!demoted)
+ /*
+ * We could get a block from mq->dirty, but that
+ * would add extra latency to the triggering bio as it
+ * waits for the writeback. Better to not promote this
+ * time and hope there's a clean block next time this block
+ * is hit.
+ */
+ return -ENOSPC;
+
+ if (locker->fn(locker, demoted->oblock))
+ /*
+ * We couldn't lock this block.
+ */
+ return -EBUSY;
+
+ del(mq, demoted);
+ *oblock = demoted->oblock;
+ free_entry(&mq->cache_alloc, demoted);
+
+ return 0;
+}
+
+enum promote_result {
+ PROMOTE_NOT,
+ PROMOTE_TEMPORARY,
+ PROMOTE_PERMANENT
+};
+
+/*
+ * Converts a boolean into a promote result.
+ */
+static enum promote_result maybe_promote(bool promote)
+{
+ return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
+}
+
+static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio,
+ bool fast_promote)
+{
+ if (bio_data_dir(bio) == WRITE) {
+ if (!allocator_empty(&mq->cache_alloc) && fast_promote)
+ return PROMOTE_TEMPORARY;
+
+ else
+ return maybe_promote(hs_e->level >= mq->write_promote_level);
+ } else
+ return maybe_promote(hs_e->level >= mq->read_promote_level);
+}
+
+static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock,
+ struct policy_locker *locker,
+ struct policy_result *result, enum promote_result pr)
+{
+ int r;
+ struct entry *e;
+
+ if (allocator_empty(&mq->cache_alloc)) {
+ result->op = POLICY_REPLACE;
+ r = demote_cblock(mq, locker, &result->old_oblock);
+ if (r) {
+ result->op = POLICY_MISS;
+ return;
+ }
+
+ } else
+ result->op = POLICY_NEW;
+
+ e = alloc_entry(&mq->cache_alloc);
+ BUG_ON(!e);
+ e->oblock = oblock;
+
+ if (pr == PROMOTE_TEMPORARY)
+ push(mq, e);
+ else
+ push_new(mq, e);
+
+ result->cblock = infer_cblock(mq, e);
+}
+
+static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
+{
+ sector_t r = from_oblock(b);
+ (void) sector_div(r, mq->cache_blocks_per_hotspot_block);
+ return to_oblock(r);
+}
+
+static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio)
+{
+ unsigned hi;
+ dm_oblock_t hb = to_hblock(mq, b);
+ struct entry *e = h_lookup(&mq->hotspot_table, hb);
+
+ if (e) {
+ stats_level_accessed(&mq->hotspot_stats, e->level);
+
+ hi = get_index(&mq->hotspot_alloc, e);
+ q_requeue(&mq->hotspot, e,
+ test_and_set_bit(hi, mq->hotspot_hit_bits) ?
+ 0u : mq->hotspot_level_jump);
+
+ } else {
+ stats_miss(&mq->hotspot_stats);
+
+ e = alloc_entry(&mq->hotspot_alloc);
+ if (!e) {
+ e = q_pop(&mq->hotspot);
+ if (e) {
+ h_remove(&mq->hotspot_table, e);
+ hi = get_index(&mq->hotspot_alloc, e);
+ clear_bit(hi, mq->hotspot_hit_bits);
+ }
+
+ }
+
+ if (e) {
+ e->oblock = hb;
+ q_push(&mq->hotspot, e);
+ h_insert(&mq->hotspot_table, e);
+ }
+ }
+
+ return e;
+}
+
+/*
+ * Looks the oblock up in the hash table, then decides whether to put in
+ * pre_cache, or cache etc.
+ */
+static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock,
+ bool can_migrate, bool fast_promote,
+ struct policy_locker *locker, struct policy_result *result)
+{
+ struct entry *e, *hs_e;
+ enum promote_result pr;
+
+ hs_e = update_hotspot_queue(mq, oblock, bio);
+
+ e = h_lookup(&mq->table, oblock);
+ if (e) {
+ stats_level_accessed(&mq->cache_stats, e->level);
+
+ requeue(mq, e);
+ result->op = POLICY_HIT;
+ result->cblock = infer_cblock(mq, e);
+
+ } else {
+ stats_miss(&mq->cache_stats);
+
+ pr = should_promote(mq, hs_e, bio, fast_promote);
+ if (pr == PROMOTE_NOT)
+ result->op = POLICY_MISS;
+
+ else {
+ if (!can_migrate) {
+ result->op = POLICY_MISS;
+ return -EWOULDBLOCK;
+ }
+
+ insert_in_cache(mq, oblock, locker, result, pr);
+ }
+ }
+
+ return 0;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Public interface, via the policy struct. See dm-cache-policy.h for a
+ * description of these.
+ */
+
+static struct smq_policy *to_smq_policy(struct dm_cache_policy *p)
+{
+ return container_of(p, struct smq_policy, policy);
+}
+
+static void smq_destroy(struct dm_cache_policy *p)
+{
+ struct smq_policy *mq = to_smq_policy(p);
+
+ h_exit(&mq->hotspot_table);
+ h_exit(&mq->table);
+ free_bitset(mq->hotspot_hit_bits);
+ free_bitset(mq->cache_hit_bits);
+ space_exit(&mq->es);
+ kfree(mq);
+}
+
+static void copy_tick(struct smq_policy *mq)
+{
+ unsigned long flags, tick;
+
+ spin_lock_irqsave(&mq->tick_lock, flags);
+ tick = mq->tick_protected;
+ if (tick != mq->tick) {
+ update_sentinels(mq);
+ end_hotspot_period(mq);
+ end_cache_period(mq);
+ mq->tick = tick;
+ }
+ spin_unlock_irqrestore(&mq->tick_lock, flags);
+}
+
+static bool maybe_lock(struct smq_policy *mq, bool can_block)
+{
+ if (can_block) {
+ mutex_lock(&mq->lock);
+ return true;
+ } else
+ return mutex_trylock(&mq->lock);
+}
+
+static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
+ bool can_block, bool can_migrate, bool fast_promote,
+ struct bio *bio, struct policy_locker *locker,
+ struct policy_result *result)
+{
+ int r;
+ struct smq_policy *mq = to_smq_policy(p);
+
+ result->op = POLICY_MISS;
+
+ if (!maybe_lock(mq, can_block))
+ return -EWOULDBLOCK;
+
+ copy_tick(mq);
+ r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result);
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+{
+ int r;
+ struct smq_policy *mq = to_smq_policy(p);
+ struct entry *e;
+
+ if (!mutex_trylock(&mq->lock))
+ return -EWOULDBLOCK;
+
+ e = h_lookup(&mq->table, oblock);
+ if (e) {
+ *cblock = infer_cblock(mq, e);
+ r = 0;
+ } else
+ r = -ENOENT;
+
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set)
+{
+ struct entry *e;
+
+ e = h_lookup(&mq->table, oblock);
+ BUG_ON(!e);
+
+ del(mq, e);
+ e->dirty = set;
+ push(mq, e);
+}
+
+static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+ struct smq_policy *mq = to_smq_policy(p);
+
+ mutex_lock(&mq->lock);
+ __smq_set_clear_dirty(mq, oblock, true);
+ mutex_unlock(&mq->lock);
+}
+
+static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+ struct smq_policy *mq = to_smq_policy(p);
+
+ mutex_lock(&mq->lock);
+ __smq_set_clear_dirty(mq, oblock, false);
+ mutex_unlock(&mq->lock);
+}
+
+static int smq_load_mapping(struct dm_cache_policy *p,
+ dm_oblock_t oblock, dm_cblock_t cblock,
+ uint32_t hint, bool hint_valid)
+{
+ struct smq_policy *mq = to_smq_policy(p);
+ struct entry *e;
+
+ e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
+ e->oblock = oblock;
+ e->dirty = false; /* this gets corrected in a minute */
+ e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : 1;
+ push(mq, e);
+
+ return 0;
+}
+
+static int smq_save_hints(struct smq_policy *mq, struct queue *q,
+ policy_walk_fn fn, void *context)
+{
+ int r;
+ unsigned level;
+ struct entry *e;
+
+ for (level = 0; level < q->nr_levels; level++)
+ for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
+ if (!e->sentinel) {
+ r = fn(context, infer_cblock(mq, e),
+ e->oblock, e->level);
+ if (r)
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+static int smq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
+ void *context)
+{
+ struct smq_policy *mq = to_smq_policy(p);
+ int r = 0;
+
+ mutex_lock(&mq->lock);
+
+ r = smq_save_hints(mq, &mq->clean, fn, context);
+ if (!r)
+ r = smq_save_hints(mq, &mq->dirty, fn, context);
+
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock)
+{
+ struct entry *e;
+
+ e = h_lookup(&mq->table, oblock);
+ BUG_ON(!e);
+
+ del(mq, e);
+ free_entry(&mq->cache_alloc, e);
+}
+
+static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+ struct smq_policy *mq = to_smq_policy(p);
+
+ mutex_lock(&mq->lock);
+ __remove_mapping(mq, oblock);
+ mutex_unlock(&mq->lock);
+}
+
+static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock)
+{
+ struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
+
+ if (!e || !e->allocated)
+ return -ENODATA;
+
+ del(mq, e);
+ free_entry(&mq->cache_alloc, e);
+
+ return 0;
+}
+
+static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+ int r;
+ struct smq_policy *mq = to_smq_policy(p);
+
+ mutex_lock(&mq->lock);
+ r = __remove_cblock(mq, cblock);
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+
+#define CLEAN_TARGET_CRITICAL 5u /* percent */
+
+static bool clean_target_met(struct smq_policy *mq, bool critical)
+{
+ if (critical) {
+ /*
+ * Cache entries may not be populated. So we're cannot rely on the
+ * size of the clean queue.
+ */
+ unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
+ unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u;
+
+ return nr_clean >= target;
+ } else
+ return !q_size(&mq->dirty);
+}
+
+static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock,
+ dm_cblock_t *cblock, bool critical_only)
+{
+ struct entry *e = NULL;
+ bool target_met = clean_target_met(mq, critical_only);
+
+ if (critical_only)
+ /*
+ * Always try and keep the bottom level clean.
+ */
+ e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels);
+
+ else
+ e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels);
+
+ if (!e)
+ return -ENODATA;
+
+ *oblock = e->oblock;
+ *cblock = infer_cblock(mq, e);
+ e->dirty = false;
+ push_new(mq, e);
+
+ return 0;
+}
+
+static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
+ dm_cblock_t *cblock, bool critical_only)
+{
+ int r;
+ struct smq_policy *mq = to_smq_policy(p);
+
+ mutex_lock(&mq->lock);
+ r = __smq_writeback_work(mq, oblock, cblock, critical_only);
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+static void __force_mapping(struct smq_policy *mq,
+ dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+ struct entry *e = h_lookup(&mq->table, current_oblock);
+
+ if (e) {
+ del(mq, e);
+ e->oblock = new_oblock;
+ e->dirty = true;
+ push(mq, e);
+ }
+}
+
+static void smq_force_mapping(struct dm_cache_policy *p,
+ dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+ struct smq_policy *mq = to_smq_policy(p);
+
+ mutex_lock(&mq->lock);
+ __force_mapping(mq, current_oblock, new_oblock);
+ mutex_unlock(&mq->lock);
+}
+
+static dm_cblock_t smq_residency(struct dm_cache_policy *p)
+{
+ dm_cblock_t r;
+ struct smq_policy *mq = to_smq_policy(p);
+
+ mutex_lock(&mq->lock);
+ r = to_cblock(mq->cache_alloc.nr_allocated);
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+static void smq_tick(struct dm_cache_policy *p, bool can_block)
+{
+ struct smq_policy *mq = to_smq_policy(p);
+ unsigned long flags;
+
+ spin_lock_irqsave(&mq->tick_lock, flags);
+ mq->tick_protected++;
+ spin_unlock_irqrestore(&mq->tick_lock, flags);
+
+ if (can_block) {
+ mutex_lock(&mq->lock);
+ copy_tick(mq);
+ mutex_unlock(&mq->lock);
+ }
+}
+
+/* Init the policy plugin interface function pointers. */
+static void init_policy_functions(struct smq_policy *mq)
+{
+ mq->policy.destroy = smq_destroy;
+ mq->policy.map = smq_map;
+ mq->policy.lookup = smq_lookup;
+ mq->policy.set_dirty = smq_set_dirty;
+ mq->policy.clear_dirty = smq_clear_dirty;
+ mq->policy.load_mapping = smq_load_mapping;
+ mq->policy.walk_mappings = smq_walk_mappings;
+ mq->policy.remove_mapping = smq_remove_mapping;
+ mq->policy.remove_cblock = smq_remove_cblock;
+ mq->policy.writeback_work = smq_writeback_work;
+ mq->policy.force_mapping = smq_force_mapping;
+ mq->policy.residency = smq_residency;
+ mq->policy.tick = smq_tick;
+}
+
+static bool too_many_hotspot_blocks(sector_t origin_size,
+ sector_t hotspot_block_size,
+ unsigned nr_hotspot_blocks)
+{
+ return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
+}
+
+static void calc_hotspot_params(sector_t origin_size,
+ sector_t cache_block_size,
+ unsigned nr_cache_blocks,
+ sector_t *hotspot_block_size,
+ unsigned *nr_hotspot_blocks)
+{
+ *hotspot_block_size = cache_block_size * 16u;
+ *nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u);
+
+ while ((*hotspot_block_size > cache_block_size) &&
+ too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks))
+ *hotspot_block_size /= 2u;
+}
+
+static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
+ sector_t origin_size,
+ sector_t cache_block_size)
+{
+ unsigned i;
+ unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
+ unsigned total_sentinels = 2u * nr_sentinels_per_queue;
+ struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
+
+ if (!mq)
+ return NULL;
+
+ init_policy_functions(mq);
+ mq->cache_size = cache_size;
+ mq->cache_block_size = cache_block_size;
+
+ calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size),
+ &mq->hotspot_block_size, &mq->nr_hotspot_blocks);
+
+ mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size);
+ mq->hotspot_level_jump = 1u;
+ if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) {
+ DMERR("couldn't initialize entry space");
+ goto bad_pool_init;
+ }
+
+ init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
+ for (i = 0; i < nr_sentinels_per_queue; i++)
+ get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
+
+ init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
+ for (i = 0; i < nr_sentinels_per_queue; i++)
+ get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
+
+ init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
+ total_sentinels + mq->nr_hotspot_blocks);
+
+ init_allocator(&mq->cache_alloc, &mq->es,
+ total_sentinels + mq->nr_hotspot_blocks,
+ total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size));
+
+ mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks);
+ if (!mq->hotspot_hit_bits) {
+ DMERR("couldn't allocate hotspot hit bitset");
+ goto bad_hotspot_hit_bits;
+ }
+ clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
+
+ if (from_cblock(cache_size)) {
+ mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
+ if (!mq->cache_hit_bits && mq->cache_hit_bits) {
+ DMERR("couldn't allocate cache hit bitset");
+ goto bad_cache_hit_bits;
+ }
+ clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
+ } else
+ mq->cache_hit_bits = NULL;
+
+ mq->tick_protected = 0;
+ mq->tick = 0;
+ mutex_init(&mq->lock);
+ spin_lock_init(&mq->tick_lock);
+
+ q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
+ mq->hotspot.nr_top_levels = 8;
+ mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
+ from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
+
+ q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS);
+ q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS);
+
+ stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS);
+ stats_init(&mq->cache_stats, NR_CACHE_LEVELS);
+
+ if (h_init(&mq->table, &mq->es, from_cblock(cache_size)))
+ goto bad_alloc_table;
+
+ if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks))
+ goto bad_alloc_hotspot_table;
+
+ sentinels_init(mq);
+ mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
+
+ mq->next_hotspot_period = jiffies;
+ mq->next_cache_period = jiffies;
+
+ return &mq->policy;
+
+bad_alloc_hotspot_table:
+ h_exit(&mq->table);
+bad_alloc_table:
+ free_bitset(mq->cache_hit_bits);
+bad_cache_hit_bits:
+ free_bitset(mq->hotspot_hit_bits);
+bad_hotspot_hit_bits:
+ space_exit(&mq->es);
+bad_pool_init:
+ kfree(mq);
+
+ return NULL;
+}
+
+/*----------------------------------------------------------------*/
+
+static struct dm_cache_policy_type smq_policy_type = {
+ .name = "smq",
+ .version = {1, 0, 0},
+ .hint_size = 4,
+ .owner = THIS_MODULE,
+ .create = smq_create
+};
+
+static struct dm_cache_policy_type default_policy_type = {
+ .name = "default",
+ .version = {1, 0, 0},
+ .hint_size = 4,
+ .owner = THIS_MODULE,
+ .create = smq_create,
+ .real = &smq_policy_type
+};
+
+static int __init smq_init(void)
+{
+ int r;
+
+ r = dm_cache_policy_register(&smq_policy_type);
+ if (r) {
+ DMERR("register failed %d", r);
+ return -ENOMEM;
+ }
+
+ r = dm_cache_policy_register(&default_policy_type);
+ if (r) {
+ DMERR("register failed (as default) %d", r);
+ dm_cache_policy_unregister(&smq_policy_type);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __exit smq_exit(void)
+{
+ dm_cache_policy_unregister(&smq_policy_type);
+ dm_cache_policy_unregister(&default_policy_type);
+}
+
+module_init(smq_init);
+module_exit(smq_exit);
+
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("smq cache policy");
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index f50fe360c546..05db56eedb6a 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -70,6 +70,18 @@ enum policy_operation {
};
/*
+ * When issuing a POLICY_REPLACE the policy needs to make a callback to
+ * lock the block being demoted. This doesn't need to occur during a
+ * writeback operation since the block remains in the cache.
+ */
+struct policy_locker;
+typedef int (*policy_lock_fn)(struct policy_locker *l, dm_oblock_t oblock);
+
+struct policy_locker {
+ policy_lock_fn fn;
+};
+
+/*
* This is the instruction passed back to the core target.
*/
struct policy_result {
@@ -122,7 +134,8 @@ struct dm_cache_policy {
*/
int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock,
bool can_block, bool can_migrate, bool discarded_oblock,
- struct bio *bio, struct policy_result *result);
+ struct bio *bio, struct policy_locker *locker,
+ struct policy_result *result);
/*
* Sometimes we want to see if a block is in the cache, without
@@ -165,7 +178,9 @@ struct dm_cache_policy {
int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock);
/*
- * Provide a dirty block to be written back by the core target.
+ * Provide a dirty block to be written back by the core target. If
+ * critical_only is set then the policy should only provide work if
+ * it urgently needs it.
*
* Returns:
*
@@ -173,7 +188,8 @@ struct dm_cache_policy {
*
* -ENODATA: no dirty blocks available
*/
- int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
+ int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock,
+ bool critical_only);
/*
* How full is the cache?
@@ -184,16 +200,16 @@ struct dm_cache_policy {
* Because of where we sit in the block layer, we can be asked to
* map a lot of little bios that are all in the same block (no
* queue merging has occurred). To stop the policy being fooled by
- * these the core target sends regular tick() calls to the policy.
+ * these, the core target sends regular tick() calls to the policy.
* The policy should only count an entry as hit once per tick.
*/
- void (*tick)(struct dm_cache_policy *p);
+ void (*tick)(struct dm_cache_policy *p, bool can_block);
/*
* Configuration.
*/
- int (*emit_config_values)(struct dm_cache_policy *p,
- char *result, unsigned maxlen);
+ int (*emit_config_values)(struct dm_cache_policy *p, char *result,
+ unsigned maxlen, ssize_t *sz_ptr);
int (*set_config_value)(struct dm_cache_policy *p,
const char *key, const char *value);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 41b2594a80c6..1b4e1756b169 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -25,44 +25,93 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
/*----------------------------------------------------------------*/
-/*
- * Glossary:
- *
- * oblock: index of an origin block
- * cblock: index of a cache block
- * promotion: movement of a block from origin to cache
- * demotion: movement of a block from cache to origin
- * migration: movement of a block between the origin and cache device,
- * either direction
- */
+#define IOT_RESOLUTION 4
-/*----------------------------------------------------------------*/
+struct io_tracker {
+ spinlock_t lock;
-static size_t bitset_size_in_bytes(unsigned nr_entries)
+ /*
+ * Sectors of in-flight IO.
+ */
+ sector_t in_flight;
+
+ /*
+ * The time, in jiffies, when this device became idle (if it is
+ * indeed idle).
+ */
+ unsigned long idle_time;
+ unsigned long last_update_time;
+};
+
+static void iot_init(struct io_tracker *iot)
{
- return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
+ spin_lock_init(&iot->lock);
+ iot->in_flight = 0ul;
+ iot->idle_time = 0ul;
+ iot->last_update_time = jiffies;
}
-static unsigned long *alloc_bitset(unsigned nr_entries)
+static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
{
- size_t s = bitset_size_in_bytes(nr_entries);
- return vzalloc(s);
+ if (iot->in_flight)
+ return false;
+
+ return time_after(jiffies, iot->idle_time + jifs);
}
-static void clear_bitset(void *bitset, unsigned nr_entries)
+static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
{
- size_t s = bitset_size_in_bytes(nr_entries);
- memset(bitset, 0, s);
+ bool r;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iot->lock, flags);
+ r = __iot_idle_for(iot, jifs);
+ spin_unlock_irqrestore(&iot->lock, flags);
+
+ return r;
}
-static void free_bitset(unsigned long *bits)
+static void iot_io_begin(struct io_tracker *iot, sector_t len)
{
- vfree(bits);
+ unsigned long flags;
+
+ spin_lock_irqsave(&iot->lock, flags);
+ iot->in_flight += len;
+ spin_unlock_irqrestore(&iot->lock, flags);
+}
+
+static void __iot_io_end(struct io_tracker *iot, sector_t len)
+{
+ iot->in_flight -= len;
+ if (!iot->in_flight)
+ iot->idle_time = jiffies;
+}
+
+static void iot_io_end(struct io_tracker *iot, sector_t len)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&iot->lock, flags);
+ __iot_io_end(iot, len);
+ spin_unlock_irqrestore(&iot->lock, flags);
}
/*----------------------------------------------------------------*/
/*
+ * Glossary:
+ *
+ * oblock: index of an origin block
+ * cblock: index of a cache block
+ * promotion: movement of a block from origin to cache
+ * demotion: movement of a block from cache to origin
+ * migration: movement of a block between the origin and cache device,
+ * either direction
+ */
+
+/*----------------------------------------------------------------*/
+
+/*
* There are a couple of places where we let a bio run, but want to do some
* work before calling its endio function. We do this by temporarily
* changing the endio fn.
@@ -101,12 +150,10 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
-/*
- * FIXME: the cache is read/write for the time being.
- */
enum cache_metadata_mode {
CM_WRITE, /* metadata may be changed */
CM_READ_ONLY, /* metadata may not be changed */
+ CM_FAIL
};
enum cache_io_mode {
@@ -208,6 +255,7 @@ struct cache {
int sectors_per_block_shift;
spinlock_t lock;
+ struct list_head deferred_cells;
struct bio_list deferred_bios;
struct bio_list deferred_flush_bios;
struct bio_list deferred_writethrough_bios;
@@ -282,6 +330,8 @@ struct cache {
*/
spinlock_t invalidation_lock;
struct list_head invalidation_requests;
+
+ struct io_tracker origin_tracker;
};
struct per_bio_data {
@@ -289,6 +339,7 @@ struct per_bio_data {
unsigned req_nr:2;
struct dm_deferred_entry *all_io_entry;
struct dm_hook_info hook_info;
+ sector_t len;
/*
* writethrough fields. These MUST remain at the end of this
@@ -332,6 +383,8 @@ struct prealloc {
struct dm_bio_prison_cell *cell2;
};
+static enum cache_metadata_mode get_cache_mode(struct cache *cache);
+
static void wake_worker(struct cache *cache)
{
queue_work(cache->wq, &cache->worker);
@@ -365,10 +418,13 @@ static struct dm_cache_migration *alloc_migration(struct cache *cache)
static void free_migration(struct dm_cache_migration *mg)
{
- if (atomic_dec_and_test(&mg->cache->nr_allocated_migrations))
- wake_up(&mg->cache->migration_wait);
+ struct cache *cache = mg->cache;
+
+ if (atomic_dec_and_test(&cache->nr_allocated_migrations))
+ wake_up(&cache->migration_wait);
- mempool_free(mg, mg->cache->migration_pool);
+ mempool_free(mg, cache->migration_pool);
+ wake_worker(cache);
}
static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
@@ -643,6 +699,9 @@ static void save_stats(struct cache *cache)
{
struct dm_cache_statistics stats;
+ if (get_cache_mode(cache) >= CM_READ_ONLY)
+ return;
+
stats.read_hits = atomic_read(&cache->stats.read_hit);
stats.read_misses = atomic_read(&cache->stats.read_miss);
stats.write_hits = atomic_read(&cache->stats.write_hit);
@@ -695,6 +754,7 @@ static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
pb->tick = false;
pb->req_nr = dm_bio_get_target_bio_nr(bio);
pb->all_io_entry = NULL;
+ pb->len = 0;
return pb;
}
@@ -792,12 +852,43 @@ static void inc_ds(struct cache *cache, struct bio *bio,
pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
}
+static bool accountable_bio(struct cache *cache, struct bio *bio)
+{
+ return ((bio->bi_bdev == cache->origin_dev->bdev) &&
+ !(bio->bi_rw & REQ_DISCARD));
+}
+
+static void accounted_begin(struct cache *cache, struct bio *bio)
+{
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+ if (accountable_bio(cache, bio)) {
+ pb->len = bio_sectors(bio);
+ iot_io_begin(&cache->origin_tracker, pb->len);
+ }
+}
+
+static void accounted_complete(struct cache *cache, struct bio *bio)
+{
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+ iot_io_end(&cache->origin_tracker, pb->len);
+}
+
+static void accounted_request(struct cache *cache, struct bio *bio)
+{
+ accounted_begin(cache, bio);
+ generic_make_request(bio);
+}
+
static void issue(struct cache *cache, struct bio *bio)
{
unsigned long flags;
if (!bio_triggers_commit(cache, bio)) {
- generic_make_request(bio);
+ accounted_request(cache, bio);
return;
}
@@ -870,6 +961,94 @@ static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
}
/*----------------------------------------------------------------
+ * Failure modes
+ *--------------------------------------------------------------*/
+static enum cache_metadata_mode get_cache_mode(struct cache *cache)
+{
+ return cache->features.mode;
+}
+
+static const char *cache_device_name(struct cache *cache)
+{
+ return dm_device_name(dm_table_get_md(cache->ti->table));
+}
+
+static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
+{
+ const char *descs[] = {
+ "write",
+ "read-only",
+ "fail"
+ };
+
+ dm_table_event(cache->ti->table);
+ DMINFO("%s: switching cache to %s mode",
+ cache_device_name(cache), descs[(int)mode]);
+}
+
+static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
+{
+ bool needs_check = dm_cache_metadata_needs_check(cache->cmd);
+ enum cache_metadata_mode old_mode = get_cache_mode(cache);
+
+ if (new_mode == CM_WRITE && needs_check) {
+ DMERR("%s: unable to switch cache to write mode until repaired.",
+ cache_device_name(cache));
+ if (old_mode != new_mode)
+ new_mode = old_mode;
+ else
+ new_mode = CM_READ_ONLY;
+ }
+
+ /* Never move out of fail mode */
+ if (old_mode == CM_FAIL)
+ new_mode = CM_FAIL;
+
+ switch (new_mode) {
+ case CM_FAIL:
+ case CM_READ_ONLY:
+ dm_cache_metadata_set_read_only(cache->cmd);
+ break;
+
+ case CM_WRITE:
+ dm_cache_metadata_set_read_write(cache->cmd);
+ break;
+ }
+
+ cache->features.mode = new_mode;
+
+ if (new_mode != old_mode)
+ notify_mode_switch(cache, new_mode);
+}
+
+static void abort_transaction(struct cache *cache)
+{
+ const char *dev_name = cache_device_name(cache);
+
+ if (get_cache_mode(cache) >= CM_READ_ONLY)
+ return;
+
+ if (dm_cache_metadata_set_needs_check(cache->cmd)) {
+ DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
+ set_cache_mode(cache, CM_FAIL);
+ }
+
+ DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
+ if (dm_cache_metadata_abort(cache->cmd)) {
+ DMERR("%s: failed to abort metadata transaction", dev_name);
+ set_cache_mode(cache, CM_FAIL);
+ }
+}
+
+static void metadata_operation_failed(struct cache *cache, const char *op, int r)
+{
+ DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
+ cache_device_name(cache), op, r);
+ abort_transaction(cache);
+ set_cache_mode(cache, CM_READ_ONLY);
+}
+
+/*----------------------------------------------------------------
* Migration processing
*
* Migration covers moving data from the origin device to the cache, or
@@ -885,26 +1064,63 @@ static void dec_io_migrations(struct cache *cache)
atomic_dec(&cache->nr_io_migrations);
}
-static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
- bool holder)
+static void __cell_release(struct cache *cache, struct dm_bio_prison_cell *cell,
+ bool holder, struct bio_list *bios)
{
(holder ? dm_cell_release : dm_cell_release_no_holder)
- (cache->prison, cell, &cache->deferred_bios);
+ (cache->prison, cell, bios);
free_prison_cell(cache, cell);
}
-static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
- bool holder)
+static bool discard_or_flush(struct bio *bio)
+{
+ return bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD);
+}
+
+static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
+{
+ if (discard_or_flush(cell->holder))
+ /*
+ * We have to handle these bios
+ * individually.
+ */
+ __cell_release(cache, cell, true, &cache->deferred_bios);
+
+ else
+ list_add_tail(&cell->user_list, &cache->deferred_cells);
+}
+
+static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder)
{
unsigned long flags;
+ if (!holder && dm_cell_promote_or_release(cache->prison, cell)) {
+ /*
+ * There was no prisoner to promote to holder, the
+ * cell has been released.
+ */
+ free_prison_cell(cache, cell);
+ return;
+ }
+
spin_lock_irqsave(&cache->lock, flags);
- __cell_defer(cache, cell, holder);
+ __cell_defer(cache, cell);
spin_unlock_irqrestore(&cache->lock, flags);
wake_worker(cache);
}
+static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err)
+{
+ dm_cell_error(cache->prison, cell, err);
+ dm_bio_prison_free_cell(cache->prison, cell);
+}
+
+static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell)
+{
+ cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE);
+}
+
static void free_io_migration(struct dm_cache_migration *mg)
{
dec_io_migrations(mg->cache);
@@ -914,21 +1130,22 @@ static void free_io_migration(struct dm_cache_migration *mg)
static void migration_failure(struct dm_cache_migration *mg)
{
struct cache *cache = mg->cache;
+ const char *dev_name = cache_device_name(cache);
if (mg->writeback) {
- DMWARN_LIMIT("writeback failed; couldn't copy block");
+ DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name);
set_dirty(cache, mg->old_oblock, mg->cblock);
cell_defer(cache, mg->old_ocell, false);
} else if (mg->demote) {
- DMWARN_LIMIT("demotion failed; couldn't copy block");
+ DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name);
policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
if (mg->promote)
cell_defer(cache, mg->new_ocell, true);
} else {
- DMWARN_LIMIT("promotion failed; couldn't copy block");
+ DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name);
policy_remove_mapping(cache->policy, mg->new_oblock);
cell_defer(cache, mg->new_ocell, true);
}
@@ -938,6 +1155,7 @@ static void migration_failure(struct dm_cache_migration *mg)
static void migration_success_pre_commit(struct dm_cache_migration *mg)
{
+ int r;
unsigned long flags;
struct cache *cache = mg->cache;
@@ -948,8 +1166,11 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg)
return;
} else if (mg->demote) {
- if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
- DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
+ r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
+ if (r) {
+ DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata",
+ cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
policy_force_mapping(cache->policy, mg->new_oblock,
mg->old_oblock);
if (mg->promote)
@@ -958,8 +1179,11 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg)
return;
}
} else {
- if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
- DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
+ r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
+ if (r) {
+ DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata",
+ cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
policy_remove_mapping(cache->policy, mg->new_oblock);
free_io_migration(mg);
return;
@@ -978,7 +1202,8 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
struct cache *cache = mg->cache;
if (mg->writeback) {
- DMWARN("writeback unexpectedly triggered commit");
+ DMWARN_LIMIT("%s: writeback unexpectedly triggered commit",
+ cache_device_name(cache));
return;
} else if (mg->demote) {
@@ -1054,7 +1279,7 @@ static void issue_copy(struct dm_cache_migration *mg)
}
if (r < 0) {
- DMERR_LIMIT("issuing migration failed");
+ DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache));
migration_failure(mg);
}
}
@@ -1093,7 +1318,7 @@ static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
* No need to inc_ds() here, since the cell will be held for the
* duration of the io.
*/
- generic_make_request(bio);
+ accounted_request(mg->cache, bio);
}
static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
@@ -1439,32 +1664,154 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
&cache->stats.read_miss : &cache->stats.write_miss);
}
-static void process_bio(struct cache *cache, struct prealloc *structs,
- struct bio *bio)
+/*----------------------------------------------------------------*/
+
+struct inc_detail {
+ struct cache *cache;
+ struct bio_list bios_for_issue;
+ struct bio_list unhandled_bios;
+ bool any_writes;
+};
+
+static void inc_fn(void *context, struct dm_bio_prison_cell *cell)
+{
+ struct bio *bio;
+ struct inc_detail *detail = context;
+ struct cache *cache = detail->cache;
+
+ inc_ds(cache, cell->holder, cell);
+ if (bio_data_dir(cell->holder) == WRITE)
+ detail->any_writes = true;
+
+ while ((bio = bio_list_pop(&cell->bios))) {
+ if (discard_or_flush(bio)) {
+ bio_list_add(&detail->unhandled_bios, bio);
+ continue;
+ }
+
+ if (bio_data_dir(bio) == WRITE)
+ detail->any_writes = true;
+
+ bio_list_add(&detail->bios_for_issue, bio);
+ inc_ds(cache, bio, cell);
+ }
+}
+
+// FIXME: refactor these two
+static void remap_cell_to_origin_clear_discard(struct cache *cache,
+ struct dm_bio_prison_cell *cell,
+ dm_oblock_t oblock, bool issue_holder)
+{
+ struct bio *bio;
+ unsigned long flags;
+ struct inc_detail detail;
+
+ detail.cache = cache;
+ bio_list_init(&detail.bios_for_issue);
+ bio_list_init(&detail.unhandled_bios);
+ detail.any_writes = false;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
+ bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ remap_to_origin(cache, cell->holder);
+ if (issue_holder)
+ issue(cache, cell->holder);
+ else
+ accounted_begin(cache, cell->holder);
+
+ if (detail.any_writes)
+ clear_discard(cache, oblock_to_dblock(cache, oblock));
+
+ while ((bio = bio_list_pop(&detail.bios_for_issue))) {
+ remap_to_origin(cache, bio);
+ issue(cache, bio);
+ }
+}
+
+static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell,
+ dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder)
+{
+ struct bio *bio;
+ unsigned long flags;
+ struct inc_detail detail;
+
+ detail.cache = cache;
+ bio_list_init(&detail.bios_for_issue);
+ bio_list_init(&detail.unhandled_bios);
+ detail.any_writes = false;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
+ bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ remap_to_cache(cache, cell->holder, cblock);
+ if (issue_holder)
+ issue(cache, cell->holder);
+ else
+ accounted_begin(cache, cell->holder);
+
+ if (detail.any_writes) {
+ set_dirty(cache, oblock, cblock);
+ clear_discard(cache, oblock_to_dblock(cache, oblock));
+ }
+
+ while ((bio = bio_list_pop(&detail.bios_for_issue))) {
+ remap_to_cache(cache, bio, cblock);
+ issue(cache, bio);
+ }
+}
+
+/*----------------------------------------------------------------*/
+
+struct old_oblock_lock {
+ struct policy_locker locker;
+ struct cache *cache;
+ struct prealloc *structs;
+ struct dm_bio_prison_cell *cell;
+};
+
+static int null_locker(struct policy_locker *locker, dm_oblock_t b)
+{
+ /* This should never be called */
+ BUG();
+ return 0;
+}
+
+static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
+{
+ struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker);
+ struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs);
+
+ return bio_detain(l->cache, b, NULL, cell_prealloc,
+ (cell_free_fn) prealloc_put_cell,
+ l->structs, &l->cell);
+}
+
+static void process_cell(struct cache *cache, struct prealloc *structs,
+ struct dm_bio_prison_cell *new_ocell)
{
int r;
bool release_cell = true;
+ struct bio *bio = new_ocell->holder;
dm_oblock_t block = get_bio_block(cache, bio);
- struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
struct policy_result lookup_result;
bool passthrough = passthrough_mode(&cache->features);
- bool discarded_block, can_migrate;
-
- /*
- * Check to see if that block is currently migrating.
- */
- cell_prealloc = prealloc_get_cell(structs);
- r = bio_detain(cache, block, bio, cell_prealloc,
- (cell_free_fn) prealloc_put_cell,
- structs, &new_ocell);
- if (r > 0)
- return;
+ bool fast_promotion, can_migrate;
+ struct old_oblock_lock ool;
- discarded_block = is_discarded_oblock(cache, block);
- can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
+ fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
+ can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache));
- r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
- bio, &lookup_result);
+ ool.locker.fn = cell_locker;
+ ool.cache = cache;
+ ool.structs = structs;
+ ool.cell = NULL;
+ r = policy_map(cache->policy, block, true, can_migrate, fast_promotion,
+ bio, &ool.locker, &lookup_result);
if (r == -EWOULDBLOCK)
/* migration has been denied */
@@ -1500,9 +1847,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
inc_and_issue(cache, bio, new_ocell);
- } else {
- remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
- inc_and_issue(cache, bio, new_ocell);
+ } else {
+ remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true);
+ release_cell = false;
}
}
@@ -1510,8 +1857,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
case POLICY_MISS:
inc_miss_counter(cache, bio);
- remap_to_origin_clear_discard(cache, bio, block);
- inc_and_issue(cache, bio, new_ocell);
+ remap_cell_to_origin_clear_discard(cache, new_ocell, block, true);
+ release_cell = false;
break;
case POLICY_NEW:
@@ -1521,32 +1868,17 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
break;
case POLICY_REPLACE:
- cell_prealloc = prealloc_get_cell(structs);
- r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
- (cell_free_fn) prealloc_put_cell,
- structs, &old_ocell);
- if (r > 0) {
- /*
- * We have to be careful to avoid lock inversion of
- * the cells. So we back off, and wait for the
- * old_ocell to become free.
- */
- policy_force_mapping(cache->policy, block,
- lookup_result.old_oblock);
- atomic_inc(&cache->stats.cache_cell_clash);
- break;
- }
atomic_inc(&cache->stats.demotion);
atomic_inc(&cache->stats.promotion);
-
demote_then_promote(cache, structs, lookup_result.old_oblock,
block, lookup_result.cblock,
- old_ocell, new_ocell);
+ ool.cell, new_ocell);
release_cell = false;
break;
default:
- DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
+ DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u",
+ cache_device_name(cache), __func__,
(unsigned) lookup_result.op);
bio_io_error(bio);
}
@@ -1555,10 +1887,48 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
cell_defer(cache, new_ocell, false);
}
+static void process_bio(struct cache *cache, struct prealloc *structs,
+ struct bio *bio)
+{
+ int r;
+ dm_oblock_t block = get_bio_block(cache, bio);
+ struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
+
+ /*
+ * Check to see if that block is currently migrating.
+ */
+ cell_prealloc = prealloc_get_cell(structs);
+ r = bio_detain(cache, block, bio, cell_prealloc,
+ (cell_free_fn) prealloc_put_cell,
+ structs, &new_ocell);
+ if (r > 0)
+ return;
+
+ process_cell(cache, structs, new_ocell);
+}
+
static int need_commit_due_to_time(struct cache *cache)
{
- return !time_in_range(jiffies, cache->last_commit_jiffies,
- cache->last_commit_jiffies + COMMIT_PERIOD);
+ return jiffies < cache->last_commit_jiffies ||
+ jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
+}
+
+/*
+ * A non-zero return indicates read_only or fail_io mode.
+ */
+static int commit(struct cache *cache, bool clean_shutdown)
+{
+ int r;
+
+ if (get_cache_mode(cache) >= CM_READ_ONLY)
+ return -EINVAL;
+
+ atomic_inc(&cache->stats.commit_count);
+ r = dm_cache_commit(cache->cmd, clean_shutdown);
+ if (r)
+ metadata_operation_failed(cache, "dm_cache_commit", r);
+
+ return r;
}
static int commit_if_needed(struct cache *cache)
@@ -1567,9 +1937,8 @@ static int commit_if_needed(struct cache *cache)
if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
dm_cache_changed_this_transaction(cache->cmd)) {
- atomic_inc(&cache->stats.commit_count);
+ r = commit(cache, false);
cache->commit_requested = false;
- r = dm_cache_commit(cache->cmd, false);
cache->last_commit_jiffies = jiffies;
}
@@ -1617,6 +1986,40 @@ static void process_deferred_bios(struct cache *cache)
prealloc_free_structs(cache, &structs);
}
+static void process_deferred_cells(struct cache *cache)
+{
+ unsigned long flags;
+ struct dm_bio_prison_cell *cell, *tmp;
+ struct list_head cells;
+ struct prealloc structs;
+
+ memset(&structs, 0, sizeof(structs));
+
+ INIT_LIST_HEAD(&cells);
+
+ spin_lock_irqsave(&cache->lock, flags);
+ list_splice_init(&cache->deferred_cells, &cells);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ list_for_each_entry_safe(cell, tmp, &cells, user_list) {
+ /*
+ * If we've got no free migration structs, and processing
+ * this bio might require one, we pause until there are some
+ * prepared mappings to process.
+ */
+ if (prealloc_data_structs(cache, &structs)) {
+ spin_lock_irqsave(&cache->lock, flags);
+ list_splice(&cells, &cache->deferred_cells);
+ spin_unlock_irqrestore(&cache->lock, flags);
+ break;
+ }
+
+ process_cell(cache, &structs, cell);
+ }
+
+ prealloc_free_structs(cache, &structs);
+}
+
static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
{
unsigned long flags;
@@ -1634,7 +2037,7 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
* These bios have already been through inc_ds()
*/
while ((bio = bio_list_pop(&bios)))
- submit_bios ? generic_make_request(bio) : bio_io_error(bio);
+ submit_bios ? accounted_request(cache, bio) : bio_io_error(bio);
}
static void process_deferred_writethrough_bios(struct cache *cache)
@@ -1654,7 +2057,7 @@ static void process_deferred_writethrough_bios(struct cache *cache)
* These bios have already been through inc_ds()
*/
while ((bio = bio_list_pop(&bios)))
- generic_make_request(bio);
+ accounted_request(cache, bio);
}
static void writeback_some_dirty_blocks(struct cache *cache)
@@ -1664,6 +2067,7 @@ static void writeback_some_dirty_blocks(struct cache *cache)
dm_cblock_t cblock;
struct prealloc structs;
struct dm_bio_prison_cell *old_ocell;
+ bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
memset(&structs, 0, sizeof(structs));
@@ -1671,7 +2075,7 @@ static void writeback_some_dirty_blocks(struct cache *cache)
if (prealloc_data_structs(cache, &structs))
break;
- r = policy_writeback_work(cache->policy, &oblock, &cblock);
+ r = policy_writeback_work(cache->policy, &oblock, &cblock, busy);
if (r)
break;
@@ -1702,15 +2106,17 @@ static void process_invalidation_request(struct cache *cache, struct invalidatio
r = policy_remove_cblock(cache->policy, to_cblock(begin));
if (!r) {
r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
- if (r)
+ if (r) {
+ metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
break;
+ }
} else if (r == -ENODATA) {
/* harmless, already unmapped */
r = 0;
} else {
- DMERR("policy_remove_cblock failed");
+ DMERR("%s: policy_remove_cblock failed", cache_device_name(cache));
break;
}
@@ -1783,7 +2189,22 @@ static void stop_worker(struct cache *cache)
flush_workqueue(cache->wq);
}
-static void requeue_deferred_io(struct cache *cache)
+static void requeue_deferred_cells(struct cache *cache)
+{
+ unsigned long flags;
+ struct list_head cells;
+ struct dm_bio_prison_cell *cell, *tmp;
+
+ INIT_LIST_HEAD(&cells);
+ spin_lock_irqsave(&cache->lock, flags);
+ list_splice_init(&cache->deferred_cells, &cells);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ list_for_each_entry_safe(cell, tmp, &cells, user_list)
+ cell_requeue(cache, cell);
+}
+
+static void requeue_deferred_bios(struct cache *cache)
{
struct bio *bio;
struct bio_list bios;
@@ -1804,6 +2225,7 @@ static int more_work(struct cache *cache)
!list_empty(&cache->need_commit_migrations);
else
return !bio_list_empty(&cache->deferred_bios) ||
+ !list_empty(&cache->deferred_cells) ||
!bio_list_empty(&cache->deferred_flush_bios) ||
!bio_list_empty(&cache->deferred_writethrough_bios) ||
!list_empty(&cache->quiesced_migrations) ||
@@ -1821,6 +2243,7 @@ static void do_worker(struct work_struct *ws)
writeback_some_dirty_blocks(cache);
process_deferred_writethrough_bios(cache);
process_deferred_bios(cache);
+ process_deferred_cells(cache);
process_invalidation_requests(cache);
}
@@ -1830,11 +2253,6 @@ static void do_worker(struct work_struct *ws)
if (commit_if_needed(cache)) {
process_deferred_flush_bios(cache, false);
process_migrations(cache, &cache->need_commit_migrations, migration_failure);
-
- /*
- * FIXME: rollback metadata or just go into a
- * failure mode and error everything
- */
} else {
process_deferred_flush_bios(cache, true);
process_migrations(cache, &cache->need_commit_migrations,
@@ -1853,7 +2271,7 @@ static void do_worker(struct work_struct *ws)
static void do_waker(struct work_struct *ws)
{
struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
- policy_tick(cache->policy);
+ policy_tick(cache->policy, true);
wake_worker(cache);
queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
}
@@ -2407,6 +2825,12 @@ static int cache_create(struct cache_args *ca, struct cache **result)
goto bad;
}
cache->cmd = cmd;
+ set_cache_mode(cache, CM_WRITE);
+ if (get_cache_mode(cache) != CM_WRITE) {
+ *error = "Unable to get write access to metadata, please check/repair metadata.";
+ r = -EINVAL;
+ goto bad;
+ }
if (passthrough_mode(&cache->features)) {
bool all_clean;
@@ -2425,6 +2849,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
}
spin_lock_init(&cache->lock);
+ INIT_LIST_HEAD(&cache->deferred_cells);
bio_list_init(&cache->deferred_bios);
bio_list_init(&cache->deferred_flush_bios);
bio_list_init(&cache->deferred_writethrough_bios);
@@ -2514,6 +2939,8 @@ static int cache_create(struct cache_args *ca, struct cache **result)
spin_lock_init(&cache->invalidation_lock);
INIT_LIST_HEAD(&cache->invalidation_requests);
+ iot_init(&cache->origin_tracker);
+
*result = cache;
return 0;
@@ -2580,15 +3007,23 @@ out:
return r;
}
-static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell)
+/*----------------------------------------------------------------*/
+
+static int cache_map(struct dm_target *ti, struct bio *bio)
{
+ struct cache *cache = ti->private;
+
int r;
+ struct dm_bio_prison_cell *cell = NULL;
dm_oblock_t block = get_bio_block(cache, bio);
size_t pb_data_size = get_per_bio_data_size(cache);
bool can_migrate = false;
- bool discarded_block;
+ bool fast_promotion;
struct policy_result lookup_result;
struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
+ struct old_oblock_lock ool;
+
+ ool.locker.fn = null_locker;
if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
/*
@@ -2597,10 +3032,11 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
* Just remap to the origin and carry on.
*/
remap_to_origin(cache, bio);
+ accounted_begin(cache, bio);
return DM_MAPIO_REMAPPED;
}
- if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
+ if (discard_or_flush(bio)) {
defer_bio(cache, bio);
return DM_MAPIO_SUBMITTED;
}
@@ -2608,15 +3044,15 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
/*
* Check to see if that block is currently migrating.
*/
- *cell = alloc_prison_cell(cache);
- if (!*cell) {
+ cell = alloc_prison_cell(cache);
+ if (!cell) {
defer_bio(cache, bio);
return DM_MAPIO_SUBMITTED;
}
- r = bio_detain(cache, block, bio, *cell,
+ r = bio_detain(cache, block, bio, cell,
(cell_free_fn) free_prison_cell,
- cache, cell);
+ cache, &cell);
if (r) {
if (r < 0)
defer_bio(cache, bio);
@@ -2624,17 +3060,18 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
return DM_MAPIO_SUBMITTED;
}
- discarded_block = is_discarded_oblock(cache, block);
+ fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
- r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
- bio, &lookup_result);
+ r = policy_map(cache->policy, block, false, can_migrate, fast_promotion,
+ bio, &ool.locker, &lookup_result);
if (r == -EWOULDBLOCK) {
- cell_defer(cache, *cell, true);
+ cell_defer(cache, cell, true);
return DM_MAPIO_SUBMITTED;
} else if (r) {
- DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
- cell_defer(cache, *cell, false);
+ DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d",
+ cache_device_name(cache), r);
+ cell_defer(cache, cell, false);
bio_io_error(bio);
return DM_MAPIO_SUBMITTED;
}
@@ -2648,21 +3085,30 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
* We need to invalidate this block, so
* defer for the worker thread.
*/
- cell_defer(cache, *cell, true);
+ cell_defer(cache, cell, true);
r = DM_MAPIO_SUBMITTED;
} else {
inc_miss_counter(cache, bio);
remap_to_origin_clear_discard(cache, bio, block);
+ accounted_begin(cache, bio);
+ inc_ds(cache, bio, cell);
+ // FIXME: we want to remap hits or misses straight
+ // away rather than passing over to the worker.
+ cell_defer(cache, cell, false);
}
} else {
inc_hit_counter(cache, bio);
if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
- !is_dirty(cache, lookup_result.cblock))
+ !is_dirty(cache, lookup_result.cblock)) {
remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
- else
- remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+ accounted_begin(cache, bio);
+ inc_ds(cache, bio, cell);
+ cell_defer(cache, cell, false);
+
+ } else
+ remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false);
}
break;
@@ -2674,18 +3120,19 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
* longer needed because the block has been demoted.
*/
bio_endio(bio, 0);
- cell_defer(cache, *cell, false);
+ // FIXME: remap everything as a miss
+ cell_defer(cache, cell, false);
r = DM_MAPIO_SUBMITTED;
} else
- remap_to_origin_clear_discard(cache, bio, block);
-
+ remap_cell_to_origin_clear_discard(cache, cell, block, false);
break;
default:
- DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
+ DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u",
+ cache_device_name(cache), __func__,
(unsigned) lookup_result.op);
- cell_defer(cache, *cell, false);
+ cell_defer(cache, cell, false);
bio_io_error(bio);
r = DM_MAPIO_SUBMITTED;
}
@@ -2693,21 +3140,6 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
return r;
}
-static int cache_map(struct dm_target *ti, struct bio *bio)
-{
- int r;
- struct dm_bio_prison_cell *cell = NULL;
- struct cache *cache = ti->private;
-
- r = __cache_map(cache, bio, &cell);
- if (r == DM_MAPIO_REMAPPED && cell) {
- inc_ds(cache, bio, cell);
- cell_defer(cache, cell, false);
- }
-
- return r;
-}
-
static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
{
struct cache *cache = ti->private;
@@ -2716,7 +3148,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
if (pb->tick) {
- policy_tick(cache->policy);
+ policy_tick(cache->policy, false);
spin_lock_irqsave(&cache->lock, flags);
cache->need_tick_bio = true;
@@ -2724,6 +3156,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
}
check_for_quiesced_migrations(cache, pb);
+ accounted_complete(cache, bio);
return 0;
}
@@ -2732,11 +3165,16 @@ static int write_dirty_bitset(struct cache *cache)
{
unsigned i, r;
+ if (get_cache_mode(cache) >= CM_READ_ONLY)
+ return -EINVAL;
+
for (i = 0; i < from_cblock(cache->cache_size); i++) {
r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
is_dirty(cache, to_cblock(i)));
- if (r)
+ if (r) {
+ metadata_operation_failed(cache, "dm_cache_set_dirty", r);
return r;
+ }
}
return 0;
@@ -2746,18 +3184,40 @@ static int write_discard_bitset(struct cache *cache)
{
unsigned i, r;
+ if (get_cache_mode(cache) >= CM_READ_ONLY)
+ return -EINVAL;
+
r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
cache->discard_nr_blocks);
if (r) {
- DMERR("could not resize on-disk discard bitset");
+ DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
return r;
}
for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
r = dm_cache_set_discard(cache->cmd, to_dblock(i),
is_discarded(cache, to_dblock(i)));
- if (r)
+ if (r) {
+ metadata_operation_failed(cache, "dm_cache_set_discard", r);
return r;
+ }
+ }
+
+ return 0;
+}
+
+static int write_hints(struct cache *cache)
+{
+ int r;
+
+ if (get_cache_mode(cache) >= CM_READ_ONLY)
+ return -EINVAL;
+
+ r = dm_cache_write_hints(cache->cmd, cache->policy);
+ if (r) {
+ metadata_operation_failed(cache, "dm_cache_write_hints", r);
+ return r;
}
return 0;
@@ -2772,26 +3232,26 @@ static bool sync_metadata(struct cache *cache)
r1 = write_dirty_bitset(cache);
if (r1)
- DMERR("could not write dirty bitset");
+ DMERR("%s: could not write dirty bitset", cache_device_name(cache));
r2 = write_discard_bitset(cache);
if (r2)
- DMERR("could not write discard bitset");
+ DMERR("%s: could not write discard bitset", cache_device_name(cache));
save_stats(cache);
- r3 = dm_cache_write_hints(cache->cmd, cache->policy);
+ r3 = write_hints(cache);
if (r3)
- DMERR("could not write hints");
+ DMERR("%s: could not write hints", cache_device_name(cache));
/*
* If writing the above metadata failed, we still commit, but don't
* set the clean shutdown flag. This will effectively force every
* dirty bit to be set on reload.
*/
- r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
+ r4 = commit(cache, !r1 && !r2 && !r3);
if (r4)
- DMERR("could not write cache metadata. Data loss may occur.");
+ DMERR("%s: could not write cache metadata", cache_device_name(cache));
return !r1 && !r2 && !r3 && !r4;
}
@@ -2803,10 +3263,12 @@ static void cache_postsuspend(struct dm_target *ti)
start_quiescing(cache);
wait_for_migrations(cache);
stop_worker(cache);
- requeue_deferred_io(cache);
+ requeue_deferred_bios(cache);
+ requeue_deferred_cells(cache);
stop_quiescing(cache);
- (void) sync_metadata(cache);
+ if (get_cache_mode(cache) == CM_WRITE)
+ (void) sync_metadata(cache);
}
static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
@@ -2929,7 +3391,8 @@ static bool can_resize(struct cache *cache, dm_cblock_t new_size)
while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
new_size = to_cblock(from_cblock(new_size) + 1);
if (is_dirty(cache, new_size)) {
- DMERR("unable to shrink cache; cache block %llu is dirty",
+ DMERR("%s: unable to shrink cache; cache block %llu is dirty",
+ cache_device_name(cache),
(unsigned long long) from_cblock(new_size));
return false;
}
@@ -2944,7 +3407,8 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
r = dm_cache_resize(cache->cmd, new_size);
if (r) {
- DMERR("could not resize cache metadata");
+ DMERR("%s: could not resize cache metadata", cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_resize", r);
return r;
}
@@ -2982,7 +3446,8 @@ static int cache_preresume(struct dm_target *ti)
r = dm_cache_load_mappings(cache->cmd, cache->policy,
load_mapping, cache);
if (r) {
- DMERR("could not load cache mappings");
+ DMERR("%s: could not load cache mappings", cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_load_mappings", r);
return r;
}
@@ -3002,7 +3467,8 @@ static int cache_preresume(struct dm_target *ti)
discard_load_info_init(cache, &li);
r = dm_cache_load_discards(cache->cmd, load_discard, &li);
if (r) {
- DMERR("could not load origin discards");
+ DMERR("%s: could not load origin discards", cache_device_name(cache));
+ metadata_operation_failed(cache, "dm_cache_load_discards", r);
return r;
}
set_discard_range(&li);
@@ -3030,7 +3496,7 @@ static void cache_resume(struct dm_target *ti)
* <#demotions> <#promotions> <#dirty>
* <#features> <features>*
* <#core args> <core args>
- * <policy name> <#policy args> <policy args>*
+ * <policy name> <#policy args> <policy args>* <cache metadata mode>
*/
static void cache_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
@@ -3046,23 +3512,26 @@ static void cache_status(struct dm_target *ti, status_type_t type,
switch (type) {
case STATUSTYPE_INFO:
- /* Commit to ensure statistics aren't out-of-date */
- if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
- r = dm_cache_commit(cache->cmd, false);
- if (r)
- DMERR("could not commit metadata for accurate status");
+ if (get_cache_mode(cache) == CM_FAIL) {
+ DMEMIT("Fail");
+ break;
}
- r = dm_cache_get_free_metadata_block_count(cache->cmd,
- &nr_free_blocks_metadata);
+ /* Commit to ensure statistics aren't out-of-date */
+ if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
+ (void) commit(cache, false);
+
+ r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
if (r) {
- DMERR("could not get metadata free block count");
+ DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
+ cache_device_name(cache), r);
goto err;
}
r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
if (r) {
- DMERR("could not get metadata device size");
+ DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
+ cache_device_name(cache), r);
goto err;
}
@@ -3093,7 +3562,8 @@ static void cache_status(struct dm_target *ti, status_type_t type,
DMEMIT("1 writeback ");
else {
- DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode);
+ DMERR("%s: internal error: unknown io mode: %d",
+ cache_device_name(cache), (int) cache->features.io_mode);
goto err;
}
@@ -3101,11 +3571,17 @@ static void cache_status(struct dm_target *ti, status_type_t type,
DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
if (sz < maxlen) {
- r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
+ r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
if (r)
- DMERR("policy_emit_config_values returned %d", r);
+ DMERR("%s: policy_emit_config_values returned %d",
+ cache_device_name(cache), r);
}
+ if (get_cache_mode(cache) == CM_READ_ONLY)
+ DMEMIT("ro ");
+ else
+ DMEMIT("rw ");
+
break;
case STATUSTYPE_TABLE:
@@ -3167,7 +3643,7 @@ static int parse_cblock_range(struct cache *cache, const char *str,
return 0;
}
- DMERR("invalid cblock range '%s'", str);
+ DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
return -EINVAL;
}
@@ -3178,17 +3654,20 @@ static int validate_cblock_range(struct cache *cache, struct cblock_range *range
uint64_t n = from_cblock(cache->cache_size);
if (b >= n) {
- DMERR("begin cblock out of range: %llu >= %llu", b, n);
+ DMERR("%s: begin cblock out of range: %llu >= %llu",
+ cache_device_name(cache), b, n);
return -EINVAL;
}
if (e > n) {
- DMERR("end cblock out of range: %llu > %llu", e, n);
+ DMERR("%s: end cblock out of range: %llu > %llu",
+ cache_device_name(cache), e, n);
return -EINVAL;
}
if (b >= e) {
- DMERR("invalid cblock range: %llu >= %llu", b, e);
+ DMERR("%s: invalid cblock range: %llu >= %llu",
+ cache_device_name(cache), b, e);
return -EINVAL;
}
@@ -3222,7 +3701,8 @@ static int process_invalidate_cblocks_message(struct cache *cache, unsigned coun
struct cblock_range range;
if (!passthrough_mode(&cache->features)) {
- DMERR("cache has to be in passthrough mode for invalidation");
+ DMERR("%s: cache has to be in passthrough mode for invalidation",
+ cache_device_name(cache));
return -EPERM;
}
@@ -3261,6 +3741,12 @@ static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
if (!argc)
return -EINVAL;
+ if (get_cache_mode(cache) >= CM_READ_ONLY) {
+ DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
+ cache_device_name(cache));
+ return -EOPNOTSUPP;
+ }
+
if (!strcasecmp(argv[0], "invalidate_cblocks"))
return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
@@ -3334,7 +3820,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
- .version = {1, 6, 0},
+ .version = {1, 7, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 5503e43e5f28..0f48fed44a17 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,7 +1,7 @@
/*
* Copyright (C) 2003 Jana Saout <jana@saout.de>
* Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
- * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2006-2015 Red Hat, Inc. All rights reserved.
* Copyright (C) 2013 Milan Broz <gmazyland@gmail.com>
*
* This file is released under the GPL.
@@ -891,6 +891,11 @@ static void crypt_alloc_req(struct crypt_config *cc,
ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
+
+ /*
+ * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs
+ * requests if driver request queue is full.
+ */
ablkcipher_request_set_callback(ctx->req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
kcryptd_async_done, dmreq_of_req(cc, ctx->req));
@@ -924,24 +929,32 @@ static int crypt_convert(struct crypt_config *cc,
r = crypt_convert_block(cc, ctx, ctx->req);
switch (r) {
- /* async */
+ /*
+ * The request was queued by a crypto driver
+ * but the driver request queue is full, let's wait.
+ */
case -EBUSY:
wait_for_completion(&ctx->restart);
reinit_completion(&ctx->restart);
- /* fall through*/
+ /* fall through */
+ /*
+ * The request is queued and processed asynchronously,
+ * completion function kcryptd_async_done() will be called.
+ */
case -EINPROGRESS:
ctx->req = NULL;
ctx->cc_sector++;
continue;
-
- /* sync */
+ /*
+ * The request was already processed (synchronously).
+ */
case 0:
atomic_dec(&ctx->cc_pending);
ctx->cc_sector++;
cond_resched();
continue;
- /* error */
+ /* There was an error while processing the request. */
default:
atomic_dec(&ctx->cc_pending);
return r;
@@ -1346,6 +1359,11 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
struct crypt_config *cc = io->cc;
+ /*
+ * A request from crypto driver backlog is going to be processed now,
+ * finish the completion and continue in crypt_convert().
+ * (Callback will be called for the second time for this request.)
+ */
if (error == -EINPROGRESS) {
complete(&ctx->restart);
return;
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 93e08446a87d..ad1b049ae2ab 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -55,8 +55,8 @@
#define LOG_DISCARD_FLAG (1 << 2)
#define LOG_MARK_FLAG (1 << 3)
-#define WRITE_LOG_VERSION 1
-#define WRITE_LOG_MAGIC 0x6a736677736872
+#define WRITE_LOG_VERSION 1ULL
+#define WRITE_LOG_MAGIC 0x6a736677736872ULL
/*
* The disk format for this is braindead simple.
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 88e4c7f24986..2daa67793511 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1,6 +1,6 @@
/*
* Copyright (C) 2010-2011 Neil Brown
- * Copyright (C) 2010-2014 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2010-2015 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
@@ -17,6 +17,7 @@
#include <linux/device-mapper.h>
#define DM_MSG_PREFIX "raid"
+#define MAX_RAID_DEVICES 253 /* raid4/5/6 limit */
static bool devices_handle_discard_safely = false;
@@ -45,25 +46,25 @@ struct raid_dev {
};
/*
- * Flags for rs->print_flags field.
+ * Flags for rs->ctr_flags field.
*/
-#define DMPF_SYNC 0x1
-#define DMPF_NOSYNC 0x2
-#define DMPF_REBUILD 0x4
-#define DMPF_DAEMON_SLEEP 0x8
-#define DMPF_MIN_RECOVERY_RATE 0x10
-#define DMPF_MAX_RECOVERY_RATE 0x20
-#define DMPF_MAX_WRITE_BEHIND 0x40
-#define DMPF_STRIPE_CACHE 0x80
-#define DMPF_REGION_SIZE 0x100
-#define DMPF_RAID10_COPIES 0x200
-#define DMPF_RAID10_FORMAT 0x400
+#define CTR_FLAG_SYNC 0x1
+#define CTR_FLAG_NOSYNC 0x2
+#define CTR_FLAG_REBUILD 0x4
+#define CTR_FLAG_DAEMON_SLEEP 0x8
+#define CTR_FLAG_MIN_RECOVERY_RATE 0x10
+#define CTR_FLAG_MAX_RECOVERY_RATE 0x20
+#define CTR_FLAG_MAX_WRITE_BEHIND 0x40
+#define CTR_FLAG_STRIPE_CACHE 0x80
+#define CTR_FLAG_REGION_SIZE 0x100
+#define CTR_FLAG_RAID10_COPIES 0x200
+#define CTR_FLAG_RAID10_FORMAT 0x400
struct raid_set {
struct dm_target *ti;
uint32_t bitmap_loaded;
- uint32_t print_flags;
+ uint32_t ctr_flags;
struct mddev md;
struct raid_type *raid_type;
@@ -81,6 +82,7 @@ static struct raid_type {
const unsigned level; /* RAID level. */
const unsigned algorithm; /* RAID algorithm. */
} raid_types[] = {
+ {"raid0", "RAID0 (striping)", 0, 2, 0, 0 /* NONE */},
{"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */},
{"raid10", "RAID10 (striped mirrors)", 0, 2, 10, UINT_MAX /* Varies */},
{"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
@@ -119,15 +121,15 @@ static int raid10_format_to_md_layout(char *format, unsigned copies)
{
unsigned n = 1, f = 1;
- if (!strcmp("near", format))
+ if (!strcasecmp("near", format))
n = copies;
else
f = copies;
- if (!strcmp("offset", format))
+ if (!strcasecmp("offset", format))
return 0x30000 | (f << 8) | n;
- if (!strcmp("far", format))
+ if (!strcasecmp("far", format))
return 0x20000 | (f << 8) | n;
return (f << 8) | n;
@@ -477,8 +479,6 @@ too_many:
* will form the "stripe"
* [[no]sync] Force or prevent recovery of the
* entire array
- * [devices_handle_discard_safely] Allow discards on RAID4/5/6; useful if RAID
- * member device(s) properly support TRIM/UNMAP
* [rebuild <idx>] Rebuild the drive indicated by the index
* [daemon_sleep <ms>] Time between bitmap daemon work to
* clear bits
@@ -555,12 +555,12 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
for (i = 0; i < num_raid_params; i++) {
if (!strcasecmp(argv[i], "nosync")) {
rs->md.recovery_cp = MaxSector;
- rs->print_flags |= DMPF_NOSYNC;
+ rs->ctr_flags |= CTR_FLAG_NOSYNC;
continue;
}
if (!strcasecmp(argv[i], "sync")) {
rs->md.recovery_cp = 0;
- rs->print_flags |= DMPF_SYNC;
+ rs->ctr_flags |= CTR_FLAG_SYNC;
continue;
}
@@ -585,7 +585,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
return -EINVAL;
}
raid10_format = argv[i];
- rs->print_flags |= DMPF_RAID10_FORMAT;
+ rs->ctr_flags |= CTR_FLAG_RAID10_FORMAT;
continue;
}
@@ -602,7 +602,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
}
clear_bit(In_sync, &rs->dev[value].rdev.flags);
rs->dev[value].rdev.recovery_offset = 0;
- rs->print_flags |= DMPF_REBUILD;
+ rs->ctr_flags |= CTR_FLAG_REBUILD;
} else if (!strcasecmp(key, "write_mostly")) {
if (rs->raid_type->level != 1) {
rs->ti->error = "write_mostly option is only valid for RAID1";
@@ -618,7 +618,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
rs->ti->error = "max_write_behind option is only valid for RAID1";
return -EINVAL;
}
- rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
+ rs->ctr_flags |= CTR_FLAG_MAX_WRITE_BEHIND;
/*
* In device-mapper, we specify things in sectors, but
@@ -631,14 +631,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
}
rs->md.bitmap_info.max_write_behind = value;
} else if (!strcasecmp(key, "daemon_sleep")) {
- rs->print_flags |= DMPF_DAEMON_SLEEP;
+ rs->ctr_flags |= CTR_FLAG_DAEMON_SLEEP;
if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
rs->ti->error = "daemon sleep period out of range";
return -EINVAL;
}
rs->md.bitmap_info.daemon_sleep = value;
} else if (!strcasecmp(key, "stripe_cache")) {
- rs->print_flags |= DMPF_STRIPE_CACHE;
+ rs->ctr_flags |= CTR_FLAG_STRIPE_CACHE;
/*
* In device-mapper, we specify things in sectors, but
@@ -656,21 +656,21 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
return -EINVAL;
}
} else if (!strcasecmp(key, "min_recovery_rate")) {
- rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
+ rs->ctr_flags |= CTR_FLAG_MIN_RECOVERY_RATE;
if (value > INT_MAX) {
rs->ti->error = "min_recovery_rate out of range";
return -EINVAL;
}
rs->md.sync_speed_min = (int)value;
} else if (!strcasecmp(key, "max_recovery_rate")) {
- rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
+ rs->ctr_flags |= CTR_FLAG_MAX_RECOVERY_RATE;
if (value > INT_MAX) {
rs->ti->error = "max_recovery_rate out of range";
return -EINVAL;
}
rs->md.sync_speed_max = (int)value;
} else if (!strcasecmp(key, "region_size")) {
- rs->print_flags |= DMPF_REGION_SIZE;
+ rs->ctr_flags |= CTR_FLAG_REGION_SIZE;
region_size = value;
} else if (!strcasecmp(key, "raid10_copies") &&
(rs->raid_type->level == 10)) {
@@ -678,7 +678,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
rs->ti->error = "Bad value for 'raid10_copies'";
return -EINVAL;
}
- rs->print_flags |= DMPF_RAID10_COPIES;
+ rs->ctr_flags |= CTR_FLAG_RAID10_COPIES;
raid10_copies = value;
} else {
DMERR("Unable to parse RAID parameter: %s", key);
@@ -720,7 +720,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
rs->md.layout = raid10_format_to_md_layout(raid10_format,
raid10_copies);
rs->md.new_layout = rs->md.layout;
- } else if ((rs->raid_type->level > 1) &&
+ } else if ((!rs->raid_type->level || rs->raid_type->level > 1) &&
sector_div(sectors_per_dev,
(rs->md.raid_disks - rs->raid_type->parity_devs))) {
rs->ti->error = "Target length not divisible by number of data devices";
@@ -947,7 +947,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
return -EINVAL;
}
- if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)))
+ if (!(rs->ctr_flags & (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC)))
mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
/*
@@ -1026,8 +1026,9 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
return 0;
}
-static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
{
+ struct mddev *mddev = &rs->md;
struct dm_raid_superblock *sb = page_address(rdev->sb_page);
/*
@@ -1037,8 +1038,10 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
if (!mddev->events && super_init_validation(mddev, rdev))
return -EINVAL;
- mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
- rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
+ /* Enable bitmap creation for RAID levels != 0 */
+ mddev->bitmap_info.offset = (rs->raid_type->level) ? to_sector(4096) : 0;
+ rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
+
if (!test_bit(FirstUse, &rdev->flags)) {
rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
if (rdev->recovery_offset != MaxSector)
@@ -1073,7 +1076,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
freshest = NULL;
rdev_for_each_safe(rdev, tmp, mddev) {
/*
- * Skipping super_load due to DMPF_SYNC will cause
+ * Skipping super_load due to CTR_FLAG_SYNC will cause
* the array to undergo initialization again as
* though it were new. This is the intended effect
* of the "sync" directive.
@@ -1082,7 +1085,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
* that the "sync" directive is disallowed during the
* reshape.
*/
- if (rs->print_flags & DMPF_SYNC)
+ rdev->sectors = to_sector(i_size_read(rdev->bdev->bd_inode));
+
+ if (rs->ctr_flags & CTR_FLAG_SYNC)
continue;
if (!rdev->meta_bdev)
@@ -1140,11 +1145,11 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
* validation for the remaining devices.
*/
ti->error = "Unable to assemble array: Invalid superblocks";
- if (super_validate(mddev, freshest))
+ if (super_validate(rs, freshest))
return -EINVAL;
rdev_for_each(rdev, mddev)
- if ((rdev != freshest) && super_validate(mddev, rdev))
+ if ((rdev != freshest) && super_validate(rs, rdev))
return -EINVAL;
return 0;
@@ -1243,7 +1248,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
}
if ((kstrtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
- (num_raid_devs >= INT_MAX)) {
+ (num_raid_devs > MAX_RAID_DEVICES)) {
ti->error = "Cannot understand number of raid devices";
return -EINVAL;
}
@@ -1282,10 +1287,11 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
*/
configure_discard_support(ti, rs);
- mutex_lock(&rs->md.reconfig_mutex);
+ /* Has to be held on running the array */
+ mddev_lock_nointr(&rs->md);
ret = md_run(&rs->md);
rs->md.in_sync = 0; /* Assume already marked dirty */
- mutex_unlock(&rs->md.reconfig_mutex);
+ mddev_unlock(&rs->md);
if (ret) {
ti->error = "Fail to run raid array";
@@ -1368,34 +1374,40 @@ static void raid_status(struct dm_target *ti, status_type_t type,
case STATUSTYPE_INFO:
DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
- if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
- sync = rs->md.curr_resync_completed;
- else
- sync = rs->md.recovery_cp;
-
- if (sync >= rs->md.resync_max_sectors) {
- /*
- * Sync complete.
- */
+ if (rs->raid_type->level) {
+ if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
+ sync = rs->md.curr_resync_completed;
+ else
+ sync = rs->md.recovery_cp;
+
+ if (sync >= rs->md.resync_max_sectors) {
+ /*
+ * Sync complete.
+ */
+ array_in_sync = 1;
+ sync = rs->md.resync_max_sectors;
+ } else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
+ /*
+ * If "check" or "repair" is occurring, the array has
+ * undergone and initial sync and the health characters
+ * should not be 'a' anymore.
+ */
+ array_in_sync = 1;
+ } else {
+ /*
+ * The array may be doing an initial sync, or it may
+ * be rebuilding individual components. If all the
+ * devices are In_sync, then it is the array that is
+ * being initialized.
+ */
+ for (i = 0; i < rs->md.raid_disks; i++)
+ if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+ array_in_sync = 1;
+ }
+ } else {
+ /* RAID0 */
array_in_sync = 1;
sync = rs->md.resync_max_sectors;
- } else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
- /*
- * If "check" or "repair" is occurring, the array has
- * undergone and initial sync and the health characters
- * should not be 'a' anymore.
- */
- array_in_sync = 1;
- } else {
- /*
- * The array may be doing an initial sync, or it may
- * be rebuilding individual components. If all the
- * devices are In_sync, then it is the array that is
- * being initialized.
- */
- for (i = 0; i < rs->md.raid_disks; i++)
- if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
- array_in_sync = 1;
}
/*
@@ -1446,7 +1458,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
case STATUSTYPE_TABLE:
/* The string you would use to construct this array */
for (i = 0; i < rs->md.raid_disks; i++) {
- if ((rs->print_flags & DMPF_REBUILD) &&
+ if ((rs->ctr_flags & CTR_FLAG_REBUILD) &&
rs->dev[i].data_dev &&
!test_bit(In_sync, &rs->dev[i].rdev.flags))
raid_param_cnt += 2; /* for rebuilds */
@@ -1455,33 +1467,33 @@ static void raid_status(struct dm_target *ti, status_type_t type,
raid_param_cnt += 2;
}
- raid_param_cnt += (hweight32(rs->print_flags & ~DMPF_REBUILD) * 2);
- if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
+ raid_param_cnt += (hweight32(rs->ctr_flags & ~CTR_FLAG_REBUILD) * 2);
+ if (rs->ctr_flags & (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC))
raid_param_cnt--;
DMEMIT("%s %u %u", rs->raid_type->name,
raid_param_cnt, rs->md.chunk_sectors);
- if ((rs->print_flags & DMPF_SYNC) &&
+ if ((rs->ctr_flags & CTR_FLAG_SYNC) &&
(rs->md.recovery_cp == MaxSector))
DMEMIT(" sync");
- if (rs->print_flags & DMPF_NOSYNC)
+ if (rs->ctr_flags & CTR_FLAG_NOSYNC)
DMEMIT(" nosync");
for (i = 0; i < rs->md.raid_disks; i++)
- if ((rs->print_flags & DMPF_REBUILD) &&
+ if ((rs->ctr_flags & CTR_FLAG_REBUILD) &&
rs->dev[i].data_dev &&
!test_bit(In_sync, &rs->dev[i].rdev.flags))
DMEMIT(" rebuild %u", i);
- if (rs->print_flags & DMPF_DAEMON_SLEEP)
+ if (rs->ctr_flags & CTR_FLAG_DAEMON_SLEEP)
DMEMIT(" daemon_sleep %lu",
rs->md.bitmap_info.daemon_sleep);
- if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
+ if (rs->ctr_flags & CTR_FLAG_MIN_RECOVERY_RATE)
DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
- if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
+ if (rs->ctr_flags & CTR_FLAG_MAX_RECOVERY_RATE)
DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
for (i = 0; i < rs->md.raid_disks; i++)
@@ -1489,11 +1501,11 @@ static void raid_status(struct dm_target *ti, status_type_t type,
test_bit(WriteMostly, &rs->dev[i].rdev.flags))
DMEMIT(" write_mostly %u", i);
- if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
+ if (rs->ctr_flags & CTR_FLAG_MAX_WRITE_BEHIND)
DMEMIT(" max_write_behind %lu",
rs->md.bitmap_info.max_write_behind);
- if (rs->print_flags & DMPF_STRIPE_CACHE) {
+ if (rs->ctr_flags & CTR_FLAG_STRIPE_CACHE) {
struct r5conf *conf = rs->md.private;
/* convert from kiB to sectors */
@@ -1501,15 +1513,15 @@ static void raid_status(struct dm_target *ti, status_type_t type,
conf ? conf->max_nr_stripes * 2 : 0);
}
- if (rs->print_flags & DMPF_REGION_SIZE)
+ if (rs->ctr_flags & CTR_FLAG_REGION_SIZE)
DMEMIT(" region_size %lu",
rs->md.bitmap_info.chunksize >> 9);
- if (rs->print_flags & DMPF_RAID10_COPIES)
+ if (rs->ctr_flags & CTR_FLAG_RAID10_COPIES)
DMEMIT(" raid10_copies %u",
raid10_md_layout_to_copies(rs->md.layout));
- if (rs->print_flags & DMPF_RAID10_FORMAT)
+ if (rs->ctr_flags & CTR_FLAG_RAID10_FORMAT)
DMEMIT(" raid10_format %s",
raid10_md_layout_to_format(rs->md.layout));
@@ -1684,26 +1696,48 @@ static void raid_resume(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
- set_bit(MD_CHANGE_DEVS, &rs->md.flags);
- if (!rs->bitmap_loaded) {
- bitmap_load(&rs->md);
- rs->bitmap_loaded = 1;
- } else {
- /*
- * A secondary resume while the device is active.
- * Take this opportunity to check whether any failed
- * devices are reachable again.
- */
- attempt_restore_of_faulty_devices(rs);
+ if (rs->raid_type->level) {
+ set_bit(MD_CHANGE_DEVS, &rs->md.flags);
+
+ if (!rs->bitmap_loaded) {
+ bitmap_load(&rs->md);
+ rs->bitmap_loaded = 1;
+ } else {
+ /*
+ * A secondary resume while the device is active.
+ * Take this opportunity to check whether any failed
+ * devices are reachable again.
+ */
+ attempt_restore_of_faulty_devices(rs);
+ }
+
+ clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
}
- clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
mddev_resume(&rs->md);
}
+static int raid_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct raid_set *rs = ti->private;
+ struct md_personality *pers = rs->md.pers;
+
+ if (pers && pers->mergeable_bvec)
+ return min(max_size, pers->mergeable_bvec(&rs->md, bvm, biovec));
+
+ /*
+ * In case we can't request the personality because
+ * the raid set is not running yet
+ *
+ * -> return safe minimum
+ */
+ return rs->md.chunk_sectors;
+}
+
static struct target_type raid_target = {
.name = "raid",
- .version = {1, 6, 0},
+ .version = {1, 7, 0},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
@@ -1715,6 +1749,7 @@ static struct target_type raid_target = {
.presuspend = raid_presuspend,
.postsuspend = raid_postsuspend,
.resume = raid_resume,
+ .merge = raid_merge,
};
static int __init dm_raid_init(void)
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 743fa9bbae9e..d83696bf403b 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -23,8 +23,10 @@
#define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */
-#define DM_RAID1_HANDLE_ERRORS 0x01
+#define DM_RAID1_HANDLE_ERRORS 0x01
+#define DM_RAID1_KEEP_LOG 0x02
#define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS)
+#define keep_log(p) ((p)->features & DM_RAID1_KEEP_LOG)
static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
@@ -229,7 +231,7 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
if (m != get_default_mirror(ms))
goto out;
- if (!ms->in_sync) {
+ if (!ms->in_sync && !keep_log(ms)) {
/*
* Better to issue requests to same failing device
* than to risk returning corrupt data.
@@ -370,6 +372,17 @@ static int recover(struct mirror_set *ms, struct dm_region *reg)
return r;
}
+static void reset_ms_flags(struct mirror_set *ms)
+{
+ unsigned int m;
+
+ ms->leg_failure = 0;
+ for (m = 0; m < ms->nr_mirrors; m++) {
+ atomic_set(&(ms->mirror[m].error_count), 0);
+ ms->mirror[m].error_type = 0;
+ }
+}
+
static void do_recovery(struct mirror_set *ms)
{
struct dm_region *reg;
@@ -398,6 +411,7 @@ static void do_recovery(struct mirror_set *ms)
/* the sync is complete */
dm_table_event(ms->ti->table);
ms->in_sync = 1;
+ reset_ms_flags(ms);
}
}
@@ -759,7 +773,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
dm_rh_delay(ms->rh, bio);
while ((bio = bio_list_pop(&nosync))) {
- if (unlikely(ms->leg_failure) && errors_handled(ms)) {
+ if (unlikely(ms->leg_failure) && errors_handled(ms) && !keep_log(ms)) {
spin_lock_irq(&ms->lock);
bio_list_add(&ms->failures, bio);
spin_unlock_irq(&ms->lock);
@@ -803,15 +817,21 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
/*
* If all the legs are dead, fail the I/O.
- * If we have been told to handle errors, hold the bio
- * and wait for userspace to deal with the problem.
+ * If the device has failed and keep_log is enabled,
+ * fail the I/O.
+ *
+ * If we have been told to handle errors, and keep_log
+ * isn't enabled, hold the bio and wait for userspace to
+ * deal with the problem.
+ *
* Otherwise pretend that the I/O succeeded. (This would
* be wrong if the failed leg returned after reboot and
* got replicated back to the good legs.)
*/
- if (!get_valid_mirror(ms))
+
+ if (unlikely(!get_valid_mirror(ms) || (keep_log(ms) && ms->log_failure)))
bio_endio(bio, -EIO);
- else if (errors_handled(ms))
+ else if (errors_handled(ms) && !keep_log(ms))
hold_bio(ms, bio);
else
bio_endio(bio, 0);
@@ -987,6 +1007,7 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
unsigned num_features;
struct dm_target *ti = ms->ti;
char dummy;
+ int i;
*args_used = 0;
@@ -1007,15 +1028,25 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
return -EINVAL;
}
- if (!strcmp("handle_errors", argv[0]))
- ms->features |= DM_RAID1_HANDLE_ERRORS;
- else {
- ti->error = "Unrecognised feature requested";
+ for (i = 0; i < num_features; i++) {
+ if (!strcmp("handle_errors", argv[0]))
+ ms->features |= DM_RAID1_HANDLE_ERRORS;
+ else if (!strcmp("keep_log", argv[0]))
+ ms->features |= DM_RAID1_KEEP_LOG;
+ else {
+ ti->error = "Unrecognised feature requested";
+ return -EINVAL;
+ }
+
+ argc--;
+ argv++;
+ (*args_used)++;
+ }
+ if (!errors_handled(ms) && keep_log(ms)) {
+ ti->error = "keep_log feature requires the handle_errors feature";
return -EINVAL;
}
- (*args_used)++;
-
return 0;
}
@@ -1029,7 +1060,7 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
* log_type is "core" or "disk"
* #log_params is between 1 and 3
*
- * If present, features must be "handle_errors".
+ * If present, supported features are "handle_errors" and "keep_log".
*/
static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
@@ -1363,6 +1394,7 @@ static void mirror_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
{
unsigned int m, sz = 0;
+ int num_feature_args = 0;
struct mirror_set *ms = (struct mirror_set *) ti->private;
struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
char buffer[ms->nr_mirrors + 1];
@@ -1392,8 +1424,17 @@ static void mirror_status(struct dm_target *ti, status_type_t type,
DMEMIT(" %s %llu", ms->mirror[m].dev->name,
(unsigned long long)ms->mirror[m].offset);
- if (ms->features & DM_RAID1_HANDLE_ERRORS)
- DMEMIT(" 1 handle_errors");
+ num_feature_args += !!errors_handled(ms);
+ num_feature_args += !!keep_log(ms);
+ if (num_feature_args) {
+ DMEMIT(" %d", num_feature_args);
+ if (errors_handled(ms))
+ DMEMIT(" handle_errors");
+ if (keep_log(ms))
+ DMEMIT(" keep_log");
+ }
+
+ break;
}
}
@@ -1413,7 +1454,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
static struct target_type mirror_target = {
.name = "mirror",
- .version = {1, 13, 2},
+ .version = {1, 14, 0},
.module = THIS_MODULE,
.ctr = mirror_ctr,
.dtr = mirror_dtr,
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index f478a4c96d2f..8a8b48fa901a 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -29,30 +29,37 @@ struct dm_stat_percpu {
unsigned long long io_ticks[2];
unsigned long long io_ticks_total;
unsigned long long time_in_queue;
+ unsigned long long *histogram;
};
struct dm_stat_shared {
atomic_t in_flight[2];
- unsigned long stamp;
+ unsigned long long stamp;
struct dm_stat_percpu tmp;
};
struct dm_stat {
struct list_head list_entry;
int id;
+ unsigned stat_flags;
size_t n_entries;
sector_t start;
sector_t end;
sector_t step;
+ unsigned n_histogram_entries;
+ unsigned long long *histogram_boundaries;
const char *program_id;
const char *aux_data;
struct rcu_head rcu_head;
size_t shared_alloc_size;
size_t percpu_alloc_size;
+ size_t histogram_alloc_size;
struct dm_stat_percpu *stat_percpu[NR_CPUS];
struct dm_stat_shared stat_shared[0];
};
+#define STAT_PRECISE_TIMESTAMPS 1
+
struct dm_stats_last_position {
sector_t last_sector;
unsigned last_rw;
@@ -160,10 +167,7 @@ static void dm_kvfree(void *ptr, size_t alloc_size)
free_shared_memory(alloc_size);
- if (is_vmalloc_addr(ptr))
- vfree(ptr);
- else
- kfree(ptr);
+ kvfree(ptr);
}
static void dm_stat_free(struct rcu_head *head)
@@ -173,8 +177,11 @@ static void dm_stat_free(struct rcu_head *head)
kfree(s->program_id);
kfree(s->aux_data);
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
+ dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size);
dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
+ }
+ dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size);
dm_kvfree(s, s->shared_alloc_size);
}
@@ -227,7 +234,10 @@ void dm_stats_cleanup(struct dm_stats *stats)
}
static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
- sector_t step, const char *program_id, const char *aux_data,
+ sector_t step, unsigned stat_flags,
+ unsigned n_histogram_entries,
+ unsigned long long *histogram_boundaries,
+ const char *program_id, const char *aux_data,
void (*suspend_callback)(struct mapped_device *),
void (*resume_callback)(struct mapped_device *),
struct mapped_device *md)
@@ -238,6 +248,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
size_t ni;
size_t shared_alloc_size;
size_t percpu_alloc_size;
+ size_t histogram_alloc_size;
struct dm_stat_percpu *p;
int cpu;
int ret_id;
@@ -261,19 +272,34 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
return -EOVERFLOW;
- if (!check_shared_memory(shared_alloc_size + num_possible_cpus() * percpu_alloc_size))
+ histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long);
+ if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long))
+ return -EOVERFLOW;
+
+ if (!check_shared_memory(shared_alloc_size + histogram_alloc_size +
+ num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size)))
return -ENOMEM;
s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
if (!s)
return -ENOMEM;
+ s->stat_flags = stat_flags;
s->n_entries = n_entries;
s->start = start;
s->end = end;
s->step = step;
s->shared_alloc_size = shared_alloc_size;
s->percpu_alloc_size = percpu_alloc_size;
+ s->histogram_alloc_size = histogram_alloc_size;
+
+ s->n_histogram_entries = n_histogram_entries;
+ s->histogram_boundaries = kmemdup(histogram_boundaries,
+ s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
+ if (!s->histogram_boundaries) {
+ r = -ENOMEM;
+ goto out;
+ }
s->program_id = kstrdup(program_id, GFP_KERNEL);
if (!s->program_id) {
@@ -291,6 +317,19 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
}
+ if (s->n_histogram_entries) {
+ unsigned long long *hi;
+ hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE);
+ if (!hi) {
+ r = -ENOMEM;
+ goto out;
+ }
+ for (ni = 0; ni < n_entries; ni++) {
+ s->stat_shared[ni].tmp.histogram = hi;
+ hi += s->n_histogram_entries + 1;
+ }
+ }
+
for_each_possible_cpu(cpu) {
p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
if (!p) {
@@ -298,6 +337,18 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
goto out;
}
s->stat_percpu[cpu] = p;
+ if (s->n_histogram_entries) {
+ unsigned long long *hi;
+ hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu));
+ if (!hi) {
+ r = -ENOMEM;
+ goto out;
+ }
+ for (ni = 0; ni < n_entries; ni++) {
+ p[ni].histogram = hi;
+ hi += s->n_histogram_entries + 1;
+ }
+ }
}
/*
@@ -375,9 +426,11 @@ static int dm_stats_delete(struct dm_stats *stats, int id)
* vfree can't be called from RCU callback
*/
for_each_possible_cpu(cpu)
- if (is_vmalloc_addr(s->stat_percpu))
+ if (is_vmalloc_addr(s->stat_percpu) ||
+ is_vmalloc_addr(s->stat_percpu[cpu][0].histogram))
goto do_sync_free;
- if (is_vmalloc_addr(s)) {
+ if (is_vmalloc_addr(s) ||
+ is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) {
do_sync_free:
synchronize_rcu_expedited();
dm_stat_free(&s->rcu_head);
@@ -417,18 +470,24 @@ static int dm_stats_list(struct dm_stats *stats, const char *program,
return 1;
}
-static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *p)
+static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared,
+ struct dm_stat_percpu *p)
{
/*
* This is racy, but so is part_round_stats_single.
*/
- unsigned long now = jiffies;
- unsigned in_flight_read;
- unsigned in_flight_write;
- unsigned long difference = now - shared->stamp;
+ unsigned long long now, difference;
+ unsigned in_flight_read, in_flight_write;
+
+ if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)))
+ now = jiffies;
+ else
+ now = ktime_to_ns(ktime_get());
+ difference = now - shared->stamp;
if (!difference)
return;
+
in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]);
in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]);
if (in_flight_read)
@@ -443,8 +502,9 @@ static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *
}
static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
- unsigned long bi_rw, sector_t len, bool merged,
- bool end, unsigned long duration)
+ unsigned long bi_rw, sector_t len,
+ struct dm_stats_aux *stats_aux, bool end,
+ unsigned long duration_jiffies)
{
unsigned long idx = bi_rw & REQ_WRITE;
struct dm_stat_shared *shared = &s->stat_shared[entry];
@@ -474,15 +534,35 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
p = &s->stat_percpu[smp_processor_id()][entry];
if (!end) {
- dm_stat_round(shared, p);
+ dm_stat_round(s, shared, p);
atomic_inc(&shared->in_flight[idx]);
} else {
- dm_stat_round(shared, p);
+ unsigned long long duration;
+ dm_stat_round(s, shared, p);
atomic_dec(&shared->in_flight[idx]);
p->sectors[idx] += len;
p->ios[idx] += 1;
- p->merges[idx] += merged;
- p->ticks[idx] += duration;
+ p->merges[idx] += stats_aux->merged;
+ if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) {
+ p->ticks[idx] += duration_jiffies;
+ duration = jiffies_to_msecs(duration_jiffies);
+ } else {
+ p->ticks[idx] += stats_aux->duration_ns;
+ duration = stats_aux->duration_ns;
+ }
+ if (s->n_histogram_entries) {
+ unsigned lo = 0, hi = s->n_histogram_entries + 1;
+ while (lo + 1 < hi) {
+ unsigned mid = (lo + hi) / 2;
+ if (s->histogram_boundaries[mid - 1] > duration) {
+ hi = mid;
+ } else {
+ lo = mid;
+ }
+
+ }
+ p->histogram[lo]++;
+ }
}
#if BITS_PER_LONG == 32
@@ -494,7 +574,7 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
sector_t bi_sector, sector_t end_sector,
- bool end, unsigned long duration,
+ bool end, unsigned long duration_jiffies,
struct dm_stats_aux *stats_aux)
{
sector_t rel_sector, offset, todo, fragment_len;
@@ -523,7 +603,7 @@ static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
if (fragment_len > s->step - offset)
fragment_len = s->step - offset;
dm_stat_for_entry(s, entry, bi_rw, fragment_len,
- stats_aux->merged, end, duration);
+ stats_aux, end, duration_jiffies);
todo -= fragment_len;
entry++;
offset = 0;
@@ -532,11 +612,13 @@ static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
sector_t bi_sector, unsigned bi_sectors, bool end,
- unsigned long duration, struct dm_stats_aux *stats_aux)
+ unsigned long duration_jiffies,
+ struct dm_stats_aux *stats_aux)
{
struct dm_stat *s;
sector_t end_sector;
struct dm_stats_last_position *last;
+ bool got_precise_time;
if (unlikely(!bi_sectors))
return;
@@ -560,8 +642,17 @@ void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
rcu_read_lock();
- list_for_each_entry_rcu(s, &stats->list, list_entry)
- __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration, stats_aux);
+ got_precise_time = false;
+ list_for_each_entry_rcu(s, &stats->list, list_entry) {
+ if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) {
+ if (!end)
+ stats_aux->duration_ns = ktime_to_ns(ktime_get());
+ else
+ stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns;
+ got_precise_time = true;
+ }
+ __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux);
+ }
rcu_read_unlock();
}
@@ -574,10 +665,25 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
local_irq_disable();
p = &s->stat_percpu[smp_processor_id()][x];
- dm_stat_round(shared, p);
+ dm_stat_round(s, shared, p);
local_irq_enable();
- memset(&shared->tmp, 0, sizeof(shared->tmp));
+ shared->tmp.sectors[READ] = 0;
+ shared->tmp.sectors[WRITE] = 0;
+ shared->tmp.ios[READ] = 0;
+ shared->tmp.ios[WRITE] = 0;
+ shared->tmp.merges[READ] = 0;
+ shared->tmp.merges[WRITE] = 0;
+ shared->tmp.ticks[READ] = 0;
+ shared->tmp.ticks[WRITE] = 0;
+ shared->tmp.io_ticks[READ] = 0;
+ shared->tmp.io_ticks[WRITE] = 0;
+ shared->tmp.io_ticks_total = 0;
+ shared->tmp.time_in_queue = 0;
+
+ if (s->n_histogram_entries)
+ memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long));
+
for_each_possible_cpu(cpu) {
p = &s->stat_percpu[cpu][x];
shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
@@ -592,6 +698,11 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
+ if (s->n_histogram_entries) {
+ unsigned i;
+ for (i = 0; i < s->n_histogram_entries + 1; i++)
+ shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]);
+ }
}
}
@@ -621,6 +732,15 @@ static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
p->io_ticks_total -= shared->tmp.io_ticks_total;
p->time_in_queue -= shared->tmp.time_in_queue;
local_irq_enable();
+ if (s->n_histogram_entries) {
+ unsigned i;
+ for (i = 0; i < s->n_histogram_entries + 1; i++) {
+ local_irq_disable();
+ p = &s->stat_percpu[smp_processor_id()][x];
+ p->histogram[i] -= shared->tmp.histogram[i];
+ local_irq_enable();
+ }
+ }
}
}
@@ -646,11 +766,15 @@ static int dm_stats_clear(struct dm_stats *stats, int id)
/*
* This is like jiffies_to_msec, but works for 64-bit values.
*/
-static unsigned long long dm_jiffies_to_msec64(unsigned long long j)
+static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j)
{
- unsigned long long result = 0;
+ unsigned long long result;
unsigned mult;
+ if (s->stat_flags & STAT_PRECISE_TIMESTAMPS)
+ return j;
+
+ result = 0;
if (j)
result = jiffies_to_msecs(j & 0x3fffff);
if (j >= 1 << 22) {
@@ -706,22 +830,29 @@ static int dm_stats_print(struct dm_stats *stats, int id,
__dm_stat_init_temporary_percpu_totals(shared, s, x);
- DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu\n",
+ DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu",
(unsigned long long)start,
(unsigned long long)step,
shared->tmp.ios[READ],
shared->tmp.merges[READ],
shared->tmp.sectors[READ],
- dm_jiffies_to_msec64(shared->tmp.ticks[READ]),
+ dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]),
shared->tmp.ios[WRITE],
shared->tmp.merges[WRITE],
shared->tmp.sectors[WRITE],
- dm_jiffies_to_msec64(shared->tmp.ticks[WRITE]),
+ dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]),
dm_stat_in_flight(shared),
- dm_jiffies_to_msec64(shared->tmp.io_ticks_total),
- dm_jiffies_to_msec64(shared->tmp.time_in_queue),
- dm_jiffies_to_msec64(shared->tmp.io_ticks[READ]),
- dm_jiffies_to_msec64(shared->tmp.io_ticks[WRITE]));
+ dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total),
+ dm_jiffies_to_msec64(s, shared->tmp.time_in_queue),
+ dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]),
+ dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE]));
+ if (s->n_histogram_entries) {
+ unsigned i;
+ for (i = 0; i < s->n_histogram_entries + 1; i++) {
+ DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]);
+ }
+ }
+ DMEMIT("\n");
if (unlikely(sz + 1 >= maxlen))
goto buffer_overflow;
@@ -763,55 +894,134 @@ static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data
return 0;
}
+static int parse_histogram(const char *h, unsigned *n_histogram_entries,
+ unsigned long long **histogram_boundaries)
+{
+ const char *q;
+ unsigned n;
+ unsigned long long last;
+
+ *n_histogram_entries = 1;
+ for (q = h; *q; q++)
+ if (*q == ',')
+ (*n_histogram_entries)++;
+
+ *histogram_boundaries = kmalloc(*n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
+ if (!*histogram_boundaries)
+ return -ENOMEM;
+
+ n = 0;
+ last = 0;
+ while (1) {
+ unsigned long long hi;
+ int s;
+ char ch;
+ s = sscanf(h, "%llu%c", &hi, &ch);
+ if (!s || (s == 2 && ch != ','))
+ return -EINVAL;
+ if (hi <= last)
+ return -EINVAL;
+ last = hi;
+ (*histogram_boundaries)[n] = hi;
+ if (s == 1)
+ return 0;
+ h = strchr(h, ',') + 1;
+ n++;
+ }
+}
+
static int message_stats_create(struct mapped_device *md,
unsigned argc, char **argv,
char *result, unsigned maxlen)
{
+ int r;
int id;
char dummy;
unsigned long long start, end, len, step;
unsigned divisor;
const char *program_id, *aux_data;
+ unsigned stat_flags = 0;
+
+ unsigned n_histogram_entries = 0;
+ unsigned long long *histogram_boundaries = NULL;
+
+ struct dm_arg_set as, as_backup;
+ const char *a;
+ unsigned feature_args;
/*
* Input format:
- * <range> <step> [<program_id> [<aux_data>]]
+ * <range> <step> [<extra_parameters> <parameters>] [<program_id> [<aux_data>]]
*/
- if (argc < 3 || argc > 5)
- return -EINVAL;
+ if (argc < 3)
+ goto ret_einval;
- if (!strcmp(argv[1], "-")) {
+ as.argc = argc;
+ as.argv = argv;
+ dm_consume_args(&as, 1);
+
+ a = dm_shift_arg(&as);
+ if (!strcmp(a, "-")) {
start = 0;
len = dm_get_size(md);
if (!len)
len = 1;
- } else if (sscanf(argv[1], "%llu+%llu%c", &start, &len, &dummy) != 2 ||
+ } else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 ||
start != (sector_t)start || len != (sector_t)len)
- return -EINVAL;
+ goto ret_einval;
end = start + len;
if (start >= end)
- return -EINVAL;
+ goto ret_einval;
- if (sscanf(argv[2], "/%u%c", &divisor, &dummy) == 1) {
+ a = dm_shift_arg(&as);
+ if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) {
+ if (!divisor)
+ return -EINVAL;
step = end - start;
if (do_div(step, divisor))
step++;
if (!step)
step = 1;
- } else if (sscanf(argv[2], "%llu%c", &step, &dummy) != 1 ||
+ } else if (sscanf(a, "%llu%c", &step, &dummy) != 1 ||
step != (sector_t)step || !step)
- return -EINVAL;
+ goto ret_einval;
+
+ as_backup = as;
+ a = dm_shift_arg(&as);
+ if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) {
+ while (feature_args--) {
+ a = dm_shift_arg(&as);
+ if (!a)
+ goto ret_einval;
+ if (!strcasecmp(a, "precise_timestamps"))
+ stat_flags |= STAT_PRECISE_TIMESTAMPS;
+ else if (!strncasecmp(a, "histogram:", 10)) {
+ if (n_histogram_entries)
+ goto ret_einval;
+ if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries)))
+ goto ret;
+ } else
+ goto ret_einval;
+ }
+ } else {
+ as = as_backup;
+ }
program_id = "-";
aux_data = "-";
- if (argc > 3)
- program_id = argv[3];
+ a = dm_shift_arg(&as);
+ if (a)
+ program_id = a;
- if (argc > 4)
- aux_data = argv[4];
+ a = dm_shift_arg(&as);
+ if (a)
+ aux_data = a;
+
+ if (as.argc)
+ goto ret_einval;
/*
* If a buffer overflow happens after we created the region,
@@ -820,17 +1030,29 @@ static int message_stats_create(struct mapped_device *md,
* leaked). So we must detect buffer overflow in advance.
*/
snprintf(result, maxlen, "%d", INT_MAX);
- if (dm_message_test_buffer_overflow(result, maxlen))
- return 1;
+ if (dm_message_test_buffer_overflow(result, maxlen)) {
+ r = 1;
+ goto ret;
+ }
- id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data,
+ id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags,
+ n_histogram_entries, histogram_boundaries, program_id, aux_data,
dm_internal_suspend_fast, dm_internal_resume_fast, md);
- if (id < 0)
- return id;
+ if (id < 0) {
+ r = id;
+ goto ret;
+ }
snprintf(result, maxlen, "%d", id);
- return 1;
+ r = 1;
+ goto ret;
+
+ret_einval:
+ r = -EINVAL;
+ret:
+ kfree(histogram_boundaries);
+ return r;
}
static int message_stats_delete(struct mapped_device *md,
@@ -933,11 +1155,6 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
{
int r;
- if (dm_request_based(md)) {
- DMWARN("Statistics are only supported for bio-based devices");
- return -EOPNOTSUPP;
- }
-
/* All messages here must start with '@' */
if (!strcasecmp(argv[0], "@stats_create"))
r = message_stats_create(md, argc, argv, result, maxlen);
diff --git a/drivers/md/dm-stats.h b/drivers/md/dm-stats.h
index e7c4984bf235..f1c0956e3843 100644
--- a/drivers/md/dm-stats.h
+++ b/drivers/md/dm-stats.h
@@ -18,6 +18,7 @@ struct dm_stats {
struct dm_stats_aux {
bool merged;
+ unsigned long long duration_ns;
};
void dm_stats_init(struct dm_stats *st);
@@ -30,7 +31,8 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
sector_t bi_sector, unsigned bi_sectors, bool end,
- unsigned long duration, struct dm_stats_aux *aux);
+ unsigned long duration_jiffies,
+ struct dm_stats_aux *aux);
static inline bool dm_stats_used(struct dm_stats *st)
{
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index f8b37d4c05d8..a672a1502c14 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -451,10 +451,8 @@ int __init dm_stripe_init(void)
int r;
r = dm_register_target(&stripe_target);
- if (r < 0) {
+ if (r < 0)
DMWARN("target registration failed");
- return r;
- }
return r;
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index a5f94125ad01..85e1d39e9a38 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -964,8 +964,8 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
return -EINVAL;
}
- if (!t->mempools)
- return -ENOMEM;
+ if (IS_ERR(t->mempools))
+ return PTR_ERR(t->mempools);
return 0;
}
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 79f694120ddf..48dfe3c4d6aa 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -184,7 +184,6 @@ struct dm_pool_metadata {
uint64_t trans_id;
unsigned long flags;
sector_t data_block_size;
- bool read_only:1;
/*
* Set if a transaction has to be aborted but the attempt to roll back
@@ -836,7 +835,6 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
init_rwsem(&pmd->root_lock);
pmd->time = 0;
INIT_LIST_HEAD(&pmd->thin_devices);
- pmd->read_only = false;
pmd->fail_io = false;
pmd->bdev = bdev;
pmd->data_block_size = data_block_size;
@@ -880,7 +878,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
return -EBUSY;
}
- if (!pmd->read_only && !pmd->fail_io) {
+ if (!dm_bm_is_read_only(pmd->bm) && !pmd->fail_io) {
r = __commit_transaction(pmd);
if (r < 0)
DMWARN("%s: __commit_transaction() failed, error = %d",
@@ -1392,10 +1390,11 @@ int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
dm_block_t keys[2] = { td->id, block };
struct dm_btree_info *info;
- if (pmd->fail_io)
- return -EINVAL;
-
down_read(&pmd->root_lock);
+ if (pmd->fail_io) {
+ up_read(&pmd->root_lock);
+ return -EINVAL;
+ }
if (can_issue_io) {
info = &pmd->info;
@@ -1419,6 +1418,63 @@ int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
return r;
}
+/* FIXME: write a more efficient one in btree */
+int dm_thin_find_mapped_range(struct dm_thin_device *td,
+ dm_block_t begin, dm_block_t end,
+ dm_block_t *thin_begin, dm_block_t *thin_end,
+ dm_block_t *pool_begin, bool *maybe_shared)
+{
+ int r;
+ dm_block_t pool_end;
+ struct dm_thin_lookup_result lookup;
+
+ if (end < begin)
+ return -ENODATA;
+
+ /*
+ * Find first mapped block.
+ */
+ while (begin < end) {
+ r = dm_thin_find_block(td, begin, true, &lookup);
+ if (r) {
+ if (r != -ENODATA)
+ return r;
+ } else
+ break;
+
+ begin++;
+ }
+
+ if (begin == end)
+ return -ENODATA;
+
+ *thin_begin = begin;
+ *pool_begin = lookup.block;
+ *maybe_shared = lookup.shared;
+
+ begin++;
+ pool_end = *pool_begin + 1;
+ while (begin != end) {
+ r = dm_thin_find_block(td, begin, true, &lookup);
+ if (r) {
+ if (r == -ENODATA)
+ break;
+ else
+ return r;
+ }
+
+ if ((lookup.block != pool_end) ||
+ (lookup.shared != *maybe_shared))
+ break;
+
+ pool_end++;
+ begin++;
+ }
+
+ *thin_end = begin;
+ return 0;
+}
+
static int __insert(struct dm_thin_device *td, dm_block_t block,
dm_block_t data_block)
{
@@ -1471,6 +1527,47 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
return 0;
}
+static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end)
+{
+ int r;
+ unsigned count;
+ struct dm_pool_metadata *pmd = td->pmd;
+ dm_block_t keys[1] = { td->id };
+ __le64 value;
+ dm_block_t mapping_root;
+
+ /*
+ * Find the mapping tree
+ */
+ r = dm_btree_lookup(&pmd->tl_info, pmd->root, keys, &value);
+ if (r)
+ return r;
+
+ /*
+ * Remove from the mapping tree, taking care to inc the
+ * ref count so it doesn't get deleted.
+ */
+ mapping_root = le64_to_cpu(value);
+ dm_tm_inc(pmd->tm, mapping_root);
+ r = dm_btree_remove(&pmd->tl_info, pmd->root, keys, &pmd->root);
+ if (r)
+ return r;
+
+ r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
+ if (r)
+ return r;
+
+ td->mapped_blocks -= count;
+ td->changed = 1;
+
+ /*
+ * Reinsert the mapping tree.
+ */
+ value = cpu_to_le64(mapping_root);
+ __dm_bless_for_disk(&value);
+ return dm_btree_insert(&pmd->tl_info, pmd->root, keys, &value, &pmd->root);
+}
+
int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
{
int r = -EINVAL;
@@ -1483,6 +1580,19 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
return r;
}
+int dm_thin_remove_range(struct dm_thin_device *td,
+ dm_block_t begin, dm_block_t end)
+{
+ int r = -EINVAL;
+
+ down_write(&td->pmd->root_lock);
+ if (!td->pmd->fail_io)
+ r = __remove_range(td, begin, end);
+ up_write(&td->pmd->root_lock);
+
+ return r;
+}
+
int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
{
int r;
@@ -1739,7 +1849,6 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou
void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
{
down_write(&pmd->root_lock);
- pmd->read_only = true;
dm_bm_set_read_only(pmd->bm);
up_write(&pmd->root_lock);
}
@@ -1747,7 +1856,6 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
{
down_write(&pmd->root_lock);
- pmd->read_only = false;
dm_bm_set_read_write(pmd->bm);
up_write(&pmd->root_lock);
}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index fac01a96d303..a938babe4258 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -147,6 +147,15 @@ int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
int can_issue_io, struct dm_thin_lookup_result *result);
/*
+ * Retrieve the next run of contiguously mapped blocks. Useful for working
+ * out where to break up IO. Returns 0 on success, < 0 on error.
+ */
+int dm_thin_find_mapped_range(struct dm_thin_device *td,
+ dm_block_t begin, dm_block_t end,
+ dm_block_t *thin_begin, dm_block_t *thin_end,
+ dm_block_t *pool_begin, bool *maybe_shared);
+
+/*
* Obtain an unused block.
*/
int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result);
@@ -158,6 +167,8 @@ int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
dm_block_t data_block);
int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
+int dm_thin_remove_range(struct dm_thin_device *td,
+ dm_block_t begin, dm_block_t end);
/*
* Queries.
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index e852602c0091..c33f61a4cc28 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -111,22 +111,30 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
/*
* Key building.
*/
-static void build_data_key(struct dm_thin_device *td,
- dm_block_t b, struct dm_cell_key *key)
+enum lock_space {
+ VIRTUAL,
+ PHYSICAL
+};
+
+static void build_key(struct dm_thin_device *td, enum lock_space ls,
+ dm_block_t b, dm_block_t e, struct dm_cell_key *key)
{
- key->virtual = 0;
+ key->virtual = (ls == VIRTUAL);
key->dev = dm_thin_dev_id(td);
key->block_begin = b;
- key->block_end = b + 1ULL;
+ key->block_end = e;
+}
+
+static void build_data_key(struct dm_thin_device *td, dm_block_t b,
+ struct dm_cell_key *key)
+{
+ build_key(td, PHYSICAL, b, b + 1llu, key);
}
static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
struct dm_cell_key *key)
{
- key->virtual = 1;
- key->dev = dm_thin_dev_id(td);
- key->block_begin = b;
- key->block_end = b + 1ULL;
+ build_key(td, VIRTUAL, b, b + 1llu, key);
}
/*----------------------------------------------------------------*/
@@ -312,6 +320,138 @@ struct thin_c {
/*----------------------------------------------------------------*/
+/**
+ * __blkdev_issue_discard_async - queue a discard with async completion
+ * @bdev: blockdev to issue discard for
+ * @sector: start sector
+ * @nr_sects: number of sectors to discard
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ * @flags: BLKDEV_IFL_* flags to control behaviour
+ * @parent_bio: parent discard bio that all sub discards get chained to
+ *
+ * Description:
+ * Asynchronously issue a discard request for the sectors in question.
+ * NOTE: this variant of blk-core's blkdev_issue_discard() is a stop-gap
+ * that is being kept local to DM thinp until the block changes to allow
+ * late bio splitting land upstream.
+ */
+static int __blkdev_issue_discard_async(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, unsigned long flags,
+ struct bio *parent_bio)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+ int type = REQ_WRITE | REQ_DISCARD;
+ unsigned int max_discard_sectors, granularity;
+ int alignment;
+ struct bio *bio;
+ int ret = 0;
+ struct blk_plug plug;
+
+ if (!q)
+ return -ENXIO;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ /* Zero-sector (unknown) and one-sector granularities are the same. */
+ granularity = max(q->limits.discard_granularity >> 9, 1U);
+ alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
+
+ /*
+ * Ensure that max_discard_sectors is of the proper
+ * granularity, so that requests stay aligned after a split.
+ */
+ max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
+ max_discard_sectors -= max_discard_sectors % granularity;
+ if (unlikely(!max_discard_sectors)) {
+ /* Avoid infinite loop below. Being cautious never hurts. */
+ return -EOPNOTSUPP;
+ }
+
+ if (flags & BLKDEV_DISCARD_SECURE) {
+ if (!blk_queue_secdiscard(q))
+ return -EOPNOTSUPP;
+ type |= REQ_SECURE;
+ }
+
+ blk_start_plug(&plug);
+ while (nr_sects) {
+ unsigned int req_sects;
+ sector_t end_sect, tmp;
+
+ /*
+ * Required bio_put occurs in bio_endio thanks to bio_chain below
+ */
+ bio = bio_alloc(gfp_mask, 1);
+ if (!bio) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
+
+ /*
+ * If splitting a request, and the next starting sector would be
+ * misaligned, stop the discard at the previous aligned sector.
+ */
+ end_sect = sector + req_sects;
+ tmp = end_sect;
+ if (req_sects < nr_sects &&
+ sector_div(tmp, granularity) != alignment) {
+ end_sect = end_sect - alignment;
+ sector_div(end_sect, granularity);
+ end_sect = end_sect * granularity + alignment;
+ req_sects = end_sect - sector;
+ }
+
+ bio_chain(bio, parent_bio);
+
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_bdev = bdev;
+
+ bio->bi_iter.bi_size = req_sects << 9;
+ nr_sects -= req_sects;
+ sector = end_sect;
+
+ submit_bio(type, bio);
+
+ /*
+ * We can loop for a long time in here, if someone does
+ * full device discards (like mkfs). Be nice and allow
+ * us to schedule out to avoid softlocking if preempt
+ * is disabled.
+ */
+ cond_resched();
+ }
+ blk_finish_plug(&plug);
+
+ return ret;
+}
+
+static bool block_size_is_power_of_two(struct pool *pool)
+{
+ return pool->sectors_per_block_shift >= 0;
+}
+
+static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
+{
+ return block_size_is_power_of_two(pool) ?
+ (b << pool->sectors_per_block_shift) :
+ (b * pool->sectors_per_block);
+}
+
+static int issue_discard(struct thin_c *tc, dm_block_t data_b, dm_block_t data_e,
+ struct bio *parent_bio)
+{
+ sector_t s = block_to_sectors(tc->pool, data_b);
+ sector_t len = block_to_sectors(tc->pool, data_e - data_b);
+
+ return __blkdev_issue_discard_async(tc->pool_dev->bdev, s, len,
+ GFP_NOWAIT, 0, parent_bio);
+}
+
+/*----------------------------------------------------------------*/
+
/*
* wake_worker() is used when new work is queued and when pool_resume is
* ready to continue deferred IO processing.
@@ -461,6 +601,7 @@ struct dm_thin_endio_hook {
struct dm_deferred_entry *all_io_entry;
struct dm_thin_new_mapping *overwrite_mapping;
struct rb_node rb_node;
+ struct dm_bio_prison_cell *cell;
};
static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
@@ -541,11 +682,6 @@ static void error_retry_list(struct pool *pool)
* target.
*/
-static bool block_size_is_power_of_two(struct pool *pool)
-{
- return pool->sectors_per_block_shift >= 0;
-}
-
static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
{
struct pool *pool = tc->pool;
@@ -559,6 +695,34 @@ static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
return block_nr;
}
+/*
+ * Returns the _complete_ blocks that this bio covers.
+ */
+static void get_bio_block_range(struct thin_c *tc, struct bio *bio,
+ dm_block_t *begin, dm_block_t *end)
+{
+ struct pool *pool = tc->pool;
+ sector_t b = bio->bi_iter.bi_sector;
+ sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
+
+ b += pool->sectors_per_block - 1ull; /* so we round up */
+
+ if (block_size_is_power_of_two(pool)) {
+ b >>= pool->sectors_per_block_shift;
+ e >>= pool->sectors_per_block_shift;
+ } else {
+ (void) sector_div(b, pool->sectors_per_block);
+ (void) sector_div(e, pool->sectors_per_block);
+ }
+
+ if (e < b)
+ /* Can happen if the bio is within a single block. */
+ e = b;
+
+ *begin = b;
+ *end = e;
+}
+
static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
{
struct pool *pool = tc->pool;
@@ -647,7 +811,7 @@ struct dm_thin_new_mapping {
struct list_head list;
bool pass_discard:1;
- bool definitely_not_shared:1;
+ bool maybe_shared:1;
/*
* Track quiescing, copying and zeroing preparation actions. When this
@@ -658,9 +822,9 @@ struct dm_thin_new_mapping {
int err;
struct thin_c *tc;
- dm_block_t virt_block;
+ dm_block_t virt_begin, virt_end;
dm_block_t data_block;
- struct dm_bio_prison_cell *cell, *cell2;
+ struct dm_bio_prison_cell *cell;
/*
* If the bio covers the whole area of a block then we can avoid
@@ -705,6 +869,8 @@ static void overwrite_endio(struct bio *bio, int err)
struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
struct dm_thin_new_mapping *m = h->overwrite_mapping;
+ bio->bi_end_io = m->saved_bi_end_io;
+
m->err = err;
complete_mapping_preparation(m);
}
@@ -793,9 +959,6 @@ static void inc_remap_and_issue_cell(struct thin_c *tc,
static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
{
- if (m->bio)
- m->bio->bi_end_io = m->saved_bi_end_io;
-
cell_error(m->tc->pool, m->cell);
list_del(&m->list);
mempool_free(m, m->tc->pool->mapping_pool);
@@ -805,13 +968,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
{
struct thin_c *tc = m->tc;
struct pool *pool = tc->pool;
- struct bio *bio;
+ struct bio *bio = m->bio;
int r;
- bio = m->bio;
- if (bio)
- bio->bi_end_io = m->saved_bi_end_io;
-
if (m->err) {
cell_error(pool, m->cell);
goto out;
@@ -822,7 +981,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
* Any I/O for this block arriving after this point will get
* remapped to it directly.
*/
- r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
+ r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block);
if (r) {
metadata_operation_failed(pool, "dm_thin_insert_block", r);
cell_error(pool, m->cell);
@@ -849,50 +1008,112 @@ out:
mempool_free(m, pool->mapping_pool);
}
-static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
+/*----------------------------------------------------------------*/
+
+static void free_discard_mapping(struct dm_thin_new_mapping *m)
{
struct thin_c *tc = m->tc;
+ if (m->cell)
+ cell_defer_no_holder(tc, m->cell);
+ mempool_free(m, tc->pool->mapping_pool);
+}
+static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
+{
bio_io_error(m->bio);
+ free_discard_mapping(m);
+}
+
+static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
+{
+ bio_endio(m->bio, 0);
+ free_discard_mapping(m);
+}
+
+static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
+{
+ int r;
+ struct thin_c *tc = m->tc;
+
+ r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end);
+ if (r) {
+ metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);
+ bio_io_error(m->bio);
+ } else
+ bio_endio(m->bio, 0);
+
cell_defer_no_holder(tc, m->cell);
- cell_defer_no_holder(tc, m->cell2);
mempool_free(m, tc->pool->mapping_pool);
}
-static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
+static int passdown_double_checking_shared_status(struct dm_thin_new_mapping *m)
{
+ /*
+ * We've already unmapped this range of blocks, but before we
+ * passdown we have to check that these blocks are now unused.
+ */
+ int r;
+ bool used = true;
struct thin_c *tc = m->tc;
+ struct pool *pool = tc->pool;
+ dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
- inc_all_io_entry(tc->pool, m->bio);
- cell_defer_no_holder(tc, m->cell);
- cell_defer_no_holder(tc, m->cell2);
+ while (b != end) {
+ /* find start of unmapped run */
+ for (; b < end; b++) {
+ r = dm_pool_block_is_used(pool->pmd, b, &used);
+ if (r)
+ return r;
- if (m->pass_discard)
- if (m->definitely_not_shared)
- remap_and_issue(tc, m->bio, m->data_block);
- else {
- bool used = false;
- if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
- bio_endio(m->bio, 0);
- else
- remap_and_issue(tc, m->bio, m->data_block);
+ if (!used)
+ break;
}
- else
- bio_endio(m->bio, 0);
- mempool_free(m, tc->pool->mapping_pool);
+ if (b == end)
+ break;
+
+ /* find end of run */
+ for (e = b + 1; e != end; e++) {
+ r = dm_pool_block_is_used(pool->pmd, e, &used);
+ if (r)
+ return r;
+
+ if (used)
+ break;
+ }
+
+ r = issue_discard(tc, b, e, m->bio);
+ if (r)
+ return r;
+
+ b = e;
+ }
+
+ return 0;
}
-static void process_prepared_discard(struct dm_thin_new_mapping *m)
+static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
{
int r;
struct thin_c *tc = m->tc;
+ struct pool *pool = tc->pool;
- r = dm_thin_remove_block(tc->td, m->virt_block);
+ r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
if (r)
- DMERR_LIMIT("dm_thin_remove_block() failed");
+ metadata_operation_failed(pool, "dm_thin_remove_range", r);
+
+ else if (m->maybe_shared)
+ r = passdown_double_checking_shared_status(m);
+ else
+ r = issue_discard(tc, m->data_block, m->data_block + (m->virt_end - m->virt_begin), m->bio);
- process_prepared_discard_passdown(m);
+ /*
+ * Even if r is set, there could be sub discards in flight that we
+ * need to wait for.
+ */
+ bio_endio(m->bio, r);
+ cell_defer_no_holder(tc, m->cell);
+ mempool_free(m, pool->mapping_pool);
}
static void process_prepared(struct pool *pool, struct list_head *head,
@@ -976,7 +1197,7 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
}
static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
- dm_block_t data_block,
+ dm_block_t data_begin,
struct dm_thin_new_mapping *m)
{
struct pool *pool = tc->pool;
@@ -986,7 +1207,7 @@ static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
m->bio = bio;
save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
inc_all_io_entry(pool, bio);
- remap_and_issue(tc, bio, data_block);
+ remap_and_issue(tc, bio, data_begin);
}
/*
@@ -1003,7 +1224,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
struct dm_thin_new_mapping *m = get_next_mapping(pool);
m->tc = tc;
- m->virt_block = virt_block;
+ m->virt_begin = virt_block;
+ m->virt_end = virt_block + 1u;
m->data_block = data_dest;
m->cell = cell;
@@ -1082,7 +1304,8 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
m->tc = tc;
- m->virt_block = virt_block;
+ m->virt_begin = virt_block;
+ m->virt_end = virt_block + 1u;
m->data_block = data_block;
m->cell = cell;
@@ -1091,16 +1314,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
* zeroing pre-existing data, we can issue the bio immediately.
* Otherwise we use kcopyd to zero the data first.
*/
- if (!pool->pf.zero_new_blocks)
+ if (pool->pf.zero_new_blocks) {
+ if (io_overwrites_block(pool, bio))
+ remap_and_issue_overwrite(tc, bio, data_block, m);
+ else
+ ll_zero(tc, m, data_block * pool->sectors_per_block,
+ (data_block + 1) * pool->sectors_per_block);
+ } else
process_prepared_mapping(m);
-
- else if (io_overwrites_block(pool, bio))
- remap_and_issue_overwrite(tc, bio, data_block, m);
-
- else
- ll_zero(tc, m,
- data_block * pool->sectors_per_block,
- (data_block + 1) * pool->sectors_per_block);
}
static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
@@ -1291,99 +1512,149 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
retry_on_resume(bio);
}
-static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+static void process_discard_cell_no_passdown(struct thin_c *tc,
+ struct dm_bio_prison_cell *virt_cell)
{
- int r;
- struct bio *bio = cell->holder;
struct pool *pool = tc->pool;
- struct dm_bio_prison_cell *cell2;
- struct dm_cell_key key2;
- dm_block_t block = get_bio_block(tc, bio);
- struct dm_thin_lookup_result lookup_result;
- struct dm_thin_new_mapping *m;
+ struct dm_thin_new_mapping *m = get_next_mapping(pool);
- if (tc->requeue_mode) {
- cell_requeue(pool, cell);
- return;
- }
+ /*
+ * We don't need to lock the data blocks, since there's no
+ * passdown. We only lock data blocks for allocation and breaking sharing.
+ */
+ m->tc = tc;
+ m->virt_begin = virt_cell->key.block_begin;
+ m->virt_end = virt_cell->key.block_end;
+ m->cell = virt_cell;
+ m->bio = virt_cell->holder;
- r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
- switch (r) {
- case 0:
- /*
- * Check nobody is fiddling with this pool block. This can
- * happen if someone's in the process of breaking sharing
- * on this block.
- */
- build_data_key(tc->td, lookup_result.block, &key2);
- if (bio_detain(tc->pool, &key2, bio, &cell2)) {
- cell_defer_no_holder(tc, cell);
- break;
- }
+ if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+ pool->process_prepared_discard(m);
+}
- if (io_overlaps_block(pool, bio)) {
- /*
- * IO may still be going to the destination block. We must
- * quiesce before we can do the removal.
- */
- m = get_next_mapping(pool);
- m->tc = tc;
- m->pass_discard = pool->pf.discard_passdown;
- m->definitely_not_shared = !lookup_result.shared;
- m->virt_block = block;
- m->data_block = lookup_result.block;
- m->cell = cell;
- m->cell2 = cell2;
- m->bio = bio;
-
- if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
- pool->process_prepared_discard(m);
+/*
+ * FIXME: DM local hack to defer parent bios's end_io until we
+ * _know_ all chained sub range discard bios have completed.
+ * Will go away once late bio splitting lands upstream!
+ */
+static inline void __bio_inc_remaining(struct bio *bio)
+{
+ bio->bi_flags |= (1 << BIO_CHAIN);
+ smp_mb__before_atomic();
+ atomic_inc(&bio->__bi_remaining);
+}
- } else {
- inc_all_io_entry(pool, bio);
- cell_defer_no_holder(tc, cell);
- cell_defer_no_holder(tc, cell2);
+static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
+ struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+ int r;
+ bool maybe_shared;
+ struct dm_cell_key data_key;
+ struct dm_bio_prison_cell *data_cell;
+ struct dm_thin_new_mapping *m;
+ dm_block_t virt_begin, virt_end, data_begin;
+
+ while (begin != end) {
+ r = ensure_next_mapping(pool);
+ if (r)
+ /* we did our best */
+ return;
+
+ r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
+ &data_begin, &maybe_shared);
+ if (r)
/*
- * The DM core makes sure that the discard doesn't span
- * a block boundary. So we submit the discard of a
- * partial block appropriately.
+ * Silently fail, letting any mappings we've
+ * created complete.
*/
- if ((!lookup_result.shared) && pool->pf.discard_passdown)
- remap_and_issue(tc, bio, lookup_result.block);
- else
- bio_endio(bio, 0);
+ break;
+
+ build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key);
+ if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
+ /* contention, we'll give up with this range */
+ begin = virt_end;
+ continue;
}
- break;
- case -ENODATA:
/*
- * It isn't provisioned, just forget it.
+ * IO may still be going to the destination block. We must
+ * quiesce before we can do the removal.
*/
- cell_defer_no_holder(tc, cell);
- bio_endio(bio, 0);
- break;
+ m = get_next_mapping(pool);
+ m->tc = tc;
+ m->maybe_shared = maybe_shared;
+ m->virt_begin = virt_begin;
+ m->virt_end = virt_end;
+ m->data_block = data_begin;
+ m->cell = data_cell;
+ m->bio = bio;
- default:
- DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
- __func__, r);
- cell_defer_no_holder(tc, cell);
- bio_io_error(bio);
- break;
+ /*
+ * The parent bio must not complete before sub discard bios are
+ * chained to it (see __blkdev_issue_discard_async's bio_chain)!
+ *
+ * This per-mapping bi_remaining increment is paired with
+ * the implicit decrement that occurs via bio_endio() in
+ * process_prepared_discard_{passdown,no_passdown}.
+ */
+ __bio_inc_remaining(bio);
+ if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+ pool->process_prepared_discard(m);
+
+ begin = virt_end;
}
}
+static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_prison_cell *virt_cell)
+{
+ struct bio *bio = virt_cell->holder;
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+
+ /*
+ * The virt_cell will only get freed once the origin bio completes.
+ * This means it will remain locked while all the individual
+ * passdown bios are in flight.
+ */
+ h->cell = virt_cell;
+ break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio);
+
+ /*
+ * We complete the bio now, knowing that the bi_remaining field
+ * will prevent completion until the sub range discards have
+ * completed.
+ */
+ bio_endio(bio, 0);
+}
+
static void process_discard_bio(struct thin_c *tc, struct bio *bio)
{
- struct dm_bio_prison_cell *cell;
- struct dm_cell_key key;
- dm_block_t block = get_bio_block(tc, bio);
+ dm_block_t begin, end;
+ struct dm_cell_key virt_key;
+ struct dm_bio_prison_cell *virt_cell;
- build_virtual_key(tc->td, block, &key);
- if (bio_detain(tc->pool, &key, bio, &cell))
+ get_bio_block_range(tc, bio, &begin, &end);
+ if (begin == end) {
+ /*
+ * The discard covers less than a block.
+ */
+ bio_endio(bio, 0);
+ return;
+ }
+
+ build_key(tc->td, VIRTUAL, begin, end, &virt_key);
+ if (bio_detain(tc->pool, &virt_key, bio, &virt_cell))
+ /*
+ * Potential starvation issue: We're relying on the
+ * fs/application being well behaved, and not trying to
+ * send IO to a region at the same time as discarding it.
+ * If they do this persistently then it's possible this
+ * cell will never be granted.
+ */
return;
- process_discard_cell(tc, cell);
+ tc->pool->process_discard_cell(tc, virt_cell);
}
static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
@@ -2099,6 +2370,24 @@ static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
dm_device_name(pool->pool_md), new_mode);
}
+static bool passdown_enabled(struct pool_c *pt)
+{
+ return pt->adjusted_pf.discard_passdown;
+}
+
+static void set_discard_callbacks(struct pool *pool)
+{
+ struct pool_c *pt = pool->ti->private;
+
+ if (passdown_enabled(pt)) {
+ pool->process_discard_cell = process_discard_cell_passdown;
+ pool->process_prepared_discard = process_prepared_discard_passdown;
+ } else {
+ pool->process_discard_cell = process_discard_cell_no_passdown;
+ pool->process_prepared_discard = process_prepared_discard_no_passdown;
+ }
+}
+
static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
{
struct pool_c *pt = pool->ti->private;
@@ -2150,7 +2439,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
pool->process_cell = process_cell_read_only;
pool->process_discard_cell = process_cell_success;
pool->process_prepared_mapping = process_prepared_mapping_fail;
- pool->process_prepared_discard = process_prepared_discard_passdown;
+ pool->process_prepared_discard = process_prepared_discard_success;
error_retry_list(pool);
break;
@@ -2169,9 +2458,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
pool->process_bio = process_bio_read_only;
pool->process_discard = process_discard_bio;
pool->process_cell = process_cell_read_only;
- pool->process_discard_cell = process_discard_cell;
pool->process_prepared_mapping = process_prepared_mapping;
- pool->process_prepared_discard = process_prepared_discard;
+ set_discard_callbacks(pool);
if (!pool->pf.error_if_no_space && no_space_timeout)
queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
@@ -2184,9 +2472,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
pool->process_bio = process_bio;
pool->process_discard = process_discard_bio;
pool->process_cell = process_cell;
- pool->process_discard_cell = process_discard_cell;
pool->process_prepared_mapping = process_prepared_mapping;
- pool->process_prepared_discard = process_prepared_discard;
+ set_discard_callbacks(pool);
break;
}
@@ -2275,6 +2562,7 @@ static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
h->shared_read_entry = NULL;
h->all_io_entry = NULL;
h->overwrite_mapping = NULL;
+ h->cell = NULL;
}
/*
@@ -2422,7 +2710,6 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
struct pool *pool = pt->pool;
struct block_device *data_bdev = pt->data_dev->bdev;
struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
- sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
const char *reason = NULL;
char buf[BDEVNAME_SIZE];
@@ -2435,12 +2722,6 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
else if (data_limits->max_discard_sectors < pool->sectors_per_block)
reason = "max discard sectors smaller than a block";
- else if (data_limits->discard_granularity > block_size)
- reason = "discard granularity larger than a block";
-
- else if (!is_factor(block_size, data_limits->discard_granularity))
- reason = "discard granularity not a factor of block size";
-
if (reason) {
DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
pt->adjusted_pf.discard_passdown = false;
@@ -3375,7 +3656,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
if (get_pool_mode(pool) >= PM_READ_ONLY) {
DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
dm_device_name(pool->pool_md));
- return -EINVAL;
+ return -EOPNOTSUPP;
}
if (!strcasecmp(argv[0], "create_thin"))
@@ -3573,24 +3854,6 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
}
-static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
-{
- struct pool *pool = pt->pool;
- struct queue_limits *data_limits;
-
- limits->max_discard_sectors = pool->sectors_per_block;
-
- /*
- * discard_granularity is just a hint, and not enforced.
- */
- if (pt->adjusted_pf.discard_passdown) {
- data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
- limits->discard_granularity = max(data_limits->discard_granularity,
- pool->sectors_per_block << SECTOR_SHIFT);
- } else
- limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
-}
-
static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct pool_c *pt = ti->private;
@@ -3645,14 +3908,17 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
disable_passdown_if_not_supported(pt);
- set_discard_limits(pt, limits);
+ /*
+ * The pool uses the same discard limits as the underlying data
+ * device. DM core has already set this up.
+ */
}
static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 14, 0},
+ .version = {1, 15, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -3811,8 +4077,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (tc->pool->pf.discard_enabled) {
ti->discards_supported = true;
ti->num_discard_bios = 1;
- /* Discard bios must be split on a block boundary */
- ti->split_discard_bios = true;
+ ti->split_discard_bios = false;
}
mutex_unlock(&dm_thin_pool_table.mutex);
@@ -3899,6 +4164,9 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
}
}
+ if (h->cell)
+ cell_defer_no_holder(h->tc, h->cell);
+
return 0;
}
@@ -4026,9 +4294,18 @@ static int thin_iterate_devices(struct dm_target *ti,
return 0;
}
+static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct thin_c *tc = ti->private;
+ struct pool *pool = tc->pool;
+
+ limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+ limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
+}
+
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 14, 0},
+ .version = {1, 15, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
@@ -4040,6 +4317,7 @@ static struct target_type thin_target = {
.status = thin_status,
.merge = thin_merge,
.iterate_devices = thin_iterate_devices,
+ .io_hints = thin_io_hints,
};
/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index d72829922eb6..2fe0992c14a7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -86,6 +86,9 @@ struct dm_rq_target_io {
struct kthread_work work;
int error;
union map_info info;
+ struct dm_stats_aux stats_aux;
+ unsigned long duration_jiffies;
+ unsigned n_sectors;
};
/*
@@ -995,6 +998,17 @@ static struct dm_rq_target_io *tio_from_request(struct request *rq)
return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
}
+static void rq_end_stats(struct mapped_device *md, struct request *orig)
+{
+ if (unlikely(dm_stats_used(&md->stats))) {
+ struct dm_rq_target_io *tio = tio_from_request(orig);
+ tio->duration_jiffies = jiffies - tio->duration_jiffies;
+ dm_stats_account_io(&md->stats, orig->cmd_flags, blk_rq_pos(orig),
+ tio->n_sectors, true, tio->duration_jiffies,
+ &tio->stats_aux);
+ }
+}
+
/*
* Don't touch any member of the md after calling this function because
* the md may be freed in dm_put() at the end of this function.
@@ -1078,6 +1092,7 @@ static void dm_end_request(struct request *clone, int error)
}
free_rq_clone(clone);
+ rq_end_stats(md, rq);
if (!rq->q->mq_ops)
blk_end_request_all(rq, error);
else
@@ -1113,13 +1128,14 @@ static void old_requeue_request(struct request *rq)
spin_unlock_irqrestore(q->queue_lock, flags);
}
-static void dm_requeue_unmapped_original_request(struct mapped_device *md,
- struct request *rq)
+static void dm_requeue_original_request(struct mapped_device *md,
+ struct request *rq)
{
int rw = rq_data_dir(rq);
dm_unprep_request(rq);
+ rq_end_stats(md, rq);
if (!rq->q->mq_ops)
old_requeue_request(rq);
else {
@@ -1130,13 +1146,6 @@ static void dm_requeue_unmapped_original_request(struct mapped_device *md,
rq_completed(md, rw, false);
}
-static void dm_requeue_unmapped_request(struct request *clone)
-{
- struct dm_rq_target_io *tio = clone->end_io_data;
-
- dm_requeue_unmapped_original_request(tio->md, tio->orig);
-}
-
static void old_stop_queue(struct request_queue *q)
{
unsigned long flags;
@@ -1200,7 +1209,7 @@ static void dm_done(struct request *clone, int error, bool mapped)
return;
else if (r == DM_ENDIO_REQUEUE)
/* The target wants to requeue the I/O */
- dm_requeue_unmapped_request(clone);
+ dm_requeue_original_request(tio->md, tio->orig);
else {
DMWARN("unimplemented target endio return value: %d", r);
BUG();
@@ -1218,6 +1227,7 @@ static void dm_softirq_done(struct request *rq)
int rw;
if (!clone) {
+ rq_end_stats(tio->md, rq);
rw = rq_data_dir(rq);
if (!rq->q->mq_ops) {
blk_end_request_all(rq, tio->error);
@@ -1910,7 +1920,7 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
break;
case DM_MAPIO_REQUEUE:
/* The target wants to requeue the I/O */
- dm_requeue_unmapped_request(clone);
+ dm_requeue_original_request(md, tio->orig);
break;
default:
if (r > 0) {
@@ -1933,7 +1943,7 @@ static void map_tio_request(struct kthread_work *work)
struct mapped_device *md = tio->md;
if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
- dm_requeue_unmapped_original_request(md, rq);
+ dm_requeue_original_request(md, rq);
}
static void dm_start_request(struct mapped_device *md, struct request *orig)
@@ -1950,6 +1960,14 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
md->last_rq_start_time = ktime_get();
}
+ if (unlikely(dm_stats_used(&md->stats))) {
+ struct dm_rq_target_io *tio = tio_from_request(orig);
+ tio->duration_jiffies = jiffies;
+ tio->n_sectors = blk_rq_sectors(orig);
+ dm_stats_account_io(&md->stats, orig->cmd_flags, blk_rq_pos(orig),
+ tio->n_sectors, false, 0, &tio->stats_aux);
+ }
+
/*
* Hold the md reference here for the in-flight I/O.
* We can't rely on the reference count by device opener,
@@ -2173,6 +2191,40 @@ static void dm_init_old_md_queue(struct mapped_device *md)
blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
}
+static void cleanup_mapped_device(struct mapped_device *md)
+{
+ cleanup_srcu_struct(&md->io_barrier);
+
+ if (md->wq)
+ destroy_workqueue(md->wq);
+ if (md->kworker_task)
+ kthread_stop(md->kworker_task);
+ if (md->io_pool)
+ mempool_destroy(md->io_pool);
+ if (md->rq_pool)
+ mempool_destroy(md->rq_pool);
+ if (md->bs)
+ bioset_free(md->bs);
+
+ if (md->disk) {
+ spin_lock(&_minor_lock);
+ md->disk->private_data = NULL;
+ spin_unlock(&_minor_lock);
+ if (blk_get_integrity(md->disk))
+ blk_integrity_unregister(md->disk);
+ del_gendisk(md->disk);
+ put_disk(md->disk);
+ }
+
+ if (md->queue)
+ blk_cleanup_queue(md->queue);
+
+ if (md->bdev) {
+ bdput(md->bdev);
+ md->bdev = NULL;
+ }
+}
+
/*
* Allocate and initialise a blank device with a given minor.
*/
@@ -2218,13 +2270,13 @@ static struct mapped_device *alloc_dev(int minor)
md->queue = blk_alloc_queue(GFP_KERNEL);
if (!md->queue)
- goto bad_queue;
+ goto bad;
dm_init_md_queue(md);
md->disk = alloc_disk(1);
if (!md->disk)
- goto bad_disk;
+ goto bad;
atomic_set(&md->pending[0], 0);
atomic_set(&md->pending[1], 0);
@@ -2245,11 +2297,11 @@ static struct mapped_device *alloc_dev(int minor)
md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
if (!md->wq)
- goto bad_thread;
+ goto bad;
md->bdev = bdget_disk(md->disk, 0);
if (!md->bdev)
- goto bad_bdev;
+ goto bad;
bio_init(&md->flush_bio);
md->flush_bio.bi_bdev = md->bdev;
@@ -2266,15 +2318,8 @@ static struct mapped_device *alloc_dev(int minor)
return md;
-bad_bdev:
- destroy_workqueue(md->wq);
-bad_thread:
- del_gendisk(md->disk);
- put_disk(md->disk);
-bad_disk:
- blk_cleanup_queue(md->queue);
-bad_queue:
- cleanup_srcu_struct(&md->io_barrier);
+bad:
+ cleanup_mapped_device(md);
bad_io_barrier:
free_minor(minor);
bad_minor:
@@ -2291,71 +2336,65 @@ static void free_dev(struct mapped_device *md)
int minor = MINOR(disk_devt(md->disk));
unlock_fs(md);
- destroy_workqueue(md->wq);
- if (md->kworker_task)
- kthread_stop(md->kworker_task);
- if (md->io_pool)
- mempool_destroy(md->io_pool);
- if (md->rq_pool)
- mempool_destroy(md->rq_pool);
- if (md->bs)
- bioset_free(md->bs);
+ cleanup_mapped_device(md);
+ if (md->use_blk_mq)
+ blk_mq_free_tag_set(&md->tag_set);
- cleanup_srcu_struct(&md->io_barrier);
free_table_devices(&md->table_devices);
dm_stats_cleanup(&md->stats);
-
- spin_lock(&_minor_lock);
- md->disk->private_data = NULL;
- spin_unlock(&_minor_lock);
- if (blk_get_integrity(md->disk))
- blk_integrity_unregister(md->disk);
- del_gendisk(md->disk);
- put_disk(md->disk);
- blk_cleanup_queue(md->queue);
- if (md->use_blk_mq)
- blk_mq_free_tag_set(&md->tag_set);
- bdput(md->bdev);
free_minor(minor);
module_put(THIS_MODULE);
kfree(md);
}
+static unsigned filter_md_type(unsigned type, struct mapped_device *md)
+{
+ if (type == DM_TYPE_BIO_BASED)
+ return type;
+
+ return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
+}
+
static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
{
struct dm_md_mempools *p = dm_table_get_md_mempools(t);
- if (md->bs) {
- /* The md already has necessary mempools. */
- if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
+ switch (filter_md_type(dm_table_get_type(t), md)) {
+ case DM_TYPE_BIO_BASED:
+ if (md->bs && md->io_pool) {
/*
+ * This bio-based md already has necessary mempools.
* Reload bioset because front_pad may have changed
* because a different table was loaded.
*/
bioset_free(md->bs);
md->bs = p->bs;
p->bs = NULL;
+ goto out;
}
- /*
- * There's no need to reload with request-based dm
- * because the size of front_pad doesn't change.
- * Note for future: If you are to reload bioset,
- * prep-ed requests in the queue may refer
- * to bio from the old bioset, so you must walk
- * through the queue to unprep.
- */
- goto out;
+ break;
+ case DM_TYPE_REQUEST_BASED:
+ if (md->rq_pool && md->io_pool)
+ /*
+ * This request-based md already has necessary mempools.
+ */
+ goto out;
+ break;
+ case DM_TYPE_MQ_REQUEST_BASED:
+ BUG_ON(p); /* No mempools needed */
+ return;
}
+ BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
+
md->io_pool = p->io_pool;
p->io_pool = NULL;
md->rq_pool = p->rq_pool;
p->rq_pool = NULL;
md->bs = p->bs;
p->bs = NULL;
-
out:
/* mempool bind completed, no longer need any mempools in the table */
dm_table_free_md_mempools(t);
@@ -2675,6 +2714,7 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
/* Direct call is fine since .queue_rq allows allocations */
if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
/* Undo dm_start_request() before requeuing */
+ rq_end_stats(md, rq);
rq_completed(md, rq_data_dir(rq), false);
return BLK_MQ_RQ_QUEUE_BUSY;
}
@@ -2734,14 +2774,6 @@ out_tag_set:
return err;
}
-static unsigned filter_md_type(unsigned type, struct mapped_device *md)
-{
- if (type == DM_TYPE_BIO_BASED)
- return type;
-
- return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
-}
-
/*
* Setup the DM device's queue based on md's type
*/
@@ -3463,7 +3495,7 @@ struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
pools = kzalloc(sizeof(*pools), GFP_KERNEL);
if (!pools)
- return NULL;
+ return ERR_PTR(-ENOMEM);
front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) +
offsetof(struct dm_target_io, clone);
@@ -3482,24 +3514,26 @@ struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
return pools;
out:
dm_free_md_mempools(pools);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md,
unsigned type)
{
- unsigned int pool_size = dm_get_reserved_rq_based_ios();
+ unsigned int pool_size;
struct dm_md_mempools *pools;
+ if (filter_md_type(type, md) == DM_TYPE_MQ_REQUEST_BASED)
+ return NULL; /* No mempools needed */
+
+ pool_size = dm_get_reserved_rq_based_ios();
pools = kzalloc(sizeof(*pools), GFP_KERNEL);
if (!pools)
- return NULL;
+ return ERR_PTR(-ENOMEM);
- if (filter_md_type(type, md) == DM_TYPE_REQUEST_BASED) {
- pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
- if (!pools->rq_pool)
- goto out;
- }
+ pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
+ if (!pools->rq_pool)
+ goto out;
pools->io_pool = mempool_create_slab_pool(pool_size, _rq_tio_cache);
if (!pools->io_pool)
@@ -3508,7 +3542,7 @@ struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md,
return pools;
out:
dm_free_md_mempools(pools);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
void dm_free_md_mempools(struct dm_md_mempools *pools)
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 087411c95ffc..4d6c9b689eaa 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -609,6 +609,12 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b)
dm_bufio_prefetch(bm->bufio, b, 1);
}
+bool dm_bm_is_read_only(struct dm_block_manager *bm)
+{
+ return bm->read_only;
+}
+EXPORT_SYMBOL_GPL(dm_bm_is_read_only);
+
void dm_bm_set_read_only(struct dm_block_manager *bm)
{
bm->read_only = true;
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index 1b95dfc17786..84330f59886d 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -123,6 +123,7 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
* Additionally you should not use dm_bm_unlock_move, however no error will
* be returned if you do.
*/
+bool dm_bm_is_read_only(struct dm_block_manager *bm);
void dm_bm_set_read_only(struct dm_block_manager *bm);
void dm_bm_set_read_write(struct dm_block_manager *bm);
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index b88757cd0d1d..e04cfd2d60ef 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -590,3 +590,130 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
return r;
}
EXPORT_SYMBOL_GPL(dm_btree_remove);
+
+/*----------------------------------------------------------------*/
+
+static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info,
+ struct dm_btree_value_type *vt, dm_block_t root,
+ uint64_t key, int *index)
+{
+ int i = *index, r;
+ struct btree_node *n;
+
+ for (;;) {
+ r = shadow_step(s, root, vt);
+ if (r < 0)
+ break;
+
+ /*
+ * We have to patch up the parent node, ugly, but I don't
+ * see a way to do this automatically as part of the spine
+ * op.
+ */
+ if (shadow_has_parent(s)) {
+ __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
+ memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
+ &location, sizeof(__le64));
+ }
+
+ n = dm_block_data(shadow_current(s));
+
+ if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
+ *index = lower_bound(n, key);
+ return 0;
+ }
+
+ r = rebalance_children(s, info, vt, key);
+ if (r)
+ break;
+
+ n = dm_block_data(shadow_current(s));
+ if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
+ *index = lower_bound(n, key);
+ return 0;
+ }
+
+ i = lower_bound(n, key);
+
+ /*
+ * We know the key is present, or else
+ * rebalance_children would have returned
+ * -ENODATA
+ */
+ root = value64(n, i);
+ }
+
+ return r;
+}
+
+static int remove_one(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *keys, uint64_t end_key,
+ dm_block_t *new_root, unsigned *nr_removed)
+{
+ unsigned level, last_level = info->levels - 1;
+ int index = 0, r = 0;
+ struct shadow_spine spine;
+ struct btree_node *n;
+ uint64_t k;
+
+ init_shadow_spine(&spine, info);
+ for (level = 0; level < last_level; level++) {
+ r = remove_raw(&spine, info, &le64_type,
+ root, keys[level], (unsigned *) &index);
+ if (r < 0)
+ goto out;
+
+ n = dm_block_data(shadow_current(&spine));
+ root = value64(n, index);
+ }
+
+ r = remove_nearest(&spine, info, &info->value_type,
+ root, keys[last_level], &index);
+ if (r < 0)
+ goto out;
+
+ n = dm_block_data(shadow_current(&spine));
+
+ if (index < 0)
+ index = 0;
+
+ if (index >= le32_to_cpu(n->header.nr_entries)) {
+ r = -ENODATA;
+ goto out;
+ }
+
+ k = le64_to_cpu(n->keys[index]);
+ if (k >= keys[last_level] && k < end_key) {
+ if (info->value_type.dec)
+ info->value_type.dec(info->value_type.context,
+ value_ptr(n, index));
+
+ delete_at(n, index);
+
+ } else
+ r = -ENODATA;
+
+out:
+ *new_root = shadow_root(&spine);
+ exit_shadow_spine(&spine);
+
+ return r;
+}
+
+int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *first_key, uint64_t end_key,
+ dm_block_t *new_root, unsigned *nr_removed)
+{
+ int r;
+
+ *nr_removed = 0;
+ do {
+ r = remove_one(info, root, first_key, end_key, &root, nr_removed);
+ if (!r)
+ (*nr_removed)++;
+ } while (!r);
+
+ *new_root = root;
+ return r == -ENODATA ? 0 : r;
+}
+EXPORT_SYMBOL_GPL(dm_btree_remove_leaves);
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index dacfc34180b4..11d8cf78621d 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -135,6 +135,15 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
uint64_t *keys, dm_block_t *new_root);
/*
+ * Removes values between 'keys' and keys2, where keys2 is keys with the
+ * final key replaced with 'end_key'. 'end_key' is the one-past-the-end
+ * value. 'keys' may be altered.
+ */
+int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
+ uint64_t *keys, uint64_t end_key,
+ dm_block_t *new_root, unsigned *nr_removed);
+
+/*
* Returns < 0 on failure. Otherwise the number of key entries that have
* been filled out. Remember trees can have zero entries, and as such have
* no lowest key.
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index e8a904298887..53091295fce9 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -204,6 +204,27 @@ static void in(struct sm_metadata *smm)
smm->recursion_count++;
}
+static int apply_bops(struct sm_metadata *smm)
+{
+ int r = 0;
+
+ while (!brb_empty(&smm->uncommitted)) {
+ struct block_op bop;
+
+ r = brb_pop(&smm->uncommitted, &bop);
+ if (r) {
+ DMERR("bug in bop ring buffer");
+ break;
+ }
+
+ r = commit_bop(smm, &bop);
+ if (r)
+ break;
+ }
+
+ return r;
+}
+
static int out(struct sm_metadata *smm)
{
int r = 0;
@@ -216,21 +237,8 @@ static int out(struct sm_metadata *smm)
return -ENOMEM;
}
- if (smm->recursion_count == 1) {
- while (!brb_empty(&smm->uncommitted)) {
- struct block_op bop;
-
- r = brb_pop(&smm->uncommitted, &bop);
- if (r) {
- DMERR("bug in bop ring buffer");
- break;
- }
-
- r = commit_bop(smm, &bop);
- if (r)
- break;
- }
- }
+ if (smm->recursion_count == 1)
+ apply_bops(smm);
smm->recursion_count--;
@@ -704,6 +712,12 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
}
old_len = smm->begin;
+ r = apply_bops(smm);
+ if (r) {
+ DMERR("%s: apply_bops failed", __func__);
+ goto out;
+ }
+
r = sm_ll_commit(&smm->ll);
if (r)
goto out;
@@ -773,6 +787,12 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
if (r)
return r;
+ r = apply_bops(smm);
+ if (r) {
+ DMERR("%s: apply_bops failed", __func__);
+ return r;
+ }
+
return sm_metadata_commit(sm);
}