Merge tag 'for-6.7/block-2023-10-30' of git://git.kernel.dk/linux

Pull block updates from Jens Axboe: - Improvements to the queue_rqs() support, and adding null_blk support for that as well (Chengming) - Series improving badblocks support (Coly) - Key store support for sed-opal (Greg) - IBM partition string handling improvements (Jan) - Make number of ublk devices supported configurable (Mike) - Cancelation improvements for ublk (Ming) - MD pull requests via Song: - Handle timeout in md-cluster, by Denis Plotnikov - Cleanup pers->prepare_suspend, by Yu Kuai - Rewrite mddev_suspend(), by Yu Kuai - Simplify md_seq_ops, by Yu Kuai - Reduce unnecessary locking array_state_store(), by Mariusz Tkaczyk - Make rdev add/remove independent from daemon thread, by Yu Kuai - Refactor code around quiesce() and mddev_suspend(), by Yu Kuai - NVMe pull request via Keith: - nvme-auth updates (Mark) - nvme-tcp tls (Hannes) - nvme-fc annotaions (Kees) - Misc cleanups and improvements (Jiapeng, Joel) * tag 'for-6.7/block-2023-10-30' of git://git.kernel.dk/linux: (95 commits) block: ublk_drv: Remove unused function md: cleanup pers->prepare_suspend() nvme-auth: allow mixing of secret and hash lengths nvme-auth: use transformed key size to create resp nvme-auth: alloc nvme_dhchap_key as single buffer nvmet-tcp: use 'spin_lock_bh' for state_lock() powerpc/pseries: PLPKS SED Opal keystore support block: sed-opal: keystore access for SED Opal keys block:sed-opal: SED Opal keystore ublk: simplify aborting request ublk: replace monitor with cancelable uring_cmd ublk: quiesce request queue when aborting queue ublk: rename mm_lock as lock ublk: move ublk_cancel_dev() out of ub->mutex ublk: make sure io cmd handled in submitter task context ublk: don't get ublk device reference in ublk_abort_queue() ublk: Make ublks_max configurable ublk: Limit dev_id/ub_number values md-cluster: check for timeout while a new disk adding nvme: rework NVME_AUTH Kconfig selection ...
author: Linus Torvalds 2023-11-01 12:30:07 -1000
committer: Linus Torvalds 2023-11-01 12:30:07 -1000
commit: 90d624af2e5a9945eedd5cafd6ae6d88f32cc977 (patch)
tree: e936a0cc8f2b613f327ab08280dccbad664703cf /block
parent: 4de520f1fcefd4ebb7dddcf28bde1b330c2f6b5d (diff)
parent: 0c696bb38f4cc0f0f90a8e06ae1eda21a9630cd0 (diff)
8 files changed, 1465 insertions, 384 deletions
diff --git a/block/Kconfig b/block/Kconfig
index f1364d1c0d93..55ae2286a4de 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -186,6 +186,7 @@ config BLK_SED_OPAL
 	bool "Logic for interfacing with Opal enabled SEDs"
 	depends on KEYS
 	select PSERIES_PLPKS if PPC_PSERIES
+	select PSERIES_PLPKS_SED if PPC_PSERIES
 	help
 	Builds Logic for interfacing with Opal enabled controllers.
 	Enabling this option enables users to setup/unlock/lock
diff --git a/block/badblocks.c b/block/badblocks.c
index 3afb550c0f7b..fc92d4e18aa3 100644
--- a/block/badblocks.c
+++ b/block/badblocks.c
@@ -16,119 +16,830 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 
-/**
- * badblocks_check() - check a given range for bad sectors
- * @bb:		the badblocks structure that holds all badblock information
- * @s:		sector (start) at which to check for badblocks
- * @sectors:	number of sectors to check for badblocks
- * @first_bad:	pointer to store location of the first badblock
- * @bad_sectors: pointer to store number of badblocks after @first_bad
+/*
+ * The purpose of badblocks set/clear is to manage bad blocks ranges which are
+ * identified by LBA addresses.
  *
- * We can record which blocks on each device are 'bad' and so just
- * fail those blocks, or that stripe, rather than the whole device.
- * Entries in the bad-block table are 64bits wide.  This comprises:
- * Length of bad-range, in sectors: 0-511 for lengths 1-512
- * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
- *  A 'shift' can be set so that larger blocks are tracked and
- *  consequently larger devices can be covered.
- * 'Acknowledged' flag - 1 bit. - the most significant bit.
+ * When the caller of badblocks_set() wants to set a range of bad blocks, the
+ * setting range can be acked or unacked. And the setting range may merge,
+ * overwrite, skip the overlapped already set range, depends on who they are
+ * overlapped or adjacent, and the acknowledgment type of the ranges. It can be
+ * more complicated when the setting range covers multiple already set bad block
+ * ranges, with restrictions of maximum length of each bad range and the bad
+ * table space limitation.
  *
- * Locking of the bad-block table uses a seqlock so badblocks_check
- * might need to retry if it is very unlucky.
- * We will sometimes want to check for bad blocks in a bi_end_io function,
- * so we use the write_seqlock_irq variant.
+ * It is difficult and unnecessary to take care of all the possible situations,
+ * for setting a large range of bad blocks, we can handle it by dividing the
+ * large range into smaller ones when encounter overlap, max range length or
+ * bad table full conditions. Every time only a smaller piece of the bad range
+ * is handled with a limited number of conditions how it is interacted with
+ * possible overlapped or adjacent already set bad block ranges. Then the hard
+ * complicated problem can be much simpler to handle in proper way.
  *
- * When looking for a bad block we specify a range and want to
- * know if any block in the range is bad.  So we binary-search
- * to the last range that starts at-or-before the given endpoint,
- * (or "before the sector after the target range")
- * then see if it ends after the given start.
+ * When setting a range of bad blocks to the bad table, the simplified situations
+ * to be considered are, (The already set bad blocks ranges are naming with
+ *  prefix E, and the setting bad blocks range is naming with prefix S)
  *
- * Return:
- *  0: there are no known bad blocks in the range
- *  1: there are known bad block which are all acknowledged
- * -1: there are bad blocks which have not yet been acknowledged in metadata.
- * plus the start/length of the first bad section we overlap.
+ * 1) A setting range is not overlapped or adjacent to any other already set bad
+ *    block range.
+ *                         +--------+
+ *                         |    S   |
+ *                         +--------+
+ *        +-------------+               +-------------+
+ *        |      E1     |               |      E2     |
+ *        +-------------+               +-------------+
+ *    For this situation if the bad blocks table is not full, just allocate a
+ *    free slot from the bad blocks table to mark the setting range S. The
+ *    result is,
+ *        +-------------+  +--------+   +-------------+
+ *        |      E1     |  |    S   |   |      E2     |
+ *        +-------------+  +--------+   +-------------+
+ * 2) A setting range starts exactly at a start LBA of an already set bad blocks
+ *    range.
+ * 2.1) The setting range size < already set range size
+ *        +--------+
+ *        |    S   |
+ *        +--------+
+ *        +-------------+
+ *        |      E      |
+ *        +-------------+
+ * 2.1.1) If S and E are both acked or unacked range, the setting range S can
+ *    be merged into existing bad range E. The result is,
+ *        +-------------+
+ *        |      S      |
+ *        +-------------+
+ * 2.1.2) If S is unacked setting and E is acked, the setting will be denied, and
+ *    the result is,
+ *        +-------------+
+ *        |      E      |
+ *        +-------------+
+ * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E.
+ *    An extra slot from the bad blocks table will be allocated for S, and head
+ *    of E will move to end of the inserted range S. The result is,
+ *        +--------+----+
+ *        |    S   | E  |
+ *        +--------+----+
+ * 2.2) The setting range size == already set range size
+ * 2.2.1) If S and E are both acked or unacked range, the setting range S can
+ *    be merged into existing bad range E. The result is,
+ *        +-------------+
+ *        |      S      |
+ *        +-------------+
+ * 2.2.2) If S is unacked setting and E is acked, the setting will be denied, and
+ *    the result is,
+ *        +-------------+
+ *        |      E      |
+ *        +-------------+
+ * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of
+      bad blocks range E. The result is,
+ *        +-------------+
+ *        |      S      |
+ *        +-------------+
+ * 2.3) The setting range size > already set range size
+ *        +-------------------+
+ *        |          S        |
+ *        +-------------------+
+ *        +-------------+
+ *        |      E      |
+ *        +-------------+
+ *    For such situation, the setting range S can be treated as two parts, the
+ *    first part (S1) is as same size as the already set range E, the second
+ *    part (S2) is the rest of setting range.
+ *        +-------------+-----+        +-------------+       +-----+
+ *        |    S1       | S2  |        |     S1      |       | S2  |
+ *        +-------------+-----+  ===>  +-------------+       +-----+
+ *        +-------------+              +-------------+
+ *        |      E      |              |      E      |
+ *        +-------------+              +-------------+
+ *    Now we only focus on how to handle the setting range S1 and already set
+ *    range E, which are already explained in 2.2), for the rest S2 it will be
+ *    handled later in next loop.
+ * 3) A setting range starts before the start LBA of an already set bad blocks
+ *    range.
+ *        +-------------+
+ *        |      S      |
+ *        +-------------+
+ *             +-------------+
+ *             |      E      |
+ *             +-------------+
+ *    For this situation, the setting range S can be divided into two parts, the
+ *    first (S1) ends at the start LBA of already set range E, the second part
+ *    (S2) starts exactly at a start LBA of the already set range E.
+ *        +----+---------+             +----+      +---------+
+ *        | S1 |    S2   |             | S1 |      |    S2   |
+ *        +----+---------+      ===>   +----+      +---------+
+ *             +-------------+                     +-------------+
+ *             |      E      |                     |      E      |
+ *             +-------------+                     +-------------+
+ *    Now only the first part S1 should be handled in this loop, which is in
+ *    similar condition as 1). The rest part S2 has exact same start LBA address
+ *    of the already set range E, they will be handled in next loop in one of
+ *    situations in 2).
+ * 4) A setting range starts after the start LBA of an already set bad blocks
+ *    range.
+ * 4.1) If the setting range S exactly matches the tail part of already set bad
+ *    blocks range E, like the following chart shows,
+ *            +---------+
+ *            |   S     |
+ *            +---------+
+ *        +-------------+
+ *        |      E      |
+ *        +-------------+
+ * 4.1.1) If range S and E have same acknowledge value (both acked or unacked),
+ *    they will be merged into one, the result is,
+ *        +-------------+
+ *        |      S      |
+ *        +-------------+
+ * 4.1.2) If range E is acked and the setting range S is unacked, the setting
+ *    request of S will be rejected, the result is,
+ *        +-------------+
+ *        |      E      |
+ *        +-------------+
+ * 4.1.3) If range E is unacked, and the setting range S is acked, then S may
+ *    overwrite the overlapped range of E, the result is,
+ *        +---+---------+
+ *        | E |    S    |
+ *        +---+---------+
+ * 4.2) If the setting range S stays in middle of an already set range E, like
+ *    the following chart shows,
+ *             +----+
+ *             | S  |
+ *             +----+
+ *        +--------------+
+ *        |       E      |
+ *        +--------------+
+ * 4.2.1) If range S and E have same acknowledge value (both acked or unacked),
+ *    they will be merged into one, the result is,
+ *        +--------------+
+ *        |       S      |
+ *        +--------------+
+ * 4.2.2) If range E is acked and the setting range S is unacked, the setting
+ *    request of S will be rejected, the result is also,
+ *        +--------------+
+ *        |       E      |
+ *        +--------------+
+ * 4.2.3) If range E is unacked, and the setting range S is acked, then S will
+ *    inserted into middle of E and split previous range E into two parts (E1
+ *    and E2), the result is,
+ *        +----+----+----+
+ *        | E1 |  S | E2 |
+ *        +----+----+----+
+ * 4.3) If the setting bad blocks range S is overlapped with an already set bad
+ *    blocks range E. The range S starts after the start LBA of range E, and
+ *    ends after the end LBA of range E, as the following chart shows,
+ *            +-------------------+
+ *            |          S        |
+ *            +-------------------+
+ *        +-------------+
+ *        |      E      |
+ *        +-------------+
+ *    For this situation the range S can be divided into two parts, the first
+ *    part (S1) ends at end range E, and the second part (S2) has rest range of
+ *    origin S.
+ *            +---------+---------+            +---------+      +---------+
+ *            |    S1   |    S2   |            |    S1   |      |    S2   |
+ *            +---------+---------+  ===>      +---------+      +---------+
+ *        +-------------+                  +-------------+
+ *        |      E      |                  |      E      |
+ *        +-------------+                  +-------------+
+ *     Now in this loop the setting range S1 and already set range E can be
+ *     handled as the situations 4.1), the rest range S2 will be handled in next
+ *     loop and ignored in this loop.
+ * 5) A setting bad blocks range S is adjacent to one or more already set bad
+ *    blocks range(s), and they are all acked or unacked range.
+ * 5.1) Front merge: If the already set bad blocks range E is before setting
+ *    range S and they are adjacent,
+ *                +------+
+ *                |  S   |
+ *                +------+
+ *        +-------+
+ *        |   E   |
+ *        +-------+
+ * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge
+ *    values are same, the setting range S can front merges into range E. The
+ *    result is,
+ *        +--------------+
+ *        |       S      |
+ *        +--------------+
+ * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting
+ *    range S right after already set range E into the bad blocks table. The
+ *    result is,
+ *        +--------+------+
+ *        |   E    |   S  |
+ *        +--------+------+
+ * 6) Special cases which above conditions cannot handle
+ * 6.1) Multiple already set ranges may merge into less ones in a full bad table
+ *        +-------------------------------------------------------+
+ *        |                           S                           |
+ *        +-------------------------------------------------------+
+ *        |<----- BB_MAX_LEN ----->|
+ *                                 +-----+     +-----+   +-----+
+ *                                 | E1  |     | E2  |   | E3  |
+ *                                 +-----+     +-----+   +-----+
+ *     In the above example, when the bad blocks table is full, inserting the
+ *     first part of setting range S will fail because no more available slot
+ *     can be allocated from bad blocks table. In this situation a proper
+ *     setting method should be go though all the setting bad blocks range and
+ *     look for chance to merge already set ranges into less ones. When there
+ *     is available slot from bad blocks table, re-try again to handle more
+ *     setting bad blocks ranges as many as possible.
+ *        +------------------------+
+ *        |          S3            |
+ *        +------------------------+
+ *        |<----- BB_MAX_LEN ----->|
+ *                                 +-----+-----+-----+---+-----+--+
+ *                                 |       S1        |     S2     |
+ *                                 +-----+-----+-----+---+-----+--+
+ *     The above chart shows although the first part (S3) cannot be inserted due
+ *     to no-space in bad blocks table, but the following E1, E2 and E3 ranges
+ *     can be merged with rest part of S into less range S1 and S2. Now there is
+ *     1 free slot in bad blocks table.
+ *        +------------------------+-----+-----+-----+---+-----+--+
+ *        |           S3           |       S1        |     S2     |
+ *        +------------------------+-----+-----+-----+---+-----+--+
+ *     Since the bad blocks table is not full anymore, re-try again for the
+ *     origin setting range S. Now the setting range S3 can be inserted into the
+ *     bad blocks table with previous freed slot from multiple ranges merge.
+ * 6.2) Front merge after overwrite
+ *    In the following example, in bad blocks table, E1 is an acked bad blocks
+ *    range and E2 is an unacked bad blocks range, therefore they are not able
+ *    to merge into a larger range. The setting bad blocks range S is acked,
+ *    therefore part of E2 can be overwritten by S.
+ *                      +--------+
+ *                      |    S   |                             acknowledged
+ *                      +--------+                         S:       1
+ *              +-------+-------------+                   E1:       1
+ *              |   E1  |    E2       |                   E2:       0
+ *              +-------+-------------+
+ *     With previous simplified routines, after overwriting part of E2 with S,
+ *     the bad blocks table should be (E3 is remaining part of E2 which is not
+ *     overwritten by S),
+ *                                                             acknowledged
+ *              +-------+--------+----+                    S:       1
+ *              |   E1  |    S   | E3 |                   E1:       1
+ *              +-------+--------+----+                   E3:       0
+ *     The above result is correct but not perfect. Range E1 and S in the bad
+ *     blocks table are all acked, merging them into a larger one range may
+ *     occupy less bad blocks table space and make badblocks_check() faster.
+ *     Therefore in such situation, after overwriting range S, the previous range
+ *     E1 should be checked for possible front combination. Then the ideal
+ *     result can be,
+ *              +----------------+----+                        acknowledged
+ *              |       E1       | E3 |                   E1:       1
+ *              +----------------+----+                   E3:       0
+ * 6.3) Behind merge: If the already set bad blocks range E is behind the setting
+ *    range S and they are adjacent. Normally we don't need to care about this
+ *    because front merge handles this while going though range S from head to
+ *    tail, except for the tail part of range S. When the setting range S are
+ *    fully handled, all the above simplified routine doesn't check whether the
+ *    tail LBA of range S is adjacent to the next already set range and not
+ *    merge them even it is possible.
+ *        +------+
+ *        |  S   |
+ *        +------+
+ *               +-------+
+ *               |   E   |
+ *               +-------+
+ *    For the above special situation, when the setting range S are all handled
+ *    and the loop ends, an extra check is necessary for whether next already
+ *    set range E is right after S and mergeable.
+ * 6.3.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge
+ *    values are same, the setting range S can behind merges into range E. The
+ *    result is,
+ *        +--------------+
+ *        |       S      |
+ *        +--------------+
+ * 6.3.2) Otherwise these two ranges cannot merge, just insert the setting range
+ *     S in front of the already set range E in the bad blocks table. The result
+ *     is,
+ *        +------+-------+
+ *        |  S   |   E   |
+ *        +------+-------+
+ *
+ * All the above 5 simplified situations and 3 special cases may cover 99%+ of
+ * the bad block range setting conditions. Maybe there is some rare corner case
+ * is not considered and optimized, it won't hurt if badblocks_set() fails due
+ * to no space, or some ranges are not merged to save bad blocks table space.
+ *
+ * Inside badblocks_set() each loop starts by jumping to re_insert label, every
+ * time for the new loop prev_badblocks() is called to find an already set range
+ * which starts before or at current setting range. Since the setting bad blocks
+ * range is handled from head to tail, most of the cases it is unnecessary to do
+ * the binary search inside prev_badblocks(), it is possible to provide a hint
+ * to prev_badblocks() for a fast path, then the expensive binary search can be
+ * avoided. In my test with the hint to prev_badblocks(), except for the first
+ * loop, all rested calls to prev_badblocks() can go into the fast path and
+ * return correct bad blocks table index immediately.
+ *
+ *
+ * Clearing a bad blocks range from the bad block table has similar idea as
+ * setting does, but much more simpler. The only thing needs to be noticed is
+ * when the clearing range hits middle of a bad block range, the existing bad
+ * block range will split into two, and one more item should be added into the
+ * bad block table. The simplified situations to be considered are, (The already
+ * set bad blocks ranges in bad block table are naming with prefix E, and the
+ * clearing bad blocks range is naming with prefix C)
+ *
+ * 1) A clearing range is not overlapped to any already set ranges in bad block
+ *    table.
+ *    +-----+         |          +-----+         |          +-----+
+ *    |  C  |         |          |  C  |         |          |  C  |
+ *    +-----+         or         +-----+         or         +-----+
+ *            +---+   |   +----+         +----+  |  +---+
+ *            | E |   |   | E1 |         | E2 |  |  | E |
+ *            +---+   |   +----+         +----+  |  +---+
+ *    For the above situations, no bad block to be cleared and no failure
+ *    happens, simply returns 0.
+ * 2) The clearing range hits middle of an already setting bad blocks range in
+ *    the bad block table.
+ *            +---+
+ *            | C |
+ *            +---+
+ *     +-----------------+
+ *     |         E       |
+ *     +-----------------+
+ *    In this situation if the bad block table is not full, the range E will be
+ *    split into two ranges E1 and E2. The result is,
+ *     +------+   +------+
+ *     |  E1  |   |  E2  |
+ *     +------+   +------+
+ * 3) The clearing range starts exactly at same LBA as an already set bad block range
+ *    from the bad block table.
+ * 3.1) Partially covered at head part
+ *         +------------+
+ *         |     C      |
+ *         +------------+
+ *         +-----------------+
+ *         |         E       |
+ *         +-----------------+
+ *    For this situation, the overlapped already set range will update the
+ *    start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No
+ *    item deleted from bad block table. The result is,
+ *                      +----+
+ *                      | E1 |
+ *                      +----+
+ * 3.2) Exact fully covered
+ *         +-----------------+
+ *         |         C       |
+ *         +-----------------+
+ *         +-----------------+
+ *         |         E       |
+ *         +-----------------+
+ *    For this situation the whole bad blocks range E will be cleared and its
+ *    corresponded item is deleted from the bad block table.
+ * 4) The clearing range exactly ends at same LBA as an already set bad block
+ *    range.
+ *                   +-------+
+ *                   |   C   |
+ *                   +-------+
+ *         +-----------------+
+ *         |         E       |
+ *         +-----------------+
+ *    For the above situation, the already set range E is updated to shrink its
+ *    end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C).
+ *    The result is,
+ *         +---------+
+ *         |    E    |
+ *         +---------+
+ * 5) The clearing range is partially overlapped with an already set bad block
+ *    range from the bad block table.
+ * 5.1) The already set bad block range is front overlapped with the clearing
+ *    range.
+ *         +----------+
+ *         |     C    |
+ *         +----------+
+ *              +------------+
+ *              |      E     |
+ *              +------------+
+ *   For such situation, the clearing range C can be treated as two parts. The
+ *   first part ends at the start LBA of range E, and the second part starts at
+ *   same LBA of range E.
+ *         +----+-----+               +----+   +-----+
+ *         | C1 | C2  |               | C1 |   | C2  |
+ *         +----+-----+         ===>  +----+   +-----+
+ *              +------------+                 +------------+
+ *              |      E     |                 |      E     |
+ *              +------------+                 +------------+
+ *   Now the first part C1 can be handled as condition 1), and the second part C2 can be
+ *   handled as condition 3.1) in next loop.
+ * 5.2) The already set bad block range is behind overlaopped with the clearing
+ *   range.
+ *                 +----------+
+ *                 |     C    |
+ *                 +----------+
+ *         +------------+
+ *         |      E     |
+ *         +------------+
+ *   For such situation, the clearing range C can be treated as two parts. The
+ *   first part C1 ends at same end LBA of range E, and the second part starts
+ *   at end LBA of range E.
+ *                 +----+-----+                 +----+    +-----+
+ *                 | C1 | C2  |                 | C1 |    | C2  |
+ *                 +----+-----+  ===>           +----+    +-----+
+ *         +------------+               +------------+
+ *         |      E     |               |      E     |
+ *         +------------+               +------------+
+ *   Now the first part clearing range C1 can be handled as condition 4), and
+ *   the second part clearing range C2 can be handled as condition 1) in next
+ *   loop.
+ *
+ *   All bad blocks range clearing can be simplified into the above 5 situations
+ *   by only handling the head part of the clearing range in each run of the
+ *   while-loop. The idea is similar to bad blocks range setting but much
+ *   simpler.
  */
-int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
-			sector_t *first_bad, int *bad_sectors)
+
+/*
+ * Find the range starts at-or-before 's' from bad table. The search
+ * starts from index 'hint' and stops at index 'hint_end' from the bad
+ * table.
+ */
+static int prev_by_hint(struct badblocks *bb, sector_t s, int hint)
 {
-	int hi;
-	int lo;
+	int hint_end = hint + 2;
 	u64 *p = bb->page;
-	int rv;
-	sector_t target = s + sectors;
-	unsigned seq;
+	int ret = -1;
 
-	if (bb->shift > 0) {
-		/* round the start down, and the end up */
-		s >>= bb->shift;
-		target += (1<<bb->shift) - 1;
-		target >>= bb->shift;
+	while ((hint < hint_end) && ((hint + 1) <= bb->count) &&
+	       (BB_OFFSET(p[hint]) <= s)) {
+		if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) {
+			ret = hint;
+			break;
+		}
+		hint++;
+	}
+
+	return ret;
+}
+
+/*
+ * Find the range starts at-or-before bad->start. If 'hint' is provided
+ * (hint >= 0) then search in the bad table from hint firstly. It is
+ * very probably the wanted bad range can be found from the hint index,
+ * then the unnecessary while-loop iteration can be avoided.
+ */
+static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad,
+			  int hint)
+{
+	sector_t s = bad->start;
+	int ret = -1;
+	int lo, hi;
+	u64 *p;
+
+	if (!bb->count)
+		goto out;
+
+	if (hint >= 0) {
+		ret = prev_by_hint(bb, s, hint);
+		if (ret >= 0)
+			goto out;
 	}
-	/* 'target' is now the first block after the bad range */
 
-retry:
-	seq = read_seqbegin(&bb->lock);
 	lo = 0;
-	rv = 0;
 	hi = bb->count;
+	p = bb->page;
 
-	/* Binary search between lo and hi for 'target'
-	 * i.e. for the last range that starts before 'target'
-	 */
-	/* INVARIANT: ranges before 'lo' and at-or-after 'hi'
-	 * are known not to be the last range before target.
-	 * VARIANT: hi-lo is the number of possible
-	 * ranges, and decreases until it reaches 1
-	 */
+	/* The following bisect search might be unnecessary */
+	if (BB_OFFSET(p[lo]) > s)
+		return -1;
+	if (BB_OFFSET(p[hi - 1]) <= s)
+		return hi - 1;
+
+	/* Do bisect search in bad table */
 	while (hi - lo > 1) {
-		int mid = (lo + hi) / 2;
+		int mid = (lo + hi)/2;
 		sector_t a = BB_OFFSET(p[mid]);
 
-		if (a < target)
-			/* This could still be the one, earlier ranges
-			 * could not.
-			 */
+		if (a == s) {
+			ret = mid;
+			goto out;
+		}
+
+		if (a < s)
 			lo = mid;
 		else
-			/* This and later ranges are definitely out. */
 			hi = mid;
 	}
-	/* 'lo' might be the last that started before target, but 'hi' isn't */
-	if (hi > lo) {
-		/* need to check all range that end after 's' to see if
-		 * any are unacknowledged.
+
+	if (BB_OFFSET(p[lo]) <= s)
+		ret = lo;
+out:
+	return ret;
+}
+
+/*
+ * Return 'true' if the range indicated by 'bad' can be backward merged
+ * with the bad range (from the bad table) index by 'behind'.
+ */
+static bool can_merge_behind(struct badblocks *bb,
+			     struct badblocks_context *bad, int behind)
+{
+	sector_t sectors = bad->len;
+	sector_t s = bad->start;
+	u64 *p = bb->page;
+
+	if ((s < BB_OFFSET(p[behind])) &&
+	    ((s + sectors) >= BB_OFFSET(p[behind])) &&
+	    ((BB_END(p[behind]) - s) <= BB_MAX_LEN) &&
+	    BB_ACK(p[behind]) == bad->ack)
+		return true;
+	return false;
+}
+
+/*
+ * Do backward merge for range indicated by 'bad' and the bad range
+ * (from the bad table) indexed by 'behind'. The return value is merged
+ * sectors from bad->len.
+ */
+static int behind_merge(struct badblocks *bb, struct badblocks_context *bad,
+			int behind)
+{
+	sector_t sectors = bad->len;
+	sector_t s = bad->start;
+	u64 *p = bb->page;
+	int merged = 0;
+
+	WARN_ON(s >= BB_OFFSET(p[behind]));
+	WARN_ON((s + sectors) < BB_OFFSET(p[behind]));
+
+	if (s < BB_OFFSET(p[behind])) {
+		merged = BB_OFFSET(p[behind]) - s;
+		p[behind] =  BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack);
+
+		WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN);
+	}
+
+	return merged;
+}
+
+/*
+ * Return 'true' if the range indicated by 'bad' can be forward
+ * merged with the bad range (from the bad table) indexed by 'prev'.
+ */
+static bool can_merge_front(struct badblocks *bb, int prev,
+			    struct badblocks_context *bad)
+{
+	sector_t s = bad->start;
+	u64 *p = bb->page;
+
+	if (BB_ACK(p[prev]) == bad->ack &&
+	    (s < BB_END(p[prev]) ||
+	     (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN))))
+		return true;
+	return false;
+}
+
+/*
+ * Do forward merge for range indicated by 'bad' and the bad range
+ * (from bad table) indexed by 'prev'. The return value is sectors
+ * merged from bad->len.
+ */
+static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad)
+{
+	sector_t sectors = bad->len;
+	sector_t s = bad->start;
+	u64 *p = bb->page;
+	int merged = 0;
+
+	WARN_ON(s > BB_END(p[prev]));
+
+	if (s < BB_END(p[prev])) {
+		merged = min_t(sector_t, sectors, BB_END(p[prev]) - s);
+	} else {
+		merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev]));
+		if ((prev + 1) < bb->count &&
+		    merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) {
+			merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]);
+		}
+
+		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+				  BB_LEN(p[prev]) + merged, bad->ack);
+	}
+
+	return merged;
+}
+
+/*
+ * 'Combine' is a special case which can_merge_front() is not able to
+ * handle: If a bad range (indexed by 'prev' from bad table) exactly
+ * starts as bad->start, and the bad range ahead of 'prev' (indexed by
+ * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and
+ * the sum of their lengths does not exceed BB_MAX_LEN limitation, then
+ * these two bad range (from bad table) can be combined.
+ *
+ * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad
+ * table can be combined.
+ */
+static bool can_combine_front(struct badblocks *bb, int prev,
+			      struct badblocks_context *bad)
+{
+	u64 *p = bb->page;
+
+	if ((prev > 0) &&
+	    (BB_OFFSET(p[prev]) == bad->start) &&
+	    (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) &&
+	    (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) &&
+	    (BB_ACK(p[prev - 1]) == BB_ACK(p[prev])))
+		return true;
+	return false;
+}
+
+/*
+ * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad
+ * table) into one larger bad range, and the new range is indexed by
+ * 'prev - 1'.
+ * The caller of front_combine() will decrease bb->count, therefore
+ * it is unnecessary to clear p[perv] after front merge.
+ */
+static void front_combine(struct badblocks *bb, int prev)
+{
+	u64 *p = bb->page;
+
+	p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]),
+			      BB_LEN(p[prev - 1]) + BB_LEN(p[prev]),
+			      BB_ACK(p[prev]));
+	if ((prev + 1) < bb->count)
+		memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8);
+}
+
+/*
+ * Return 'true' if the range indicated by 'bad' is exactly forward
+ * overlapped with the bad range (from bad table) indexed by 'front'.
+ * Exactly forward overlap means the bad range (from bad table) indexed
+ * by 'prev' does not cover the whole range indicated by 'bad'.
+ */
+static bool overlap_front(struct badblocks *bb, int front,
+			  struct badblocks_context *bad)
+{
+	u64 *p = bb->page;
+
+	if (bad->start >= BB_OFFSET(p[front]) &&
+	    bad->start < BB_END(p[front]))
+		return true;
+	return false;
+}
+
+/*
+ * Return 'true' if the range indicated by 'bad' is exactly backward
+ * overlapped with the bad range (from bad table) indexed by 'behind'.
+ */
+static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad,
+			   int behind)
+{
+	u64 *p = bb->page;
+
+	if (bad->start < BB_OFFSET(p[behind]) &&
+	    (bad->start + bad->len) > BB_OFFSET(p[behind]))
+		return true;
+	return false;
+}
+
+/*
+ * Return 'true' if the range indicated by 'bad' can overwrite the bad
+ * range (from bad table) indexed by 'prev'.
+ *
+ * The range indicated by 'bad' can overwrite the bad range indexed by
+ * 'prev' when,
+ * 1) The whole range indicated by 'bad' can cover partial or whole bad
+ *    range (from bad table) indexed by 'prev'.
+ * 2) The ack value of 'bad' is larger or equal to the ack value of bad
+ *    range 'prev'.
+ *
+ * If the overwriting doesn't cover the whole bad range (from bad table)
+ * indexed by 'prev', new range might be split from existing bad range,
+ * 1) The overwrite covers head or tail part of existing bad range, 1
+ *    extra bad range will be split and added into the bad table.
+ * 2) The overwrite covers middle of existing bad range, 2 extra bad
+ *    ranges will be split (ahead and after the overwritten range) and
+ *    added into the bad table.
+ * The number of extra split ranges of the overwriting is stored in
+ * 'extra' and returned for the caller.
+ */
+static bool can_front_overwrite(struct badblocks *bb, int prev,
+				struct badblocks_context *bad, int *extra)
+{
+	u64 *p = bb->page;
+	int len;
+
+	WARN_ON(!overlap_front(bb, prev, bad));
+
+	if (BB_ACK(p[prev]) >= bad->ack)
+		return false;
+
+	if (BB_END(p[prev]) <= (bad->start + bad->len)) {
+		len = BB_END(p[prev]) - bad->start;
+		if (BB_OFFSET(p[prev]) == bad->start)
+			*extra = 0;
+		else
+			*extra = 1;
+
+		bad->len = len;
+	} else {
+		if (BB_OFFSET(p[prev]) == bad->start)
+			*extra = 1;
+		else
+		/*
+		 * prev range will be split into two, beside the overwritten
+		 * one, an extra slot needed from bad table.
 		 */
-		while (lo >= 0 &&
-		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
-			if (BB_OFFSET(p[lo]) < target) {
-				/* starts before the end, and finishes after
-				 * the start, so they must overlap
-				 */
-				if (rv != -1 && BB_ACK(p[lo]))
-					rv = 1;
-				else
-					rv = -1;
-				*first_bad = BB_OFFSET(p[lo]);
-				*bad_sectors = BB_LEN(p[lo]);
-			}
-			lo--;
+			*extra = 2;
+	}
+
+	if ((bb->count + (*extra)) >= MAX_BADBLOCKS)
+		return false;
+
+	return true;
+}
+
+/*
+ * Do the overwrite from the range indicated by 'bad' to the bad range
+ * (from bad table) indexed by 'prev'.
+ * The previously called can_front_overwrite() will provide how many
+ * extra bad range(s) might be split and added into the bad table. All
+ * the splitting cases in the bad table will be handled here.
+ */
+static int front_overwrite(struct badblocks *bb, int prev,
+			   struct badblocks_context *bad, int extra)
+{
+	u64 *p = bb->page;
+	sector_t orig_end = BB_END(p[prev]);
+	int orig_ack = BB_ACK(p[prev]);
+
+	switch (extra) {
+	case 0:
+		p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]),
+				  bad->ack);
+		break;
+	case 1:
+		if (BB_OFFSET(p[prev]) == bad->start) {
+			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+					  bad->len, bad->ack);
+			memmove(p + prev + 2, p + prev + 1,
+				(bb->count - prev - 1) * 8);
+			p[prev + 1] = BB_MAKE(bad->start + bad->len,
+					      orig_end - BB_END(p[prev]),
+					      orig_ack);
+		} else {
+			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+					  bad->start - BB_OFFSET(p[prev]),
+					  orig_ack);
+			/*
+			 * prev +2 -> prev + 1 + 1, which is for,
+			 * 1) prev + 1: the slot index of the previous one
+			 * 2) + 1: one more slot for extra being 1.
+			 */
+			memmove(p + prev + 2, p + prev + 1,
+				(bb->count - prev - 1) * 8);
+			p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
 		}
+		break;
+	case 2:
+		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+				  bad->start - BB_OFFSET(p[prev]),
+				  orig_ack);
+		/*
+		 * prev + 3 -> prev + 1 + 2, which is for,
+		 * 1) prev + 1: the slot index of the previous one
+		 * 2) + 2: two more slots for extra being 2.
+		 */
+		memmove(p + prev + 3, p + prev + 1,
+			(bb->count - prev - 1) * 8);
+		p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
+		p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]),
+				      orig_end - BB_END(p[prev + 1]),
+				      orig_ack);
+		break;
+	default:
+		break;
 	}
 
-	if (read_seqretry(&bb->lock, seq))
-		goto retry;
+	return bad->len;
+}
 
-	return rv;
+/*
+ * Explicitly insert a range indicated by 'bad' to the bad table, where
+ * the location is indexed by 'at'.
+ */
+static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad)
+{
+	u64 *p = bb->page;
+	int len;
+
+	WARN_ON(badblocks_full(bb));
+
+	len = min_t(sector_t, bad->len, BB_MAX_LEN);
+	if (at < bb->count)
+		memmove(p + at + 1, p + at, (bb->count - at) * 8);
+	p[at] = BB_MAKE(bad->start, len, bad->ack);
+
+	return len;
 }
-EXPORT_SYMBOL_GPL(badblocks_check);
 
 static void badblocks_update_acked(struct badblocks *bb)
 {
+	bool unacked = false;
 	u64 *p = bb->page;
 	int i;
-	bool unacked = false;
 
 	if (!bb->unacked_exist)
 		return;
@@ -144,281 +855,600 @@ static void badblocks_update_acked(struct badblocks *bb)
 		bb->unacked_exist = 0;
 }
 
-/**
- * badblocks_set() - Add a range of bad blocks to the table.
- * @bb:		the badblocks structure that holds all badblock information
- * @s:		first sector to mark as bad
- * @sectors:	number of sectors to mark as bad
- * @acknowledged: weather to mark the bad sectors as acknowledged
- *
- * This might extend the table, or might contract it if two adjacent ranges
- * can be merged. We binary-search to find the 'insertion' point, then
- * decide how best to handle it.
- *
- * Return:
- *  0: success
- *  1: failed to set badblocks (out of space)
- */
-int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
-			int acknowledged)
+/* Do exact work to set bad block range into the bad block table */
+static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+			  int acknowledged)
 {
-	u64 *p;
-	int lo, hi;
-	int rv = 0;
+	int retried = 0, space_desired = 0;
+	int orig_len, len = 0, added = 0;
+	struct badblocks_context bad;
+	int prev = -1, hint = -1;
+	sector_t orig_start;
 	unsigned long flags;
+	int rv = 0;
+	u64 *p;
 
 	if (bb->shift < 0)
 		/* badblocks are disabled */
 		return 1;
 
+	if (sectors == 0)
+		/* Invalid sectors number */
+		return 1;
+
 	if (bb->shift) {
 		/* round the start down, and the end up */
 		sector_t next = s + sectors;
 
-		s >>= bb->shift;
-		next += (1<<bb->shift) - 1;
-		next >>= bb->shift;
+		rounddown(s, bb->shift);
+		roundup(next, bb->shift);
 		sectors = next - s;
 	}
 
 	write_seqlock_irqsave(&bb->lock, flags);
 
+	orig_start = s;
+	orig_len = sectors;
+	bad.ack = acknowledged;
 	p = bb->page;
-	lo = 0;
-	hi = bb->count;
-	/* Find the last range that starts at-or-before 's' */
-	while (hi - lo > 1) {
-		int mid = (lo + hi) / 2;
-		sector_t a = BB_OFFSET(p[mid]);
 
-		if (a <= s)
-			lo = mid;
-		else
-			hi = mid;
+re_insert:
+	bad.start = s;
+	bad.len = sectors;
+	len = 0;
+
+	if (badblocks_empty(bb)) {
+		len = insert_at(bb, 0, &bad);
+		bb->count++;
+		added++;
+		goto update_sectors;
 	}
-	if (hi > lo && BB_OFFSET(p[lo]) > s)
-		hi = lo;
 
-	if (hi > lo) {
-		/* we found a range that might merge with the start
-		 * of our new range
-		 */
-		sector_t a = BB_OFFSET(p[lo]);
-		sector_t e = a + BB_LEN(p[lo]);
-		int ack = BB_ACK(p[lo]);
-
-		if (e >= s) {
-			/* Yes, we can merge with a previous range */
-			if (s == a && s + sectors >= e)
-				/* new range covers old */
-				ack = acknowledged;
-			else
-				ack = ack && acknowledged;
-
-			if (e < s + sectors)
-				e = s + sectors;
-			if (e - a <= BB_MAX_LEN) {
-				p[lo] = BB_MAKE(a, e-a, ack);
-				s = e;
+	prev = prev_badblocks(bb, &bad, hint);
+
+	/* start before all badblocks */
+	if (prev < 0) {
+		if (!badblocks_full(bb)) {
+			/* insert on the first */
+			if (bad.len > (BB_OFFSET(p[0]) - bad.start))
+				bad.len = BB_OFFSET(p[0]) - bad.start;
+			len = insert_at(bb, 0, &bad);
+			bb->count++;
+			added++;
+			hint = 0;
+			goto update_sectors;
+		}
+
+		/* No sapce, try to merge */
+		if (overlap_behind(bb, &bad, 0)) {
+			if (can_merge_behind(bb, &bad, 0)) {
+				len = behind_merge(bb, &bad, 0);
+				added++;
 			} else {
-				/* does not all fit in one range,
-				 * make p[lo] maximal
-				 */
-				if (BB_LEN(p[lo]) != BB_MAX_LEN)
-					p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
-				s = a + BB_MAX_LEN;
+				len = BB_OFFSET(p[0]) - s;
+				space_desired = 1;
 			}
-			sectors = e - s;
+			hint = 0;
+			goto update_sectors;
 		}
+
+		/* no table space and give up */
+		goto out;
 	}
-	if (sectors && hi < bb->count) {
-		/* 'hi' points to the first range that starts after 's'.
-		 * Maybe we can merge with the start of that range
-		 */
-		sector_t a = BB_OFFSET(p[hi]);
-		sector_t e = a + BB_LEN(p[hi]);
-		int ack = BB_ACK(p[hi]);
-
-		if (a <= s + sectors) {
-			/* merging is possible */
-			if (e <= s + sectors) {
-				/* full overlap */
-				e = s + sectors;
-				ack = acknowledged;
-			} else
-				ack = ack && acknowledged;
-
-			a = s;
-			if (e - a <= BB_MAX_LEN) {
-				p[hi] = BB_MAKE(a, e-a, ack);
-				s = e;
-			} else {
-				p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
-				s = a + BB_MAX_LEN;
+
+	/* in case p[prev-1] can be merged with p[prev] */
+	if (can_combine_front(bb, prev, &bad)) {
+		front_combine(bb, prev);
+		bb->count--;
+		added++;
+		hint = prev;
+		goto update_sectors;
+	}
+
+	if (overlap_front(bb, prev, &bad)) {
+		if (can_merge_front(bb, prev, &bad)) {
+			len = front_merge(bb, prev, &bad);
+			added++;
+		} else {
+			int extra = 0;
+
+			if (!can_front_overwrite(bb, prev, &bad, &extra)) {
+				len = min_t(sector_t,
+					    BB_END(p[prev]) - s, sectors);
+				hint = prev;
+				goto update_sectors;
+			}
+
+			len = front_overwrite(bb, prev, &bad, extra);
+			added++;
+			bb->count += extra;
+
+			if (can_combine_front(bb, prev, &bad)) {
+				front_combine(bb, prev);
+				bb->count--;
 			}
-			sectors = e - s;
-			lo = hi;
-			hi++;
 		}
+		hint = prev;
+		goto update_sectors;
+	}
+
+	if (can_merge_front(bb, prev, &bad)) {
+		len = front_merge(bb, prev, &bad);
+		added++;
+		hint = prev;
+		goto update_sectors;
 	}
-	if (sectors == 0 && hi < bb->count) {
-		/* we might be able to combine lo and hi */
-		/* Note: 's' is at the end of 'lo' */
-		sector_t a = BB_OFFSET(p[hi]);
-		int lolen = BB_LEN(p[lo]);
-		int hilen = BB_LEN(p[hi]);
-		int newlen = lolen + hilen - (s - a);
-
-		if (s >= a && newlen < BB_MAX_LEN) {
-			/* yes, we can combine them */
-			int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
-
-			p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
-			memmove(p + hi, p + hi + 1,
-				(bb->count - hi - 1) * 8);
-			bb->count--;
+
+	/* if no space in table, still try to merge in the covered range */
+	if (badblocks_full(bb)) {
+		/* skip the cannot-merge range */
+		if (((prev + 1) < bb->count) &&
+		    overlap_behind(bb, &bad, prev + 1) &&
+		    ((s + sectors) >= BB_END(p[prev + 1]))) {
+			len = BB_END(p[prev + 1]) - s;
+			hint = prev + 1;
+			goto update_sectors;
 		}
+
+		/* no retry any more */
+		len = sectors;
+		space_desired = 1;
+		hint = -1;
+		goto update_sectors;
 	}
-	while (sectors) {
-		/* didn't merge (it all).
-		 * Need to add a range just before 'hi'
-		 */
-		if (bb->count >= MAX_BADBLOCKS) {
-			/* No room for more */
-			rv = 1;
-			break;
-		} else {
-			int this_sectors = sectors;
 
-			memmove(p + hi + 1, p + hi,
-				(bb->count - hi) * 8);
-			bb->count++;
+	/* cannot merge and there is space in bad table */
+	if ((prev + 1) < bb->count &&
+	    overlap_behind(bb, &bad, prev + 1))
+		bad.len = min_t(sector_t,
+				bad.len, BB_OFFSET(p[prev + 1]) - bad.start);
 
-			if (this_sectors > BB_MAX_LEN)
-				this_sectors = BB_MAX_LEN;
-			p[hi] = BB_MAKE(s, this_sectors, acknowledged);
-			sectors -= this_sectors;
-			s += this_sectors;
-		}
+	len = insert_at(bb, prev + 1, &bad);
+	bb->count++;
+	added++;
+	hint = prev + 1;
+
+update_sectors:
+	s += len;
+	sectors -= len;
+
+	if (sectors > 0)
+		goto re_insert;
+
+	WARN_ON(sectors < 0);
+
+	/*
+	 * Check whether the following already set range can be
+	 * merged. (prev < 0) condition is not handled here,
+	 * because it's already complicated enough.
+	 */
+	if (prev >= 0 &&
+	    (prev + 1) < bb->count &&
+	    BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) &&
+	    (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN &&
+	    BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) {
+		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+				  BB_LEN(p[prev]) + BB_LEN(p[prev + 1]),
+				  BB_ACK(p[prev]));
+
+		if ((prev + 2) < bb->count)
+			memmove(p + prev + 1, p + prev + 2,
+				(bb->count -  (prev + 2)) * 8);
+		bb->count--;
+	}
+
+	if (space_desired && !badblocks_full(bb)) {
+		s = orig_start;
+		sectors = orig_len;
+		space_desired = 0;
+		if (retried++ < 3)
+			goto re_insert;
+	}
+
+out:
+	if (added) {
+		set_changed(bb);
+
+		if (!acknowledged)
+			bb->unacked_exist = 1;
+		else
+			badblocks_update_acked(bb);
 	}
 
-	bb->changed = 1;
-	if (!acknowledged)
-		bb->unacked_exist = 1;
-	else
-		badblocks_update_acked(bb);
 	write_sequnlock_irqrestore(&bb->lock, flags);
 
+	if (!added)
+		rv = 1;
+
 	return rv;
 }
-EXPORT_SYMBOL_GPL(badblocks_set);
 
-/**
- * badblocks_clear() - Remove a range of bad blocks to the table.
- * @bb:		the badblocks structure that holds all badblock information
- * @s:		first sector to mark as bad
- * @sectors:	number of sectors to mark as bad
- *
- * This may involve extending the table if we spilt a region,
- * but it must not fail.  So if the table becomes full, we just
- * drop the remove request.
- *
- * Return:
- *  0: success
- *  1: failed to clear badblocks
+/*
+ * Clear the bad block range from bad block table which is front overlapped
+ * with the clearing range. The return value is how many sectors from an
+ * already set bad block range are cleared. If the whole bad block range is
+ * covered by the clearing range and fully cleared, 'delete' is set as 1 for
+ * the caller to reduce bb->count.
  */
-int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+static int front_clear(struct badblocks *bb, int prev,
+		       struct badblocks_context *bad, int *deleted)
 {
-	u64 *p;
-	int lo, hi;
-	sector_t target = s + sectors;
+	sector_t sectors = bad->len;
+	sector_t s = bad->start;
+	u64 *p = bb->page;
+	int cleared = 0;
+
+	*deleted = 0;
+	if (s == BB_OFFSET(p[prev])) {
+		if (BB_LEN(p[prev]) > sectors) {
+			p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors,
+					  BB_LEN(p[prev]) - sectors,
+					  BB_ACK(p[prev]));
+			cleared = sectors;
+		} else {
+			/* BB_LEN(p[prev]) <= sectors */
+			cleared = BB_LEN(p[prev]);
+			if ((prev + 1) < bb->count)
+				memmove(p + prev, p + prev + 1,
+				       (bb->count - prev - 1) * 8);
+			*deleted = 1;
+		}
+	} else if (s > BB_OFFSET(p[prev])) {
+		if (BB_END(p[prev]) <= (s + sectors)) {
+			cleared = BB_END(p[prev]) - s;
+			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+					  s - BB_OFFSET(p[prev]),
+					  BB_ACK(p[prev]));
+		} else {
+			/* Splitting is handled in front_splitting_clear() */
+			BUG();
+		}
+	}
+
+	return cleared;
+}
+
+/*
+ * Handle the condition that the clearing range hits middle of an already set
+ * bad block range from bad block table. In this condition the existing bad
+ * block range is split into two after the middle part is cleared.
+ */
+static int front_splitting_clear(struct badblocks *bb, int prev,
+				  struct badblocks_context *bad)
+{
+	u64 *p = bb->page;
+	u64 end = BB_END(p[prev]);
+	int ack = BB_ACK(p[prev]);
+	sector_t sectors = bad->len;
+	sector_t s = bad->start;
+
+	p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+			  s - BB_OFFSET(p[prev]),
+			  ack);
+	memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8);
+	p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack);
+	return sectors;
+}
+
+/* Do the exact work to clear bad block range from the bad block table */
+static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+{
+	struct badblocks_context bad;
+	int prev = -1, hint = -1;
+	int len = 0, cleared = 0;
 	int rv = 0;
+	u64 *p;
+
+	if (bb->shift < 0)
+		/* badblocks are disabled */
+		return 1;
+
+	if (sectors == 0)
+		/* Invalid sectors number */
+		return 1;
+
+	if (bb->shift) {
+		sector_t target;
 
-	if (bb->shift > 0) {
 		/* When clearing we round the start up and the end down.
 		 * This should not matter as the shift should align with
 		 * the block size and no rounding should ever be needed.
 		 * However it is better the think a block is bad when it
 		 * isn't than to think a block is not bad when it is.
 		 */
-		s += (1<<bb->shift) - 1;
-		s >>= bb->shift;
-		target >>= bb->shift;
+		target = s + sectors;
+		roundup(s, bb->shift);
+		rounddown(target, bb->shift);
+		sectors = target - s;
 	}
 
 	write_seqlock_irq(&bb->lock);
 
+	bad.ack = true;
 	p = bb->page;
-	lo = 0;
-	hi = bb->count;
-	/* Find the last range that starts before 'target' */
-	while (hi - lo > 1) {
-		int mid = (lo + hi) / 2;
-		sector_t a = BB_OFFSET(p[mid]);
 
-		if (a < target)
-			lo = mid;
-		else
-			hi = mid;
+re_clear:
+	bad.start = s;
+	bad.len = sectors;
+
+	if (badblocks_empty(bb)) {
+		len = sectors;
+		cleared++;
+		goto update_sectors;
 	}
-	if (hi > lo) {
-		/* p[lo] is the last range that could overlap the
-		 * current range.  Earlier ranges could also overlap,
-		 * but only this one can overlap the end of the range.
-		 */
-		if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) &&
-		    (BB_OFFSET(p[lo]) < target)) {
-			/* Partial overlap, leave the tail of this range */
-			int ack = BB_ACK(p[lo]);
-			sector_t a = BB_OFFSET(p[lo]);
-			sector_t end = a + BB_LEN(p[lo]);
-
-			if (a < s) {
-				/* we need to split this range */
-				if (bb->count >= MAX_BADBLOCKS) {
-					rv = -ENOSPC;
-					goto out;
-				}
-				memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
-				bb->count++;
-				p[lo] = BB_MAKE(a, s-a, ack);
-				lo++;
-			}
-			p[lo] = BB_MAKE(target, end - target, ack);
-			/* there is no longer an overlap */
-			hi = lo;
-			lo--;
-		}
-		while (lo >= 0 &&
-		       (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) &&
-		       (BB_OFFSET(p[lo]) < target)) {
-			/* This range does overlap */
-			if (BB_OFFSET(p[lo]) < s) {
-				/* Keep the early parts of this range. */
-				int ack = BB_ACK(p[lo]);
-				sector_t start = BB_OFFSET(p[lo]);
-
-				p[lo] = BB_MAKE(start, s - start, ack);
-				/* now low doesn't overlap, so.. */
-				break;
-			}
-			lo--;
+
+
+	prev = prev_badblocks(bb, &bad, hint);
+
+	/* Start before all badblocks */
+	if (prev < 0) {
+		if (overlap_behind(bb, &bad, 0)) {
+			len = BB_OFFSET(p[0]) - s;
+			hint = 0;
+		} else {
+			len = sectors;
 		}
-		/* 'lo' is strictly before, 'hi' is strictly after,
-		 * anything between needs to be discarded
+		/*
+		 * Both situations are to clear non-bad range,
+		 * should be treated as successful
 		 */
-		if (hi - lo > 1) {
-			memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
-			bb->count -= (hi - lo - 1);
+		cleared++;
+		goto update_sectors;
+	}
+
+	/* Start after all badblocks */
+	if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
+		len = sectors;
+		cleared++;
+		goto update_sectors;
+	}
+
+	/* Clear will split a bad record but the table is full */
+	if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) &&
+	    (BB_END(p[prev]) > (bad.start + sectors))) {
+		len = sectors;
+		goto update_sectors;
+	}
+
+	if (overlap_front(bb, prev, &bad)) {
+		if ((BB_OFFSET(p[prev]) < bad.start) &&
+		    (BB_END(p[prev]) > (bad.start + bad.len))) {
+			/* Splitting */
+			if ((bb->count + 1) < MAX_BADBLOCKS) {
+				len = front_splitting_clear(bb, prev, &bad);
+				bb->count += 1;
+				cleared++;
+			} else {
+				/* No space to split, give up */
+				len = sectors;
+			}
+		} else {
+			int deleted = 0;
+
+			len = front_clear(bb, prev, &bad, &deleted);
+			bb->count -= deleted;
+			cleared++;
+			hint = prev;
 		}
+
+		goto update_sectors;
+	}
+
+	/* Not front overlap, but behind overlap */
+	if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
+		len = BB_OFFSET(p[prev + 1]) - bad.start;
+		hint = prev + 1;
+		/* Clear non-bad range should be treated as successful */
+		cleared++;
+		goto update_sectors;
+	}
+
+	/* Not cover any badblocks range in the table */
+	len = sectors;
+	/* Clear non-bad range should be treated as successful */
+	cleared++;
+
+update_sectors:
+	s += len;
+	sectors -= len;
+
+	if (sectors > 0)
+		goto re_clear;
+
+	WARN_ON(sectors < 0);
+
+	if (cleared) {
+		badblocks_update_acked(bb);
+		set_changed(bb);
 	}
 
-	badblocks_update_acked(bb);
-	bb->changed = 1;
-out:
 	write_sequnlock_irq(&bb->lock);
+
+	if (!cleared)
+		rv = 1;
+
 	return rv;
 }
+
+/* Do the exact work to check bad blocks range from the bad block table */
+static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+			    sector_t *first_bad, int *bad_sectors)
+{
+	int unacked_badblocks, acked_badblocks;
+	int prev = -1, hint = -1, set = 0;
+	struct badblocks_context bad;
+	unsigned int seq;
+	int len, rv;
+	u64 *p;
+
+	WARN_ON(bb->shift < 0 || sectors == 0);
+
+	if (bb->shift > 0) {
+		sector_t target;
+
+		/* round the start down, and the end up */
+		target = s + sectors;
+		rounddown(s, bb->shift);
+		roundup(target, bb->shift);
+		sectors = target - s;
+	}
+
+retry:
+	seq = read_seqbegin(&bb->lock);
+
+	p = bb->page;
+	unacked_badblocks = 0;
+	acked_badblocks = 0;
+
+re_check:
+	bad.start = s;
+	bad.len = sectors;
+
+	if (badblocks_empty(bb)) {
+		len = sectors;
+		goto update_sectors;
+	}
+
+	prev = prev_badblocks(bb, &bad, hint);
+
+	/* start after all badblocks */
+	if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
+		len = sectors;
+		goto update_sectors;
+	}
+
+	if (overlap_front(bb, prev, &bad)) {
+		if (BB_ACK(p[prev]))
+			acked_badblocks++;
+		else
+			unacked_badblocks++;
+
+		if (BB_END(p[prev]) >= (s + sectors))
+			len = sectors;
+		else
+			len = BB_END(p[prev]) - s;
+
+		if (set == 0) {
+			*first_bad = BB_OFFSET(p[prev]);
+			*bad_sectors = BB_LEN(p[prev]);
+			set = 1;
+		}
+		goto update_sectors;
+	}
+
+	/* Not front overlap, but behind overlap */
+	if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
+		len = BB_OFFSET(p[prev + 1]) - bad.start;
+		hint = prev + 1;
+		goto update_sectors;
+	}
+
+	/* not cover any badblocks range in the table */
+	len = sectors;
+
+update_sectors:
+	s += len;
+	sectors -= len;
+
+	if (sectors > 0)
+		goto re_check;
+
+	WARN_ON(sectors < 0);
+
+	if (unacked_badblocks > 0)
+		rv = -1;
+	else if (acked_badblocks > 0)
+		rv = 1;
+	else
+		rv = 0;
+
+	if (read_seqretry(&bb->lock, seq))
+		goto retry;
+
+	return rv;
+}
+
+/**
+ * badblocks_check() - check a given range for bad sectors
+ * @bb:		the badblocks structure that holds all badblock information
+ * @s:		sector (start) at which to check for badblocks
+ * @sectors:	number of sectors to check for badblocks
+ * @first_bad:	pointer to store location of the first badblock
+ * @bad_sectors: pointer to store number of badblocks after @first_bad
+ *
+ * We can record which blocks on each device are 'bad' and so just
+ * fail those blocks, or that stripe, rather than the whole device.
+ * Entries in the bad-block table are 64bits wide.  This comprises:
+ * Length of bad-range, in sectors: 0-511 for lengths 1-512
+ * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
+ *  A 'shift' can be set so that larger blocks are tracked and
+ *  consequently larger devices can be covered.
+ * 'Acknowledged' flag - 1 bit. - the most significant bit.
+ *
+ * Locking of the bad-block table uses a seqlock so badblocks_check
+ * might need to retry if it is very unlucky.
+ * We will sometimes want to check for bad blocks in a bi_end_io function,
+ * so we use the write_seqlock_irq variant.
+ *
+ * When looking for a bad block we specify a range and want to
+ * know if any block in the range is bad.  So we binary-search
+ * to the last range that starts at-or-before the given endpoint,
+ * (or "before the sector after the target range")
+ * then see if it ends after the given start.
+ *
+ * Return:
+ *  0: there are no known bad blocks in the range
+ *  1: there are known bad block which are all acknowledged
+ * -1: there are bad blocks which have not yet been acknowledged in metadata.
+ * plus the start/length of the first bad section we overlap.
+ */
+int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+			sector_t *first_bad, int *bad_sectors)
+{
+	return _badblocks_check(bb, s, sectors, first_bad, bad_sectors);
+}
+EXPORT_SYMBOL_GPL(badblocks_check);
+
+/**
+ * badblocks_set() - Add a range of bad blocks to the table.
+ * @bb:		the badblocks structure that holds all badblock information
+ * @s:		first sector to mark as bad
+ * @sectors:	number of sectors to mark as bad
+ * @acknowledged: weather to mark the bad sectors as acknowledged
+ *
+ * This might extend the table, or might contract it if two adjacent ranges
+ * can be merged. We binary-search to find the 'insertion' point, then
+ * decide how best to handle it.
+ *
+ * Return:
+ *  0: success
+ *  1: failed to set badblocks (out of space)
+ */
+int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+			int acknowledged)
+{
+	return _badblocks_set(bb, s, sectors, acknowledged);
+}
+EXPORT_SYMBOL_GPL(badblocks_set);
+
+/**
+ * badblocks_clear() - Remove a range of bad blocks to the table.
+ * @bb:		the badblocks structure that holds all badblock information
+ * @s:		first sector to mark as bad
+ * @sectors:	number of sectors to mark as bad
+ *
+ * This may involve extending the table if we spilt a region,
+ * but it must not fail.  So if the table becomes full, we just
+ * drop the remove request.
+ *
+ * Return:
+ *  0: success
+ *  1: failed to clear badblocks
+ */
+int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+{
+	return _badblocks_clear(bb, s, sectors);
+}
 EXPORT_SYMBOL_GPL(badblocks_clear);
 
 /**
diff --git a/block/blk-flush.c b/block/blk-flush.c
index e73dc22d05c1..3f4d41952ef2 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -323,16 +323,9 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	flush_rq->mq_ctx = first_rq->mq_ctx;
 	flush_rq->mq_hctx = first_rq->mq_hctx;
 
-	if (!q->elevator) {
+	if (!q->elevator)
 		flush_rq->tag = first_rq->tag;
-
-		/*
-		 * We borrow data request's driver tag, so have to mark
-		 * this flush request as INFLIGHT for avoiding double
-		 * account of this driver tag
-		 */
-		flush_rq->rq_flags |= RQF_MQ_INFLIGHT;
-	} else
+	else
 		flush_rq->internal_tag = first_rq->internal_tag;
 
 	flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index c3b5930106b2..5cbeb9344f2f 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -246,7 +246,6 @@ static const char *const rqf_name[] = {
 	RQF_NAME(STARTED),
 	RQF_NAME(FLUSH_SEQ),
 	RQF_NAME(MIXED_MERGE),
-	RQF_NAME(MQ_INFLIGHT),
 	RQF_NAME(DONTPREP),
 	RQF_NAME(SCHED_TAGS),
 	RQF_NAME(USE_SCHED),
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1fafd54dce3c..e2d11183f62e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -426,6 +426,8 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
 		rq_list_add(data->cached_rq, rq);
 		nr++;
 	}
+	if (!(data->rq_flags & RQF_SCHED_TAGS))
+		blk_mq_add_active_requests(data->hctx, nr);
 	/* caller already holds a reference, add for remainder */
 	percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
 	data->nr_tags -= nr;
@@ -510,6 +512,8 @@ retry:
 		goto retry;
 	}
 
+	if (!(data->rq_flags & RQF_SCHED_TAGS))
+		blk_mq_inc_active_requests(data->hctx);
 	rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag);
 	blk_mq_rq_time_init(rq, alloc_time_ns);
 	return rq;
@@ -669,6 +673,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	tag = blk_mq_get_tag(&data);
 	if (tag == BLK_MQ_NO_TAG)
 		goto out_queue_exit;
+	if (!(data.rq_flags & RQF_SCHED_TAGS))
+		blk_mq_inc_active_requests(data.hctx);
 	rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
 	blk_mq_rq_time_init(rq, alloc_time_ns);
 	rq->__data_len = 0;
@@ -708,11 +714,10 @@ static void __blk_mq_free_request(struct request *rq)
 	blk_pm_mark_last_busy(rq);
 	rq->mq_hctx = NULL;
 
-	if (rq->rq_flags & RQF_MQ_INFLIGHT)
-		__blk_mq_dec_active_requests(hctx);
-
-	if (rq->tag != BLK_MQ_NO_TAG)
+	if (rq->tag != BLK_MQ_NO_TAG) {
+		blk_mq_dec_active_requests(hctx);
 		blk_mq_put_tag(hctx->tags, ctx, rq->tag);
+	}
 	if (sched_tag != BLK_MQ_NO_TAG)
 		blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
 	blk_mq_sched_restart(hctx);
@@ -1061,12 +1066,7 @@ static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
 {
 	struct request_queue *q = hctx->queue;
 
-	/*
-	 * All requests should have been marked as RQF_MQ_INFLIGHT, so
-	 * update hctx->nr_active in batch
-	 */
-	if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
-		__blk_mq_sub_active_requests(hctx, nr_tags);
+	blk_mq_sub_active_requests(hctx, nr_tags);
 
 	blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
 	percpu_ref_put_many(&q->q_usage_counter, nr_tags);
@@ -1259,6 +1259,7 @@ void blk_mq_start_request(struct request *rq)
 
 	blk_add_timer(rq);
 	WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
+	rq->mq_hctx->tags->rqs[rq->tag] = rq;
 
 #ifdef CONFIG_BLK_DEV_INTEGRITY
 	if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
@@ -1748,7 +1749,7 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
 	return data.rq;
 }
 
-static bool __blk_mq_alloc_driver_tag(struct request *rq)
+bool __blk_mq_alloc_driver_tag(struct request *rq)
 {
 	struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
 	unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
@@ -1769,20 +1770,7 @@ static bool __blk_mq_alloc_driver_tag(struct request *rq)
 		return false;
 
 	rq->tag = tag + tag_offset;
-	return true;
-}
-
-bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq)
-{
-	if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq))
-		return false;
-
-	if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
-			!(rq->rq_flags & RQF_MQ_INFLIGHT)) {
-		rq->rq_flags |= RQF_MQ_INFLIGHT;
-		__blk_mq_inc_active_requests(hctx);
-	}
-	hctx->tags->rqs[rq->tag] = rq;
+	blk_mq_inc_active_requests(rq->mq_hctx);
 	return true;
 }
 
@@ -2794,13 +2782,8 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 		 * If we do, we can dispatch the whole plug list in one go. We
 		 * already know at this point that all requests belong to the
 		 * same queue, caller must ensure that's the case.
-		 *
-		 * Since we pass off the full list to the driver at this point,
-		 * we do not increment the active request count for the queue.
-		 * Bypass shared tags for now because of that.
 		 */
-		if (q->mq_ops->queue_rqs &&
-		    !(rq->mq_hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
+		if (q->mq_ops->queue_rqs) {
 			blk_mq_run_dispatch_ops(q,
 				__blk_mq_flush_plug_list(q, plug));
 			if (rq_list_empty(plug->mq_list))
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 1743857e0b01..f75a9ecfebde 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -271,12 +271,18 @@ static inline int blk_mq_get_rq_budget_token(struct request *rq)
 	return -1;
 }
 
-static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
+static inline void __blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx,
+						int val)
 {
 	if (blk_mq_is_shared_tags(hctx->flags))
-		atomic_inc(&hctx->queue->nr_active_requests_shared_tags);
+		atomic_add(val, &hctx->queue->nr_active_requests_shared_tags);
 	else
-		atomic_inc(&hctx->nr_active);
+		atomic_add(val, &hctx->nr_active);
+}
+
+static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
+{
+	__blk_mq_add_active_requests(hctx, 1);
 }
 
 static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
@@ -293,6 +299,32 @@ static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
 	__blk_mq_sub_active_requests(hctx, 1);
 }
 
+static inline void blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx,
+					      int val)
+{
+	if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+		__blk_mq_add_active_requests(hctx, val);
+}
+
+static inline void blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
+{
+	if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+		__blk_mq_inc_active_requests(hctx);
+}
+
+static inline void blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
+					      int val)
+{
+	if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+		__blk_mq_sub_active_requests(hctx, val);
+}
+
+static inline void blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
+{
+	if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+		__blk_mq_dec_active_requests(hctx);
+}
+
 static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
 {
 	if (blk_mq_is_shared_tags(hctx->flags))
@@ -302,13 +334,9 @@ static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
 static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
 					   struct request *rq)
 {
+	blk_mq_dec_active_requests(hctx);
 	blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag);
 	rq->tag = BLK_MQ_NO_TAG;
-
-	if (rq->rq_flags & RQF_MQ_INFLIGHT) {
-		rq->rq_flags &= ~RQF_MQ_INFLIGHT;
-		__blk_mq_dec_active_requests(hctx);
-	}
 }
 
 static inline void blk_mq_put_driver_tag(struct request *rq)
@@ -319,19 +347,14 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
 	__blk_mq_put_driver_tag(rq->mq_hctx, rq);
 }
 
-bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq);
+bool __blk_mq_alloc_driver_tag(struct request *rq);
 
 static inline bool blk_mq_get_driver_tag(struct request *rq)
 {
-	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
-
-	if (rq->tag != BLK_MQ_NO_TAG &&
-	    !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
-		hctx->tags->rqs[rq->tag] = rq;
-		return true;
-	}
+	if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq))
+		return false;
 
-	return __blk_mq_get_driver_tag(hctx, rq);
+	return true;
 }
 
 static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c
index 403756dbd50d..82d9c4c3fb41 100644
--- a/block/partitions/ibm.c
+++ b/block/partitions/ibm.c
@@ -61,6 +61,47 @@ static sector_t cchhb2blk(struct vtoc_cchhb *ptr, struct hd_geometry *geo)
 		ptr->b;
 }
 
+/* Volume Label Type/ID Length */
+#define DASD_VOL_TYPE_LEN	4
+#define DASD_VOL_ID_LEN		6
+
+/* Volume Label Types */
+#define DASD_VOLLBL_TYPE_VOL1 0
+#define DASD_VOLLBL_TYPE_LNX1 1
+#define DASD_VOLLBL_TYPE_CMS1 2
+
+struct dasd_vollabel {
+	char *type;
+	int idx;
+};
+
+static struct dasd_vollabel dasd_vollabels[] = {
+	[DASD_VOLLBL_TYPE_VOL1] = {
+		.type = "VOL1",
+		.idx = DASD_VOLLBL_TYPE_VOL1,
+	},
+	[DASD_VOLLBL_TYPE_LNX1] = {
+		.type = "LNX1",
+		.idx = DASD_VOLLBL_TYPE_LNX1,
+	},
+	[DASD_VOLLBL_TYPE_CMS1] = {
+		.type = "CMS1",
+		.idx = DASD_VOLLBL_TYPE_CMS1,
+	},
+};
+
+static int get_label_by_type(const char *type)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(dasd_vollabels); i++) {
+		if (!memcmp(type, dasd_vollabels[i].type, DASD_VOL_TYPE_LEN))
+			return dasd_vollabels[i].idx;
+	}
+
+	return -1;
+}
+
 static int find_label(struct parsed_partitions *state,
 		      dasd_information2_t *info,
 		      struct hd_geometry *geo,
@@ -70,12 +111,10 @@ static int find_label(struct parsed_partitions *state,
 		      char type[],
 		      union label_t *label)
 {
-	Sector sect;
-	unsigned char *data;
 	sector_t testsect[3];
-	unsigned char temp[5];
-	int found = 0;
 	int i, testcount;
+	Sector sect;
+	void *data;
 
 	/* There a three places where we may find a valid label:
 	 * - on an ECKD disk it's block 2
@@ -103,31 +142,27 @@ static int find_label(struct parsed_partitions *state,
 		if (data == NULL)
 			continue;
 		memcpy(label, data, sizeof(*label));
-		memcpy(temp, data, 4);
-		temp[4] = 0;
-		EBCASC(temp, 4);
+		memcpy(type, data, DASD_VOL_TYPE_LEN);
+		EBCASC(type, DASD_VOL_TYPE_LEN);
 		put_dev_sector(sect);
-		if (!strcmp(temp, "VOL1") ||
-		    !strcmp(temp, "LNX1") ||
-		    !strcmp(temp, "CMS1")) {
-			if (!strcmp(temp, "VOL1")) {
-				strncpy(type, label->vol.vollbl, 4);
-				strncpy(name, label->vol.volid, 6);
-			} else {
-				strncpy(type, label->lnx.vollbl, 4);
-				strncpy(name, label->lnx.volid, 6);
-			}
-			EBCASC(type, 4);
-			EBCASC(name, 6);
+		switch (get_label_by_type(type)) {
+		case DASD_VOLLBL_TYPE_VOL1:
+			memcpy(name, label->vol.volid, DASD_VOL_ID_LEN);
+			EBCASC(name, DASD_VOL_ID_LEN);
+			*labelsect = testsect[i];
+			return 1;
+		case DASD_VOLLBL_TYPE_LNX1:
+		case DASD_VOLLBL_TYPE_CMS1:
+			memcpy(name, label->lnx.volid, DASD_VOL_ID_LEN);
+			EBCASC(name, DASD_VOL_ID_LEN);
 			*labelsect = testsect[i];
-			found = 1;
+			return 1;
+		default:
 			break;
 		}
 	}
-	if (!found)
-		memset(label, 0, sizeof(*label));
 
-	return found;
+	return 0;
 }
 
 static int find_vol1_partitions(struct parsed_partitions *state,
@@ -297,8 +332,8 @@ int ibm_partition(struct parsed_partitions *state)
 	sector_t nr_sectors;
 	dasd_information2_t *info;
 	struct hd_geometry *geo;
-	char type[5] = {0,};
-	char name[7] = {0,};
+	char type[DASD_VOL_TYPE_LEN + 1] = "";
+	char name[DASD_VOL_ID_LEN + 1] = "";
 	sector_t labelsect;
 	union label_t *label;
 
@@ -330,18 +365,21 @@ int ibm_partition(struct parsed_partitions *state)
 		info = NULL;
 	}
 
-	if (find_label(state, info, geo, blocksize, &labelsect, name, type,
-		       label)) {
-		if (!strncmp(type, "VOL1", 4)) {
+	if (find_label(state, info, geo, blocksize, &labelsect, name, type, label)) {
+		switch (get_label_by_type(type)) {
+		case DASD_VOLLBL_TYPE_VOL1:
 			res = find_vol1_partitions(state, geo, blocksize, name,
 						   label);
-		} else if (!strncmp(type, "LNX1", 4)) {
+			break;
+		case DASD_VOLLBL_TYPE_LNX1:
 			res = find_lnx1_partitions(state, geo, blocksize, name,
 						   label, labelsect, nr_sectors,
 						   info);
-		} else if (!strncmp(type, "CMS1", 4)) {
+			break;
+		case DASD_VOLLBL_TYPE_CMS1:
 			res = find_cms1_partitions(state, geo, blocksize, name,
 						   label, labelsect);
+			break;
 		}
 	} else if (info) {
 		/*
diff --git a/block/sed-opal.c b/block/sed-opal.c
index 04f38a3f5d95..3d9e9cd250bd 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -18,6 +18,7 @@
 #include <linux/uaccess.h>
 #include <uapi/linux/sed-opal.h>
 #include <linux/sed-opal.h>
+#include <linux/sed-opal-key.h>
 #include <linux/string.h>
 #include <linux/kdev_t.h>
 #include <linux/key.h>
@@ -3018,7 +3019,13 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
 	if (ret)
 		return ret;
 
-	/* update keyring with new password */
+	/* update keyring and key store with new password */
+	ret = sed_write_key(OPAL_AUTH_KEY,
+			    opal_pw->new_user_pw.opal_key.key,
+			    opal_pw->new_user_pw.opal_key.key_len);
+	if (ret != -EOPNOTSUPP)
+		pr_warn("error updating SED key: %d\n", ret);
+
 	ret = update_sed_opal_key(OPAL_AUTH_KEY,
 				  opal_pw->new_user_pw.opal_key.key,
 				  opal_pw->new_user_pw.opal_key.key_len);
@@ -3291,6 +3298,8 @@ EXPORT_SYMBOL_GPL(sed_ioctl);
 static int __init sed_opal_init(void)
 {
 	struct key *kr;
+	char init_sed_key[OPAL_KEY_MAX];
+	int keylen = OPAL_KEY_MAX - 1;
 
 	kr = keyring_alloc(".sed_opal",
 			   GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(),
@@ -3303,6 +3312,11 @@ static int __init sed_opal_init(void)
 
 	sed_opal_keyring = kr;
 
-	return 0;
+	if (sed_read_key(OPAL_AUTH_KEY, init_sed_key, &keylen) < 0) {
+		memset(init_sed_key, '\0', sizeof(init_sed_key));
+		keylen = OPAL_KEY_MAX - 1;
+	}
+
+	return update_sed_opal_key(OPAL_AUTH_KEY, init_sed_key, keylen);
 }
 late_initcall(sed_opal_init);
author	Linus Torvalds	2023-11-01 12:30:07 -1000
committer	Linus Torvalds	2023-11-01 12:30:07 -1000
commit	90d624af2e5a9945eedd5cafd6ae6d88f32cc977 (patch)
tree	e936a0cc8f2b613f327ab08280dccbad664703cf /block
parent	4de520f1fcefd4ebb7dddcf28bde1b330c2f6b5d (diff)
parent	0c696bb38f4cc0f0f90a8e06ae1eda21a9630cd0 (diff)