Add failfast support.

Allow per-device "failfast" flag to be set when creating an
array or adding devices to an array.

When re-adding a device which had the failfast flag, it can be removed
using --nofailfast.

failfast status is printed in --detail and --examine output.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Jes Sorensen <Jes.Sorensen@redhat.com>
This commit is contained in:
NeilBrown 2016-11-25 10:55:49 +11:00 committed by Jes Sorensen
parent cf52eff58a
commit 71574efb07
12 changed files with 148 additions and 6 deletions

View File

@ -890,6 +890,8 @@ int Create(struct supertype *st, char *mddev,
if (dv->writemostly == 1)
inf->disk.state |= (1<<MD_DISK_WRITEMOSTLY);
if (dv->failfast == 1)
inf->disk.state |= (1<<MD_DISK_FAILFAST);
if (have_container)
fd = -1;

View File

@ -658,6 +658,7 @@ This is pretty boring
}
if (disk.state & (1<<MD_DISK_REMOVED)) printf(" removed");
if (disk.state & (1<<MD_DISK_WRITEMOSTLY)) printf(" writemostly");
if (disk.state & (1<<MD_DISK_FAILFAST)) printf(" failfast");
if (disk.state & (1<<MD_DISK_JOURNAL)) printf(" journal");
if ((disk.state &
((1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC)

View File

@ -1035,6 +1035,7 @@ static int array_try_spare(char *devname, int *dfdp, struct dev_policy *pol,
devlist.next = NULL;
devlist.used = 0;
devlist.writemostly = 0;
devlist.failfast = 0;
devlist.devname = chosen_devname;
sprintf(chosen_devname, "%d:%d", major(stb.st_rdev),
minor(stb.st_rdev));

View File

@ -683,8 +683,13 @@ int attempt_re_add(int fd, int tfd, struct mddev_dev *dv,
disc.state |= 1 << MD_DISK_WRITEMOSTLY;
if (dv->writemostly == 2)
disc.state &= ~(1 << MD_DISK_WRITEMOSTLY);
if (dv->failfast == 1)
disc.state |= 1 << MD_DISK_FAILFAST;
if (dv->failfast == 2)
disc.state &= ~(1 << MD_DISK_FAILFAST);
remove_partitions(tfd);
if (update || dv->writemostly > 0) {
if (update || dv->writemostly > 0
|| dv->failfast > 0) {
int rv = -1;
tfd = dev_open(dv->devname, O_RDWR);
if (tfd < 0) {
@ -700,6 +705,14 @@ int attempt_re_add(int fd, int tfd, struct mddev_dev *dv,
rv = dev_st->ss->update_super(
dev_st, NULL, "readwrite",
devname, verbose, 0, NULL);
if (dv->failfast == 1)
rv = dev_st->ss->update_super(
dev_st, NULL, "failfast",
devname, verbose, 0, NULL);
if (dv->failfast == 2)
rv = dev_st->ss->update_super(
dev_st, NULL, "nofailfast",
devname, verbose, 0, NULL);
if (update)
rv = dev_st->ss->update_super(
dev_st, NULL, update,
@ -964,6 +977,8 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
disc.state |= (1 << MD_DISK_JOURNAL) | (1 << MD_DISK_SYNC);
if (dv->writemostly == 1)
disc.state |= 1 << MD_DISK_WRITEMOSTLY;
if (dv->failfast == 1)
disc.state |= 1 << MD_DISK_FAILFAST;
dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
if (tst->ss->add_to_super(tst, &disc, dfd,
dv->devname, INVALID_SECTORS))
@ -1009,6 +1024,8 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
if (dv->writemostly == 1)
disc.state |= (1 << MD_DISK_WRITEMOSTLY);
if (dv->failfast == 1)
disc.state |= (1 << MD_DISK_FAILFAST);
if (tst->ss->external) {
/* add a disk
* to an external metadata container */
@ -1785,6 +1802,7 @@ int move_spare(char *from_devname, char *to_devname, dev_t devid)
devlist.next = NULL;
devlist.used = 0;
devlist.writemostly = 0;
devlist.failfast = 0;
devlist.devname = devname;
sprintf(devname, "%d:%d", major(devid), minor(devid));

View File

@ -136,6 +136,8 @@ struct option long_options[] = {
{"bitmap-chunk", 1, 0, BitmapChunk},
{"write-behind", 2, 0, WriteBehind},
{"write-mostly",0, 0, WriteMostly},
{"failfast", 0, 0, FailFast},
{"nofailfast",0, 0, NoFailFast},
{"re-add", 0, 0, ReAdd},
{"homehost", 1, 0, HomeHost},
{"symlinks", 1, 0, Symlinks},

54
md.4
View File

@ -916,6 +916,60 @@ slow). The extra latency of the remote link will not slow down normal
operations, but the remote system will still have a reasonably
up-to-date copy of all data.
.SS FAILFAST
From Linux 4.10,
.I
md
supports FAILFAST for RAID1 and RAID10 arrays. This is a flag that
can be set on individual drives, though it is usually set on all
drives, or no drives.
When
.I md
sends an I/O request to a drive that is marked as FAILFAST, and when
the array could survive the loss of that drive without losing data,
.I md
will request that the underlying device does not perform any retries.
This means that a failure will be reported to
.I md
promptly, and it can mark the device as faulty and continue using the
other device(s).
.I md
cannot control the timeout that the underlying devices use to
determine failure. Any changes desired to that timeout must be set
explictly on the underlying device, separately from using
.IR mdadm .
If a FAILFAST request does fail, and if it is still safe to mark the
device as faulty without data loss, that will be done and the array
will continue functioning on a reduced number of devices. If it is not
possible to safely mark the device as faulty,
.I md
will retry the request without disabling retries in the underlying
device. In any case,
.I md
will not attempt to repair read errors on a device marked as FAILFAST
by writing out the correct. It will just mark the device as faulty.
FAILFAST is appropriate for storage arrays that have a low probability
of true failure, but will sometimes introduce unacceptable delays to
I/O requests while performing internal maintenance. The value of
setting FAILFAST involves a trade-off. The gain is that the chance of
unacceptable delays is substantially reduced. The cost is that the
unlikely event of data-loss on one device is slightly more likely to
result in data-loss for the array.
When a device in an array using FAILFAST is marked as faulty, it will
usually become usable again in a short while.
.I mdadm
makes no attempt to detect that possibility. Some separate
mechanism, tuned to the specific details of the expected failure modes,
needs to be created to monitor devices to see when they return to full
functionality, and to then re-add them to the array. In order of
this "re-add" functionality to be effective, an array using FAILFAST
should always have a write-intent bitmap.
.SS RESTRIPING
.IR Restriping ,

1
md_p.h
View File

@ -89,6 +89,7 @@
* read requests will only be sent here in
* dire need
*/
#define MD_DISK_FAILFAST 10 /* Fewer retries, more failures */
#define MD_DISK_REPLACEMENT 17
#define MD_DISK_JOURNAL 18 /* disk is used as the write journal in RAID-5/6 */

View File

@ -747,7 +747,7 @@ subsequent devices listed in a
.BR \-\-create ,
or
.B \-\-add
command will be flagged as 'write-mostly'. This is valid for RAID1
command will be flagged as 'write\-mostly'. This is valid for RAID1
only and means that the 'md' driver will avoid reading from these
devices if at all possible. This can be useful if mirroring over a
slow link.
@ -761,6 +761,25 @@ A write-intent bitmap is required in order to use write-behind
mode, and write-behind is only attempted on drives marked as
.IR write-mostly .
.TP
.BR \-\-failfast
subsequent devices listed in a
.B \-\-create
or
.B \-\-add
command will be flagged as 'failfast'. This is valid for RAID1 and
RAID10 only. IO requests to these devices will be encouraged to fail
quickly rather than cause long delays due to error handling. Also no
attempt is made to repair a read error on these devices.
If an array becomes degraded so that the 'failfast' device is the only
usable device, the 'failfast' flag will then be ignored and extended
delays will be preferred to complete failure.
The 'failfast' flag is appropriate for storage arrays which have a
low probability of true failure, but which may sometimes
cause unacceptable delays due to internal maintenance functions.
.TP
.BR \-\-assume\-clean
Tell
@ -1452,6 +1471,17 @@ that had a failed journal. To avoid interrupting on-going write opertions,
.B \-\-add-journal
only works for array in Read-Only state.
.TP
.BR \-\-failfast
Subsequent devices that are added or re\-added will have
the 'failfast' flag set. This is only valid for RAID1 and RAID10 and
means that the 'md' driver will avoid long timeouts on error handling
where possible.
.TP
.BR \-\-nofailfast
Subsequent devices that are re\-added will be re\-added without
the 'failfast' flag set.
.P
Each of these options requires that the first device listed is the array
to be acted upon, and the remainder are component devices to be added,

11
mdadm.c
View File

@ -90,6 +90,7 @@ int main(int argc, char *argv[])
int spare_sharing = 1;
struct supertype *ss = NULL;
int writemostly = 0;
int failfast = 0;
char *shortopt = short_options;
int dosyslog = 0;
int rebuild_map = 0;
@ -295,6 +296,7 @@ int main(int argc, char *argv[])
dv->devname = optarg;
dv->disposition = devmode;
dv->writemostly = writemostly;
dv->failfast = failfast;
dv->used = 0;
dv->next = NULL;
*devlistend = dv;
@ -351,6 +353,7 @@ int main(int argc, char *argv[])
dv->devname = optarg;
dv->disposition = devmode;
dv->writemostly = writemostly;
dv->failfast = failfast;
dv->used = 0;
dv->next = NULL;
*devlistend = dv;
@ -417,6 +420,14 @@ int main(int argc, char *argv[])
writemostly = 2;
continue;
case O(MANAGE,FailFast):
case O(CREATE,FailFast):
failfast = 1;
continue;
case O(MANAGE,NoFailFast):
failfast = 2;
continue;
case O(GROW,'z'):
case O(CREATE,'z'):
case O(BUILD,'z'): /* size */

5
mdadm.h Executable file → Normal file
View File

@ -383,6 +383,8 @@ enum special_options {
ConfigFile,
ChunkSize,
WriteMostly,
FailFast,
NoFailFast,
Layout,
Auto,
Force,
@ -516,6 +518,7 @@ struct mddev_dev {
* Not set for names read from .config
*/
char writemostly; /* 1 for 'set writemostly', 2 for 'clear writemostly' */
char failfast; /* Ditto but for 'failfast' flag */
int used; /* set when used */
long long data_offset;
struct mddev_dev *next;
@ -821,6 +824,8 @@ extern struct superswitch {
* linear-grow-update - now change the size of the array.
* writemostly - set the WriteMostly1 bit in the superblock devflags
* readwrite - clear the WriteMostly1 bit in the superblock devflags
* failfast - set the FailFast1 bit in the superblock
* nofailfast - clear the FailFast1 bit
* no-bitmap - clear any record that a bitmap is present.
* bbl - add a bad-block-log if possible
* no-bbl - remove any bad-block-log is it is empty.

View File

@ -232,14 +232,15 @@ static void examine_super0(struct supertype *st, char *homehost)
mdp_disk_t *dp;
char *dv;
char nb[5];
int wonly;
int wonly, failfast;
if (d>=0) dp = &sb->disks[d];
else dp = &sb->this_disk;
snprintf(nb, sizeof(nb), "%4d", d);
printf("%4s %5d %5d %5d %5d ", d < 0 ? "this" : nb,
dp->number, dp->major, dp->minor, dp->raid_disk);
wonly = dp->state & (1 << MD_DISK_WRITEMOSTLY);
dp->state &= ~(1 << MD_DISK_WRITEMOSTLY);
failfast = dp->state & (1<<MD_DISK_FAILFAST);
dp->state &= ~(wonly | failfast);
if (dp->state & (1 << MD_DISK_FAULTY))
printf(" faulty");
if (dp->state & (1 << MD_DISK_ACTIVE))
@ -250,6 +251,8 @@ static void examine_super0(struct supertype *st, char *homehost)
printf(" removed");
if (wonly)
printf(" write-mostly");
if (failfast)
printf(" failfast");
if (dp->state == 0)
printf(" spare");
if ((dv = map_dev(dp->major, dp->minor, 0)))
@ -581,7 +584,8 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
} else if (strcmp(update, "assemble")==0) {
int d = info->disk.number;
int wonly = sb->disks[d].state & (1<<MD_DISK_WRITEMOSTLY);
int mask = (1<<MD_DISK_WRITEMOSTLY);
int failfast = sb->disks[d].state & (1<<MD_DISK_FAILFAST);
int mask = (1<<MD_DISK_WRITEMOSTLY)|(1<<MD_DISK_FAILFAST);
int add = 0;
if (sb->minor_version >= 91)
/* During reshape we don't insist on everything
@ -590,7 +594,7 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
add = (1<<MD_DISK_SYNC);
if (((sb->disks[d].state & ~mask) | add)
!= (unsigned)info->disk.state) {
sb->disks[d].state = info->disk.state | wonly;
sb->disks[d].state = info->disk.state | wonly |failfast;
rv = 1;
}
if (info->reshape_active &&

View File

@ -77,6 +77,7 @@ struct mdp_superblock_1 {
__u8 device_uuid[16]; /* user-space setable, ignored by kernel */
__u8 devflags; /* per-device flags. Only one defined...*/
#define WriteMostly1 1 /* mask for writemostly flag in above */
#define FailFast1 2 /* Device should get FailFast requests */
/* bad block log. If there are any bad blocks the feature flag is set.
* if offset and size are non-zero, that space is reserved and available.
*/
@ -430,6 +431,8 @@ static void examine_super1(struct supertype *st, char *homehost)
printf(" Flags :");
if (sb->devflags & WriteMostly1)
printf(" write-mostly");
if (sb->devflags & FailFast1)
printf(" failfast");
printf("\n");
}
@ -1020,6 +1023,8 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
}
if (sb->devflags & WriteMostly1)
info->disk.state |= (1 << MD_DISK_WRITEMOSTLY);
if (sb->devflags & FailFast1)
info->disk.state |= (1 << MD_DISK_FAILFAST);
info->events = __le64_to_cpu(sb->events);
sprintf(info->text_version, "1.%d", st->minor_version);
info->safe_mode_delay = 200;
@ -1377,6 +1382,10 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
sb->devflags |= WriteMostly1;
else if (strcmp(update, "readwrite")==0)
sb->devflags &= ~WriteMostly1;
else if (strcmp(update, "failfast") == 0)
sb->devflags |= FailFast1;
else if (strcmp(update, "nofailfast") == 0)
sb->devflags &= ~FailFast1;
else
rv = -1;
@ -1713,6 +1722,10 @@ static int write_init_super1(struct supertype *st)
sb->devflags |= WriteMostly1;
else
sb->devflags &= ~WriteMostly1;
if (di->disk.state & (1<<MD_DISK_FAILFAST))
sb->devflags |= FailFast1;
else
sb->devflags &= ~FailFast1;
random_uuid(sb->device_uuid);