Add failfast support.
Allow per-device "failfast" flag to be set when creating an array or adding devices to an array. When re-adding a device which had the failfast flag, it can be removed using --nofailfast. failfast status is printed in --detail and --examine output. Signed-off-by: NeilBrown <neilb@suse.com> Signed-off-by: Jes Sorensen <Jes.Sorensen@redhat.com>
This commit is contained in:
parent
cf52eff58a
commit
71574efb07
2
Create.c
2
Create.c
|
@ -890,6 +890,8 @@ int Create(struct supertype *st, char *mddev,
|
|||
|
||||
if (dv->writemostly == 1)
|
||||
inf->disk.state |= (1<<MD_DISK_WRITEMOSTLY);
|
||||
if (dv->failfast == 1)
|
||||
inf->disk.state |= (1<<MD_DISK_FAILFAST);
|
||||
|
||||
if (have_container)
|
||||
fd = -1;
|
||||
|
|
1
Detail.c
1
Detail.c
|
@ -658,6 +658,7 @@ This is pretty boring
|
|||
}
|
||||
if (disk.state & (1<<MD_DISK_REMOVED)) printf(" removed");
|
||||
if (disk.state & (1<<MD_DISK_WRITEMOSTLY)) printf(" writemostly");
|
||||
if (disk.state & (1<<MD_DISK_FAILFAST)) printf(" failfast");
|
||||
if (disk.state & (1<<MD_DISK_JOURNAL)) printf(" journal");
|
||||
if ((disk.state &
|
||||
((1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC)
|
||||
|
|
|
@ -1035,6 +1035,7 @@ static int array_try_spare(char *devname, int *dfdp, struct dev_policy *pol,
|
|||
devlist.next = NULL;
|
||||
devlist.used = 0;
|
||||
devlist.writemostly = 0;
|
||||
devlist.failfast = 0;
|
||||
devlist.devname = chosen_devname;
|
||||
sprintf(chosen_devname, "%d:%d", major(stb.st_rdev),
|
||||
minor(stb.st_rdev));
|
||||
|
|
20
Manage.c
20
Manage.c
|
@ -683,8 +683,13 @@ int attempt_re_add(int fd, int tfd, struct mddev_dev *dv,
|
|||
disc.state |= 1 << MD_DISK_WRITEMOSTLY;
|
||||
if (dv->writemostly == 2)
|
||||
disc.state &= ~(1 << MD_DISK_WRITEMOSTLY);
|
||||
if (dv->failfast == 1)
|
||||
disc.state |= 1 << MD_DISK_FAILFAST;
|
||||
if (dv->failfast == 2)
|
||||
disc.state &= ~(1 << MD_DISK_FAILFAST);
|
||||
remove_partitions(tfd);
|
||||
if (update || dv->writemostly > 0) {
|
||||
if (update || dv->writemostly > 0
|
||||
|| dv->failfast > 0) {
|
||||
int rv = -1;
|
||||
tfd = dev_open(dv->devname, O_RDWR);
|
||||
if (tfd < 0) {
|
||||
|
@ -700,6 +705,14 @@ int attempt_re_add(int fd, int tfd, struct mddev_dev *dv,
|
|||
rv = dev_st->ss->update_super(
|
||||
dev_st, NULL, "readwrite",
|
||||
devname, verbose, 0, NULL);
|
||||
if (dv->failfast == 1)
|
||||
rv = dev_st->ss->update_super(
|
||||
dev_st, NULL, "failfast",
|
||||
devname, verbose, 0, NULL);
|
||||
if (dv->failfast == 2)
|
||||
rv = dev_st->ss->update_super(
|
||||
dev_st, NULL, "nofailfast",
|
||||
devname, verbose, 0, NULL);
|
||||
if (update)
|
||||
rv = dev_st->ss->update_super(
|
||||
dev_st, NULL, update,
|
||||
|
@ -964,6 +977,8 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
|
|||
disc.state |= (1 << MD_DISK_JOURNAL) | (1 << MD_DISK_SYNC);
|
||||
if (dv->writemostly == 1)
|
||||
disc.state |= 1 << MD_DISK_WRITEMOSTLY;
|
||||
if (dv->failfast == 1)
|
||||
disc.state |= 1 << MD_DISK_FAILFAST;
|
||||
dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
|
||||
if (tst->ss->add_to_super(tst, &disc, dfd,
|
||||
dv->devname, INVALID_SECTORS))
|
||||
|
@ -1009,6 +1024,8 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
|
|||
|
||||
if (dv->writemostly == 1)
|
||||
disc.state |= (1 << MD_DISK_WRITEMOSTLY);
|
||||
if (dv->failfast == 1)
|
||||
disc.state |= (1 << MD_DISK_FAILFAST);
|
||||
if (tst->ss->external) {
|
||||
/* add a disk
|
||||
* to an external metadata container */
|
||||
|
@ -1785,6 +1802,7 @@ int move_spare(char *from_devname, char *to_devname, dev_t devid)
|
|||
devlist.next = NULL;
|
||||
devlist.used = 0;
|
||||
devlist.writemostly = 0;
|
||||
devlist.failfast = 0;
|
||||
devlist.devname = devname;
|
||||
sprintf(devname, "%d:%d", major(devid), minor(devid));
|
||||
|
||||
|
|
2
ReadMe.c
2
ReadMe.c
|
@ -136,6 +136,8 @@ struct option long_options[] = {
|
|||
{"bitmap-chunk", 1, 0, BitmapChunk},
|
||||
{"write-behind", 2, 0, WriteBehind},
|
||||
{"write-mostly",0, 0, WriteMostly},
|
||||
{"failfast", 0, 0, FailFast},
|
||||
{"nofailfast",0, 0, NoFailFast},
|
||||
{"re-add", 0, 0, ReAdd},
|
||||
{"homehost", 1, 0, HomeHost},
|
||||
{"symlinks", 1, 0, Symlinks},
|
||||
|
|
54
md.4
54
md.4
|
@ -916,6 +916,60 @@ slow). The extra latency of the remote link will not slow down normal
|
|||
operations, but the remote system will still have a reasonably
|
||||
up-to-date copy of all data.
|
||||
|
||||
.SS FAILFAST
|
||||
|
||||
From Linux 4.10,
|
||||
.I
|
||||
md
|
||||
supports FAILFAST for RAID1 and RAID10 arrays. This is a flag that
|
||||
can be set on individual drives, though it is usually set on all
|
||||
drives, or no drives.
|
||||
|
||||
When
|
||||
.I md
|
||||
sends an I/O request to a drive that is marked as FAILFAST, and when
|
||||
the array could survive the loss of that drive without losing data,
|
||||
.I md
|
||||
will request that the underlying device does not perform any retries.
|
||||
This means that a failure will be reported to
|
||||
.I md
|
||||
promptly, and it can mark the device as faulty and continue using the
|
||||
other device(s).
|
||||
.I md
|
||||
cannot control the timeout that the underlying devices use to
|
||||
determine failure. Any changes desired to that timeout must be set
|
||||
explictly on the underlying device, separately from using
|
||||
.IR mdadm .
|
||||
|
||||
If a FAILFAST request does fail, and if it is still safe to mark the
|
||||
device as faulty without data loss, that will be done and the array
|
||||
will continue functioning on a reduced number of devices. If it is not
|
||||
possible to safely mark the device as faulty,
|
||||
.I md
|
||||
will retry the request without disabling retries in the underlying
|
||||
device. In any case,
|
||||
.I md
|
||||
will not attempt to repair read errors on a device marked as FAILFAST
|
||||
by writing out the correct. It will just mark the device as faulty.
|
||||
|
||||
FAILFAST is appropriate for storage arrays that have a low probability
|
||||
of true failure, but will sometimes introduce unacceptable delays to
|
||||
I/O requests while performing internal maintenance. The value of
|
||||
setting FAILFAST involves a trade-off. The gain is that the chance of
|
||||
unacceptable delays is substantially reduced. The cost is that the
|
||||
unlikely event of data-loss on one device is slightly more likely to
|
||||
result in data-loss for the array.
|
||||
|
||||
When a device in an array using FAILFAST is marked as faulty, it will
|
||||
usually become usable again in a short while.
|
||||
.I mdadm
|
||||
makes no attempt to detect that possibility. Some separate
|
||||
mechanism, tuned to the specific details of the expected failure modes,
|
||||
needs to be created to monitor devices to see when they return to full
|
||||
functionality, and to then re-add them to the array. In order of
|
||||
this "re-add" functionality to be effective, an array using FAILFAST
|
||||
should always have a write-intent bitmap.
|
||||
|
||||
.SS RESTRIPING
|
||||
|
||||
.IR Restriping ,
|
||||
|
|
1
md_p.h
1
md_p.h
|
@ -89,6 +89,7 @@
|
|||
* read requests will only be sent here in
|
||||
* dire need
|
||||
*/
|
||||
#define MD_DISK_FAILFAST 10 /* Fewer retries, more failures */
|
||||
|
||||
#define MD_DISK_REPLACEMENT 17
|
||||
#define MD_DISK_JOURNAL 18 /* disk is used as the write journal in RAID-5/6 */
|
||||
|
|
32
mdadm.8.in
32
mdadm.8.in
|
@ -747,7 +747,7 @@ subsequent devices listed in a
|
|||
.BR \-\-create ,
|
||||
or
|
||||
.B \-\-add
|
||||
command will be flagged as 'write-mostly'. This is valid for RAID1
|
||||
command will be flagged as 'write\-mostly'. This is valid for RAID1
|
||||
only and means that the 'md' driver will avoid reading from these
|
||||
devices if at all possible. This can be useful if mirroring over a
|
||||
slow link.
|
||||
|
@ -761,6 +761,25 @@ A write-intent bitmap is required in order to use write-behind
|
|||
mode, and write-behind is only attempted on drives marked as
|
||||
.IR write-mostly .
|
||||
|
||||
.TP
|
||||
.BR \-\-failfast
|
||||
subsequent devices listed in a
|
||||
.B \-\-create
|
||||
or
|
||||
.B \-\-add
|
||||
command will be flagged as 'failfast'. This is valid for RAID1 and
|
||||
RAID10 only. IO requests to these devices will be encouraged to fail
|
||||
quickly rather than cause long delays due to error handling. Also no
|
||||
attempt is made to repair a read error on these devices.
|
||||
|
||||
If an array becomes degraded so that the 'failfast' device is the only
|
||||
usable device, the 'failfast' flag will then be ignored and extended
|
||||
delays will be preferred to complete failure.
|
||||
|
||||
The 'failfast' flag is appropriate for storage arrays which have a
|
||||
low probability of true failure, but which may sometimes
|
||||
cause unacceptable delays due to internal maintenance functions.
|
||||
|
||||
.TP
|
||||
.BR \-\-assume\-clean
|
||||
Tell
|
||||
|
@ -1452,6 +1471,17 @@ that had a failed journal. To avoid interrupting on-going write opertions,
|
|||
.B \-\-add-journal
|
||||
only works for array in Read-Only state.
|
||||
|
||||
.TP
|
||||
.BR \-\-failfast
|
||||
Subsequent devices that are added or re\-added will have
|
||||
the 'failfast' flag set. This is only valid for RAID1 and RAID10 and
|
||||
means that the 'md' driver will avoid long timeouts on error handling
|
||||
where possible.
|
||||
.TP
|
||||
.BR \-\-nofailfast
|
||||
Subsequent devices that are re\-added will be re\-added without
|
||||
the 'failfast' flag set.
|
||||
|
||||
.P
|
||||
Each of these options requires that the first device listed is the array
|
||||
to be acted upon, and the remainder are component devices to be added,
|
||||
|
|
11
mdadm.c
11
mdadm.c
|
@ -90,6 +90,7 @@ int main(int argc, char *argv[])
|
|||
int spare_sharing = 1;
|
||||
struct supertype *ss = NULL;
|
||||
int writemostly = 0;
|
||||
int failfast = 0;
|
||||
char *shortopt = short_options;
|
||||
int dosyslog = 0;
|
||||
int rebuild_map = 0;
|
||||
|
@ -295,6 +296,7 @@ int main(int argc, char *argv[])
|
|||
dv->devname = optarg;
|
||||
dv->disposition = devmode;
|
||||
dv->writemostly = writemostly;
|
||||
dv->failfast = failfast;
|
||||
dv->used = 0;
|
||||
dv->next = NULL;
|
||||
*devlistend = dv;
|
||||
|
@ -351,6 +353,7 @@ int main(int argc, char *argv[])
|
|||
dv->devname = optarg;
|
||||
dv->disposition = devmode;
|
||||
dv->writemostly = writemostly;
|
||||
dv->failfast = failfast;
|
||||
dv->used = 0;
|
||||
dv->next = NULL;
|
||||
*devlistend = dv;
|
||||
|
@ -417,6 +420,14 @@ int main(int argc, char *argv[])
|
|||
writemostly = 2;
|
||||
continue;
|
||||
|
||||
case O(MANAGE,FailFast):
|
||||
case O(CREATE,FailFast):
|
||||
failfast = 1;
|
||||
continue;
|
||||
case O(MANAGE,NoFailFast):
|
||||
failfast = 2;
|
||||
continue;
|
||||
|
||||
case O(GROW,'z'):
|
||||
case O(CREATE,'z'):
|
||||
case O(BUILD,'z'): /* size */
|
||||
|
|
|
@ -383,6 +383,8 @@ enum special_options {
|
|||
ConfigFile,
|
||||
ChunkSize,
|
||||
WriteMostly,
|
||||
FailFast,
|
||||
NoFailFast,
|
||||
Layout,
|
||||
Auto,
|
||||
Force,
|
||||
|
@ -516,6 +518,7 @@ struct mddev_dev {
|
|||
* Not set for names read from .config
|
||||
*/
|
||||
char writemostly; /* 1 for 'set writemostly', 2 for 'clear writemostly' */
|
||||
char failfast; /* Ditto but for 'failfast' flag */
|
||||
int used; /* set when used */
|
||||
long long data_offset;
|
||||
struct mddev_dev *next;
|
||||
|
@ -821,6 +824,8 @@ extern struct superswitch {
|
|||
* linear-grow-update - now change the size of the array.
|
||||
* writemostly - set the WriteMostly1 bit in the superblock devflags
|
||||
* readwrite - clear the WriteMostly1 bit in the superblock devflags
|
||||
* failfast - set the FailFast1 bit in the superblock
|
||||
* nofailfast - clear the FailFast1 bit
|
||||
* no-bitmap - clear any record that a bitmap is present.
|
||||
* bbl - add a bad-block-log if possible
|
||||
* no-bbl - remove any bad-block-log is it is empty.
|
||||
|
|
12
super0.c
12
super0.c
|
@ -232,14 +232,15 @@ static void examine_super0(struct supertype *st, char *homehost)
|
|||
mdp_disk_t *dp;
|
||||
char *dv;
|
||||
char nb[5];
|
||||
int wonly;
|
||||
int wonly, failfast;
|
||||
if (d>=0) dp = &sb->disks[d];
|
||||
else dp = &sb->this_disk;
|
||||
snprintf(nb, sizeof(nb), "%4d", d);
|
||||
printf("%4s %5d %5d %5d %5d ", d < 0 ? "this" : nb,
|
||||
dp->number, dp->major, dp->minor, dp->raid_disk);
|
||||
wonly = dp->state & (1 << MD_DISK_WRITEMOSTLY);
|
||||
dp->state &= ~(1 << MD_DISK_WRITEMOSTLY);
|
||||
failfast = dp->state & (1<<MD_DISK_FAILFAST);
|
||||
dp->state &= ~(wonly | failfast);
|
||||
if (dp->state & (1 << MD_DISK_FAULTY))
|
||||
printf(" faulty");
|
||||
if (dp->state & (1 << MD_DISK_ACTIVE))
|
||||
|
@ -250,6 +251,8 @@ static void examine_super0(struct supertype *st, char *homehost)
|
|||
printf(" removed");
|
||||
if (wonly)
|
||||
printf(" write-mostly");
|
||||
if (failfast)
|
||||
printf(" failfast");
|
||||
if (dp->state == 0)
|
||||
printf(" spare");
|
||||
if ((dv = map_dev(dp->major, dp->minor, 0)))
|
||||
|
@ -581,7 +584,8 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
|
|||
} else if (strcmp(update, "assemble")==0) {
|
||||
int d = info->disk.number;
|
||||
int wonly = sb->disks[d].state & (1<<MD_DISK_WRITEMOSTLY);
|
||||
int mask = (1<<MD_DISK_WRITEMOSTLY);
|
||||
int failfast = sb->disks[d].state & (1<<MD_DISK_FAILFAST);
|
||||
int mask = (1<<MD_DISK_WRITEMOSTLY)|(1<<MD_DISK_FAILFAST);
|
||||
int add = 0;
|
||||
if (sb->minor_version >= 91)
|
||||
/* During reshape we don't insist on everything
|
||||
|
@ -590,7 +594,7 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
|
|||
add = (1<<MD_DISK_SYNC);
|
||||
if (((sb->disks[d].state & ~mask) | add)
|
||||
!= (unsigned)info->disk.state) {
|
||||
sb->disks[d].state = info->disk.state | wonly;
|
||||
sb->disks[d].state = info->disk.state | wonly |failfast;
|
||||
rv = 1;
|
||||
}
|
||||
if (info->reshape_active &&
|
||||
|
|
13
super1.c
13
super1.c
|
@ -77,6 +77,7 @@ struct mdp_superblock_1 {
|
|||
__u8 device_uuid[16]; /* user-space setable, ignored by kernel */
|
||||
__u8 devflags; /* per-device flags. Only one defined...*/
|
||||
#define WriteMostly1 1 /* mask for writemostly flag in above */
|
||||
#define FailFast1 2 /* Device should get FailFast requests */
|
||||
/* bad block log. If there are any bad blocks the feature flag is set.
|
||||
* if offset and size are non-zero, that space is reserved and available.
|
||||
*/
|
||||
|
@ -430,6 +431,8 @@ static void examine_super1(struct supertype *st, char *homehost)
|
|||
printf(" Flags :");
|
||||
if (sb->devflags & WriteMostly1)
|
||||
printf(" write-mostly");
|
||||
if (sb->devflags & FailFast1)
|
||||
printf(" failfast");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
@ -1020,6 +1023,8 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
|
|||
}
|
||||
if (sb->devflags & WriteMostly1)
|
||||
info->disk.state |= (1 << MD_DISK_WRITEMOSTLY);
|
||||
if (sb->devflags & FailFast1)
|
||||
info->disk.state |= (1 << MD_DISK_FAILFAST);
|
||||
info->events = __le64_to_cpu(sb->events);
|
||||
sprintf(info->text_version, "1.%d", st->minor_version);
|
||||
info->safe_mode_delay = 200;
|
||||
|
@ -1377,6 +1382,10 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
|
|||
sb->devflags |= WriteMostly1;
|
||||
else if (strcmp(update, "readwrite")==0)
|
||||
sb->devflags &= ~WriteMostly1;
|
||||
else if (strcmp(update, "failfast") == 0)
|
||||
sb->devflags |= FailFast1;
|
||||
else if (strcmp(update, "nofailfast") == 0)
|
||||
sb->devflags &= ~FailFast1;
|
||||
else
|
||||
rv = -1;
|
||||
|
||||
|
@ -1713,6 +1722,10 @@ static int write_init_super1(struct supertype *st)
|
|||
sb->devflags |= WriteMostly1;
|
||||
else
|
||||
sb->devflags &= ~WriteMostly1;
|
||||
if (di->disk.state & (1<<MD_DISK_FAILFAST))
|
||||
sb->devflags |= FailFast1;
|
||||
else
|
||||
sb->devflags &= ~FailFast1;
|
||||
|
||||
random_uuid(sb->device_uuid);
|
||||
|
||||
|
|
Loading…
Reference in New Issue