Generic support for --consistency-policy and PPL

Add a new parameter to mdadm: --consistency-policy=. It determines how
the array maintains consistency in case of unexpected shutdown. This
maps to the md sysfs attribute 'consistency_policy'. It can be used to
create a raid5 array using PPL. Add the necessary plumbing to pass this
option to metadata handlers. The write journal and bitmap
functionalities are treated as different policies, which are implicitly
selected when using --write-journal or --bitmap options.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Jes Sorensen <Jes.Sorensen@gmail.com>
This commit is contained in:
Artur Paszkiewicz 2017-03-29 11:54:15 +02:00 committed by Jes Sorensen
parent b4decd517d
commit 5308f11727
14 changed files with 190 additions and 58 deletions

View File

@ -259,7 +259,8 @@ int Create(struct supertype *st, char *mddev,
if (st && ! st->ss->validate_geometry(st, s->level, s->layout, s->raiddisks,
&s->chunk, s->size*2,
data_offset, NULL,
&newsize, c->verbose>=0))
&newsize, s->consistency_policy,
c->verbose>=0))
return 1;
if (s->chunk && s->chunk != UnSet) {
@ -358,7 +359,8 @@ int Create(struct supertype *st, char *mddev,
st, s->level, s->layout, s->raiddisks,
&s->chunk, s->size*2,
dv->data_offset, dname,
&freesize, c->verbose > 0)) {
&freesize, s->consistency_policy,
c->verbose > 0)) {
case -1: /* Not valid, message printed, and not
* worth checking any further */
exit(2);
@ -395,6 +397,7 @@ int Create(struct supertype *st, char *mddev,
&s->chunk, s->size*2,
dv->data_offset,
dname, &freesize,
s->consistency_policy,
c->verbose >= 0)) {
pr_err("%s is not suitable for this array.\n",
@ -501,7 +504,8 @@ int Create(struct supertype *st, char *mddev,
s->raiddisks,
&s->chunk, minsize*2,
data_offset,
NULL, NULL, 0)) {
NULL, NULL,
s->consistency_policy, 0)) {
pr_err("devices too large for RAID level %d\n", s->level);
return 1;
}
@ -528,6 +532,12 @@ int Create(struct supertype *st, char *mddev,
if (s->bitmap_file && strcmp(s->bitmap_file, "none") == 0)
s->bitmap_file = NULL;
if (s->consistency_policy == CONSISTENCY_POLICY_PPL &&
!st->ss->write_init_ppl) {
pr_err("%s metadata does not support PPL\n", st->ss->name);
return 1;
}
if (!have_container && s->level > 0 && ((maxsize-s->size)*100 > maxsize)) {
if (c->runstop != 1 || c->verbose >= 0)
pr_err("largest drive (%s) exceeds size (%lluK) by more than 1%%\n",
@ -720,7 +730,7 @@ int Create(struct supertype *st, char *mddev,
name += 2;
}
}
if (!st->ss->init_super(st, &info.array, s->size, name, c->homehost, uuid,
if (!st->ss->init_super(st, &info.array, s, name, c->homehost, uuid,
data_offset))
goto abort_locked;

2
Kill.c
View File

@ -63,7 +63,7 @@ int Kill(char *dev, struct supertype *st, int force, int verbose, int noexcl)
rv = st->ss->load_super(st, fd, dev);
if (rv == 0 || (force && rv >= 2)) {
st->ss->free_super(st);
st->ss->init_super(st, NULL, 0, "", NULL, NULL,
st->ss->init_super(st, NULL, NULL, "", NULL, NULL,
INVALID_SECTORS);
if (st->ss->store_super(st, fd)) {
if (verbose >= 0)

View File

@ -78,11 +78,11 @@ char Version[] = "mdadm - v" VERSION " - " VERS_DATE "\n";
* found, it is started.
*/
char short_options[]="-ABCDEFGIQhVXYWZ:vqbc:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:";
char short_options[]="-ABCDEFGIQhVXYWZ:vqbc:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:k:";
char short_bitmap_options[]=
"-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:";
"-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:k:";
char short_bitmap_auto_options[]=
"-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:";
"-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:k:";
struct option long_options[] = {
{"manage", 0, 0, ManageOpt},
@ -148,6 +148,7 @@ struct option long_options[] = {
{"nodes",1, 0, Nodes}, /* also for --assemble */
{"home-cluster",1, 0, ClusterName},
{"write-journal",1, 0, WriteJournal},
{"consistency-policy", 1, 0, 'k'},
/* For assemble */
{"uuid", 1, 0, 'u'},
@ -362,27 +363,29 @@ char Help_create[] =
" other levels.\n"
"\n"
" Options that are valid with --create (-C) are:\n"
" --bitmap= : Create a bitmap for the array with the given filename\n"
" : or an internal bitmap is 'internal' is given\n"
" --chunk= -c : chunk size in kibibytes\n"
" --rounding= : rounding factor for linear array (==chunk size)\n"
" --level= -l : raid level: 0,1,4,5,6,10,linear,multipath and synonyms\n"
" --parity= -p : raid5/6 parity algorithm: {left,right}-{,a}symmetric\n"
" --layout= : same as --parity, for RAID10: [fno]NN \n"
" --raid-devices= -n : number of active devices in array\n"
" --spare-devices= -x: number of spare (eXtra) devices in initial array\n"
" --size= -z : Size (in K) of each drive in RAID1/4/5/6/10 - optional\n"
" --data-offset= : Space to leave between start of device and start\n"
" : of array data.\n"
" --force -f : Honour devices as listed on command line. Don't\n"
" : insert a missing drive for RAID5.\n"
" --run -R : insist of running the array even if not all\n"
" : devices are present or some look odd.\n"
" --readonly -o : start the array readonly - not supported yet.\n"
" --name= -N : Textual name for array - max 32 characters\n"
" --bitmap-chunk= : bitmap chunksize in Kilobytes.\n"
" --delay= -d : bitmap update delay in seconds.\n"
" --write-journal= : Specify journal device for RAID-4/5/6 array\n"
" --bitmap= -b : Create a bitmap for the array with the given filename\n"
" : or an internal bitmap if 'internal' is given\n"
" --chunk= -c : chunk size in kibibytes\n"
" --rounding= : rounding factor for linear array (==chunk size)\n"
" --level= -l : raid level: 0,1,4,5,6,10,linear,multipath and synonyms\n"
" --parity= -p : raid5/6 parity algorithm: {left,right}-{,a}symmetric\n"
" --layout= : same as --parity, for RAID10: [fno]NN \n"
" --raid-devices= -n : number of active devices in array\n"
" --spare-devices= -x : number of spare (eXtra) devices in initial array\n"
" --size= -z : Size (in K) of each drive in RAID1/4/5/6/10 - optional\n"
" --data-offset= : Space to leave between start of device and start\n"
" : of array data.\n"
" --force -f : Honour devices as listed on command line. Don't\n"
" : insert a missing drive for RAID5.\n"
" --run -R : insist of running the array even if not all\n"
" : devices are present or some look odd.\n"
" --readonly -o : start the array readonly - not supported yet.\n"
" --name= -N : Textual name for array - max 32 characters\n"
" --bitmap-chunk= : bitmap chunksize in Kilobytes.\n"
" --delay= -d : bitmap update delay in seconds.\n"
" --write-journal= : Specify journal device for RAID-4/5/6 array\n"
" --consistency-policy= : Specify the policy that determines how the array\n"
" -k : maintains consistency in case of unexpected shutdown.\n"
"\n"
;

10
maps.c
View File

@ -129,6 +129,16 @@ mapping_t faultylayout[] = {
{ NULL, 0}
};
mapping_t consistency_policies[] = {
{ "unknown", CONSISTENCY_POLICY_UNKNOWN},
{ "none", CONSISTENCY_POLICY_NONE},
{ "resync", CONSISTENCY_POLICY_RESYNC},
{ "bitmap", CONSISTENCY_POLICY_BITMAP},
{ "journal", CONSISTENCY_POLICY_JOURNAL},
{ "ppl", CONSISTENCY_POLICY_PPL},
{ NULL, 0}
};
char *map_num(mapping_t *map, int num)
{
while (map->name) {

View File

@ -724,7 +724,9 @@ When creating an array on devices which are 100G or larger,
.I mdadm
automatically adds an internal bitmap as it will usually be
beneficial. This can be suppressed with
.B "\-\-bitmap=none".
.B "\-\-bitmap=none"
or by selecting a different consistency policy with
.BR \-\-consistency\-policy .
.TP
.BR \-\-bitmap\-chunk=
@ -1020,6 +1022,36 @@ should be a SSD with reasonable lifetime.
Auto creation of symlinks in /dev to /dev/md, option --symlinks must
be 'no' or 'yes' and work with --create and --build.
.TP
.BR \-k ", " \-\-consistency\-policy=
Specify how the array maintains consistency in case of unexpected shutdown.
Only relevant for RAID levels with redundancy.
Currently supported options are:
.RS
.TP
.B resync
Full resync is performed and all redundancy is regenerated when the array is
started after unclean shutdown.
.TP
.B bitmap
Resync assisted by a write-intent bitmap. Implicitly selected when using
.BR \-\-bitmap .
.TP
.B journal
For RAID levels 4/5/6, journal device is used to log transactions and replay
after unclean shutdown. Implicitly selected when using
.BR \-\-write\-journal .
.TP
.B ppl
For RAID5 only, Partial Parity Log is used to close the write hole and
eliminate resync. PPL is stored in the metadata region of RAID member drives,
no additional journal drive is needed.
.RE
.SH For assemble:
@ -2153,8 +2185,10 @@ in the array exceed 100G is size, an internal write-intent bitmap
will automatically be added unless some other option is explicitly
requested with the
.B \-\-bitmap
option. In any case space for a bitmap will be reserved so that one
can be added layer with
option or a different consistency policy is selected with the
.B \-\-consistency\-policy
option. In any case space for a bitmap will be reserved so that one
can be added later with
.BR "\-\-grow \-\-bitmap=internal" .
If the metadata type supports it (currently only 1.x metadata), space

55
mdadm.c
View File

@ -78,6 +78,7 @@ int main(int argc, char *argv[])
.level = UnSet,
.layout = UnSet,
.bitmap_chunk = UnSet,
.consistency_policy = UnSet,
};
char sys_hostname[256];
@ -1215,6 +1216,16 @@ int main(int argc, char *argv[])
s.journaldisks = 1;
continue;
case O(CREATE, 'k'):
s.consistency_policy = map_name(consistency_policies,
optarg);
if (s.consistency_policy == UnSet ||
s.consistency_policy < CONSISTENCY_POLICY_RESYNC) {
pr_err("Invalid consistency policy: %s\n",
optarg);
exit(2);
}
continue;
}
/* We have now processed all the valid options. Anything else is
* an error
@ -1242,9 +1253,47 @@ int main(int argc, char *argv[])
exit(0);
}
if (s.journaldisks && (s.level < 4 || s.level > 6)) {
pr_err("--write-journal is only supported for RAID level 4/5/6.\n");
exit(2);
if (s.journaldisks) {
if (s.level < 4 || s.level > 6) {
pr_err("--write-journal is only supported for RAID level 4/5/6.\n");
exit(2);
}
if (s.consistency_policy != UnSet &&
s.consistency_policy != CONSISTENCY_POLICY_JOURNAL) {
pr_err("--write-journal is not supported with consistency policy: %s\n",
map_num(consistency_policies, s.consistency_policy));
exit(2);
}
}
if (mode == CREATE && s.consistency_policy != UnSet) {
if (s.level <= 0) {
pr_err("--consistency-policy not meaningful with level %s.\n",
map_num(pers, s.level));
exit(2);
} else if (s.consistency_policy == CONSISTENCY_POLICY_JOURNAL &&
!s.journaldisks) {
pr_err("--write-journal is required for consistency policy: %s\n",
map_num(consistency_policies, s.consistency_policy));
exit(2);
} else if (s.consistency_policy == CONSISTENCY_POLICY_PPL &&
s.level != 5) {
pr_err("PPL consistency policy is only supported for RAID level 5.\n");
exit(2);
} else if (s.consistency_policy == CONSISTENCY_POLICY_BITMAP &&
(!s.bitmap_file ||
strcmp(s.bitmap_file, "none") == 0)) {
pr_err("--bitmap is required for consistency policy: %s\n",
map_num(consistency_policies, s.consistency_policy));
exit(2);
} else if (s.bitmap_file &&
strcmp(s.bitmap_file, "none") != 0 &&
s.consistency_policy != CONSISTENCY_POLICY_BITMAP &&
s.consistency_policy != CONSISTENCY_POLICY_JOURNAL) {
pr_err("--bitmap is not compatible with consistency policy: %s\n",
map_num(consistency_policies, s.consistency_policy));
exit(2);
}
}
if (!mode && devs_found) {

21
mdadm.h
View File

@ -279,6 +279,15 @@ struct mdinfo {
int journal_device_required;
int journal_clean;
enum {
CONSISTENCY_POLICY_UNKNOWN,
CONSISTENCY_POLICY_NONE,
CONSISTENCY_POLICY_RESYNC,
CONSISTENCY_POLICY_BITMAP,
CONSISTENCY_POLICY_JOURNAL,
CONSISTENCY_POLICY_PPL,
} consistency_policy;
/* During reshape we can sometimes change the data_offset to avoid
* over-writing still-valid data. We need to know if there is space.
* So getinfo_super will fill in space_before and space_after in sectors.
@ -426,6 +435,7 @@ enum special_options {
ClusterName,
ClusterConfirm,
WriteJournal,
ConsistencyPolicy,
};
enum prefix_standard {
@ -527,6 +537,7 @@ struct shape {
int assume_clean;
int write_behind;
unsigned long long size;
int consistency_policy;
};
/* List of device names - wildcards expanded */
@ -618,6 +629,7 @@ enum sysfs_read_flags {
GET_STATE = (1 << 23),
GET_ERROR = (1 << 24),
GET_ARRAY_STATE = (1 << 25),
GET_CONSISTENCY_POLICY = (1 << 26),
};
/* If fd >= 0, get the array it is open on,
@ -701,7 +713,7 @@ extern int restore_stripes(int *dest, unsigned long long *offsets,
extern char *map_num(mapping_t *map, int num);
extern int map_name(mapping_t *map, char *name);
extern mapping_t r5layout[], r6layout[], pers[], modes[], faultylayout[];
extern mapping_t r5layout[], r6layout[], pers[], modes[], faultylayout[], consistency_policies[];
extern char *map_dev_preferred(int major, int minor, int create,
char *prefer);
@ -863,7 +875,7 @@ extern struct superswitch {
* metadata.
*/
int (*init_super)(struct supertype *st, mdu_array_info_t *info,
unsigned long long size, char *name,
struct shape *s, char *name,
char *homehost, int *uuid,
unsigned long long data_offset);
@ -961,7 +973,7 @@ extern struct superswitch {
int *chunk, unsigned long long size,
unsigned long long data_offset,
char *subdev, unsigned long long *freesize,
int verbose);
int consistency_policy, int verbose);
/* Return a linked list of 'mdinfo' structures for all arrays
* in the container. For non-containers, it is like
@ -1059,6 +1071,9 @@ extern struct superswitch {
/* validate container after assemble */
int (*validate_container)(struct mdinfo *info);
/* write initial empty PPL on device */
int (*write_init_ppl)(struct supertype *st, struct mdinfo *info, int fd);
/* records new bad block in metadata */
int (*record_bad_block)(struct active_array *a, int n,
unsigned long long sector, int length);

View File

@ -2290,7 +2290,7 @@ static unsigned int find_vde_by_guid(const struct ddf_super *ddf,
static int init_super_ddf(struct supertype *st,
mdu_array_info_t *info,
unsigned long long size, char *name, char *homehost,
struct shape *s, char *name, char *homehost,
int *uuid, unsigned long long data_offset)
{
/* This is primarily called by Create when creating a new array.
@ -2328,7 +2328,7 @@ static int init_super_ddf(struct supertype *st,
struct virtual_disk *vd;
if (st->sb)
return init_super_ddf_bvd(st, info, size, name, homehost, uuid,
return init_super_ddf_bvd(st, info, s->size, name, homehost, uuid,
data_offset);
if (posix_memalign((void**)&ddf, 512, sizeof(*ddf)) != 0) {
@ -3347,7 +3347,7 @@ static int validate_geometry_ddf(struct supertype *st,
int *chunk, unsigned long long size,
unsigned long long data_offset,
char *dev, unsigned long long *freesize,
int verbose)
int consistency_policy, int verbose)
{
int fd;
struct mdinfo *sra;

View File

@ -205,7 +205,7 @@ static int validate_geometry(struct supertype *st, int level,
int *chunk, unsigned long long size,
unsigned long long data_offset,
char *subdev, unsigned long long *freesize,
int verbose)
int consistency_policy, int verbose)
{
pr_err("gpt metadata cannot be used this way\n");
return 0;

View File

@ -5155,7 +5155,7 @@ static int check_name(struct intel_super *super, char *name, int quiet)
}
static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
unsigned long long size, char *name,
struct shape *s, char *name,
char *homehost, int *uuid,
long long data_offset)
{
@ -5250,7 +5250,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
strncpy((char *) dev->volume, name, MAX_RAID_SERIAL_LEN);
array_blocks = calc_array_size(info->level, info->raid_disks,
info->layout, info->chunk_size,
size * 2);
s->size * 2);
/* round array size down to closest MB */
array_blocks = (array_blocks >> SECT_PER_MB_SHIFT) << SECT_PER_MB_SHIFT;
@ -5264,7 +5264,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
vol->curr_migr_unit = 0;
map = get_imsm_map(dev, MAP_0);
set_pba_of_lba0(map, super->create_offset);
set_blocks_per_member(map, info_to_blocks_per_member(info, size));
set_blocks_per_member(map, info_to_blocks_per_member(info, s->size));
map->blocks_per_strip = __cpu_to_le16(info_to_blocks_per_strip(info));
map->failed_disk_num = ~0;
if (info->level > 0)
@ -5292,7 +5292,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
map->num_domains = 1;
/* info->size is only int so use the 'size' parameter instead */
num_data_stripes = (size * 2) / info_to_blocks_per_strip(info);
num_data_stripes = (s->size * 2) / info_to_blocks_per_strip(info);
num_data_stripes /= map->num_domains;
set_num_data_stripes(map, num_data_stripes);
@ -5314,7 +5314,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
}
static int init_super_imsm(struct supertype *st, mdu_array_info_t *info,
unsigned long long size, char *name,
struct shape *s, char *name,
char *homehost, int *uuid,
unsigned long long data_offset)
{
@ -5337,7 +5337,7 @@ static int init_super_imsm(struct supertype *st, mdu_array_info_t *info,
}
if (st->sb)
return init_super_imsm_volume(st, info, size, name, homehost, uuid,
return init_super_imsm_volume(st, info, s, name, homehost, uuid,
data_offset);
if (info)
@ -6914,7 +6914,7 @@ static int validate_geometry_imsm(struct supertype *st, int level, int layout,
int raiddisks, int *chunk, unsigned long long size,
unsigned long long data_offset,
char *dev, unsigned long long *freesize,
int verbose)
int consistency_policy, int verbose)
{
int fd, cfd;
struct mdinfo *sra;
@ -10953,7 +10953,7 @@ enum imsm_reshape_type imsm_analyze_change(struct supertype *st,
geo->raid_disks + devNumChange,
&chunk,
geo->size, INVALID_SECTORS,
0, 0, 1))
0, 0, info.consistency_policy, 1))
change = -1;
if (check_devs) {

View File

@ -193,7 +193,7 @@ static int validate_geometry(struct supertype *st, int level,
int *chunk, unsigned long long size,
unsigned long long data_offset,
char *subdev, unsigned long long *freesize,
int verbose)
int consistency_policy, int verbose)
{
pr_err("mbr metadata cannot be used this way\n");
return 0;

View File

@ -725,7 +725,7 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
* We use the first 8 bytes (64bits) of the sha1 of the host name
*/
static int init_super0(struct supertype *st, mdu_array_info_t *info,
unsigned long long size, char *ignored_name,
struct shape *s, char *ignored_name,
char *homehost, int *uuid,
unsigned long long data_offset)
{
@ -764,8 +764,8 @@ static int init_super0(struct supertype *st, mdu_array_info_t *info,
sb->gvalid_words = 0; /* ignored */
sb->ctime = time(0);
sb->level = info->level;
sb->size = size;
if (size != (unsigned long long)sb->size)
sb->size = s->size;
if (s->size != (unsigned long long)sb->size)
return 0;
sb->nr_disks = info->nr_disks;
sb->raid_disks = info->raid_disks;
@ -1267,7 +1267,7 @@ static int validate_geometry0(struct supertype *st, int level,
int *chunk, unsigned long long size,
unsigned long long data_offset,
char *subdev, unsigned long long *freesize,
int verbose)
int consistency_policy, int verbose)
{
unsigned long long ldsize;
int fd;

View File

@ -1397,7 +1397,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
}
static int init_super1(struct supertype *st, mdu_array_info_t *info,
unsigned long long size, char *name, char *homehost,
struct shape *s, char *name, char *homehost,
int *uuid, unsigned long long data_offset)
{
struct mdp_superblock_1 *sb;
@ -1450,7 +1450,7 @@ static int init_super1(struct supertype *st, mdu_array_info_t *info,
sb->ctime = __cpu_to_le64((unsigned long long)time(0));
sb->level = __cpu_to_le32(info->level);
sb->layout = __cpu_to_le32(info->layout);
sb->size = __cpu_to_le64(size*2ULL);
sb->size = __cpu_to_le64(s->size*2ULL);
sb->chunksize = __cpu_to_le32(info->chunk_size>>9);
sb->raid_disks = __cpu_to_le32(info->raid_disks);
@ -2487,7 +2487,7 @@ static int validate_geometry1(struct supertype *st, int level,
int *chunk, unsigned long long size,
unsigned long long data_offset,
char *subdev, unsigned long long *freesize,
int verbose)
int consistency_policy, int verbose)
{
unsigned long long ldsize, devsize;
int bmspace;

11
sysfs.c
View File

@ -242,6 +242,17 @@ struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options)
} else
sra->sysfs_array_state[0] = 0;
if (options & GET_CONSISTENCY_POLICY) {
strcpy(base, "consistency_policy");
if (load_sys(fname, buf, sizeof(buf))) {
sra->consistency_policy = CONSISTENCY_POLICY_UNKNOWN;
} else {
sra->consistency_policy = map_name(consistency_policies, buf);
if (sra->consistency_policy == UnSet)
sra->consistency_policy = CONSISTENCY_POLICY_UNKNOWN;
}
}
if (! (options & GET_DEVS))
return sra;