imsm: add support for checkpointing via 'curr_migr_unit'

Unlike native md checkpointing some data about the geometry and type of
the migration process is coded into curr_migr_unit.  Provide logic to
convert between md/{resync_start|recovery_start} and imsm/curr_migr_unit.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
This commit is contained in:
Dan Williams 2009-12-21 17:54:32 -07:00
parent 2904b26f05
commit 1e5c69836d
4 changed files with 320 additions and 34 deletions

View File

@ -20,8 +20,6 @@
#include "mdadm.h"
#define min(a,b) (((a) < (b)) ? (a) : (b))
inline void sb_le_to_cpu(bitmap_super_t *sb)
{
sb->magic = __le32_to_cpu(sb->magic);

17
mdadm.h
View File

@ -129,6 +129,22 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
#endif /* __KLIBC__ */
/*
* min()/max()/clamp() macros that also do
* strict type-checking.. See the
* "unnecessary" pointer comparison.
*/
#define min(x, y) ({ \
typeof(x) _min1 = (x); \
typeof(y) _min2 = (y); \
(void) (&_min1 == &_min2); \
_min1 < _min2 ? _min1 : _min2; })
#define max(x, y) ({ \
typeof(x) _max1 = (x); \
typeof(y) _max2 = (y); \
(void) (&_max1 == &_max2); \
_max1 > _max2 ? _max1 : _max2; })
/* general information that might be extracted from a superblock */
struct mdinfo {
@ -842,6 +858,7 @@ extern int assemble_container_content(struct supertype *st, int mdfd,
extern int add_disk(int mdfd, struct supertype *st,
struct mdinfo *sra, struct mdinfo *info);
extern int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info);
unsigned long long min_recovery_start(struct mdinfo *array);
extern char *human_size(long long bytes);
extern char *human_size_brief(long long bytes);

View File

@ -635,6 +635,8 @@ static int is_failed(struct imsm_disk *disk)
}
#ifndef MDASSEMBLE
static __u64 blocks_per_migr_unit(struct imsm_dev *dev);
static void print_imsm_dev(struct imsm_dev *dev, char *uuid, int disk_idx)
{
__u64 sz;
@ -690,7 +692,11 @@ static void print_imsm_dev(struct imsm_dev *dev, char *uuid, int disk_idx)
printf(" Map State : %s", map_state_str[map->map_state]);
if (dev->vol.migr_state) {
struct imsm_map *map = get_imsm_map(dev, 1);
printf(" <-- %s", map_state_str[map->map_state]);
printf("\n Checkpoint : %u (%llu)",
__le32_to_cpu(dev->vol.curr_migr_unit),
blocks_per_migr_unit(dev));
}
printf("\n");
printf(" Dirty State : %s\n", dev->vol.dirty ? "dirty" : "clean");
@ -1216,6 +1222,179 @@ get_imsm_numerical_version(struct imsm_super *mpb, int *m, int *p)
}
#endif
static __u32 migr_strip_blocks_resync(struct imsm_dev *dev)
{
/* migr_strip_size when repairing or initializing parity */
struct imsm_map *map = get_imsm_map(dev, 0);
__u32 chunk = __le32_to_cpu(map->blocks_per_strip);
switch (get_imsm_raid_level(map)) {
case 5:
case 10:
return chunk;
default:
return 128*1024 >> 9;
}
}
static __u32 migr_strip_blocks_rebuild(struct imsm_dev *dev)
{
/* migr_strip_size when rebuilding a degraded disk, no idea why
* this is different than migr_strip_size_resync(), but it's good
* to be compatible
*/
struct imsm_map *map = get_imsm_map(dev, 1);
__u32 chunk = __le32_to_cpu(map->blocks_per_strip);
switch (get_imsm_raid_level(map)) {
case 1:
case 10:
if (map->num_members % map->num_domains == 0)
return 128*1024 >> 9;
else
return chunk;
case 5:
return max((__u32) 64*1024 >> 9, chunk);
default:
return 128*1024 >> 9;
}
}
static __u32 num_stripes_per_unit_resync(struct imsm_dev *dev)
{
struct imsm_map *lo = get_imsm_map(dev, 0);
struct imsm_map *hi = get_imsm_map(dev, 1);
__u32 lo_chunk = __le32_to_cpu(lo->blocks_per_strip);
__u32 hi_chunk = __le32_to_cpu(hi->blocks_per_strip);
return max((__u32) 1, hi_chunk / lo_chunk);
}
static __u32 num_stripes_per_unit_rebuild(struct imsm_dev *dev)
{
struct imsm_map *lo = get_imsm_map(dev, 0);
int level = get_imsm_raid_level(lo);
if (level == 1 || level == 10) {
struct imsm_map *hi = get_imsm_map(dev, 1);
return hi->num_domains;
} else
return num_stripes_per_unit_resync(dev);
}
static __u8 imsm_num_data_members(struct imsm_dev *dev)
{
/* named 'imsm_' because raid0, raid1 and raid10
* counter-intuitively have the same number of data disks
*/
struct imsm_map *map = get_imsm_map(dev, 0);
switch (get_imsm_raid_level(map)) {
case 0:
case 1:
case 10:
return map->num_members;
case 5:
return map->num_members - 1;
default:
dprintf("%s: unsupported raid level\n", __func__);
return 0;
}
}
static __u32 parity_segment_depth(struct imsm_dev *dev)
{
struct imsm_map *map = get_imsm_map(dev, 0);
__u32 chunk = __le32_to_cpu(map->blocks_per_strip);
switch(get_imsm_raid_level(map)) {
case 1:
case 10:
return chunk * map->num_domains;
case 5:
return chunk * map->num_members;
default:
return chunk;
}
}
static __u32 map_migr_block(struct imsm_dev *dev, __u32 block)
{
struct imsm_map *map = get_imsm_map(dev, 1);
__u32 chunk = __le32_to_cpu(map->blocks_per_strip);
__u32 strip = block / chunk;
switch (get_imsm_raid_level(map)) {
case 1:
case 10: {
__u32 vol_strip = (strip * map->num_domains) + 1;
__u32 vol_stripe = vol_strip / map->num_members;
return vol_stripe * chunk + block % chunk;
} case 5: {
__u32 stripe = strip / (map->num_members - 1);
return stripe * chunk + block % chunk;
}
default:
return 0;
}
}
static __u64 blocks_per_migr_unit(struct imsm_dev *dev)
{
/* calculate the conversion factor between per member 'blocks'
* (md/{resync,rebuild}_start) and imsm migration units, return
* 0 for the 'not migrating' and 'unsupported migration' cases
*/
if (!dev->vol.migr_state)
return 0;
switch (migr_type(dev)) {
case MIGR_VERIFY:
case MIGR_REPAIR:
case MIGR_INIT: {
struct imsm_map *map = get_imsm_map(dev, 0);
__u32 stripes_per_unit;
__u32 blocks_per_unit;
__u32 parity_depth;
__u32 migr_chunk;
__u32 block_map;
__u32 block_rel;
__u32 segment;
__u32 stripe;
__u8 disks;
/* yes, this is really the translation of migr_units to
* per-member blocks in the 'resync' case
*/
stripes_per_unit = num_stripes_per_unit_resync(dev);
migr_chunk = migr_strip_blocks_resync(dev);
disks = imsm_num_data_members(dev);
blocks_per_unit = stripes_per_unit * migr_chunk * disks;
stripe = __le32_to_cpu(map->blocks_per_strip) * disks;
segment = blocks_per_unit / stripe;
block_rel = blocks_per_unit - segment * stripe;
parity_depth = parity_segment_depth(dev);
block_map = map_migr_block(dev, block_rel);
return block_map + parity_depth * segment;
}
case MIGR_REBUILD: {
__u32 stripes_per_unit;
__u32 migr_chunk;
stripes_per_unit = num_stripes_per_unit_rebuild(dev);
migr_chunk = migr_strip_blocks_rebuild(dev);
return migr_chunk * stripes_per_unit;
}
case MIGR_GEN_MIGR:
case MIGR_STATE_CHANGE:
default:
return 0;
}
}
static int imsm_level_to_layout(int level)
{
switch (level) {
@ -1265,12 +1444,33 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info)
info->component_size = __le32_to_cpu(map->blocks_per_member);
memset(info->uuid, 0, sizeof(info->uuid));
if (map->map_state == IMSM_T_STATE_UNINITIALIZED || dev->vol.dirty)
if (map->map_state == IMSM_T_STATE_UNINITIALIZED || dev->vol.dirty) {
info->resync_start = 0;
else if (dev->vol.migr_state)
/* FIXME add curr_migr_unit to resync_start conversion */
info->resync_start = 0;
else
} else if (dev->vol.migr_state) {
switch (migr_type(dev)) {
case MIGR_REPAIR:
case MIGR_INIT: {
__u64 blocks_per_unit = blocks_per_migr_unit(dev);
__u64 units = __le32_to_cpu(dev->vol.curr_migr_unit);
info->resync_start = blocks_per_unit * units;
break;
}
case MIGR_VERIFY:
/* we could emulate the checkpointing of
* 'sync_action=check' migrations, but for now
* we just immediately complete them
*/
case MIGR_REBUILD:
/* this is handled by container_content_imsm() */
case MIGR_GEN_MIGR:
case MIGR_STATE_CHANGE:
/* FIXME handle other migrations */
default:
/* we are not dirty, so... */
info->resync_start = MaxSector;
}
} else
info->resync_start = MaxSector;
strncpy(info->name, (char *) dev->volume, MAX_RAID_SERIAL_LEN);
@ -3782,6 +3982,46 @@ static int validate_geometry_imsm(struct supertype *st, int level, int layout,
}
#endif /* MDASSEMBLE */
static int is_rebuilding(struct imsm_dev *dev)
{
struct imsm_map *migr_map;
if (!dev->vol.migr_state)
return 0;
if (migr_type(dev) != MIGR_REBUILD)
return 0;
migr_map = get_imsm_map(dev, 1);
if (migr_map->map_state == IMSM_T_STATE_DEGRADED)
return 1;
else
return 0;
}
static void update_recovery_start(struct imsm_dev *dev, struct mdinfo *array)
{
struct mdinfo *rebuild = NULL;
struct mdinfo *d;
__u32 units;
if (!is_rebuilding(dev))
return;
/* Find the rebuild target, but punt on the dual rebuild case */
for (d = array->devs; d; d = d->next)
if (d->recovery_start == 0) {
if (rebuild)
return;
rebuild = d;
}
units = __le32_to_cpu(dev->vol.curr_migr_unit);
rebuild->recovery_start = units * blocks_per_migr_unit(dev);
}
static struct mdinfo *container_content_imsm(struct supertype *st)
{
/* Given a container loaded by load_super_imsm_all,
@ -3829,6 +4069,7 @@ static struct mdinfo *container_content_imsm(struct supertype *st)
super->current_vol = i;
getinfo_super_imsm_volume(st, this);
for (slot = 0 ; slot < map->num_members; slot++) {
unsigned long long recovery_start;
struct mdinfo *info_d;
struct dl *d;
int idx;
@ -3842,33 +4083,41 @@ static struct mdinfo *container_content_imsm(struct supertype *st)
if (d->index == idx)
break;
recovery_start = MaxSector;
if (d == NULL)
skip = 1;
if (d && is_failed(&d->disk))
skip = 1;
if (ord & IMSM_ORD_REBUILD)
skip = 1;
recovery_start = 0;
/*
* if we skip some disks the array will be assmebled degraded;
* reset resync start to avoid a dirty-degraded situation
* reset resync start to avoid a dirty-degraded
* situation when performing the intial sync
*
* FIXME handle dirty degraded
*/
if (skip && !dev->vol.dirty)
if ((skip || recovery_start == 0) && !dev->vol.dirty)
this->resync_start = MaxSector;
if (skip)
continue;
info_d = malloc(sizeof(*info_d));
info_d = calloc(1, sizeof(*info_d));
if (!info_d) {
fprintf(stderr, Name ": failed to allocate disk"
" for volume %.16s\n", dev->volume);
info_d = this->devs;
while (info_d) {
struct mdinfo *d = info_d->next;
free(info_d);
info_d = d;
}
free(this);
this = rest;
break;
}
memset(info_d, 0, sizeof(*info_d));
info_d->next = this->devs;
this->devs = info_d;
@ -3876,9 +4125,10 @@ static struct mdinfo *container_content_imsm(struct supertype *st)
info_d->disk.major = d->major;
info_d->disk.minor = d->minor;
info_d->disk.raid_disk = slot;
info_d->recovery_start = MaxSector;
info_d->recovery_start = recovery_start;
this->array.working_disks++;
if (info_d->recovery_start == MaxSector)
this->array.working_disks++;
info_d->events = __le32_to_cpu(mpb->generation_num);
info_d->data_offset = __le32_to_cpu(map->pba_of_lba0);
@ -3886,6 +4136,8 @@ static struct mdinfo *container_content_imsm(struct supertype *st)
if (d->devname)
strcpy(info_d->name, d->devname);
}
/* now that the disk list is up-to-date fixup recovery_start */
update_recovery_start(dev, this);
rest = this;
}
@ -4028,24 +4280,6 @@ static int is_resyncing(struct imsm_dev *dev)
return 0;
}
static int is_rebuilding(struct imsm_dev *dev)
{
struct imsm_map *migr_map;
if (!dev->vol.migr_state)
return 0;
if (migr_type(dev) != MIGR_REBUILD)
return 0;
migr_map = get_imsm_map(dev, 1);
if (migr_map->map_state == IMSM_T_STATE_DEGRADED)
return 1;
else
return 0;
}
/* return true if we recorded new information */
static int mark_failure(struct imsm_dev *dev, struct imsm_disk *disk, int idx)
{
@ -4096,6 +4330,7 @@ static int imsm_set_array_state(struct active_array *a, int consistent)
struct imsm_map *map = get_imsm_map(dev, 0);
int failed = imsm_count_failed(super, dev);
__u8 map_state = imsm_check_degraded(super, dev, failed);
__u32 blocks_per_unit;
/* before we activate this array handle any missing disks */
if (consistent == 2 && super->missing) {
@ -4107,7 +4342,7 @@ static int imsm_set_array_state(struct active_array *a, int consistent)
mark_missing(dev, &dl->disk, dl->index);
super->updates_pending++;
}
if (consistent == 2 &&
(!is_resync_complete(&a->info) ||
map_state != IMSM_T_STATE_NORMAL ||
@ -4134,7 +4369,28 @@ static int imsm_set_array_state(struct active_array *a, int consistent)
super->updates_pending++;
}
/* FIXME check if we can update curr_migr_unit from resync_start */
/* check if we can update curr_migr_unit from resync_start, recovery_start */
blocks_per_unit = blocks_per_migr_unit(dev);
if (blocks_per_unit && failed <= 1) {
__u32 units32;
__u64 units;
if (migr_type(dev) == MIGR_REBUILD)
units = min_recovery_start(&a->info) / blocks_per_unit;
else
units = a->info.resync_start / blocks_per_unit;
units32 = units;
/* check that we did not overflow 32-bits, and that
* curr_migr_unit needs updating
*/
if (units32 == units &&
__le32_to_cpu(dev->vol.curr_migr_unit) != units32) {
dprintf("imsm: mark checkpoint (%u)\n", units32);
dev->vol.curr_migr_unit = __cpu_to_le32(units32);
super->updates_pending++;
}
}
/* mark dirty / clean */
if (dev->vol.dirty != !consistent) {

15
util.c
View File

@ -1210,6 +1210,21 @@ int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info)
return rv;
}
unsigned long long min_recovery_start(struct mdinfo *array)
{
/* find the minimum recovery_start in an array for metadata
* formats that only record per-array recovery progress instead
* of per-device
*/
unsigned long long recovery_start = MaxSector;
struct mdinfo *d;
for (d = array->devs; d; d = d->next)
recovery_start = min(recovery_start, d->recovery_start);
return recovery_start;
}
char *devnum2devname(int num)
{
char name[100];