From a7dd165b4ea5c3db8dd05f4594e8dea783296ae3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 1 Dec 2009 16:04:06 -0700 Subject: [PATCH 01/13] imsm: catch attempt to auto-layout zero-length arrays When -z is omitted reserve_space() looks to satisfy a zero length allocation which lo and behold is equal to the amount of free space on a full disk. So, catch maxsize == 0 and simplify the return value from merge_extents() to always equal amount of free space (no benefit to having a special case ~0ULL == error). Signed-off-by: Dan Williams --- super-intel.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/super-intel.c b/super-intel.c index 2e119f8..6fe5e0d 100644 --- a/super-intel.c +++ b/super-intel.c @@ -3333,7 +3333,7 @@ static unsigned long long merge_extents(struct intel_super *super, int sum_exten unsigned long reserve; if (!e) - return ~0ULL; /* error */ + return 0; /* coalesce and sort all extents. also, check to see if we need to * reserve space between member arrays @@ -3376,17 +3376,23 @@ static unsigned long long merge_extents(struct intel_super *super, int sum_exten } while (e[i-1].size); free(e); + if (maxsize == 0) + return 0; + + /* FIXME assumes volume at offset 0 is the first volume in a + * container + */ if (start_extent > 0) reserve = IMSM_RESERVED_SECTORS; /* gap between raid regions */ else reserve = 0; if (maxsize < reserve) - return ~0ULL; + return 0; super->create_offset = ~((__u32) 0); if (start + reserve > super->create_offset) - return ~0ULL; /* start overflows create_offset */ + return 0; /* start overflows create_offset */ super->create_offset = start + reserve; return maxsize - reserve; @@ -3569,15 +3575,11 @@ static int validate_geometry_imsm_volume(struct supertype *st, int level, i += dl->extent_cnt; maxsize = merge_extents(super, i); - if (maxsize < size) { + if (maxsize < size || maxsize == 0) { if (verbose) fprintf(stderr, Name ": not enough space after merge (%llu < %llu)\n", maxsize, size); return 0; - } else if (maxsize == ~0ULL) { - if (verbose) - fprintf(stderr, Name ": failed to merge %d extents\n", i); - return 0; } *freesize = maxsize; @@ -3634,7 +3636,8 @@ static int reserve_space(struct supertype *st, int raiddisks, if (cnt < raiddisks || (super->orom && used && used != raiddisks) || - maxsize < minsize) { + maxsize < minsize || + maxsize == 0) { fprintf(stderr, Name ": not enough devices with space to create array.\n"); return 0; /* No enough free spaces large enough */ } From 6592ce37ee6f32b4886dfca33811c5a0403ed1b6 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 10 Dec 2009 15:03:31 -0700 Subject: [PATCH 02/13] imsm: honor orom constraints for auto-layout Factor out the orom checking bits to validate_geometry_imsm_orom() and share it between validate_geometry_imsm_volume() and the entry path to reserve_space(). Signed-off-by: Dan Williams --- super-intel.c | 52 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/super-intel.c b/super-intel.c index 6fe5e0d..4372ab4 100644 --- a/super-intel.c +++ b/super-intel.c @@ -3421,6 +3421,34 @@ static int is_raid_level_supported(const struct imsm_orom *orom, int level, int } #define pr_vrb(fmt, arg...) (void) (verbose && fprintf(stderr, Name fmt, ##arg)) +static int +validate_geometry_imsm_orom(struct intel_super *super, int level, int layout, + int raiddisks, int chunk, int verbose) +{ + if (!is_raid_level_supported(super->orom, level, raiddisks)) { + pr_vrb(": platform does not support raid%d with %d disk%s\n", + level, raiddisks, raiddisks > 1 ? "s" : ""); + return 0; + } + if (super->orom && level != 1 && + !imsm_orom_has_chunk(super->orom, chunk)) { + pr_vrb(": platform does not support a chunk size of: %d\n", chunk); + return 0; + } + if (layout != imsm_level_to_layout(level)) { + if (level == 5) + pr_vrb(": imsm raid 5 only supports the left-asymmetric layout\n"); + else if (level == 10) + pr_vrb(": imsm raid 10 only supports the n2 layout\n"); + else + pr_vrb(": imsm unknown layout %#x for this raid level %d\n", + layout, level); + return 0; + } + + return 1; +} + /* validate_geometry_imsm_volume - lifted from validate_geometry_ddf_bvd * FIX ME add ahci details */ @@ -3443,26 +3471,8 @@ static int validate_geometry_imsm_volume(struct supertype *st, int level, if (!super) return 0; - if (!is_raid_level_supported(super->orom, level, raiddisks)) { - pr_vrb(": platform does not support raid%d with %d disk%s\n", - level, raiddisks, raiddisks > 1 ? "s" : ""); + if (!validate_geometry_imsm_orom(super, level, layout, raiddisks, chunk, verbose)) return 0; - } - if (super->orom && level != 1 && - !imsm_orom_has_chunk(super->orom, chunk)) { - pr_vrb(": platform does not support a chunk size of: %d\n", chunk); - return 0; - } - if (layout != imsm_level_to_layout(level)) { - if (level == 5) - pr_vrb(": imsm raid 5 only supports the left-asymmetric layout\n"); - else if (level == 10) - pr_vrb(": imsm raid 10 only supports the n2 layout\n"); - else - pr_vrb(": imsm unknown layout %#x for this raid level %d\n", - layout, level); - return 0; - } if (!dev) { /* General test: make sure there is space for @@ -3689,6 +3699,10 @@ static int validate_geometry_imsm(struct supertype *st, int level, int layout, * created. add_to_super and getinfo_super * detect when autolayout is in progress. */ + if (!validate_geometry_imsm_orom(st->sb, level, layout, + raiddisks, chunk, + verbose)) + return 0; return reserve_space(st, raiddisks, size, chunk, freesize); } return 1; From ac6449bee9b40b0e8131005dc0cbe2d91a070ae2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 10 Dec 2009 15:03:34 -0700 Subject: [PATCH 03/13] imsm: fix spare promotion When associating a spare take on the target's metadata version number to satisfy future compare_super checks. Signed-off-by: Dan Williams --- super-intel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/super-intel.c b/super-intel.c index 4372ab4..39a2985 100644 --- a/super-intel.c +++ b/super-intel.c @@ -1566,6 +1566,7 @@ static int compare_super_imsm(struct supertype *st, struct supertype *tst) first->anchor->num_raid_devs = sec->anchor->num_raid_devs; first->anchor->orig_family_num = sec->anchor->orig_family_num; first->anchor->family_num = sec->anchor->family_num; + memcpy(first->anchor->sig, sec->anchor->sig, MAX_SIGNATURE_LENGTH); for (i = 0; i < sec->anchor->num_raid_devs; i++) imsm_copy_dev(get_imsm_dev(first, i), get_imsm_dev(sec, i)); } From ecf408e9149dbd6ef61fdcbc92c5816c0eb7fc84 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 12 Dec 2009 13:57:25 -0700 Subject: [PATCH 04/13] imsm: fix thunderdome segfault disk_list_get() can return NULL if: 1/ A formerly missing disk is re-added 2/ The original array has not been rebuilt, so the family number of the missing disk still matches 3/ The metadata record of the in-sync disks are read before the missing disk This will result in the missing disk not adding its own serial number to the disk_list, only its truncated value will be present. Signed-off-by: Dan Williams --- super-intel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/super-intel.c b/super-intel.c index 39a2985..2d5796f 100644 --- a/super-intel.c +++ b/super-intel.c @@ -2429,7 +2429,7 @@ imsm_thunderdome(struct intel_super **super_list, int len) struct intel_disk *idisk; idisk = disk_list_get(dl->serial, disk_list); - if (is_spare(&idisk->disk) && + if (idisk && is_spare(&idisk->disk) && !is_failed(&idisk->disk) && !is_configured(&idisk->disk)) dl->index = -1; else { From 1f0769d76801f928400f5626c6581873adaf7002 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 12 Dec 2009 13:57:28 -0700 Subject: [PATCH 05/13] util: fix devnum2devname for devnum == 0 devnum 0 is md0 no md_d-1 Signed-off-by: Dan Williams --- util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util.c b/util.c index a0e4bcf..d49a4ed 100644 --- a/util.c +++ b/util.c @@ -1210,7 +1210,7 @@ int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info) char *devnum2devname(int num) { char name[100]; - if (num > 0) + if (num >= 0) sprintf(name, "md%d", num); else sprintf(name, "md_d%d", -1-num); From 8655a7b19477c22c4721ff6c35e0f6dfc5fa403b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 12 Dec 2009 13:57:28 -0700 Subject: [PATCH 06/13] imsm: cleanup print_imsm_dev() When printing the migration state there is no need to print "migrating". The fact that the state is non-idle should be enough indication. Signed-off-by: Dan Williams --- super-intel.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/super-intel.c b/super-intel.c index 2d5796f..ab8172d 100644 --- a/super-intel.c +++ b/super-intel.c @@ -669,23 +669,24 @@ static void print_imsm_dev(struct imsm_dev *dev, char *uuid, int disk_idx) printf(" Chunk Size : %u KiB\n", __le16_to_cpu(map->blocks_per_strip) / 2); printf(" Reserved : %d\n", __le32_to_cpu(dev->reserved_blocks)); - printf(" Migrate State : %s", dev->vol.migr_state ? "migrating" : "idle\n"); + printf(" Migrate State : "); if (dev->vol.migr_state) { if (migr_type(dev) == MIGR_INIT) - printf(": initializing\n"); + printf("initialize\n"); else if (migr_type(dev) == MIGR_REBUILD) - printf(": rebuilding\n"); + printf("rebuild\n"); else if (migr_type(dev) == MIGR_VERIFY) - printf(": check\n"); + printf("check\n"); else if (migr_type(dev) == MIGR_GEN_MIGR) - printf(": general migration\n"); + printf("general migration\n"); else if (migr_type(dev) == MIGR_STATE_CHANGE) - printf(": state change\n"); + printf("state change\n"); else if (migr_type(dev) == MIGR_REPAIR) - printf(": repair\n"); + printf("repair\n"); else - printf(": \n", migr_type(dev)); - } + printf("\n", migr_type(dev)); + } else + printf("idle\n"); printf(" Map State : %s", map_state_str[map->map_state]); if (dev->vol.migr_state) { struct imsm_map *map = get_imsm_map(dev, 1); From 071cfc42580673b149140339a862f90399dc00b5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 12 Dec 2009 14:10:01 -0700 Subject: [PATCH 07/13] mdmon: cleanup manage_member() leak free() the results of activate_spare(). Signed-off-by: Dan Williams --- managemon.c | 81 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 31 deletions(-) diff --git a/managemon.c b/managemon.c index 5958e18..19effe4 100644 --- a/managemon.c +++ b/managemon.c @@ -209,16 +209,22 @@ struct metadata_update *update_queue = NULL; struct metadata_update *update_queue_handled = NULL; struct metadata_update *update_queue_pending = NULL; -void check_update_queue(struct supertype *container) +static void free_updates(struct metadata_update **update) { - while (update_queue_handled) { - struct metadata_update *this = update_queue_handled; - update_queue_handled = this->next; + while (*update) { + struct metadata_update *this = *update; + + *update = this->next; free(this->buf); - if (this->space) - free(this->space); + free(this->space); free(this); } +} + +void check_update_queue(struct supertype *container) +{ + free_updates(&update_queue_handled); + if (update_queue == NULL && update_queue_pending) { update_queue = update_queue_pending; @@ -376,8 +382,9 @@ static void manage_member(struct mdstat_ent *mdstat, if (a->check_degraded) { struct metadata_update *updates = NULL; - struct mdinfo *newdev; + struct mdinfo *newdev = NULL; struct active_array *newa; + struct mdinfo *d; a->check_degraded = 0; @@ -385,34 +392,46 @@ static void manage_member(struct mdstat_ent *mdstat, * to check. */ newdev = a->container->ss->activate_spare(a, &updates); - if (newdev) { - struct mdinfo *d; - /* Cool, we can add a device or several. */ - newa = duplicate_aa(a); - /* suspend recovery - maybe not needed */ + if (!newdev) + return; - /* Add device to array and set offset/size/slot. - * and open files for each newdev */ - for (d = newdev; d ; d = d->next) { - struct mdinfo *newd; - if (sysfs_add_disk(&newa->info, d, 0) < 0) - continue; - newd = malloc(sizeof(*newd)); - *newd = *d; - newd->next = newa->info.devs; - newa->info.devs = newd; + newa = duplicate_aa(a); + if (!newa) + goto out; + /* Cool, we can add a device or several. */ - newd->state_fd = sysfs_open(a->devnum, - newd->sys_name, - "state"); - newd->prev_state - = read_dev_state(newd->state_fd); - newd->curr_state = newd->prev_state; + /* Add device to array and set offset/size/slot. + * and open files for each newdev */ + for (d = newdev; d ; d = d->next) { + struct mdinfo *newd; + + newd = malloc(sizeof(*newd)); + if (!newd) + continue; + if (sysfs_add_disk(&newa->info, d, 0) < 0) { + free(newd); + continue; } - queue_metadata_update(updates); - replace_array(a->container, a, newa); - sysfs_set_str(&a->info, NULL, "sync_action", "recover"); + *newd = *d; + newd->next = newa->info.devs; + newa->info.devs = newd; + + newd->state_fd = sysfs_open(a->devnum, newd->sys_name, + "state"); + newd->prev_state = read_dev_state(newd->state_fd); + newd->curr_state = newd->prev_state; } + queue_metadata_update(updates); + updates = NULL; + replace_array(a->container, a, newa); + sysfs_set_str(&a->info, NULL, "sync_action", "recover"); + out: + while (newdev) { + d = newdev->next; + free(newdev); + newdev = d; + } + free_updates(&updates); } } From b7941fd68de046be58de4d53fe08925fdb9008f0 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 14 Dec 2009 12:57:55 -0700 Subject: [PATCH 08/13] mdmon: cleanup resync_start We don't need to sprinkle reads of this attribute all over the place, just once at the entry of read_and_act(). Also, the mdinfo structure for the array already has a 'resync_start' member, so just reuse that. Finally, rename get_resync_start() to read_resync_start to make it consistent with the other sysfs accessors in monitor.c. Signed-off-by: Dan Williams --- managemon.c | 1 - mdmon.h | 7 ++----- monitor.c | 19 ++++++------------- super-ddf.c | 8 ++++---- super-intel.c | 9 ++++----- 5 files changed, 16 insertions(+), 28 deletions(-) diff --git a/managemon.c b/managemon.c index 19effe4..e77f045 100644 --- a/managemon.c +++ b/managemon.c @@ -541,7 +541,6 @@ static void manage_new(struct mdstat_ent *mdstat, new->info.state_fd = sysfs_open(new->devnum, NULL, "array_state"); new->resync_start_fd = sysfs_open(new->devnum, NULL, "resync_start"); new->metadata_fd = sysfs_open(new->devnum, NULL, "metadata_version"); - get_resync_start(new); dprintf("%s: inst: %d action: %d state: %d\n", __func__, atoi(inst), new->action_fd, new->info.state_fd); diff --git a/mdmon.h b/mdmon.h index 7cfee35..4494085 100644 --- a/mdmon.h +++ b/mdmon.h @@ -39,8 +39,6 @@ struct active_array { int check_degraded; /* flag set by mon, read by manage */ int devnum; - - unsigned long long resync_start; }; /* @@ -73,7 +71,6 @@ extern int socket_hup_requested; extern int sigterm; int read_dev_state(int fd); -int get_resync_start(struct active_array *a); int is_container_member(struct mdstat_ent *mdstat, char *container); struct mdstat_ent *mdstat_read(int hold, int start); @@ -85,9 +82,9 @@ extern int monitor_loop_cnt; /* helper routine to determine resync completion since MaxSector is a * moving target */ -static inline int is_resync_complete(struct active_array *a) +static inline int is_resync_complete(struct mdinfo *array) { - if (a->resync_start >= a->info.component_size) + if (array->resync_start >= array->component_size) return 1; return 0; } diff --git a/monitor.c b/monitor.c index 0cafc3a..a8e0af3 100644 --- a/monitor.c +++ b/monitor.c @@ -66,23 +66,20 @@ static int read_attr(char *buf, int len, int fd) return n; } -int get_resync_start(struct active_array *a) +static unsigned long long read_resync_start(int fd) { char buf[30]; int n; - n = read_attr(buf, 30, a->resync_start_fd); + n = read_attr(buf, 30, fd); if (n <= 0) - return n; + return 0; if (strncmp(buf, "none", 4) == 0) - a->resync_start = ~0ULL; + return ~0ULL; else - a->resync_start = strtoull(buf, NULL, 10); - - return 1; + return strtoull(buf, NULL, 10); } - static enum array_state read_state(int fd) { char buf[20]; @@ -208,6 +205,7 @@ static int read_and_act(struct active_array *a) a->curr_state = read_state(a->info.state_fd); a->curr_action = read_action(a->action_fd); + a->info.resync_start = read_resync_start(a->resync_start_fd); for (mdi = a->info.devs; mdi ; mdi = mdi->next) { mdi->next_state = 0; if (mdi->state_fd >= 0) @@ -217,13 +215,11 @@ static int read_and_act(struct active_array *a) if (a->curr_state <= inactive && a->prev_state > inactive) { /* array has been stopped */ - get_resync_start(a); a->container->ss->set_array_state(a, 1); a->next_state = clear; deactivate = 1; } if (a->curr_state == write_pending) { - get_resync_start(a); a->container->ss->set_array_state(a, 0); a->next_state = active; dirty = 1; @@ -236,7 +232,6 @@ static int read_and_act(struct active_array *a) dirty = 1; } if (a->curr_state == clean) { - get_resync_start(a); a->container->ss->set_array_state(a, 1); } if (a->curr_state == active || @@ -253,7 +248,6 @@ static int read_and_act(struct active_array *a) /* explicit request for readonly array. Leave it alone */ ; } else { - get_resync_start(a); if (a->container->ss->set_array_state(a, 2)) a->next_state = read_auto; /* array is clean */ else { @@ -271,7 +265,6 @@ static int read_and_act(struct active_array *a) * until the array goes inactive or readonly though. * Just check if we need to fiddle spares. */ - get_resync_start(a); a->container->ss->set_array_state(a, a->curr_state <= clean); check_degraded = 1; } diff --git a/super-ddf.c b/super-ddf.c index fe83642..f5eb816 100644 --- a/super-ddf.c +++ b/super-ddf.c @@ -3066,7 +3066,7 @@ static int ddf_set_array_state(struct active_array *a, int consistent) if (consistent == 2) { /* Should check if a recovery should be started FIXME */ consistent = 1; - if (!is_resync_complete(a)) + if (!is_resync_complete(&a->info)) consistent = 0; } if (consistent) @@ -3078,9 +3078,9 @@ static int ddf_set_array_state(struct active_array *a, int consistent) old = ddf->virt->entries[inst].init_state; ddf->virt->entries[inst].init_state &= ~DDF_initstate_mask; - if (is_resync_complete(a)) + if (is_resync_complete(&a->info)) ddf->virt->entries[inst].init_state |= DDF_init_full; - else if (a->resync_start == 0) + else if (a->info.resync_start == 0) ddf->virt->entries[inst].init_state |= DDF_init_not; else ddf->virt->entries[inst].init_state |= DDF_init_quick; @@ -3088,7 +3088,7 @@ static int ddf_set_array_state(struct active_array *a, int consistent) ddf->updates_pending = 1; dprintf("ddf mark %d %s %llu\n", inst, consistent?"clean":"dirty", - a->resync_start); + a->info.resync_start); return consistent; } diff --git a/super-intel.c b/super-intel.c index ab8172d..4072fc8 100644 --- a/super-intel.c +++ b/super-intel.c @@ -4108,12 +4108,12 @@ static int imsm_set_array_state(struct active_array *a, int consistent) } if (consistent == 2 && - (!is_resync_complete(a) || + (!is_resync_complete(&a->info) || map_state != IMSM_T_STATE_NORMAL || dev->vol.migr_state)) consistent = 0; - if (is_resync_complete(a)) { + if (is_resync_complete(&a->info)) { /* complete intialization / resync, * recovery and interrupted recovery is completed in * ->set_disk @@ -4125,7 +4125,7 @@ static int imsm_set_array_state(struct active_array *a, int consistent) } } else if (!is_resyncing(dev) && !failed) { /* mark the start of the init process if nothing is failed */ - dprintf("imsm: mark resync start (%llu)\n", a->resync_start); + dprintf("imsm: mark resync start\n"); if (map->map_state == IMSM_T_STATE_UNINITIALIZED) migrate(dev, IMSM_T_STATE_NORMAL, MIGR_INIT); else @@ -4137,8 +4137,7 @@ static int imsm_set_array_state(struct active_array *a, int consistent) /* mark dirty / clean */ if (dev->vol.dirty != !consistent) { - dprintf("imsm: mark '%s' (%llu)\n", - consistent ? "clean" : "dirty", a->resync_start); + dprintf("imsm: mark '%s'\n", consistent ? "clean" : "dirty"); if (consistent) dev->vol.dirty = 0; else From e1516be1db121e6f462b41a739106e33461a733a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 21 Dec 2009 10:06:14 -0700 Subject: [PATCH 09/13] Add scaffolding for handling md/dev-XXX/recovery_start Prepare the code to handle saving a recovery checkpoint. Signed-off-by: Dan Williams --- managemon.c | 7 ++++++- mdadm.h | 6 +++++- monitor.c | 4 +++- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/managemon.c b/managemon.c index e77f045..e335077 100644 --- a/managemon.c +++ b/managemon.c @@ -112,8 +112,10 @@ static void close_aa(struct active_array *aa) { struct mdinfo *d; - for (d = aa->info.devs; d; d = d->next) + for (d = aa->info.devs; d; d = d->next) { + close(d->recovery_fd); close(d->state_fd); + } close(aa->action_fd); close(aa->info.state_fd); @@ -517,6 +519,9 @@ static void manage_new(struct mdstat_ent *mdstat, newd->state_fd = sysfs_open(new->devnum, newd->sys_name, "state"); + newd->recovery_fd = sysfs_open(new->devnum, + newd->sys_name, + "recovery_start"); newd->prev_state = read_dev_state(newd->state_fd); newd->curr_state = newd->prev_state; diff --git a/mdadm.h b/mdadm.h index c7f864b..7b75540 100644 --- a/mdadm.h +++ b/mdadm.h @@ -146,7 +146,10 @@ struct mdinfo { */ int reshape_active; unsigned long long reshape_progress; - unsigned long long resync_start; + union { + unsigned long long resync_start; /* per-array resync position */ + unsigned long long recovery_start; /* per-device rebuild position */ + }; unsigned long safe_mode_delay; /* ms delay to mark clean */ int new_level, delta_disks, new_layout, new_chunk; int errors; @@ -168,6 +171,7 @@ struct mdinfo { struct mdinfo *next; /* Device info for mdmon: */ + int recovery_fd; int state_fd; #define DS_FAULTY 1 #define DS_INSYNC 2 diff --git a/monitor.c b/monitor.c index a8e0af3..58752a8 100644 --- a/monitor.c +++ b/monitor.c @@ -208,8 +208,10 @@ static int read_and_act(struct active_array *a) a->info.resync_start = read_resync_start(a->resync_start_fd); for (mdi = a->info.devs; mdi ; mdi = mdi->next) { mdi->next_state = 0; - if (mdi->state_fd >= 0) + if (mdi->state_fd >= 0) { + mdi->recovery_start = read_resync_start(mdi->recovery_fd); mdi->curr_state = read_dev_state(mdi->state_fd); + } } if (a->curr_state <= inactive && From b7528a20cca58c973771d94d7d2b8ef74bcf582d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 21 Dec 2009 10:23:26 -0700 Subject: [PATCH 10/13] Introduce MaxSector Replace occurrences of ~0ULL to make it clear we are talking about maximal resync/recovery position. Signed-off-by: Dan Williams --- Create.c | 2 +- mdadm.h | 1 + monitor.c | 2 +- super-ddf.c | 4 ++-- super-intel.c | 8 ++++---- super1.c | 6 +++--- 6 files changed, 12 insertions(+), 11 deletions(-) diff --git a/Create.c b/Create.c index 5b01b63..1ae7f92 100644 --- a/Create.c +++ b/Create.c @@ -527,7 +527,7 @@ int Create(struct supertype *st, char *mddev, assume_clean ) { info.array.state = 1; /* clean, but one+ drive will be missing*/ - info.resync_start = ~0ULL; + info.resync_start = MaxSector; } else { info.array.state = 0; /* not clean, but no errors */ info.resync_start = 0; diff --git a/mdadm.h b/mdadm.h index 7b75540..9cf15c4 100644 --- a/mdadm.h +++ b/mdadm.h @@ -149,6 +149,7 @@ struct mdinfo { union { unsigned long long resync_start; /* per-array resync position */ unsigned long long recovery_start; /* per-device rebuild position */ + #define MaxSector (~0ULL) /* resync/recovery complete position */ }; unsigned long safe_mode_delay; /* ms delay to mark clean */ int new_level, delta_disks, new_layout, new_chunk; diff --git a/monitor.c b/monitor.c index 58752a8..81fef49 100644 --- a/monitor.c +++ b/monitor.c @@ -75,7 +75,7 @@ static unsigned long long read_resync_start(int fd) if (n <= 0) return 0; if (strncmp(buf, "none", 4) == 0) - return ~0ULL; + return MaxSector; else return strtoull(buf, NULL, 10); } diff --git a/super-ddf.c b/super-ddf.c index f5eb816..8c3f4be 100644 --- a/super-ddf.c +++ b/super-ddf.c @@ -1433,7 +1433,7 @@ static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info) (ddf->virt->entries[info->container_member].init_state & DDF_initstate_mask) == DDF_init_full) - info->resync_start = ~0ULL; + info->resync_start = MaxSector; uuid_from_super_ddf(st, info->uuid); @@ -2921,7 +2921,7 @@ static struct mdinfo *container_content_ddf(struct supertype *st) this->resync_start = 0; } else { this->array.state = 1; - this->resync_start = ~0ULL; + this->resync_start = MaxSector; } memcpy(this->name, ddf->virt->entries[i].name, 16); this->name[16]=0; diff --git a/super-intel.c b/super-intel.c index 4072fc8..4bb1990 100644 --- a/super-intel.c +++ b/super-intel.c @@ -1271,7 +1271,7 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info) /* FIXME add curr_migr_unit to resync_start conversion */ info->resync_start = 0; else - info->resync_start = ~0ULL; + info->resync_start = MaxSector; strncpy(info->name, (char *) dev->volume, MAX_RAID_SERIAL_LEN); info->name[MAX_RAID_SERIAL_LEN] = 0; @@ -3482,7 +3482,7 @@ static int validate_geometry_imsm_volume(struct supertype *st, int level, * offset */ unsigned long long minsize = size; - unsigned long long start_offset = ~0ULL; + unsigned long long start_offset = MaxSector; int dcnt = 0; if (minsize == 0) minsize = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; @@ -3498,7 +3498,7 @@ static int validate_geometry_imsm_volume(struct supertype *st, int level, esize = e[i].start - pos; if (esize >= minsize) found = 1; - if (found && start_offset == ~0ULL) { + if (found && start_offset == MaxSector) { start_offset = pos; break; } else if (found && pos != start_offset) { @@ -3856,7 +3856,7 @@ static struct mdinfo *container_content_imsm(struct supertype *st) * FIXME handle dirty degraded */ if (skip && !dev->vol.dirty) - this->resync_start = ~0ULL; + this->resync_start = MaxSector; if (skip) continue; diff --git a/super1.c b/super1.c index 2c992a4..85bb598 100644 --- a/super1.c +++ b/super1.c @@ -659,9 +659,9 @@ static int update_super1(struct supertype *st, struct mdinfo *info, switch(__le32_to_cpu(sb->level)) { case 5: case 4: case 6: /* need to force clean */ - if (sb->resync_offset != ~0ULL) + if (sb->resync_offset != MaxSector) rv = 1; - sb->resync_offset = ~0ULL; + sb->resync_offset = MaxSector; } } if (strcmp(update, "assemble")==0) { @@ -855,7 +855,7 @@ static int init_super1(struct supertype *st, mdu_array_info_t *info, sb->utime = sb->ctime; sb->events = __cpu_to_le64(1); if (info->state & (1<resync_offset = ~0ULL; + sb->resync_offset = MaxSector; else sb->resync_offset = 0; sb->max_dev = __cpu_to_le32((1024- sizeof(struct mdp_superblock_1))/ From d23534e4646313a67296b295666d165a87bb2c92 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 21 Dec 2009 11:26:21 -0700 Subject: [PATCH 11/13] Teach sysfs_add_disk() callers to use ->recovery_start versus 'insync' parameter Also fixup 'in_sync' versus 'insync' typo. Signed-off-by: Dan Williams --- Assemble.c | 2 +- Manage.c | 3 ++- managemon.c | 2 +- mdadm.h | 3 +-- super-ddf.c | 2 ++ super-intel.c | 2 ++ sysfs.c | 6 +++--- util.c | 7 +++++-- 8 files changed, 17 insertions(+), 10 deletions(-) diff --git a/Assemble.c b/Assemble.c index 014d644..560e2fe 100644 --- a/Assemble.c +++ b/Assemble.c @@ -1286,7 +1286,7 @@ int assemble_container_content(struct supertype *st, int mdfd, sysfs_free(sra); for (dev = content->devs; dev; dev = dev->next) - if (sysfs_add_disk(content, dev, 1) == 0) + if (sysfs_add_disk(content, dev) == 0) working++; else if (errno == EEXIST) preexist++; diff --git a/Manage.c b/Manage.c index fb9b972..6f0a6a2 100644 --- a/Manage.c +++ b/Manage.c @@ -696,7 +696,8 @@ int Manage_subdevs(char *devname, int fd, tst->ss->getinfo_super(tst, &new_mdi); new_mdi.disk.major = disc.major; new_mdi.disk.minor = disc.minor; - if (sysfs_add_disk(sra, &new_mdi, 0) != 0) { + new_mdi.recovery_start = 0; + if (sysfs_add_disk(sra, &new_mdi) != 0) { fprintf(stderr, Name ": add new device to external metadata" " failed for %s\n", dv->devname); close(container_fd); diff --git a/managemon.c b/managemon.c index e335077..3a20e2b 100644 --- a/managemon.c +++ b/managemon.c @@ -410,7 +410,7 @@ static void manage_member(struct mdstat_ent *mdstat, newd = malloc(sizeof(*newd)); if (!newd) continue; - if (sysfs_add_disk(&newa->info, d, 0) < 0) { + if (sysfs_add_disk(&newa->info, d) < 0) { free(newd); continue; } diff --git a/mdadm.h b/mdadm.h index 9cf15c4..af6d91b 100644 --- a/mdadm.h +++ b/mdadm.h @@ -385,8 +385,7 @@ extern int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev, char *name, char *val, int size); extern int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms); extern int sysfs_set_array(struct mdinfo *info, int vers); -extern int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, - int in_sync); +extern int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd); extern int sysfs_disk_to_scsi_id(int fd, __u32 *id); extern int sysfs_unique_holder(int devnum, long rdev); extern int load_sys(char *path, char *buf); diff --git a/super-ddf.c b/super-ddf.c index 8c3f4be..14f8330 100644 --- a/super-ddf.c +++ b/super-ddf.c @@ -2968,6 +2968,7 @@ static struct mdinfo *container_content_ddf(struct supertype *st) dev->disk.minor = d->minor; dev->disk.raid_disk = i; dev->disk.state = (1<recovery_start = MaxSector; dev->events = __be32_to_cpu(ddf->primary.seq); dev->data_offset = __be64_to_cpu(vc->lba_offset[i]); @@ -3547,6 +3548,7 @@ static struct mdinfo *ddf_activate_spare(struct active_array *a, di->disk.major = dl->major; di->disk.minor = dl->minor; di->disk.state = 0; + di->recovery_start = 0; di->data_offset = pos; di->component_size = a->info.component_size; di->container_member = dl->pdnum; diff --git a/super-intel.c b/super-intel.c index 4bb1990..9f879c5 100644 --- a/super-intel.c +++ b/super-intel.c @@ -3876,6 +3876,7 @@ static struct mdinfo *container_content_imsm(struct supertype *st) info_d->disk.major = d->major; info_d->disk.minor = d->minor; info_d->disk.raid_disk = slot; + info_d->recovery_start = MaxSector; this->array.working_disks++; @@ -4454,6 +4455,7 @@ static struct mdinfo *imsm_activate_spare(struct active_array *a, di->disk.major = dl->major; di->disk.minor = dl->minor; di->disk.state = 0; + di->recovery_start = 0; di->data_offset = __le32_to_cpu(map->pba_of_lba0); di->component_size = a->info.component_size; di->container_member = inst; diff --git a/sysfs.c b/sysfs.c index 35dfbd4..8fdb529 100644 --- a/sysfs.c +++ b/sysfs.c @@ -572,7 +572,7 @@ int sysfs_set_array(struct mdinfo *info, int vers) return rv; } -int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int in_sync) +int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd) { char dv[100]; char nm[100]; @@ -598,11 +598,11 @@ int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int in_sync) rv = sysfs_set_num(sra, sd, "offset", sd->data_offset); rv |= sysfs_set_num(sra, sd, "size", (sd->component_size+1) / 2); if (sra->array.level != LEVEL_CONTAINER) { - if (in_sync) + if (sd->recovery_start == MaxSector) /* This can correctly fail if array isn't started, * yet, so just ignore status for now. */ - sysfs_set_str(sra, sd, "state", "in_sync"); + sysfs_set_str(sra, sd, "state", "insync"); rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk); } return rv; diff --git a/util.c b/util.c index d49a4ed..05be64c 100644 --- a/util.c +++ b/util.c @@ -1162,8 +1162,11 @@ int add_disk(int mdfd, struct supertype *st, int rv; #ifndef MDASSEMBLE if (st->ss->external) { - rv = sysfs_add_disk(sra, info, - info->disk.state & (1<disk.state & (1<recovery_start = MaxSector; + else + info->recovery_start = 0; + rv = sysfs_add_disk(sra, info); if (! rv) { struct mdinfo *sd2; for (sd2 = sra->devs; sd2; sd2=sd2->next) From 2904b26f059c5d82d9d631c9987e92e3f9af498c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 21 Dec 2009 12:51:57 -0700 Subject: [PATCH 12/13] Support external metadata recovery-resume Minimal changes needed to permit reassembling partially recovered external metadata arrays. The biggest logical change is that ->container_content() can now surface partially rebuilt members rather than omitting them from the disk list. Signed-off-by: Dan Williams --- Assemble.c | 2 +- Manage.c | 2 +- managemon.c | 2 +- mdadm.h | 2 +- sysfs.c | 11 ++++++++++- util.c | 2 +- 6 files changed, 15 insertions(+), 6 deletions(-) diff --git a/Assemble.c b/Assemble.c index 560e2fe..014d644 100644 --- a/Assemble.c +++ b/Assemble.c @@ -1286,7 +1286,7 @@ int assemble_container_content(struct supertype *st, int mdfd, sysfs_free(sra); for (dev = content->devs; dev; dev = dev->next) - if (sysfs_add_disk(content, dev) == 0) + if (sysfs_add_disk(content, dev, 1) == 0) working++; else if (errno == EEXIST) preexist++; diff --git a/Manage.c b/Manage.c index 6f0a6a2..df6079b 100644 --- a/Manage.c +++ b/Manage.c @@ -697,7 +697,7 @@ int Manage_subdevs(char *devname, int fd, new_mdi.disk.major = disc.major; new_mdi.disk.minor = disc.minor; new_mdi.recovery_start = 0; - if (sysfs_add_disk(sra, &new_mdi) != 0) { + if (sysfs_add_disk(sra, &new_mdi, 0) != 0) { fprintf(stderr, Name ": add new device to external metadata" " failed for %s\n", dv->devname); close(container_fd); diff --git a/managemon.c b/managemon.c index 3a20e2b..e335077 100644 --- a/managemon.c +++ b/managemon.c @@ -410,7 +410,7 @@ static void manage_member(struct mdstat_ent *mdstat, newd = malloc(sizeof(*newd)); if (!newd) continue; - if (sysfs_add_disk(&newa->info, d) < 0) { + if (sysfs_add_disk(&newa->info, d, 0) < 0) { free(newd); continue; } diff --git a/mdadm.h b/mdadm.h index af6d91b..27ef693 100644 --- a/mdadm.h +++ b/mdadm.h @@ -385,7 +385,7 @@ extern int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev, char *name, char *val, int size); extern int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms); extern int sysfs_set_array(struct mdinfo *info, int vers); -extern int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd); +extern int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume); extern int sysfs_disk_to_scsi_id(int fd, __u32 *id); extern int sysfs_unique_holder(int devnum, long rdev); extern int load_sys(char *path, char *buf); diff --git a/sysfs.c b/sysfs.c index 8fdb529..c3bbbe3 100644 --- a/sysfs.c +++ b/sysfs.c @@ -572,7 +572,7 @@ int sysfs_set_array(struct mdinfo *info, int vers) return rv; } -int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd) +int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume) { char dv[100]; char nm[100]; @@ -595,6 +595,13 @@ int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd) strcpy(sd->sys_name, "dev-"); strcpy(sd->sys_name+4, dname); + /* test write to see if 'recovery_start' is available */ + if (resume && sd->recovery_start < MaxSector && + sysfs_set_num(sra, sd, "recovery_start", 0)) { + sysfs_set_str(sra, sd, "state", "remove"); + return -1; + } + rv = sysfs_set_num(sra, sd, "offset", sd->data_offset); rv |= sysfs_set_num(sra, sd, "size", (sd->component_size+1) / 2); if (sra->array.level != LEVEL_CONTAINER) { @@ -604,6 +611,8 @@ int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd) */ sysfs_set_str(sra, sd, "state", "insync"); rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk); + if (resume) + sysfs_set_num(sra, sd, "recovery_start", sd->recovery_start); } return rv; } diff --git a/util.c b/util.c index 05be64c..927a0ee 100644 --- a/util.c +++ b/util.c @@ -1166,7 +1166,7 @@ int add_disk(int mdfd, struct supertype *st, info->recovery_start = MaxSector; else info->recovery_start = 0; - rv = sysfs_add_disk(sra, info); + rv = sysfs_add_disk(sra, info, 0); if (! rv) { struct mdinfo *sd2; for (sd2 = sra->devs; sd2; sd2=sd2->next) From 1e5c69836d4d0b6dcaef8fc187e6bf2841eb57f6 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 21 Dec 2009 17:54:32 -0700 Subject: [PATCH 13/13] imsm: add support for checkpointing via 'curr_migr_unit' Unlike native md checkpointing some data about the geometry and type of the migration process is coded into curr_migr_unit. Provide logic to convert between md/{resync_start|recovery_start} and imsm/curr_migr_unit. Signed-off-by: Dan Williams --- bitmap.c | 2 - mdadm.h | 17 +++ super-intel.c | 320 +++++++++++++++++++++++++++++++++++++++++++++----- util.c | 15 +++ 4 files changed, 320 insertions(+), 34 deletions(-) diff --git a/bitmap.c b/bitmap.c index 850b0ce..088e37d 100644 --- a/bitmap.c +++ b/bitmap.c @@ -20,8 +20,6 @@ #include "mdadm.h" -#define min(a,b) (((a) < (b)) ? (a) : (b)) - inline void sb_le_to_cpu(bitmap_super_t *sb) { sb->magic = __le32_to_cpu(sb->magic); diff --git a/mdadm.h b/mdadm.h index 27ef693..c1c36af 100644 --- a/mdadm.h +++ b/mdadm.h @@ -129,6 +129,22 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence)); #endif /* __KLIBC__ */ +/* + * min()/max()/clamp() macros that also do + * strict type-checking.. See the + * "unnecessary" pointer comparison. + */ +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + +#define max(x, y) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void) (&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; }) /* general information that might be extracted from a superblock */ struct mdinfo { @@ -842,6 +858,7 @@ extern int assemble_container_content(struct supertype *st, int mdfd, extern int add_disk(int mdfd, struct supertype *st, struct mdinfo *sra, struct mdinfo *info); extern int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info); +unsigned long long min_recovery_start(struct mdinfo *array); extern char *human_size(long long bytes); extern char *human_size_brief(long long bytes); diff --git a/super-intel.c b/super-intel.c index 9f879c5..609aaf5 100644 --- a/super-intel.c +++ b/super-intel.c @@ -635,6 +635,8 @@ static int is_failed(struct imsm_disk *disk) } #ifndef MDASSEMBLE +static __u64 blocks_per_migr_unit(struct imsm_dev *dev); + static void print_imsm_dev(struct imsm_dev *dev, char *uuid, int disk_idx) { __u64 sz; @@ -690,7 +692,11 @@ static void print_imsm_dev(struct imsm_dev *dev, char *uuid, int disk_idx) printf(" Map State : %s", map_state_str[map->map_state]); if (dev->vol.migr_state) { struct imsm_map *map = get_imsm_map(dev, 1); + printf(" <-- %s", map_state_str[map->map_state]); + printf("\n Checkpoint : %u (%llu)", + __le32_to_cpu(dev->vol.curr_migr_unit), + blocks_per_migr_unit(dev)); } printf("\n"); printf(" Dirty State : %s\n", dev->vol.dirty ? "dirty" : "clean"); @@ -1216,6 +1222,179 @@ get_imsm_numerical_version(struct imsm_super *mpb, int *m, int *p) } #endif +static __u32 migr_strip_blocks_resync(struct imsm_dev *dev) +{ + /* migr_strip_size when repairing or initializing parity */ + struct imsm_map *map = get_imsm_map(dev, 0); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + + switch (get_imsm_raid_level(map)) { + case 5: + case 10: + return chunk; + default: + return 128*1024 >> 9; + } +} + +static __u32 migr_strip_blocks_rebuild(struct imsm_dev *dev) +{ + /* migr_strip_size when rebuilding a degraded disk, no idea why + * this is different than migr_strip_size_resync(), but it's good + * to be compatible + */ + struct imsm_map *map = get_imsm_map(dev, 1); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + + switch (get_imsm_raid_level(map)) { + case 1: + case 10: + if (map->num_members % map->num_domains == 0) + return 128*1024 >> 9; + else + return chunk; + case 5: + return max((__u32) 64*1024 >> 9, chunk); + default: + return 128*1024 >> 9; + } +} + +static __u32 num_stripes_per_unit_resync(struct imsm_dev *dev) +{ + struct imsm_map *lo = get_imsm_map(dev, 0); + struct imsm_map *hi = get_imsm_map(dev, 1); + __u32 lo_chunk = __le32_to_cpu(lo->blocks_per_strip); + __u32 hi_chunk = __le32_to_cpu(hi->blocks_per_strip); + + return max((__u32) 1, hi_chunk / lo_chunk); +} + +static __u32 num_stripes_per_unit_rebuild(struct imsm_dev *dev) +{ + struct imsm_map *lo = get_imsm_map(dev, 0); + int level = get_imsm_raid_level(lo); + + if (level == 1 || level == 10) { + struct imsm_map *hi = get_imsm_map(dev, 1); + + return hi->num_domains; + } else + return num_stripes_per_unit_resync(dev); +} + +static __u8 imsm_num_data_members(struct imsm_dev *dev) +{ + /* named 'imsm_' because raid0, raid1 and raid10 + * counter-intuitively have the same number of data disks + */ + struct imsm_map *map = get_imsm_map(dev, 0); + + switch (get_imsm_raid_level(map)) { + case 0: + case 1: + case 10: + return map->num_members; + case 5: + return map->num_members - 1; + default: + dprintf("%s: unsupported raid level\n", __func__); + return 0; + } +} + +static __u32 parity_segment_depth(struct imsm_dev *dev) +{ + struct imsm_map *map = get_imsm_map(dev, 0); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + + switch(get_imsm_raid_level(map)) { + case 1: + case 10: + return chunk * map->num_domains; + case 5: + return chunk * map->num_members; + default: + return chunk; + } +} + +static __u32 map_migr_block(struct imsm_dev *dev, __u32 block) +{ + struct imsm_map *map = get_imsm_map(dev, 1); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + __u32 strip = block / chunk; + + switch (get_imsm_raid_level(map)) { + case 1: + case 10: { + __u32 vol_strip = (strip * map->num_domains) + 1; + __u32 vol_stripe = vol_strip / map->num_members; + + return vol_stripe * chunk + block % chunk; + } case 5: { + __u32 stripe = strip / (map->num_members - 1); + + return stripe * chunk + block % chunk; + } + default: + return 0; + } +} + +static __u64 blocks_per_migr_unit(struct imsm_dev *dev) +{ + /* calculate the conversion factor between per member 'blocks' + * (md/{resync,rebuild}_start) and imsm migration units, return + * 0 for the 'not migrating' and 'unsupported migration' cases + */ + if (!dev->vol.migr_state) + return 0; + + switch (migr_type(dev)) { + case MIGR_VERIFY: + case MIGR_REPAIR: + case MIGR_INIT: { + struct imsm_map *map = get_imsm_map(dev, 0); + __u32 stripes_per_unit; + __u32 blocks_per_unit; + __u32 parity_depth; + __u32 migr_chunk; + __u32 block_map; + __u32 block_rel; + __u32 segment; + __u32 stripe; + __u8 disks; + + /* yes, this is really the translation of migr_units to + * per-member blocks in the 'resync' case + */ + stripes_per_unit = num_stripes_per_unit_resync(dev); + migr_chunk = migr_strip_blocks_resync(dev); + disks = imsm_num_data_members(dev); + blocks_per_unit = stripes_per_unit * migr_chunk * disks; + stripe = __le32_to_cpu(map->blocks_per_strip) * disks; + segment = blocks_per_unit / stripe; + block_rel = blocks_per_unit - segment * stripe; + parity_depth = parity_segment_depth(dev); + block_map = map_migr_block(dev, block_rel); + return block_map + parity_depth * segment; + } + case MIGR_REBUILD: { + __u32 stripes_per_unit; + __u32 migr_chunk; + + stripes_per_unit = num_stripes_per_unit_rebuild(dev); + migr_chunk = migr_strip_blocks_rebuild(dev); + return migr_chunk * stripes_per_unit; + } + case MIGR_GEN_MIGR: + case MIGR_STATE_CHANGE: + default: + return 0; + } +} + static int imsm_level_to_layout(int level) { switch (level) { @@ -1265,12 +1444,33 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info) info->component_size = __le32_to_cpu(map->blocks_per_member); memset(info->uuid, 0, sizeof(info->uuid)); - if (map->map_state == IMSM_T_STATE_UNINITIALIZED || dev->vol.dirty) + if (map->map_state == IMSM_T_STATE_UNINITIALIZED || dev->vol.dirty) { info->resync_start = 0; - else if (dev->vol.migr_state) - /* FIXME add curr_migr_unit to resync_start conversion */ - info->resync_start = 0; - else + } else if (dev->vol.migr_state) { + switch (migr_type(dev)) { + case MIGR_REPAIR: + case MIGR_INIT: { + __u64 blocks_per_unit = blocks_per_migr_unit(dev); + __u64 units = __le32_to_cpu(dev->vol.curr_migr_unit); + + info->resync_start = blocks_per_unit * units; + break; + } + case MIGR_VERIFY: + /* we could emulate the checkpointing of + * 'sync_action=check' migrations, but for now + * we just immediately complete them + */ + case MIGR_REBUILD: + /* this is handled by container_content_imsm() */ + case MIGR_GEN_MIGR: + case MIGR_STATE_CHANGE: + /* FIXME handle other migrations */ + default: + /* we are not dirty, so... */ + info->resync_start = MaxSector; + } + } else info->resync_start = MaxSector; strncpy(info->name, (char *) dev->volume, MAX_RAID_SERIAL_LEN); @@ -3782,6 +3982,46 @@ static int validate_geometry_imsm(struct supertype *st, int level, int layout, } #endif /* MDASSEMBLE */ +static int is_rebuilding(struct imsm_dev *dev) +{ + struct imsm_map *migr_map; + + if (!dev->vol.migr_state) + return 0; + + if (migr_type(dev) != MIGR_REBUILD) + return 0; + + migr_map = get_imsm_map(dev, 1); + + if (migr_map->map_state == IMSM_T_STATE_DEGRADED) + return 1; + else + return 0; +} + +static void update_recovery_start(struct imsm_dev *dev, struct mdinfo *array) +{ + struct mdinfo *rebuild = NULL; + struct mdinfo *d; + __u32 units; + + if (!is_rebuilding(dev)) + return; + + /* Find the rebuild target, but punt on the dual rebuild case */ + for (d = array->devs; d; d = d->next) + if (d->recovery_start == 0) { + if (rebuild) + return; + rebuild = d; + } + + units = __le32_to_cpu(dev->vol.curr_migr_unit); + rebuild->recovery_start = units * blocks_per_migr_unit(dev); +} + + static struct mdinfo *container_content_imsm(struct supertype *st) { /* Given a container loaded by load_super_imsm_all, @@ -3829,6 +4069,7 @@ static struct mdinfo *container_content_imsm(struct supertype *st) super->current_vol = i; getinfo_super_imsm_volume(st, this); for (slot = 0 ; slot < map->num_members; slot++) { + unsigned long long recovery_start; struct mdinfo *info_d; struct dl *d; int idx; @@ -3842,33 +4083,41 @@ static struct mdinfo *container_content_imsm(struct supertype *st) if (d->index == idx) break; + recovery_start = MaxSector; if (d == NULL) skip = 1; if (d && is_failed(&d->disk)) skip = 1; if (ord & IMSM_ORD_REBUILD) - skip = 1; + recovery_start = 0; /* * if we skip some disks the array will be assmebled degraded; - * reset resync start to avoid a dirty-degraded situation + * reset resync start to avoid a dirty-degraded + * situation when performing the intial sync * * FIXME handle dirty degraded */ - if (skip && !dev->vol.dirty) + if ((skip || recovery_start == 0) && !dev->vol.dirty) this->resync_start = MaxSector; if (skip) continue; - info_d = malloc(sizeof(*info_d)); + info_d = calloc(1, sizeof(*info_d)); if (!info_d) { fprintf(stderr, Name ": failed to allocate disk" " for volume %.16s\n", dev->volume); + info_d = this->devs; + while (info_d) { + struct mdinfo *d = info_d->next; + + free(info_d); + info_d = d; + } free(this); this = rest; break; } - memset(info_d, 0, sizeof(*info_d)); info_d->next = this->devs; this->devs = info_d; @@ -3876,9 +4125,10 @@ static struct mdinfo *container_content_imsm(struct supertype *st) info_d->disk.major = d->major; info_d->disk.minor = d->minor; info_d->disk.raid_disk = slot; - info_d->recovery_start = MaxSector; + info_d->recovery_start = recovery_start; - this->array.working_disks++; + if (info_d->recovery_start == MaxSector) + this->array.working_disks++; info_d->events = __le32_to_cpu(mpb->generation_num); info_d->data_offset = __le32_to_cpu(map->pba_of_lba0); @@ -3886,6 +4136,8 @@ static struct mdinfo *container_content_imsm(struct supertype *st) if (d->devname) strcpy(info_d->name, d->devname); } + /* now that the disk list is up-to-date fixup recovery_start */ + update_recovery_start(dev, this); rest = this; } @@ -4028,24 +4280,6 @@ static int is_resyncing(struct imsm_dev *dev) return 0; } -static int is_rebuilding(struct imsm_dev *dev) -{ - struct imsm_map *migr_map; - - if (!dev->vol.migr_state) - return 0; - - if (migr_type(dev) != MIGR_REBUILD) - return 0; - - migr_map = get_imsm_map(dev, 1); - - if (migr_map->map_state == IMSM_T_STATE_DEGRADED) - return 1; - else - return 0; -} - /* return true if we recorded new information */ static int mark_failure(struct imsm_dev *dev, struct imsm_disk *disk, int idx) { @@ -4096,6 +4330,7 @@ static int imsm_set_array_state(struct active_array *a, int consistent) struct imsm_map *map = get_imsm_map(dev, 0); int failed = imsm_count_failed(super, dev); __u8 map_state = imsm_check_degraded(super, dev, failed); + __u32 blocks_per_unit; /* before we activate this array handle any missing disks */ if (consistent == 2 && super->missing) { @@ -4107,7 +4342,7 @@ static int imsm_set_array_state(struct active_array *a, int consistent) mark_missing(dev, &dl->disk, dl->index); super->updates_pending++; } - + if (consistent == 2 && (!is_resync_complete(&a->info) || map_state != IMSM_T_STATE_NORMAL || @@ -4134,7 +4369,28 @@ static int imsm_set_array_state(struct active_array *a, int consistent) super->updates_pending++; } - /* FIXME check if we can update curr_migr_unit from resync_start */ + /* check if we can update curr_migr_unit from resync_start, recovery_start */ + blocks_per_unit = blocks_per_migr_unit(dev); + if (blocks_per_unit && failed <= 1) { + __u32 units32; + __u64 units; + + if (migr_type(dev) == MIGR_REBUILD) + units = min_recovery_start(&a->info) / blocks_per_unit; + else + units = a->info.resync_start / blocks_per_unit; + units32 = units; + + /* check that we did not overflow 32-bits, and that + * curr_migr_unit needs updating + */ + if (units32 == units && + __le32_to_cpu(dev->vol.curr_migr_unit) != units32) { + dprintf("imsm: mark checkpoint (%u)\n", units32); + dev->vol.curr_migr_unit = __cpu_to_le32(units32); + super->updates_pending++; + } + } /* mark dirty / clean */ if (dev->vol.dirty != !consistent) { diff --git a/util.c b/util.c index 927a0ee..53c21e3 100644 --- a/util.c +++ b/util.c @@ -1210,6 +1210,21 @@ int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info) return rv; } +unsigned long long min_recovery_start(struct mdinfo *array) +{ + /* find the minimum recovery_start in an array for metadata + * formats that only record per-array recovery progress instead + * of per-device + */ + unsigned long long recovery_start = MaxSector; + struct mdinfo *d; + + for (d = array->devs; d; d = d->next) + recovery_start = min(recovery_start, d->recovery_start); + + return recovery_start; +} + char *devnum2devname(int num) { char name[100];