From 25ed7e5924fd6e97b17831d2b42ecb3975cd71f2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 28 Sep 2009 14:40:59 -0700 Subject: [PATCH 01/26] imsm: cleanup disk status tests Add is_failed(), is_configured(), and is_spare() helpers to clean up disk status flag testing. Signed-off-by: Dan Williams --- super-intel.c | 53 ++++++++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/super-intel.c b/super-intel.c index 07b0b90..9f57a68 100644 --- a/super-intel.c +++ b/super-intel.c @@ -612,6 +612,21 @@ static __u32 imsm_reserved_sectors(struct intel_super *super, struct dl *dl) } #ifndef MDASSEMBLE +static int is_spare(struct imsm_disk *disk) +{ + return (disk->status & SPARE_DISK) == SPARE_DISK; +} + +static int is_configured(struct imsm_disk *disk) +{ + return (disk->status & CONFIGURED_DISK) == CONFIGURED_DISK; +} + +static int is_failed(struct imsm_disk *disk) +{ + return (disk->status & FAILED_DISK) == FAILED_DISK; +} + static void print_imsm_dev(struct imsm_dev *dev, char *uuid, int disk_idx) { __u64 sz; @@ -676,7 +691,6 @@ static void print_imsm_disk(struct imsm_super *mpb, int index, __u32 reserved) { struct imsm_disk *disk = __get_imsm_disk(mpb, index); char str[MAX_RAID_SERIAL_LEN + 1]; - __u32 s; __u64 sz; if (index < 0) @@ -685,10 +699,9 @@ static void print_imsm_disk(struct imsm_super *mpb, int index, __u32 reserved) printf("\n"); snprintf(str, MAX_RAID_SERIAL_LEN + 1, "%s", disk->serial); printf(" Disk%02d Serial : %s\n", index, str); - s = disk->status; - printf(" State :%s%s%s\n", s&SPARE_DISK ? " spare" : "", - s&CONFIGURED_DISK ? " active" : "", - s&FAILED_DISK ? " failed" : ""); + printf(" State :%s%s%s\n", is_spare(disk) ? " spare" : "", + is_configured(disk) ? " active" : "", + is_failed(disk) ? " failed" : ""); printf(" Id : %08x\n", __le32_to_cpu(disk->scsi_id)); sz = __le32_to_cpu(disk->total_blocks) - reserved; printf(" Usable Size : %llu%s\n", (unsigned long long)sz, @@ -1298,7 +1311,6 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info) { struct intel_super *super = st->sb; struct imsm_disk *disk; - __u32 s; if (super->current_vol >= 0) { getinfo_super_imsm_volume(st, info); @@ -1334,14 +1346,13 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info) disk = &super->disks->disk; info->data_offset = __le32_to_cpu(disk->total_blocks) - reserved; info->component_size = reserved; - s = disk->status; - info->disk.state = s & CONFIGURED_DISK ? (1 << MD_DISK_ACTIVE) : 0; + info->disk.state = is_configured(disk) ? (1 << MD_DISK_ACTIVE) : 0; /* we don't change info->disk.raid_disk here because * this state will be finalized in mdmon after we have * found the 'most fresh' version of the metadata */ - info->disk.state |= s & FAILED_DISK ? (1 << MD_DISK_FAULTY) : 0; - info->disk.state |= s & SPARE_DISK ? 0 : (1 << MD_DISK_SYNC); + info->disk.state |= is_failed(disk) ? (1 << MD_DISK_FAULTY) : 0; + info->disk.state |= is_spare(disk) ? 0 : (1 << MD_DISK_SYNC); } /* only call uuid_from_super_imsm when this disk is part of a populated container, @@ -3444,7 +3455,6 @@ static struct mdinfo *container_content_imsm(struct supertype *st) struct dl *d; int idx; int skip; - __u32 s; __u32 ord; skip = 0; @@ -3456,9 +3466,7 @@ static struct mdinfo *container_content_imsm(struct supertype *st) if (d == NULL) skip = 1; - - s = d ? d->disk.status : 0; - if (s & FAILED_DISK) + if (d && is_failed(&d->disk)) skip = 1; if (ord & IMSM_ORD_REBUILD) skip = 1; @@ -3565,8 +3573,7 @@ static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev, insync = 2; disk = get_imsm_disk(super, idx); - if (!disk || disk->status & FAILED_DISK || - ord & IMSM_ORD_REBUILD) + if (!disk || is_failed(disk) || ord & IMSM_ORD_REBUILD) insync--; /* no in-sync disks left in this mirror the @@ -3616,8 +3623,7 @@ static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev) idx = ord_to_idx(ord); disk = get_imsm_disk(super, idx); - if (!disk || disk->status & FAILED_DISK || - ord & IMSM_ORD_REBUILD) + if (!disk || is_failed(disk) || ord & IMSM_ORD_REBUILD) failed++; } @@ -3676,7 +3682,7 @@ static int mark_failure(struct imsm_dev *dev, struct imsm_disk *disk, int idx) return 0; ord = __le32_to_cpu(map->disk_ord_tbl[slot]); - if ((disk->status & FAILED_DISK) && (ord & IMSM_ORD_REBUILD)) + if (is_failed(disk) && (ord & IMSM_ORD_REBUILD)) return 0; disk->status |= FAILED_DISK; @@ -3877,7 +3883,7 @@ static struct dl *imsm_readd(struct intel_super *super, int idx, struct active_a if (dl->index == i) break; - if (dl && dl->disk.status & FAILED_DISK) + if (dl && is_failed(&dl->disk)) dl = NULL; if (dl) @@ -3915,11 +3921,10 @@ static struct dl *imsm_add_spare(struct intel_super *super, int slot, continue; /* skip in use or failed drives */ - if (dl->disk.status & FAILED_DISK || idx == dl->index || + if (is_failed(&dl->disk) || idx == dl->index || dl->index == -2) { dprintf("%x:%x status (failed: %d index: %d)\n", - dl->major, dl->minor, - (dl->disk.status & FAILED_DISK) == FAILED_DISK, idx); + dl->major, dl->minor, is_failed(&dl->disk), idx); continue; } @@ -4221,7 +4226,7 @@ static void imsm_process_update(struct supertype *st, if (i == u->slot) continue; disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i)); - if (!disk || disk->status & FAILED_DISK) + if (!disk || is_failed(disk)) failed++; } From 51725a7c2569b764f59f009bc0ef42901a1ec915 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 30 Sep 2009 11:44:38 -0700 Subject: [PATCH 02/26] imsm: kill close() of component device None of the other formats close the passed in fd at load, and this becomes a problem when trying to support --update where we need O_EXCL protection across the entire operation. Signed-off-by: Dan Williams --- super-intel.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/super-intel.c b/super-intel.c index 9f57a68..80cd6c5 100644 --- a/super-intel.c +++ b/super-intel.c @@ -1685,10 +1685,8 @@ load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) serialcpy(dl->serial, serial); dl->index = -2; dl->e = NULL; - } else if (keep_fd) { - close(dl->fd); + } else if (keep_fd) dl->fd = fd; - } /* look up this disk's index in the current anchor */ for (i = 0; i < super->anchor->num_disks; i++) { From a2b9798159755b6f5e867fae0dd3e25af59fc85e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 30 Sep 2009 11:45:41 -0700 Subject: [PATCH 03/26] imsm: disambiguate family_num This is a result of trawling through the Windows implementation to learn the mechanism of how it disambiguates family_num. It is a continuation of commit 148acb7b "imsm: fix family number handling" which introduced a regression when reassembling a container with stale disks and rebuilt members. When rebuilding, a new family number is assigned to protect against the "prodigal array member" problem. It prevents a former family member from returning to the system and causing a rebuild to go the wrong direction. However, this invalidates looking at the generation number to determine the most up-to-date disk when comparing across family numbers. Instead the assembly logic looks for agreement between a disk's local family membership compared against a global list of all families in the system. Whenever a disk's local metadata does not match a family number on the global list that family number is marked offline. It is possible that this logic results in multiple incompatible but valid family numbers existing in a container. In this case mdadm.conf cannot be consulted because it only records the uuid which is generated from static fields in the metadata. The metadata lacks the data needed to disambiguate "local" versus "foreign". The "foreign" array in this case requires updating to change its container-id information (orig_family_num), and possibly the member array names. Signed-off-by: Dan Williams --- super-intel.c | 580 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 448 insertions(+), 132 deletions(-) diff --git a/super-intel.c b/super-intel.c index 80cd6c5..e53afbb 100644 --- a/super-intel.c +++ b/super-intel.c @@ -265,6 +265,14 @@ struct intel_super { struct bbm_log *bbm_log; const char *hba; /* device path of the raid controller for this metadata */ const struct imsm_orom *orom; /* platform firmware support */ + struct intel_super *next; /* (temp) list for disambiguating family_num */ +}; + +struct intel_disk { + struct imsm_disk disk; + #define IMSM_UNKNOWN_OWNER (-1) + int owner; + struct intel_disk *next; }; struct extent { @@ -1477,8 +1485,19 @@ static int compare_super_imsm(struct supertype *st, struct supertype *tst) */ if (first->anchor->num_raid_devs > 0 && sec->anchor->num_raid_devs > 0) { - if (first->anchor->orig_family_num != sec->anchor->orig_family_num || - first->anchor->family_num != sec->anchor->family_num) + /* Determine if these disks might ever have been + * related. Further disambiguation can only take place + * in load_super_imsm_all + */ + __u32 first_family = first->anchor->orig_family_num; + __u32 sec_family = sec->anchor->orig_family_num; + + if (first_family == 0) + first_family = first->anchor->family_num; + if (sec_family == 0) + sec_family = sec->anchor->family_num; + + if (first_family != sec_family) return 3; } @@ -1548,7 +1567,6 @@ static void fd2devname(int fd, char *name) snprintf(name, MAX_RAID_SERIAL_LEN, "/dev/%s", nm); } - extern int scsi_get_serial(int fd, void *buf, size_t buf_len); static int imsm_read_serial(int fd, char *devname, @@ -1642,14 +1660,32 @@ static struct dl *serial_to_dl(__u8 *serial, struct intel_super *super) return dl; } +static struct imsm_disk * +__serial_to_disk(__u8 *serial, struct imsm_super *mpb, int *idx) +{ + int i; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, i); + + if (serialcmp(disk->serial, serial) == 0) { + if (idx) + *idx = i; + return disk; + } + } + + return NULL; +} + static int load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) { + struct imsm_disk *disk; struct dl *dl; struct stat stb; int rv; - int i; - int alloc = 1; + char name[40]; __u8 serial[MAX_RAID_SERIAL_LEN]; rv = imsm_read_serial(fd, devname, serial); @@ -1657,16 +1693,7 @@ load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) if (rv != 0) return 2; - /* check if this is a disk we have seen before. it may be a spare in - * super->disks while the current anchor believes it is a raid member, - * check if we need to update dl->index - */ - dl = serial_to_dl(serial, super); - if (!dl) - dl = malloc(sizeof(*dl)); - else - alloc = 0; - + dl = calloc(1, sizeof(*dl)); if (!dl) { if (devname) fprintf(stderr, @@ -1675,51 +1702,35 @@ load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) return 2; } - if (alloc) { - fstat(fd, &stb); - dl->major = major(stb.st_rdev); - dl->minor = minor(stb.st_rdev); - dl->next = super->disks; - dl->fd = keep_fd ? fd : -1; - dl->devname = devname ? strdup(devname) : NULL; - serialcpy(dl->serial, serial); - dl->index = -2; - dl->e = NULL; - } else if (keep_fd) - dl->fd = fd; + fstat(fd, &stb); + dl->major = major(stb.st_rdev); + dl->minor = minor(stb.st_rdev); + dl->next = super->disks; + dl->fd = keep_fd ? fd : -1; + assert(super->disks == NULL); + super->disks = dl; + serialcpy(dl->serial, serial); + dl->index = -2; + dl->e = NULL; + fd2devname(fd, name); + if (devname) + dl->devname = strdup(devname); + else + dl->devname = strdup(name); /* look up this disk's index in the current anchor */ - for (i = 0; i < super->anchor->num_disks; i++) { - struct imsm_disk *disk_iter; - - disk_iter = __get_imsm_disk(super->anchor, i); - - if (serialcmp(disk_iter->serial, dl->serial) == 0) { - dl->disk = *disk_iter; - /* only set index on disks that are a member of a - * populated contianer, i.e. one with raid_devs - */ - if (dl->disk.status & FAILED_DISK) - dl->index = -2; - else if (dl->disk.status & SPARE_DISK) - dl->index = -1; - else - dl->index = i; - - break; - } - } - - /* no match, maybe a stale failed drive */ - if (i == super->anchor->num_disks && dl->index >= 0) { - dl->disk = *__get_imsm_disk(super->anchor, dl->index); - if (dl->disk.status & FAILED_DISK) + disk = __serial_to_disk(dl->serial, super->anchor, &dl->index); + if (disk) { + dl->disk = *disk; + /* only set index on disks that are a member of a + * populated contianer, i.e. one with raid_devs + */ + if (is_failed(&dl->disk)) dl->index = -2; + else if (is_spare(&dl->disk)) + dl->index = -1; } - if (alloc) - super->disks = dl; - return 0; } @@ -1861,7 +1872,6 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) struct stat; struct imsm_super *anchor; __u32 check_sum; - int rc; get_dev_size(fd, NULL, &dsize); @@ -1923,10 +1933,7 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) return 2; } - rc = load_imsm_disk(fd, super, devname, 0); - if (rc == 0) - rc = parse_raid_devices(super); - return rc; + return 0; } /* read the extended mpb */ @@ -1962,11 +1969,23 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) */ super->bbm_log = __get_imsm_bbm_log(super->anchor); - rc = load_imsm_disk(fd, super, devname, 0); - if (rc == 0) - rc = parse_raid_devices(super); + return 0; +} - return rc; +static int +load_and_parse_mpb(int fd, struct intel_super *super, char *devname, int keep_fd) +{ + int err; + + err = load_imsm_mpb(fd, super, devname); + if (err) + return err; + err = load_imsm_disk(fd, super, devname, keep_fd); + if (err) + return err; + err = parse_raid_devices(super); + + return err; } static void __free_imsm_disk(struct dl *d) @@ -2096,19 +2115,333 @@ static int find_missing(struct intel_super *super) return 0; } +static struct intel_disk *disk_list_get(__u8 *serial, struct intel_disk *disk_list) +{ + struct intel_disk *idisk = disk_list; + + while (idisk) { + if (serialcmp(idisk->disk.serial, serial) == 0) + break; + idisk = idisk->next; + } + + return idisk; +} + +static int __prep_thunderdome(struct intel_super **table, int tbl_size, + struct intel_super *super, + struct intel_disk **disk_list) +{ + struct imsm_disk *d = &super->disks->disk; + struct imsm_super *mpb = super->anchor; + int i, j; + + for (i = 0; i < tbl_size; i++) { + struct imsm_super *tbl_mpb = table[i]->anchor; + struct imsm_disk *tbl_d = &table[i]->disks->disk; + + if (tbl_mpb->family_num == mpb->family_num) { + if (tbl_mpb->check_sum == mpb->check_sum) { + dprintf("%s: mpb from %d:%d matches %d:%d\n", + __func__, super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + break; + } + + if (((is_configured(d) && !is_configured(tbl_d)) || + is_configured(d) == is_configured(tbl_d)) && + tbl_mpb->generation_num < mpb->generation_num) { + /* current version of the mpb is a + * better candidate than the one in + * super_table, but copy over "cross + * generational" status + */ + struct intel_disk *idisk; + + dprintf("%s: mpb from %d:%d replaces %d:%d\n", + __func__, super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + + idisk = disk_list_get(tbl_d->serial, *disk_list); + if (idisk && is_failed(&idisk->disk)) + tbl_d->status |= FAILED_DISK; + break; + } else { + struct intel_disk *idisk; + struct imsm_disk *disk; + + /* tbl_mpb is more up to date, but copy + * over cross generational status before + * returning + */ + disk = __serial_to_disk(d->serial, mpb, NULL); + if (disk && is_failed(disk)) + d->status |= FAILED_DISK; + + idisk = disk_list_get(d->serial, *disk_list); + if (idisk) { + idisk->owner = i; + if (disk && is_configured(disk)) + idisk->disk.status |= CONFIGURED_DISK; + } + + dprintf("%s: mpb from %d:%d prefer %d:%d\n", + __func__, super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + + return tbl_size; + } + } + } + + if (i >= tbl_size) + table[tbl_size++] = super; + else + table[i] = super; + + /* update/extend the merged list of imsm_disk records */ + for (j = 0; j < mpb->num_disks; j++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, j); + struct intel_disk *idisk; + + idisk = disk_list_get(disk->serial, *disk_list); + if (idisk) { + idisk->disk.status |= disk->status; + if (is_configured(&idisk->disk) || + is_failed(&idisk->disk)) + idisk->disk.status &= ~(SPARE_DISK); + } else { + idisk = calloc(1, sizeof(*idisk)); + if (!idisk) + return -1; + idisk->owner = IMSM_UNKNOWN_OWNER; + idisk->disk = *disk; + idisk->next = *disk_list; + *disk_list = idisk; + } + + if (serialcmp(idisk->disk.serial, d->serial) == 0) + idisk->owner = i; + } + + return tbl_size; +} + +static struct intel_super * +validate_members(struct intel_super *super, struct intel_disk *disk_list, + const int owner) +{ + struct imsm_super *mpb = super->anchor; + int ok_count = 0; + int i; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, i); + struct intel_disk *idisk; + + idisk = disk_list_get(disk->serial, disk_list); + if (idisk) { + if (idisk->owner == owner || + idisk->owner == IMSM_UNKNOWN_OWNER) + ok_count++; + else + dprintf("%s: '%.16s' owner %d != %d\n", + __func__, disk->serial, idisk->owner, + owner); + } else { + dprintf("%s: unknown disk %x [%d]: %.16s\n", + __func__, __le32_to_cpu(mpb->family_num), i, + disk->serial); + break; + } + } + + if (ok_count == mpb->num_disks) + return super; + return NULL; +} + +static void show_conflicts(__u32 family_num, struct intel_super *super_list) +{ + struct intel_super *s; + + for (s = super_list; s; s = s->next) { + if (family_num != s->anchor->family_num) + continue; + fprintf(stderr, "Conflict, offlining family %#x on '%s'\n", + __le32_to_cpu(family_num), s->disks->devname); + } +} + +static struct intel_super * +imsm_thunderdome(struct intel_super **super_list, int len) +{ + struct intel_super *super_table[len]; + struct intel_disk *disk_list = NULL; + struct intel_super *champion, *spare; + struct intel_super *s, **del; + int tbl_size = 0; + int conflict; + int i; + + memset(super_table, 0, sizeof(super_table)); + for (s = *super_list; s; s = s->next) + tbl_size = __prep_thunderdome(super_table, tbl_size, s, &disk_list); + + for (i = 0; i < tbl_size; i++) { + struct imsm_disk *d; + struct intel_disk *idisk; + struct imsm_super *mpb = super_table[i]->anchor; + + s = super_table[i]; + d = &s->disks->disk; + + /* 'd' must appear in merged disk list for its + * configuration to be valid + */ + idisk = disk_list_get(d->serial, disk_list); + if (idisk && idisk->owner == i) + s = validate_members(s, disk_list, i); + else + s = NULL; + + if (!s) + dprintf("%s: marking family: %#x from %d:%d offline\n", + __func__, mpb->family_num, + super_table[i]->disks->major, + super_table[i]->disks->minor); + super_table[i] = s; + } + + /* This is where the mdadm implementation differs from the Windows + * driver which has no strict concept of a container. We can only + * assemble one family from a container, so when returning a prodigal + * array member to this system the code will not be able to disambiguate + * the container contents that should be assembled ("foreign" versus + * "local"). It requires user intervention to set the orig_family_num + * to a new value to establish a new container. The Windows driver in + * this situation fixes up the volume name in place and manages the + * foreign array as an independent entity. + */ + s = NULL; + spare = NULL; + conflict = 0; + for (i = 0; i < tbl_size; i++) { + struct intel_super *tbl_ent = super_table[i]; + int is_spare = 0; + + if (!tbl_ent) + continue; + + if (tbl_ent->anchor->num_raid_devs == 0) { + spare = tbl_ent; + is_spare = 1; + } + + if (s && !is_spare) { + show_conflicts(tbl_ent->anchor->family_num, *super_list); + conflict++; + } else if (!s && !is_spare) + s = tbl_ent; + } + + if (!s) + s = spare; + if (!s) { + champion = NULL; + goto out; + } + champion = s; + + if (conflict) + fprintf(stderr, "Chose family %#x on '%s', " + "assemble conflicts to new container with '--update=uuid'\n", + __le32_to_cpu(s->anchor->family_num), s->disks->devname); + + /* collect all dl's onto 'champion', and update them to + * champion's version of the status + */ + for (s = *super_list; s; s = s->next) { + struct imsm_super *mpb = champion->anchor; + struct dl *dl = s->disks; + + if (s == champion) + continue; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk; + + disk = __serial_to_disk(dl->serial, mpb, &dl->index); + if (disk) { + dl->disk = *disk; + /* only set index on disks that are a member of + * a populated contianer, i.e. one with + * raid_devs + */ + if (is_failed(&dl->disk)) + dl->index = -2; + else if (is_spare(&dl->disk)) + dl->index = -1; + break; + } + } + + if (i >= mpb->num_disks) { + struct intel_disk *idisk; + + idisk = disk_list_get(dl->serial, disk_list); + if (is_spare(&idisk->disk) && + !is_failed(&idisk->disk) && !is_configured(&idisk->disk)) + dl->index = -1; + else { + dl->index = -2; + continue; + } + } + + dl->next = champion->disks; + champion->disks = dl; + s->disks = NULL; + } + + /* delete 'champion' from super_list */ + for (del = super_list; *del; ) { + if (*del == champion) { + *del = (*del)->next; + break; + } else + del = &(*del)->next; + } + champion->next = NULL; + + out: + while (disk_list) { + struct intel_disk *idisk = disk_list; + + disk_list = disk_list->next; + free(idisk); + } + + return champion; +} + static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, char *devname, int keep_fd) { struct mdinfo *sra; - struct intel_super *super; - struct mdinfo *sd, *best = NULL; - __u32 bestgen = 0; - __u32 gen; - char nm[20]; - int dfd; - int rv; + struct intel_super *super_list = NULL; + struct intel_super *super = NULL; int devnum = fd2devnum(fd); + struct mdinfo *sd; int retry; + int err = 0; + int i; enum sysfs_read_flags flags; flags = GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE; @@ -2125,81 +2458,51 @@ static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, strcmp(sra->text_version, "imsm") != 0) return 1; - super = alloc_super(0); - if (!super) - return 1; + /* load all mpbs */ + for (sd = sra->devs, i = 0; sd; sd = sd->next, i++) { + struct intel_super *s = alloc_super(0); + char nm[20]; + int dfd; - /* find the most up to date disk in this array, skipping spares */ - for (sd = sra->devs; sd; sd = sd->next) { + err = 1; + if (!s) + goto error; + s->next = super_list; + super_list = s; + + err = 2; sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); dfd = dev_open(nm, keep_fd ? O_RDWR : O_RDONLY); - if (dfd < 0) { - free_imsm(super); - return 2; - } - rv = load_imsm_mpb(dfd, super, NULL); + if (dfd < 0) + goto error; + + err = load_and_parse_mpb(dfd, s, NULL, keep_fd); /* retry the load if we might have raced against mdmon */ - if (rv == 3 && mdmon_running(devnum)) + if (err == 3 && mdmon_running(devnum)) for (retry = 0; retry < 3; retry++) { usleep(3000); - rv = load_imsm_mpb(dfd, super, NULL); - if (rv != 3) + err = load_and_parse_mpb(dfd, s, NULL, keep_fd); + if (err != 3) break; } if (!keep_fd) close(dfd); - if (rv == 0) { - if (super->anchor->num_raid_devs == 0) - gen = 0; - else - gen = __le32_to_cpu(super->anchor->generation_num); - if (!best || gen > bestgen) { - bestgen = gen; - best = sd; - } - } else { - free_imsm(super); - return rv; - } + if (err) + goto error; } - if (!best) { - free_imsm(super); - return 1; + /* all mpbs enter, maybe one leaves */ + super = imsm_thunderdome(&super_list, i); + if (!super) { + err = 1; + goto error; } - /* load the most up to date anchor */ - sprintf(nm, "%d:%d", best->disk.major, best->disk.minor); - dfd = dev_open(nm, O_RDONLY); - if (dfd < 0) { - free_imsm(super); - return 1; - } - rv = load_imsm_mpb(dfd, super, NULL); - close(dfd); - if (rv != 0) { - free_imsm(super); - return 2; - } - - /* re-parse the disk list with the current anchor */ - for (sd = sra->devs ; sd ; sd = sd->next) { - sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); - dfd = dev_open(nm, keep_fd? O_RDWR : O_RDONLY); - if (dfd < 0) { - free_imsm(super); - return 2; - } - load_imsm_disk(dfd, super, NULL, keep_fd); - if (!keep_fd) - close(dfd); - } - - if (find_missing(super) != 0) { free_imsm(super); - return 2; + err = 2; + goto error; } if (st->subarray[0]) { @@ -2207,13 +2510,26 @@ static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, super->current_vol = atoi(st->subarray); else { free_imsm(super); - return 1; + err = 1; + goto error; } } + err = 0; + + error: + while (super_list) { + struct intel_super *s = super_list; + + super_list = super_list->next; + free_imsm(s); + } + + if (err) + return err; *sbp = super; st->container_dev = devnum; - if (st->ss == NULL) { + if (err == 0 && st->ss == NULL) { st->ss = &super_imsm; st->minor_version = 0; st->max_devs = IMSM_MAX_DEVICES; @@ -2244,7 +2560,7 @@ static int load_super_imsm(struct supertype *st, int fd, char *devname) return 1; } - rv = load_imsm_mpb(fd, super, devname); + rv = load_and_parse_mpb(fd, super, devname, 0); if (rv) { if (devname) From f796af5d5ea603085ce6bcf3c171b89a1f84f37a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2009 17:41:53 -0700 Subject: [PATCH 04/26] imsm: fix spare record writeout race imsm_activate_spare() in the manager thread may race against write_super_imsm_spares() in the monitor thread. Give write_super_imsm_spares() its own private mpb buffer to prevent confusing the manager. This change uncovered cases where spares were not being assembled due to a failed metadata version number check. Spares can freely associate across metadata version number, so reduce the scope of the version check in the spare assembly case. Signed-off-by: Dan Williams --- super-intel.c | 59 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/super-intel.c b/super-intel.c index e53afbb..0e3ed89 100644 --- a/super-intel.c +++ b/super-intel.c @@ -1477,9 +1477,6 @@ static int compare_super_imsm(struct supertype *st, struct supertype *tst) return 0; } - if (memcmp(first->anchor->sig, sec->anchor->sig, MAX_SIGNATURE_LENGTH) != 0) - return 3; - /* if an anchor does not have num_raid_devs set then it is a free * floating spare */ @@ -1492,6 +1489,10 @@ static int compare_super_imsm(struct supertype *st, struct supertype *tst) __u32 first_family = first->anchor->orig_family_num; __u32 sec_family = sec->anchor->orig_family_num; + if (memcmp(first->anchor->sig, sec->anchor->sig, + MAX_SIGNATURE_LENGTH) != 0) + return 3; + if (first_family == 0) first_family = first->anchor->family_num; if (sec_family == 0) @@ -1499,8 +1500,10 @@ static int compare_super_imsm(struct supertype *st, struct supertype *tst) if (first_family != sec_family) return 3; + } + /* if 'first' is a spare promote it to a populated mpb with sec's * family number */ @@ -2976,39 +2979,48 @@ static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk, return 0; } -static int store_imsm_mpb(int fd, struct intel_super *super); +static int store_imsm_mpb(int fd, struct imsm_super *mpb); + +static union { + char buf[512]; + struct imsm_super anchor; +} spare_record __attribute__ ((aligned(512))); /* spare records have their own family number and do not have any defined raid * devices */ static int write_super_imsm_spares(struct intel_super *super, int doclose) { - struct imsm_super mpb_save; struct imsm_super *mpb = super->anchor; + struct imsm_super *spare = &spare_record.anchor; __u32 sum; struct dl *d; - mpb_save = *mpb; - mpb->num_raid_devs = 0; - mpb->num_disks = 1; - mpb->mpb_size = sizeof(struct imsm_super); - mpb->generation_num = __cpu_to_le32(1UL); + spare->mpb_size = __cpu_to_le32(sizeof(struct imsm_super)), + spare->generation_num = __cpu_to_le32(1UL), + spare->attributes = MPB_ATTRIB_CHECKSUM_VERIFY; + spare->num_disks = 1, + spare->num_raid_devs = 0, + spare->cache_size = mpb->cache_size, + spare->pwr_cycle_count = __cpu_to_le32(1), + + snprintf((char *) spare->sig, MAX_SIGNATURE_LENGTH, + MPB_SIGNATURE MPB_VERSION_RAID0); for (d = super->disks; d; d = d->next) { if (d->index != -1) continue; - mpb->disk[0] = d->disk; - sum = __gen_imsm_checksum(mpb); - mpb->family_num = __cpu_to_le32(sum); - mpb->orig_family_num = 0; - sum = __gen_imsm_checksum(mpb); - mpb->check_sum = __cpu_to_le32(sum); + spare->disk[0] = d->disk; + sum = __gen_imsm_checksum(spare); + spare->family_num = __cpu_to_le32(sum); + spare->orig_family_num = 0; + sum = __gen_imsm_checksum(spare); + spare->check_sum = __cpu_to_le32(sum); - if (store_imsm_mpb(d->fd, super)) { + if (store_imsm_mpb(d->fd, spare)) { fprintf(stderr, "%s: failed for device %d:%d %s\n", __func__, d->major, d->minor, strerror(errno)); - *mpb = mpb_save; return 1; } if (doclose) { @@ -3017,7 +3029,6 @@ static int write_super_imsm_spares(struct intel_super *super, int doclose) } } - *mpb = mpb_save; return 0; } @@ -3069,7 +3080,7 @@ static int write_super_imsm(struct intel_super *super, int doclose) for (d = super->disks; d ; d = d->next) { if (d->index < 0) continue; - if (store_imsm_mpb(d->fd, super)) + if (store_imsm_mpb(d->fd, mpb)) fprintf(stderr, "%s: failed for device %d:%d %s\n", __func__, d->major, d->minor, strerror(errno)); if (doclose) { @@ -4144,9 +4155,9 @@ static void imsm_set_disk(struct active_array *a, int n, int state) } } -static int store_imsm_mpb(int fd, struct intel_super *super) +static int store_imsm_mpb(int fd, struct imsm_super *mpb) { - struct imsm_super *mpb = super->anchor; + void *buf = mpb; __u32 mpb_size = __le32_to_cpu(mpb->mpb_size); unsigned long long dsize; unsigned long long sectors; @@ -4161,7 +4172,7 @@ static int store_imsm_mpb(int fd, struct intel_super *super) if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0) return 1; - if (write(fd, super->buf + 512, 512 * sectors) != 512 * sectors) + if (write(fd, buf + 512, 512 * sectors) != 512 * sectors) return 1; } @@ -4169,7 +4180,7 @@ static int store_imsm_mpb(int fd, struct intel_super *super) if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) return 1; - if (write(fd, super->buf, 512) != 512) + if (write(fd, buf, 512) != 512) return 1; return 0; From e683ca88ac4c2f55059e8c82aff7a487a0884ef7 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2009 17:41:53 -0700 Subject: [PATCH 05/26] imsm: fix/support --update Fix init_super_imsm() to return an empty mpb when info == NULL, and teach store_super_imsm() to simply write out the passed in mpb. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=523320 Reported-by: Hans de Goede Signed-off-by: Dan Williams --- super-intel.c | 54 +++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/super-intel.c b/super-intel.c index 0e3ed89..eaf5b0b 100644 --- a/super-intel.c +++ b/super-intel.c @@ -2819,24 +2819,33 @@ static int init_super_imsm(struct supertype *st, mdu_array_info_t *info, size_t mpb_size; char *version; - if (!info) { - st->sb = NULL; - return 0; - } if (st->sb) - return init_super_imsm_volume(st, info, size, name, homehost, - uuid); + return init_super_imsm_volume(st, info, size, name, homehost, uuid); + + if (info) + mpb_size = disks_to_mpb_size(info->nr_disks); + else + mpb_size = 512; super = alloc_super(1); - if (!super) - return 0; - mpb_size = disks_to_mpb_size(info->nr_disks); - if (posix_memalign(&super->buf, 512, mpb_size) != 0) { + if (super && posix_memalign(&super->buf, 512, mpb_size) != 0) { free(super); + super = NULL; + } + if (!super) { + fprintf(stderr, Name + ": %s could not allocate superblock\n", __func__); return 0; } + memset(super->buf, 0, mpb_size); mpb = super->buf; - memset(mpb, 0, mpb_size); + mpb->mpb_size = __cpu_to_le32(mpb_size); + st->sb = super; + + if (info == NULL) { + /* zeroing superblock */ + return 0; + } mpb->attributes = MPB_ATTRIB_CHECKSUM_VERIFY; @@ -2844,9 +2853,7 @@ static int init_super_imsm(struct supertype *st, mdu_array_info_t *info, strcpy(version, MPB_SIGNATURE); version += strlen(MPB_SIGNATURE); strcpy(version, MPB_VERSION_RAID0); - mpb->mpb_size = mpb_size; - st->sb = super; return 1; } @@ -3188,24 +3195,15 @@ static int write_init_super_imsm(struct supertype *st) } #endif -static int store_zero_imsm(struct supertype *st, int fd) +static int store_super_imsm(struct supertype *st, int fd) { - unsigned long long dsize; - void *buf; + struct intel_super *super = st->sb; + struct imsm_super *mpb = super ? super->anchor : NULL; - get_dev_size(fd, NULL, &dsize); - - /* first block is stored on second to last sector of the disk */ - if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) + if (!mpb) return 1; - if (posix_memalign(&buf, 512, 512) != 0) - return 1; - - memset(buf, 0, 512); - if (write(fd, buf, 512) != 512) - return 1; - return 0; + return store_imsm_mpb(fd, mpb); } static int imsm_bbm_log_size(struct imsm_super *mpb) @@ -4914,7 +4912,7 @@ struct superswitch super_imsm = { .load_super = load_super_imsm, .init_super = init_super_imsm, - .store_super = store_zero_imsm, + .store_super = store_super_imsm, .free_super = free_super_imsm, .match_metadata_desc = match_metadata_desc_imsm, .container_content = container_content_imsm, From 955e9ea1394662f097a88bb3d62c56ab50448597 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2009 17:41:53 -0700 Subject: [PATCH 06/26] ddf: prevent superblock being zeroed on --update The full fix would be to support updating ddf metadata, but this minimal fix just prevents the superblock from being zeroed when someone inadvertently passes an unsupported --update option during assembly. Reported-by: Hans de Goede Signed-off-by: Dan Williams --- super-ddf.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/super-ddf.c b/super-ddf.c index 9bf08c2..06858e2 100644 --- a/super-ddf.c +++ b/super-ddf.c @@ -1589,13 +1589,8 @@ static int init_super_ddf(struct supertype *st, struct phys_disk *pd; struct virtual_disk *vd; - if (!info) { - st->sb = NULL; - return 0; - } if (st->sb) - return init_super_ddf_bvd(st, info, size, name, homehost, - uuid); + return init_super_ddf_bvd(st, info, size, name, homehost, uuid); if (posix_memalign((void**)&ddf, 512, sizeof(*ddf)) != 0) { fprintf(stderr, Name ": %s could not allocate superblock\n", __func__); @@ -1604,6 +1599,12 @@ static int init_super_ddf(struct supertype *st, memset(ddf, 0, sizeof(*ddf)); ddf->dlist = NULL; /* no physical disks yet */ ddf->conflist = NULL; /* No virtual disks yet */ + st->sb = ddf; + + if (info == NULL) { + /* zeroing superblock */ + return 0; + } /* At least 32MB *must* be reserved for the ddf. So let's just * start 32MB from the end, and put the primary header there. @@ -2971,12 +2972,22 @@ static struct mdinfo *container_content_ddf(struct supertype *st) return rest; } -static int store_zero_ddf(struct supertype *st, int fd) +static int store_super_ddf(struct supertype *st, int fd) { + struct ddf_super *ddf = st->sb; unsigned long long dsize; void *buf; int rc; + if (!ddf) + return 1; + + /* ->dlist and ->conflist will be set for updates, currently not + * supported + */ + if (ddf->dlist || ddf->conflist) + return 1; + if (!get_dev_size(fd, NULL, &dsize)) return 1; @@ -3627,7 +3638,7 @@ struct superswitch super_ddf = { .load_super = load_super_ddf, .init_super = init_super_ddf, - .store_super = store_zero_ddf, + .store_super = store_super_ddf, .free_super = free_super_ddf, .match_metadata_desc = match_metadata_desc_ddf, .container_content = container_content_ddf, From 6e46bf344bf34a688696e240596f8259e328eea9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2009 17:41:53 -0700 Subject: [PATCH 07/26] imsm: add --update=uuid support When disks have conflicting container memberships (same container ids but incompatible member arrays) --update=uuid can be used to move offenders to a new container id by changing 'orig_family_num'. Note that this only supports random updates of the uuid as the actual uuid is synthesized. We also need to communicate the new 'orig_family_num' value to all disks involved in the update. A new field 'update_private' is added to struct mdinfo to allow this information to be transmitted. Signed-off-by: Dan Williams --- Assemble.c | 3 +++ mdadm.h | 5 +++++ super-intel.c | 51 +++++++++++++++++++++++++++++++++++++++------------ 3 files changed, 47 insertions(+), 12 deletions(-) diff --git a/Assemble.c b/Assemble.c index 4578906..7da0905 100644 --- a/Assemble.c +++ b/Assemble.c @@ -565,6 +565,7 @@ int Assemble(struct supertype *st, char *mddev, #endif /* Ok, no bad inconsistancy, we can try updating etc */ bitmap_done = 0; + content->update_private = NULL; for (tmpdev = devlist; tmpdev; tmpdev=tmpdev->next) if (tmpdev->used == 1) { char *devname = tmpdev->devname; struct stat stb; @@ -717,6 +718,8 @@ int Assemble(struct supertype *st, char *mddev, } devcnt++; } + free(content->update_private); + content->update_private = NULL; if (devcnt == 0) { fprintf(stderr, Name ": no devices found for %s\n", diff --git a/mdadm.h b/mdadm.h index 91ba624..04b87b8 100644 --- a/mdadm.h +++ b/mdadm.h @@ -153,6 +153,11 @@ struct mdinfo { int cache_size; /* size of raid456 stripe cache*/ int mismatch_cnt; char text_version[50]; + void *update_private; /* for passing metadata-format + * specific update data + * between successive calls to + * update_super() + */ int container_member; /* for assembling external-metatdata arrays * This is to be used internally by metadata diff --git a/super-intel.c b/super-intel.c index eaf5b0b..110c4a8 100644 --- a/super-intel.c +++ b/super-intel.c @@ -1378,8 +1378,6 @@ static int update_super_imsm(struct supertype *st, struct mdinfo *info, char *update, char *devname, int verbose, int uuid_set, char *homehost) { - /* FIXME */ - /* For 'assemble' and 'force' we need to return non-zero if any * change was made. For others, the return value is ignored. * Update options are: @@ -1395,26 +1393,55 @@ static int update_super_imsm(struct supertype *st, struct mdinfo *info, * linear only * resync: mark as dirty so a resync will happen. * name: update the name - preserving the homehost + * uuid: Change the uuid of the array to match watch is given * * Following are not relevant for this imsm: * sparc2.2 : update from old dodgey metadata * super-minor: change the preferred_minor number * summaries: update redundant counters. - * uuid: Change the uuid of the array to match watch is given * homehost: update the recorded homehost * _reshape_progress: record new reshape_progress position. */ - int rv = 0; - //struct intel_super *super = st->sb; - //struct imsm_super *mpb = super->mpb; + int rv = 1; + struct intel_super *super = st->sb; + struct imsm_super *mpb; - if (strcmp(update, "grow") == 0) { - } - if (strcmp(update, "resync") == 0) { - /* dev->vol.dirty = 1; */ - } + /* we can only update container info */ + if (!super || super->current_vol >= 0 || !super->anchor) + return 1; - /* IMSM has no concept of UUID or homehost */ + mpb = super->anchor; + + if (strcmp(update, "uuid") == 0 && uuid_set && !info->update_private) + fprintf(stderr, + Name ": '--uuid' not supported for imsm metadata\n"); + else if (strcmp(update, "uuid") == 0 && uuid_set && info->update_private) { + mpb->orig_family_num = *((__u32 *) info->update_private); + rv = 0; + } else if (strcmp(update, "uuid") == 0) { + __u32 *new_family = malloc(sizeof(*new_family)); + + /* update orig_family_number with the incoming random + * data, report the new effective uuid, and store the + * new orig_family_num for future updates. + */ + if (new_family) { + memcpy(&mpb->orig_family_num, info->uuid, sizeof(__u32)); + uuid_from_super_imsm(st, info->uuid); + *new_family = mpb->orig_family_num; + info->update_private = new_family; + rv = 0; + } + } else if (strcmp(update, "assemble") == 0) + rv = 0; + else + fprintf(stderr, + Name ": '--update=%s' not supported for imsm metadata\n", + update); + + /* successful update? recompute checksum */ + if (rv == 0) + mpb->check_sum = __le32_to_cpu(__gen_imsm_checksum(mpb)); return rv; } From d2b9eb5993b6c36bf1d66980811bda1b6eefb19f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2009 17:41:53 -0700 Subject: [PATCH 08/26] imsm: regression test for prodigal array member scenario Provide a test to sanity check assembly and reassembly in the presence of conflicting family number information. Signed-off-by: Dan Williams --- tests/09imsm-assemble | 46 +++++++++++++++++++++++++++++++++++++++ tests/env-09imsm-assemble | 32 +++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 tests/09imsm-assemble create mode 100644 tests/env-09imsm-assemble diff --git a/tests/09imsm-assemble b/tests/09imsm-assemble new file mode 100644 index 0000000..7389b0e --- /dev/null +++ b/tests/09imsm-assemble @@ -0,0 +1,46 @@ +# validate the prodigal member disk scenario i.e. a former container +# member is returned after having been rebuilt on another system +num_disks=4 +size=$((10*1024)) +mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 $dev2 $dev3 +mdadm -CR $member $dev0 $dev2 -n 2 -l 1 -z $size +mdadm --wait $member +mdadm -Ss + +# make dev0 and dev1 a new rebuild family +mdadm -A $container $dev0 $dev1 +mdadm -I $container +mdadm --wait ${member}_0 +mdadm -Ss + +# make dev2 and dev3 a new rebuild family +mdadm -A $container $dev2 $dev3 +mdadm -I $container +mdadm --wait ${member}_0 +mdadm -Ss + +# reassemble and make sure one of the families falls out +mdadm -A $container $dev0 $dev1 $dev2 $dev3 +mdadm -I $container +testdev ${member}_0 1 $size 1 +if mdadm --remove $container $dev0 ; then + # the dev[23] family won + imsm_check_removal $container $dev1 + imsm_check_hold $container $dev2 + imsm_check_hold $container $dev3 +else + # the dev[01] family won + imsm_check_hold $container $dev1 + imsm_check_removal $container $dev2 + imsm_check_removal $container $dev3 +fi +mdadm -Ss + +# reassemble with a new id for the dev[23] family +mdadm -A $container $dev0 $dev1 +mdadm -I $container +mdadm -A ${container}2 $dev2 $dev3 --update=uuid +mdadm -I ${container}2 + +testdev ${member}_0 1 $size 1 +testdev ${member}_1 1 $size 1 diff --git a/tests/env-09imsm-assemble b/tests/env-09imsm-assemble new file mode 100644 index 0000000..b12954b --- /dev/null +++ b/tests/env-09imsm-assemble @@ -0,0 +1,32 @@ +imsm_check_hold() { + if mdadm --remove $1 $2; then + echo "$2 removal from $1 should have been blocked" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +imsm_check_removal() { + if ! mdadm --remove $1 $2 ; then + echo "$2 removal from $1 should have succeeded" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +setup_env() { + export IMSM_DEVNAME_AS_SERIAL=1 + export IMSM_TEST_OROM=1 + container=/dev/md/container + member=/dev/md/vol0 +} + +reset_env() { + unset IMSM_DEVNAME_AS_SERIAL + unset IMSM_TEST_OROM + unset imsm_check + unset container + unset member +} From aae5a11207cf6da1682e6a76e116a19e21473f03 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2009 17:41:57 -0700 Subject: [PATCH 09/26] Detail: export MD_UUID from mapfile The load_super() from an mdadm --detail call may race against an mdmon update. When this happens the load_super sees an inconsistent metadata block and returns an error. The fallback path to use the map file contents lacks uuid reporting, so provide __fname_from_uuid for generically printing a uuid. Reported-by: Hans de Goede Signed-off-by: Dan Williams --- Detail.c | 5 +++++ mdadm.h | 1 + util.c | 12 ++++++++---- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/Detail.c b/Detail.c index 001012a..1598a42 100644 --- a/Detail.c +++ b/Detail.c @@ -194,7 +194,12 @@ int Detail(char *dev, int brief, int export, int test, char *homehost) st->ss->export_detail_super(st); } else { struct map_ent *mp, *map = NULL; + char nbuf[64]; mp = map_by_devnum(&map, fd2devnum(fd)); + if (mp) { + __fname_from_uuid(mp->uuid, 0, nbuf, ':'); + printf("MD_UUID=%s\n", nbuf+5); + } if (mp && mp->path && strncmp(mp->path, "/dev/md/", 8) == 0) printf("MD_DEVNAME=%s\n", mp->path+8); diff --git a/mdadm.h b/mdadm.h index 04b87b8..8212a2c 100644 --- a/mdadm.h +++ b/mdadm.h @@ -810,6 +810,7 @@ extern void uuid_from_super(int uuid[4], mdp_super_t *super); extern const int uuid_match_any[4]; extern int same_uuid(int a[4], int b[4], int swapuuid); extern void copy_uuid(void *a, int b[4], int swapuuid); +extern char *__fname_from_uuid(int id[4], int swap, char *buf, char sep); extern char *fname_from_uuid(struct supertype *st, struct mdinfo *info, char *buf, char sep); extern unsigned long calc_csum(void *super, int bytes); diff --git a/util.c b/util.c index 4ccb1bb..98aedd0 100644 --- a/util.c +++ b/util.c @@ -269,17 +269,15 @@ void copy_uuid(void *a, int b[4], int swapuuid) memcpy(a, b, 16); } -char *fname_from_uuid(struct supertype *st, struct mdinfo *info, char *buf, char sep) +char *__fname_from_uuid(int id[4], int swap, char *buf, char sep) { int i, j; - int id; char uuid[16]; char *c = buf; strcpy(c, "UUID-"); c += strlen(c); - copy_uuid(uuid, info->uuid, st->ss->swapuuid); + copy_uuid(uuid, id, swap); for (i = 0; i < 4; i++) { - id = uuid[i]; if (i) *c++ = sep; for (j = 3; j >= 0; j--) { @@ -288,6 +286,12 @@ char *fname_from_uuid(struct supertype *st, struct mdinfo *info, char *buf, char } } return buf; + +} + +char *fname_from_uuid(struct supertype *st, struct mdinfo *info, char *buf, char sep) +{ + return __fname_from_uuid(info->uuid, st->ss->swapuuid, buf, sep); } #ifndef MDASSEMBLE From 96a8270d46faab599b41f1cf78b4331b44c5a6be Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2009 17:41:57 -0700 Subject: [PATCH 10/26] mdmon: avoid writes in the startup path for mdmon on root arrays When killing a previous monitor be careful not to cause writes to the filesystem until the reads necessary to get the monitor operational have completed. The code is already prepared for errors creating the pid and socket files, so simply defer creation of these files until after the first call to manage(). Cc: Hans de Goede Signed-off-by: Dan Williams --- managemon.c | 6 ++++ mdmon.c | 86 ++++++++++++++++++++++------------------------------- 2 files changed, 42 insertions(+), 50 deletions(-) diff --git a/managemon.c b/managemon.c index f9d545d..5958e18 100644 --- a/managemon.c +++ b/managemon.c @@ -680,6 +680,12 @@ void do_manager(struct supertype *container) read_sock(container); if (container->sock < 0 || socket_hup_requested) { + /* If this fails, we hope it already exists + * pid file lives in /var/run/mdadm/mdXX.pid + */ + mkdir("/var", 0600); + mkdir("/var/run", 0600); + mkdir("/var/run/mdadm", 0600); close(container->sock); container->sock = make_control_sock(container->devname); make_pidfile(container->devname, 0); diff --git a/mdmon.c b/mdmon.c index 31994d8..5f87e78 100644 --- a/mdmon.c +++ b/mdmon.c @@ -113,6 +113,14 @@ static struct superswitch *find_metadata_methods(char *vers) return NULL; } +static int test_pidfile(char *devname) +{ + char path[100]; + struct stat st; + + sprintf(path, "/var/run/mdadm/%s.pid", devname); + return stat(path, &st); +} int make_pidfile(char *devname, int o_excl) { @@ -149,27 +157,30 @@ int is_container_member(struct mdstat_ent *mdstat, char *container) return 1; } -void remove_pidfile(char *devname); -static void try_kill_monitor(char *devname) +pid_t devname2mdmon(char *devname) +{ + char buf[100]; + pid_t pid = -1; + int fd; + + sprintf(buf, "/var/run/mdadm/%s.pid", devname); + fd = open(buf, O_RDONLY|O_NOATIME); + if (fd < 0) + return -1; + + if (read(fd, buf, sizeof(buf)) > 0) + sscanf(buf, "%d\n", &pid); + close(fd); + + return pid; +} + +static void try_kill_monitor(pid_t pid, char *devname) { char buf[100]; int fd; - pid_t pid; struct mdstat_ent *mdstat; - sprintf(buf, "/var/run/mdadm/%s.pid", devname); - fd = open(buf, O_RDONLY); - if (fd < 0) - return; - - if (read(fd, buf, sizeof(buf)) < 0) { - close(fd); - return; - } - - close(fd); - pid = strtoul(buf, NULL, 10); - /* first rule of survival... don't off yourself */ if (pid == getpid()) return; @@ -197,7 +208,6 @@ static void try_kill_monitor(char *devname) WaitClean(buf, 0); } free_mdstat(mdstat); - remove_pidfile(devname); } void remove_pidfile(char *devname) @@ -355,6 +365,7 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) int pfd[2]; int status; int ignore; + pid_t victim = -1; dprintf("starting mdmon for %s in %s\n", devname, switchroot ? : "/"); @@ -400,6 +411,7 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) container->devname = devname; container->arrays = NULL; container->subarray[0] = 0; + container->sock = -1; if (!container->devname) { fprintf(stderr, "mdmon: failed to allocate container name string\n"); @@ -464,12 +476,9 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) if (switchroot) { /* we assume we assume that /sys /proc /dev are available in - * the new root (see nash:setuproot) - * - * kill any monitors in the current namespace and change - * to the new one + * the new root */ - try_kill_monitor(container->devname); + victim = devname2mdmon(container->devname); if (chroot(switchroot) != 0) { fprintf(stderr, "mdmon: failed to chroot to '%s': %s\n", switchroot, strerror(errno)); @@ -477,40 +486,15 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) } } - /* If this fails, we hope it already exists - * pid file lives in /var/run/mdadm/mdXX.pid - */ - mkdir("/var", 0600); - mkdir("/var/run", 0600); - mkdir("/var/run/mdadm", 0600); ignore = chdir("/"); - if (make_pidfile(container->devname, O_EXCL) < 0) { + if (victim < 0 && test_pidfile(container->devname) == 0) { if (ping_monitor(container->devname) == 0) { fprintf(stderr, "mdmon: %s already managed\n", container->devname); exit(3); - } else { - int err; - - /* cleanup the old monitor, this one is taking over */ - try_kill_monitor(container->devname); - err = make_pidfile(container->devname, 0); - if (err < 0) { - fprintf(stderr, "mdmon: %s Cannot create pidfile\n", - container->devname); - if (err == -EROFS) { - /* FIXME implement a mechanism to - * prevent duplicate monitor instances - */ - fprintf(stderr, - "mdmon: continuing on read-only file system\n"); - } else - exit(3); - } - } + } else if (victim < 0) + victim = devname2mdmon(container->devname); } - container->sock = make_control_sock(container->devname); - if (container->ss->load_super(container, mdfd, devname)) { fprintf(stderr, "mdmon: Cannot load metadata for %s\n", devname); @@ -544,6 +528,8 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) exit(2); } + if (victim > -1) + try_kill_monitor(victim, container->devname); do_manager(container); exit(0); From b928b5a0384e7181425a282a0586cbbb3c85fbc3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2009 17:08:33 -0700 Subject: [PATCH 11/26] mdmon: exec(2) when the switchroot argument is not "/" Try to execute mdmon from the target namespace. When used for initramfs handovers we need to drop all references to the initramfs filesystem for that memory to be freed. Cc: Hans de Goede Signed-off-by: Dan Williams --- mdmon.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/mdmon.c b/mdmon.c index 5f87e78..d3e8be5 100644 --- a/mdmon.c +++ b/mdmon.c @@ -369,6 +369,29 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) dprintf("starting mdmon for %s in %s\n", devname, switchroot ? : "/"); + + /* try to spawn mdmon instances from the target file system */ + if (switchroot && strcmp(switchroot, "/") != 0) { + char path[1024]; + pid_t pid; + + sprintf(path, "%s/sbin/mdmon", switchroot); + switch (fork()) { + case 0: + execl(path, "mdmon", devname, NULL); + exit(1); + case -1: + return 1; + default: + pid = wait(&status); + if (pid > -1 && WIFEXITED(status) && + WEXITSTATUS(status) == 0) + return 0; + else + return 1; + } + } + mdfd = open_dev(devnum); if (mdfd < 0) { fprintf(stderr, "mdmon: %s: %s\n", devname, From 9f1da8242161ba684f2867f211eb7e9d4baa84bb Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2009 17:37:02 -0700 Subject: [PATCH 12/26] mdmon: preserve socket over chroot Connect to the monitor in the old namespace and use that connection for WaitClean requests when stopping the victim mdmon instance. This allows ping_monitor() to work post chroot(). Cc: Hans de Goede Signed-off-by: Dan Williams --- mdadm.c | 4 ++-- mdadm.h | 2 +- mdmon.c | 12 ++++++++---- msg.c | 14 +++++++++++--- msg.h | 1 + sysfs.c | 5 +++-- 6 files changed, 26 insertions(+), 12 deletions(-) diff --git a/mdadm.c b/mdadm.c index bb3e5bb..6f43dc3 100644 --- a/mdadm.c +++ b/mdadm.c @@ -1276,7 +1276,7 @@ int main(int argc, char *argv[]) export, test, homehost); else - rv |= WaitClean(name, v); + rv |= WaitClean(name, -1, v); put_md_name(name); } free_mdstat(ms); @@ -1337,7 +1337,7 @@ int main(int argc, char *argv[]) case 'W': rv |= Wait(dv->devname); continue; case Waitclean: - rv |= WaitClean(dv->devname, verbose-quiet); continue; + rv |= WaitClean(dv->devname, -1, verbose-quiet); continue; } mdfd = open_mddev(dv->devname, 1); if (mdfd>=0) { diff --git a/mdadm.h b/mdadm.h index 8212a2c..ffa5f53 100644 --- a/mdadm.h +++ b/mdadm.h @@ -753,7 +753,7 @@ extern int Monitor(mddev_dev_t devlist, extern int Kill(char *dev, int force, int quiet, int noexcl); extern int Wait(char *dev); -extern int WaitClean(char *dev, int verbose); +extern int WaitClean(char *dev, int sock, int verbose); extern int Incremental(char *devname, int verbose, int runstop, struct supertype *st, char *homehost, int require_homehost, diff --git a/mdmon.c b/mdmon.c index d3e8be5..50c7be6 100644 --- a/mdmon.c +++ b/mdmon.c @@ -175,7 +175,7 @@ pid_t devname2mdmon(char *devname) return pid; } -static void try_kill_monitor(pid_t pid, char *devname) +static void try_kill_monitor(pid_t pid, char *devname, int sock) { char buf[100]; int fd; @@ -205,7 +205,7 @@ static void try_kill_monitor(pid_t pid, char *devname) for ( ; mdstat; mdstat = mdstat->next) if (is_container_member(mdstat, devname)) { sprintf(buf, "/dev/%s", mdstat->dev); - WaitClean(buf, 0); + WaitClean(buf, sock, 0); } free_mdstat(mdstat); } @@ -366,6 +366,7 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) int status; int ignore; pid_t victim = -1; + int victim_sock = -1; dprintf("starting mdmon for %s in %s\n", devname, switchroot ? : "/"); @@ -502,6 +503,7 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) * the new root */ victim = devname2mdmon(container->devname); + victim_sock = connect_monitor(container->devname); if (chroot(switchroot) != 0) { fprintf(stderr, "mdmon: failed to chroot to '%s': %s\n", switchroot, strerror(errno)); @@ -551,8 +553,10 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) exit(2); } - if (victim > -1) - try_kill_monitor(victim, container->devname); + if (victim > -1) { + try_kill_monitor(victim, container->devname, victim_sock); + close(victim_sock); + } do_manager(container); exit(0); diff --git a/msg.c b/msg.c index 5a4839f..8d52b94 100644 --- a/msg.c +++ b/msg.c @@ -177,10 +177,8 @@ int connect_monitor(char *devname) return sfd; } -/* give the monitor a chance to update the metadata */ -int ping_monitor(char *devname) +int fping_monitor(int sfd) { - int sfd = connect_monitor(devname); int err = 0; if (sfd < 0) @@ -194,6 +192,16 @@ int ping_monitor(char *devname) if (!err && wait_reply(sfd, 20) != 0) err = -1; + return err; +} + + +/* give the monitor a chance to update the metadata */ +int ping_monitor(char *devname) +{ + int sfd = connect_monitor(devname); + int err = fping_monitor(sfd); + close(sfd); return err; } diff --git a/msg.h b/msg.h index b9bd205..f8e89fd 100644 --- a/msg.h +++ b/msg.h @@ -27,6 +27,7 @@ extern int ack(int fd, int tmo); extern int wait_reply(int fd, int tmo); extern int connect_monitor(char *devname); extern int ping_monitor(char *devname); +extern int fping_monitor(int sock); extern int ping_manager(char *devname); #define MSG_MAX_LEN (4*1024*1024) diff --git a/sysfs.c b/sysfs.c index 81ccb53..d327e3d 100644 --- a/sysfs.c +++ b/sysfs.c @@ -764,7 +764,7 @@ int sysfs_unique_holder(int devnum, long rdev) static char *clean_states[] = { "clear", "inactive", "readonly", "read-auto", "clean", NULL }; -int WaitClean(char *dev, int verbose) +int WaitClean(char *dev, int sock, int verbose) { int fd; struct mdinfo *mdi; @@ -840,7 +840,8 @@ int WaitClean(char *dev, int verbose) } if (rv < 0) rv = 1; - else if (ping_monitor(mdi->text_version) == 0) { + else if (fping_monitor(sock) == 0 || + ping_monitor(mdi->text_version) == 0) { /* we need to ping to close the window between array * state transitioning to clean and the metadata being * marked clean From 1373b07d758213b643f72a09384b840e4f08057a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 19 Oct 2009 13:04:16 +1100 Subject: [PATCH 13/26] mdmon: lock current memory as well as future memory. mlockall(MCL_FUTURE) only locks mappings that have not yet been created. To lock all memory used by the process, we need MCL_CURRENT | MCL_FUTURE Signed-off-by: NeilBrown --- mdmon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mdmon.c b/mdmon.c index 50c7be6..0ec4259 100644 --- a/mdmon.c +++ b/mdmon.c @@ -545,7 +545,7 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) ignore = dup(0); #endif - mlockall(MCL_FUTURE); + mlockall(MCL_CURRENT | MCL_FUTURE); if (clone_monitor(container) < 0) { fprintf(stderr, "mdmon: failed to start monitor process: %s\n", From 9a36a9b713a6c789f268251a81de67bb8fd9c7f8 Mon Sep 17 00:00:00 2001 From: Zdenek Behan Date: Mon, 19 Oct 2009 13:13:58 +1100 Subject: [PATCH 14/26] Monitor: add option to specify rebuild increments ie. the percent increments after which RebuildNN event is generated This is particulary useful when using --program option, rather than (only) syslog for alerts. Signed-off-by: Zdenek Behan Signed-off-by: NeilBrown --- Monitor.c | 26 +++++++++++++------------- ReadMe.c | 2 ++ mdadm.8 | 12 ++++++++++-- mdadm.c | 11 ++++++++++- mdadm.h | 2 +- 5 files changed, 36 insertions(+), 17 deletions(-) diff --git a/Monitor.c b/Monitor.c index af486d7..b0802f8 100644 --- a/Monitor.c +++ b/Monitor.c @@ -33,14 +33,6 @@ static void alert(char *event, char *dev, char *disc, char *mailaddr, char *mailfrom, char *cmd, int dosyslog); -static char *percentalerts[] = { - "RebuildStarted", - "Rebuild20", - "Rebuild40", - "Rebuild60", - "Rebuild80", -}; - /* The largest number of disks current arrays can manage is 384 * This really should be dynamically, but that will have to wait * At least it isn't MD_SB_DISKS. @@ -49,7 +41,7 @@ static char *percentalerts[] = { int Monitor(mddev_dev_t devlist, char *mailaddr, char *alert_cmd, int period, int daemonise, int scan, int oneshot, - int dosyslog, int test, char* pidfile) + int dosyslog, int test, char* pidfile, int increments) { /* * Every few seconds, scan every md device looking for changes @@ -77,8 +69,8 @@ int Monitor(mddev_dev_t devlist, * An active device had a reverse transition * RebuildStarted * percent went from -1 to +ve - * Rebuild20 Rebuild40 Rebuild60 Rebuild80 - * percent went from below to not-below that number + * RebuildNN + * percent went from below to not-below NN% * DeviceDisappeared * Couldn't access a device which was previously visible * @@ -311,9 +303,17 @@ int Monitor(mddev_dev_t devlist, if (mse && st->percent >= 0 && mse->percent >= 0 && - (mse->percent / 20) > (st->percent / 20)) - alert(percentalerts[mse->percent/20], + (mse->percent / increments) > (st->percent / increments)) { + char percentalert[15]; // "RebuildNN" (10 chars) or "RebuildStarted" (15 chars) + + if((mse->percent / increments) == 0) + snprintf(percentalert, sizeof(percentalert), "RebuildStarted"); + else + snprintf(percentalert, sizeof(percentalert), "Rebuild%02d", mse->percent); + + alert(percentalert, dev, NULL, mailaddr, mailfrom, alert_cmd, dosyslog); + } if (mse && mse->percent == -1 && diff --git a/ReadMe.c b/ReadMe.c index 90b4daf..3e53e57 100644 --- a/ReadMe.c +++ b/ReadMe.c @@ -176,6 +176,7 @@ struct option long_options[] = { {"mail", 1, 0, 'm'}, {"program", 1, 0, 'p'}, {"alert", 1, 0, 'p'}, + {"increment", 1, 0, 'r'}, {"delay", 1, 0, 'd'}, {"daemonise", 0, 0, 'f'}, {"daemonize", 0, 0, 'f'}, @@ -495,6 +496,7 @@ char Help_monitor[] = " --mail= -m : Address to mail alerts of failure to\n" " --program= -p : Program to run when an event is detected\n" " --alert= : same as --program\n" +" --increment= -r : Report RebuildNN events in the given increment. default=20\n" " --delay= -d : seconds of delay between polling state. default=60\n" " --config= -c : specify a different config file\n" " --scan -s : find mail-address/program in config file\n" diff --git a/mdadm.8 b/mdadm.8 index 7f19918..36dbf90 100644 --- a/mdadm.8 +++ b/mdadm.8 @@ -1220,6 +1220,12 @@ reduce this as the kernel alerts .I mdadm immediately when there is any change. +.TP +.BR \-r ", " \-\-increment +Give a percentage increment. +.I mdadm +will generate RebuildNN events with the given percentage increment. + .TP .BR \-f ", " \-\-daemonise Tell @@ -1818,8 +1824,10 @@ An md array started reconstruction. (syslog priority: Warning) .BI Rebuild NN Where .I NN -is 20, 40, 60, or 80, this indicates that rebuild has passed that many -percentage of the total. (syslog priority: Warning) +is a two-digit number (ie. 05, 48). This indicates that rebuild +has passed that many percent of the total. The events are generated +with fixed increment since 0. Increment size may be specified with +a commandline option (default is 20). (syslog priority: Warning) .TP .B RebuildFinished diff --git a/mdadm.c b/mdadm.c index 6f43dc3..df48117 100644 --- a/mdadm.c +++ b/mdadm.c @@ -89,6 +89,7 @@ int main(int argc, char *argv[]) int require_homehost = 1; char *mailaddr = NULL; char *program = NULL; + int increments = 20; int delay = 0; int daemonise = 0; char *pidfile = NULL; @@ -698,6 +699,14 @@ int main(int argc, char *argv[]) program = optarg; continue; + case O(MONITOR,'r'): /* rebuild increments */ + increments = atoi(optarg); + if (increments>99 || increments<1) { + fprintf(stderr, Name ": please specify positive integer between 1 and 99 as rebuild increments.\n"); + exit(2); + } + continue; + case O(MONITOR,'d'): /* delay in seconds */ case O(GROW, 'd'): case O(BUILD,'d'): /* delay for bitmap updates */ @@ -1377,7 +1386,7 @@ int main(int argc, char *argv[]) } rv= Monitor(devlist, mailaddr, program, delay?delay:60, daemonise, scan, oneshot, - dosyslog, test, pidfile); + dosyslog, test, pidfile, increments); break; case GROW: diff --git a/mdadm.h b/mdadm.h index ffa5f53..2e2275c 100644 --- a/mdadm.h +++ b/mdadm.h @@ -749,7 +749,7 @@ extern int Examine(mddev_dev_t devlist, int brief, int export, int scan, extern int Monitor(mddev_dev_t devlist, char *mailaddr, char *alert_cmd, int period, int daemonise, int scan, int oneshot, - int dosyslog, int test, char *pidfile); + int dosyslog, int test, char *pidfile, int increments); extern int Kill(char *dev, int force, int quiet, int noexcl); extern int Wait(char *dev); From d16c7af6d8fc271c7713cb9817ef88c09d541f61 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Sat, 3 Oct 2009 20:34:55 -0400 Subject: [PATCH 15/26] mdadm(8): fix spurious space after -e header Signed-off-by: Mike Frysinger Signed-off-by: NeilBrown --- mdadm.8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mdadm.8 b/mdadm.8 index 36dbf90..b15bfb0 100644 --- a/mdadm.8 +++ b/mdadm.8 @@ -307,7 +307,7 @@ says to get a list of array devices from .BR /proc/mdstat . .TP -.B \-e ", " \-\-metadata= +.BR \-e ", " \-\-metadata= Declare the style of RAID metadata (superblock) to be used. The default is 0.90 for .BR \-\-create , From 2e48e349452cccefe4286449483abd8a17f2cb15 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 19 Oct 2009 16:56:13 +1100 Subject: [PATCH 16/26] test: udev-settle before testing device. I think we sometime get way ahead of udev and devices disappear and appear almost at random. So add some settling. Signed-off-by: NeilBrown --- test | 1 + 1 file changed, 1 insertion(+) diff --git a/test b/test index 133f8ff..2ceea46 100644 --- a/test +++ b/test @@ -161,6 +161,7 @@ no_errors() { # basic device test testdev() { + udevadm settle dev=$1 cnt=$2 dvsize=$3 From 453e3b41d0bd823bfc3137ce5c2738d02fc2ffe2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 19 Oct 2009 16:57:16 +1100 Subject: [PATCH 17/26] test/raid6integ: correct type ddf-zero-restart was misspelled. Signed-off-by: NeilBrown --- tests/01raid6integ | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/01raid6integ b/tests/01raid6integ index ed7cec5..245b0da 100644 --- a/tests/01raid6integ +++ b/tests/01raid6integ @@ -7,10 +7,10 @@ layouts='ls rs la ra' lv=`uname -r` if expr $lv '>=' 2.6.30 > /dev/null then - layouts="$layouts parity-first dd-zero-restart ddf-N-restart ddf-N-continue \ + layouts="$layouts parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \ left-asymmetric-6 right-asymmetric-6 left-symmetric-6 right-symmetric-6 parity-first-6" fi -echo $layouts + for layout in $layouts do mdadm -CR $md0 -l6 --layout $layout -n5 $dev0 $dev1 $dev2 $dev3 $dev4 From 00eb571675b431b3845c67abba891a198cef06f3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 19 Oct 2009 16:58:38 +1100 Subject: [PATCH 18/26] test/ddf: don't insist that mdadm.conf is always in the same order. When created by different process, the order could reasonably be different. So sort before compare Signed-off-by: NeilBrown --- tests/10ddf-create | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/10ddf-create b/tests/10ddf-create index db22b64..a32dc0f 100644 --- a/tests/10ddf-create +++ b/tests/10ddf-create @@ -55,8 +55,8 @@ mdadm -Ss mdadm -Asc /var/tmp/mdadm.conf check nosync # This failed once. The raid5 was resyncing. -mdadm -Dbs > /tmp/mdadm.conf -diff /tmp/mdadm.conf /var/tmp/mdadm.conf +mdadm -Dbs | sort > /tmp/mdadm.conf +sort /var/tmp/mdadm.conf | diff /tmp/mdadm.conf - mdadm -Ss # and now assemble fully incrementally. @@ -70,7 +70,7 @@ do done check nosync -mdadm -Dbs > /tmp/mdadm.conf -diff /tmp/mdadm.conf /var/tmp/mdadm.conf +mdadm -Dbs | sort > /tmp/mdadm.conf +sort /var/tmp/mdadm.conf | diff /tmp/mdadm.conf - mdadm -Ss rm /tmp/mdadm.conf /var/tmp/mdadm.conf From 8f1b2bbbb9cab959ecab0c474dc81afdc7b6dffd Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 19 Oct 2009 17:00:52 +1100 Subject: [PATCH 19/26] Detail: list containers before members. To allow "--assemble --scan" to have a chance, list containers before members in --detail --scan output. Signed-off-by: NeilBrown --- mdadm.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mdadm.c b/mdadm.c index df48117..4651e73 100644 --- a/mdadm.c +++ b/mdadm.c @@ -1263,11 +1263,18 @@ int main(int argc, char *argv[]) struct mdstat_ent *ms = mdstat_read(0, 1); struct mdstat_ent *e; struct map_ent *map = NULL; + int members; int v = verbose>1?0:verbose+1; + for (members = 0; members <= 1; members++) { for (e=ms ; e ; e=e->next) { char *name; struct map_ent *me; + int member = e->metadata_version && + strncmp(e->metadata_version, + "external:/", 10) == 0; + if (members != member) + continue; me = map_by_devnum(&map, e->devnum); if (me && me->path && strcmp(me->path, "/unknown") != 0) @@ -1288,6 +1295,7 @@ int main(int argc, char *argv[]) rv |= WaitClean(name, -1, v); put_md_name(name); } + } free_mdstat(ms); } else if (devmode == 'S' && scan) { /* apply --stop to all devices in /proc/mdstat */ From 7636b5a8bb4f5bb3451d355ed0bb726e0615d631 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 19 Oct 2009 17:04:12 +1100 Subject: [PATCH 20/26] Assemble: print verbose messages when finding members in containers .. so that "-Av" gives more hints at what is going on. Signed-off-by: NeilBrown --- Assemble.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Assemble.c b/Assemble.c index 7da0905..54e725a 100644 --- a/Assemble.c +++ b/Assemble.c @@ -315,6 +315,9 @@ int Assemble(struct supertype *st, char *mddev, } /* It is worth looking inside this container. */ + if (verbose > 0) + fprintf(stderr, Name ": looking in container %s\n", + devname); next_member: if (tmpdev->content) content = tmpdev->content; @@ -420,6 +423,9 @@ int Assemble(struct supertype *st, char *mddev, st->ss->free_super(st); return 1; } + if (verbose > 0) + fprintf(stderr, Name ": found match on member %s in %s\n", + content->text_version, devname); break; } if (st == NULL) From 8a0a0ded4af12be6db97abba7551923da3bcf8c9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 19 Oct 2009 17:08:04 +1100 Subject: [PATCH 21/26] Assemble: handle container members better When looking for a specific member, don't accept a different member, but step on to the next one. Signed-off-by: NeilBrown --- Assemble.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Assemble.c b/Assemble.c index 54e725a..311666c 100644 --- a/Assemble.c +++ b/Assemble.c @@ -408,6 +408,9 @@ int Assemble(struct supertype *st, char *mddev, fprintf(stderr, Name ": member %s in %s is already assembled\n", content->text_version, devname); + skip: + if (tmpdev->content) + goto next_member; tst->ss->free_super(tst); tst = NULL; content = NULL; @@ -415,6 +418,21 @@ int Assemble(struct supertype *st, char *mddev, goto loop; return 1; } + if (ident->member && ident->member[0]) { + char *s = strchr(content->text_version+1, '/'); + if (s == NULL) { + fprintf(stderr, Name ": badly formatted version: %s\n", + content->text_version); + goto skip; + } + if (strcmp(ident->member, s+1) != 0) { + if (report_missmatch) + fprintf(stderr, + Name ": skipping wrong member %s\n", + content->text_version); + goto skip; + } + } st = tst; tst = NULL; if (!auto_assem && tmpdev->next != NULL) { fprintf(stderr, Name ": %s is a container, but is not " From 5ac6db12f9f970f26b7017517789bac200631b11 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 19 Oct 2009 17:11:15 +1100 Subject: [PATCH 22/26] mdopen: only use 'dev' as chosen name if it is a full path. Otherwise using names like "r0" causes problem. They are handled sufficiently by other paths in the code. Signed-off-by: NeilBrown --- mdopen.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mdopen.c b/mdopen.c index d322cf4..ed53d6f 100644 --- a/mdopen.c +++ b/mdopen.c @@ -156,7 +156,6 @@ int create_mddev(char *dev, char *name, int autof, int trustworthy, if (dev) { - if (strncmp(dev, "/dev/md/", 8) == 0) { strcpy(cname, dev+8); } else if (strncmp(dev, "/dev/", 5) == 0) { @@ -307,7 +306,7 @@ int create_mddev(char *dev, char *name, int autof, int trustworthy, } } - if (dev) + if (dev && dev[0] == '/') strcpy(chosen, dev); else if (cname[0] == 0) strcpy(chosen, devname); From 151ea1a33d324e27a1c7ca3cb9c95e80b968353f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 16 Oct 2009 17:57:28 +1100 Subject: [PATCH 23/26] tests/imsm: allow for rounding of array size. IMSM rounds array size to a multiple of 1024K, so our tests must assume this. Signed-off-by: NeilBrown --- test | 5 +++++ tests/09imsm-create-fail-rebuild | 3 +++ 2 files changed, 8 insertions(+) diff --git a/test b/test index 2ceea46..3acb6c1 100644 --- a/test +++ b/test @@ -170,6 +170,11 @@ testdev() { dsize=$[dvsize/chunk] dsize=$[dsize*chunk] rasize=$[dsize*2*cnt] + # rasize is in sectors + if [ -n "$DEV_ROUND_K" ]; then + rasize=$[rasize/DEV_ROUND_K/2] + rasize=$[rasize*DEV_ROUND_K*2] + fi if [ `/sbin/blockdev --getsize $dev` -eq 0 ]; then sleep 2 ; fi if [ $rasize -ne `/sbin/blockdev --getsize $dev` ] then diff --git a/tests/09imsm-create-fail-rebuild b/tests/09imsm-create-fail-rebuild index 8069576..68fdd09 100644 --- a/tests/09imsm-create-fail-rebuild +++ b/tests/09imsm-create-fail-rebuild @@ -1,5 +1,8 @@ # sanity check array creation +# IMSM rounds to multiples of one mebibyte - 1024K +DEV_ROUND_K=1024 + num_disks=2 mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 imsm_check container $num_disks From 1dfcc211b1c6c319c329e41720d61a2582319d21 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 20 Oct 2009 08:02:53 +1100 Subject: [PATCH 24/26] testreshape5 fixes. We seem to need a 'udevadm settle', and possibly the 'sync'.. Signed-off-by: NeilBrown --- tests/07testreshape5 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/07testreshape5 b/tests/07testreshape5 index 0360988..4a9c0df 100644 --- a/tests/07testreshape5 +++ b/tests/07testreshape5 @@ -27,11 +27,12 @@ do # test save dd if=/dev/urandom of=$md0 bs=1024 count=$size - blockdev --flushbufs $md0 $devs + blockdev --flushbufs $md0 $devs; sync > /tmp/NewRand $dir/test_stripe save /tmp/NewRand $disks $[chunk*1024] 5 $nlayout 0 $[size*1024] $devs cmp -s -n $[size*1024] $md0 /tmp/NewRand || { echo cmp failed ; exit 2; } mdadm -S $md0 + udevadm settle done done done From 1799c9e8f8465fdbd583dfe6381400e1d01d4954 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 20 Oct 2009 13:50:23 +1100 Subject: [PATCH 25/26] super-intel: Fix compilation of mdassemble. Signed-off-by: NeilBrown --- super-intel.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/super-intel.c b/super-intel.c index 110c4a8..9a99d60 100644 --- a/super-intel.c +++ b/super-intel.c @@ -619,7 +619,6 @@ static __u32 imsm_reserved_sectors(struct intel_super *super, struct dl *dl) return rv; } -#ifndef MDASSEMBLE static int is_spare(struct imsm_disk *disk) { return (disk->status & SPARE_DISK) == SPARE_DISK; @@ -635,6 +634,7 @@ static int is_failed(struct imsm_disk *disk) return (disk->status & FAILED_DISK) == FAILED_DISK; } +#ifndef MDASSEMBLE static void print_imsm_dev(struct imsm_dev *dev, char *uuid, int disk_idx) { __u64 sz; @@ -1679,6 +1679,7 @@ static void serialcpy(__u8 *dest, __u8 *src) strncpy((char *) dest, (char *) src, MAX_RAID_SERIAL_LEN); } +#ifndef MDASSEMBLE static struct dl *serial_to_dl(__u8 *serial, struct intel_super *super) { struct dl *dl; @@ -1689,6 +1690,7 @@ static struct dl *serial_to_dl(__u8 *serial, struct intel_super *super) return dl; } +#endif static struct imsm_disk * __serial_to_disk(__u8 *serial, struct imsm_super *mpb, int *idx) @@ -3230,7 +3232,11 @@ static int store_super_imsm(struct supertype *st, int fd) if (!mpb) return 1; +#ifndef MDASSEMBLE return store_imsm_mpb(fd, mpb); +#else + return 1; +#endif } static int imsm_bbm_log_size(struct imsm_super *mpb) From 0eb26465c0a14d707ca00f4f7bcdb67bde36f706 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 22 Oct 2009 11:00:56 +1100 Subject: [PATCH 26/26] Free some malloced memory that wasn't being freed. As mdadm is normally a short-lived program it isn't always necessary to free memory that was allocated, as the 'exit()' call will automatically free everything. But it is more obviously correct if the 'free' is there. So this patch add a few calls to 'free' Signed-off-by: NeilBrown --- Detail.c | 1 + Manage.c | 4 +++- mdopen.c | 5 ++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Detail.c b/Detail.c index e41ad1b..544cfdb 100644 --- a/Detail.c +++ b/Detail.c @@ -541,6 +541,7 @@ This is pretty boring 1, avail, avail_disks)) rv = 2; + free(disks); out: close(fd); return rv; diff --git a/Manage.c b/Manage.c index 3aa09bc..84eb3ab 100644 --- a/Manage.c +++ b/Manage.c @@ -140,7 +140,7 @@ static void remove_devices(int devnum, char *path) strcpy(path2, path); pe = path2 + strlen(path2); } else - path = NULL; + path2 = path = NULL; for (part = 0; part < 16; part++) { if (part) { @@ -161,6 +161,7 @@ static void remove_devices(int devnum, char *path) unlink(path2); } } + free(path2); } @@ -667,6 +668,7 @@ int Manage_subdevs(char *devname, int fd, disc.state |= (1<writemostly == 1) disc.state |= (1 << MD_DISK_WRITEMOSTLY); diff --git a/mdopen.c b/mdopen.c index ed53d6f..21baf5d 100644 --- a/mdopen.c +++ b/mdopen.c @@ -43,7 +43,7 @@ void make_parts(char *dev, int cnt) int odig = odig; /* quiet gcc -Os unitialized warning */ int i; int nlen = strlen(dev) + 20; - char *name = malloc(nlen); + char *name; int dig = isdigit(dev[strlen(dev)-1]); char orig[1024]; char sym[1024]; @@ -52,6 +52,7 @@ void make_parts(char *dev, int cnt) if (cnt==0) cnt=4; if (lstat(dev, &stb)!= 0) return; + if (S_ISLNK(stb.st_mode)) { int len = readlink(dev, orig, sizeof(orig)); if (len < 0 || len > 1000) @@ -63,6 +64,7 @@ void make_parts(char *dev, int cnt) minor_num = minor(stb.st_rdev); } else return; + name = malloc(nlen); for (i=1; i <= cnt ; i++) { struct stat stb2; snprintf(name, nlen, "%s%s%d", dev, dig?"p":"", i); @@ -92,6 +94,7 @@ void make_parts(char *dev, int cnt) if (err == 0 && stat(name, &stb2) == 0) add_dev(name, &stb2, 0, NULL); } + free(name); }