diff --git a/Assemble.c b/Assemble.c index 28de83e..a52dc50 100644 --- a/Assemble.c +++ b/Assemble.c @@ -315,6 +315,9 @@ int Assemble(struct supertype *st, char *mddev, } /* It is worth looking inside this container. */ + if (verbose > 0) + fprintf(stderr, Name ": looking in container %s\n", + devname); next_member: if (tmpdev->content) content = tmpdev->content; @@ -405,6 +408,9 @@ int Assemble(struct supertype *st, char *mddev, fprintf(stderr, Name ": member %s in %s is already assembled\n", content->text_version, devname); + skip: + if (tmpdev->content) + goto next_member; tst->ss->free_super(tst); tst = NULL; content = NULL; @@ -412,6 +418,21 @@ int Assemble(struct supertype *st, char *mddev, goto loop; return 1; } + if (ident->member && ident->member[0]) { + char *s = strchr(content->text_version+1, '/'); + if (s == NULL) { + fprintf(stderr, Name ": badly formatted version: %s\n", + content->text_version); + goto skip; + } + if (strcmp(ident->member, s+1) != 0) { + if (report_missmatch) + fprintf(stderr, + Name ": skipping wrong member %s\n", + content->text_version); + goto skip; + } + } st = tst; tst = NULL; if (!auto_assem && tmpdev->next != NULL) { fprintf(stderr, Name ": %s is a container, but is not " @@ -420,6 +441,9 @@ int Assemble(struct supertype *st, char *mddev, st->ss->free_super(st); return 1; } + if (verbose > 0) + fprintf(stderr, Name ": found match on member %s in %s\n", + content->text_version, devname); break; } if (st == NULL) @@ -565,6 +589,7 @@ int Assemble(struct supertype *st, char *mddev, #endif /* Ok, no bad inconsistancy, we can try updating etc */ bitmap_done = 0; + content->update_private = NULL; for (tmpdev = devlist; tmpdev; tmpdev=tmpdev->next) if (tmpdev->used == 1) { char *devname = tmpdev->devname; struct stat stb; @@ -717,6 +742,8 @@ int Assemble(struct supertype *st, char *mddev, } devcnt++; } + free(content->update_private); + content->update_private = NULL; if (devcnt == 0) { fprintf(stderr, Name ": no devices found for %s\n", diff --git a/Detail.c b/Detail.c index 55d5481..e2cf028 100644 --- a/Detail.c +++ b/Detail.c @@ -194,7 +194,12 @@ int Detail(char *dev, int brief, int export, int test, char *homehost) st->ss->export_detail_super(st); } else { struct map_ent *mp, *map = NULL; + char nbuf[64]; mp = map_by_devnum(&map, fd2devnum(fd)); + if (mp) { + __fname_from_uuid(mp->uuid, 0, nbuf, ':'); + printf("MD_UUID=%s\n", nbuf+5); + } if (mp && mp->path && strncmp(mp->path, "/dev/md/", 8) == 0) printf("MD_DEVNAME=%s\n", mp->path+8); @@ -540,6 +545,7 @@ This is pretty boring 1, avail, avail_disks)) rv = 2; + free(disks); out: close(fd); return rv; diff --git a/Manage.c b/Manage.c index 9217139..6b8cff4 100644 --- a/Manage.c +++ b/Manage.c @@ -140,7 +140,7 @@ static void remove_devices(int devnum, char *path) strcpy(path2, path); pe = path2 + strlen(path2); } else - path = NULL; + path2 = path = NULL; for (part = 0; part < 16; part++) { if (part) { @@ -161,6 +161,7 @@ static void remove_devices(int devnum, char *path) unlink(path2); } } + free(path2); } @@ -649,6 +650,7 @@ int Manage_subdevs(char *devname, int fd, disc.state |= (1<writemostly == 1) disc.state |= (1 << MD_DISK_WRITEMOSTLY); diff --git a/Monitor.c b/Monitor.c index af486d7..b0802f8 100644 --- a/Monitor.c +++ b/Monitor.c @@ -33,14 +33,6 @@ static void alert(char *event, char *dev, char *disc, char *mailaddr, char *mailfrom, char *cmd, int dosyslog); -static char *percentalerts[] = { - "RebuildStarted", - "Rebuild20", - "Rebuild40", - "Rebuild60", - "Rebuild80", -}; - /* The largest number of disks current arrays can manage is 384 * This really should be dynamically, but that will have to wait * At least it isn't MD_SB_DISKS. @@ -49,7 +41,7 @@ static char *percentalerts[] = { int Monitor(mddev_dev_t devlist, char *mailaddr, char *alert_cmd, int period, int daemonise, int scan, int oneshot, - int dosyslog, int test, char* pidfile) + int dosyslog, int test, char* pidfile, int increments) { /* * Every few seconds, scan every md device looking for changes @@ -77,8 +69,8 @@ int Monitor(mddev_dev_t devlist, * An active device had a reverse transition * RebuildStarted * percent went from -1 to +ve - * Rebuild20 Rebuild40 Rebuild60 Rebuild80 - * percent went from below to not-below that number + * RebuildNN + * percent went from below to not-below NN% * DeviceDisappeared * Couldn't access a device which was previously visible * @@ -311,9 +303,17 @@ int Monitor(mddev_dev_t devlist, if (mse && st->percent >= 0 && mse->percent >= 0 && - (mse->percent / 20) > (st->percent / 20)) - alert(percentalerts[mse->percent/20], + (mse->percent / increments) > (st->percent / increments)) { + char percentalert[15]; // "RebuildNN" (10 chars) or "RebuildStarted" (15 chars) + + if((mse->percent / increments) == 0) + snprintf(percentalert, sizeof(percentalert), "RebuildStarted"); + else + snprintf(percentalert, sizeof(percentalert), "Rebuild%02d", mse->percent); + + alert(percentalert, dev, NULL, mailaddr, mailfrom, alert_cmd, dosyslog); + } if (mse && mse->percent == -1 && diff --git a/ReadMe.c b/ReadMe.c index 0a50acb..f5f15c7 100644 --- a/ReadMe.c +++ b/ReadMe.c @@ -176,6 +176,7 @@ struct option long_options[] = { {"mail", 1, 0, 'm'}, {"program", 1, 0, 'p'}, {"alert", 1, 0, 'p'}, + {"increment", 1, 0, 'r'}, {"delay", 1, 0, 'd'}, {"daemonise", 0, 0, 'f'}, {"daemonize", 0, 0, 'f'}, @@ -496,6 +497,7 @@ char Help_monitor[] = " --mail= -m : Address to mail alerts of failure to\n" " --program= -p : Program to run when an event is detected\n" " --alert= : same as --program\n" +" --increment= -r : Report RebuildNN events in the given increment. default=20\n" " --delay= -d : seconds of delay between polling state. default=60\n" " --config= -c : specify a different config file\n" " --scan -s : find mail-address/program in config file\n" diff --git a/managemon.c b/managemon.c index f9d545d..5958e18 100644 --- a/managemon.c +++ b/managemon.c @@ -680,6 +680,12 @@ void do_manager(struct supertype *container) read_sock(container); if (container->sock < 0 || socket_hup_requested) { + /* If this fails, we hope it already exists + * pid file lives in /var/run/mdadm/mdXX.pid + */ + mkdir("/var", 0600); + mkdir("/var/run", 0600); + mkdir("/var/run/mdadm", 0600); close(container->sock); container->sock = make_control_sock(container->devname); make_pidfile(container->devname, 0); diff --git a/mdadm.8 b/mdadm.8 index 8022014..ab02558 100644 --- a/mdadm.8 +++ b/mdadm.8 @@ -309,7 +309,7 @@ says to get a list of array devices from .BR /proc/mdstat . .TP -.B \-e ", " \-\-metadata= +.BR \-e ", " \-\-metadata= Declare the style of RAID metadata (superblock) to be used. The default is 0.90 for .BR \-\-create , @@ -1261,6 +1261,12 @@ reduce this as the kernel alerts .I mdadm immediately when there is any change. +.TP +.BR \-r ", " \-\-increment +Give a percentage increment. +.I mdadm +will generate RebuildNN events with the given percentage increment. + .TP .BR \-f ", " \-\-daemonise Tell @@ -1859,8 +1865,10 @@ An md array started reconstruction. (syslog priority: Warning) .BI Rebuild NN Where .I NN -is 20, 40, 60, or 80, this indicates that rebuild has passed that many -percentage of the total. (syslog priority: Warning) +is a two-digit number (ie. 05, 48). This indicates that rebuild +has passed that many percent of the total. The events are generated +with fixed increment since 0. Increment size may be specified with +a commandline option (default is 20). (syslog priority: Warning) .TP .B RebuildFinished diff --git a/mdadm.c b/mdadm.c index a4f2d90..0c51d63 100644 --- a/mdadm.c +++ b/mdadm.c @@ -91,6 +91,7 @@ int main(int argc, char *argv[]) int require_homehost = 1; char *mailaddr = NULL; char *program = NULL; + int increments = 20; int delay = 0; int daemonise = 0; char *pidfile = NULL; @@ -714,6 +715,14 @@ int main(int argc, char *argv[]) program = optarg; continue; + case O(MONITOR,'r'): /* rebuild increments */ + increments = atoi(optarg); + if (increments>99 || increments<1) { + fprintf(stderr, Name ": please specify positive integer between 1 and 99 as rebuild increments.\n"); + exit(2); + } + continue; + case O(MONITOR,'d'): /* delay in seconds */ case O(GROW, 'd'): case O(BUILD,'d'): /* delay for bitmap updates */ @@ -1270,11 +1279,18 @@ int main(int argc, char *argv[]) struct mdstat_ent *ms = mdstat_read(0, 1); struct mdstat_ent *e; struct map_ent *map = NULL; + int members; int v = verbose>1?0:verbose+1; + for (members = 0; members <= 1; members++) { for (e=ms ; e ; e=e->next) { char *name; struct map_ent *me; + int member = e->metadata_version && + strncmp(e->metadata_version, + "external:/", 10) == 0; + if (members != member) + continue; me = map_by_devnum(&map, e->devnum); if (me && me->path && strcmp(me->path, "/unknown") != 0) @@ -1292,9 +1308,10 @@ int main(int argc, char *argv[]) export, test, homehost); else - rv |= WaitClean(name, v); + rv |= WaitClean(name, -1, v); put_md_name(name); } + } free_mdstat(ms); } else if (devmode == 'S' && scan) { /* apply --stop to all devices in /proc/mdstat */ @@ -1353,7 +1370,7 @@ int main(int argc, char *argv[]) case 'W': rv |= Wait(dv->devname); continue; case Waitclean: - rv |= WaitClean(dv->devname, verbose-quiet); continue; + rv |= WaitClean(dv->devname, -1, verbose-quiet); continue; } mdfd = open_mddev(dv->devname, 1); if (mdfd>=0) { @@ -1393,7 +1410,7 @@ int main(int argc, char *argv[]) } rv= Monitor(devlist, mailaddr, program, delay?delay:60, daemonise, scan, oneshot, - dosyslog, test, pidfile); + dosyslog, test, pidfile, increments); break; case GROW: diff --git a/mdadm.h b/mdadm.h index 2b8370c..261cdb7 100644 --- a/mdadm.h +++ b/mdadm.h @@ -153,6 +153,11 @@ struct mdinfo { int cache_size; /* size of raid456 stripe cache*/ int mismatch_cnt; char text_version[50]; + void *update_private; /* for passing metadata-format + * specific update data + * between successive calls to + * update_super() + */ int container_member; /* for assembling external-metatdata arrays * This is to be used internally by metadata @@ -749,11 +754,11 @@ extern int Examine(mddev_dev_t devlist, int brief, int export, int scan, extern int Monitor(mddev_dev_t devlist, char *mailaddr, char *alert_cmd, int period, int daemonise, int scan, int oneshot, - int dosyslog, int test, char *pidfile); + int dosyslog, int test, char *pidfile, int increments); extern int Kill(char *dev, int force, int quiet, int noexcl); extern int Wait(char *dev); -extern int WaitClean(char *dev, int verbose); +extern int WaitClean(char *dev, int sock, int verbose); extern int Incremental(char *devname, int verbose, int runstop, struct supertype *st, char *homehost, int require_homehost, @@ -813,6 +818,7 @@ extern void uuid_from_super(int uuid[4], mdp_super_t *super); extern const int uuid_match_any[4]; extern int same_uuid(int a[4], int b[4], int swapuuid); extern void copy_uuid(void *a, int b[4], int swapuuid); +extern char *__fname_from_uuid(int id[4], int swap, char *buf, char sep); extern char *fname_from_uuid(struct supertype *st, struct mdinfo *info, char *buf, char sep); extern unsigned long calc_csum(void *super, int bytes); diff --git a/mdmon.c b/mdmon.c index 31994d8..0ec4259 100644 --- a/mdmon.c +++ b/mdmon.c @@ -113,6 +113,14 @@ static struct superswitch *find_metadata_methods(char *vers) return NULL; } +static int test_pidfile(char *devname) +{ + char path[100]; + struct stat st; + + sprintf(path, "/var/run/mdadm/%s.pid", devname); + return stat(path, &st); +} int make_pidfile(char *devname, int o_excl) { @@ -149,27 +157,30 @@ int is_container_member(struct mdstat_ent *mdstat, char *container) return 1; } -void remove_pidfile(char *devname); -static void try_kill_monitor(char *devname) +pid_t devname2mdmon(char *devname) +{ + char buf[100]; + pid_t pid = -1; + int fd; + + sprintf(buf, "/var/run/mdadm/%s.pid", devname); + fd = open(buf, O_RDONLY|O_NOATIME); + if (fd < 0) + return -1; + + if (read(fd, buf, sizeof(buf)) > 0) + sscanf(buf, "%d\n", &pid); + close(fd); + + return pid; +} + +static void try_kill_monitor(pid_t pid, char *devname, int sock) { char buf[100]; int fd; - pid_t pid; struct mdstat_ent *mdstat; - sprintf(buf, "/var/run/mdadm/%s.pid", devname); - fd = open(buf, O_RDONLY); - if (fd < 0) - return; - - if (read(fd, buf, sizeof(buf)) < 0) { - close(fd); - return; - } - - close(fd); - pid = strtoul(buf, NULL, 10); - /* first rule of survival... don't off yourself */ if (pid == getpid()) return; @@ -194,10 +205,9 @@ static void try_kill_monitor(char *devname) for ( ; mdstat; mdstat = mdstat->next) if (is_container_member(mdstat, devname)) { sprintf(buf, "/dev/%s", mdstat->dev); - WaitClean(buf, 0); + WaitClean(buf, sock, 0); } free_mdstat(mdstat); - remove_pidfile(devname); } void remove_pidfile(char *devname) @@ -355,9 +365,34 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) int pfd[2]; int status; int ignore; + pid_t victim = -1; + int victim_sock = -1; dprintf("starting mdmon for %s in %s\n", devname, switchroot ? : "/"); + + /* try to spawn mdmon instances from the target file system */ + if (switchroot && strcmp(switchroot, "/") != 0) { + char path[1024]; + pid_t pid; + + sprintf(path, "%s/sbin/mdmon", switchroot); + switch (fork()) { + case 0: + execl(path, "mdmon", devname, NULL); + exit(1); + case -1: + return 1; + default: + pid = wait(&status); + if (pid > -1 && WIFEXITED(status) && + WEXITSTATUS(status) == 0) + return 0; + else + return 1; + } + } + mdfd = open_dev(devnum); if (mdfd < 0) { fprintf(stderr, "mdmon: %s: %s\n", devname, @@ -400,6 +435,7 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) container->devname = devname; container->arrays = NULL; container->subarray[0] = 0; + container->sock = -1; if (!container->devname) { fprintf(stderr, "mdmon: failed to allocate container name string\n"); @@ -464,12 +500,10 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) if (switchroot) { /* we assume we assume that /sys /proc /dev are available in - * the new root (see nash:setuproot) - * - * kill any monitors in the current namespace and change - * to the new one + * the new root */ - try_kill_monitor(container->devname); + victim = devname2mdmon(container->devname); + victim_sock = connect_monitor(container->devname); if (chroot(switchroot) != 0) { fprintf(stderr, "mdmon: failed to chroot to '%s': %s\n", switchroot, strerror(errno)); @@ -477,40 +511,15 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) } } - /* If this fails, we hope it already exists - * pid file lives in /var/run/mdadm/mdXX.pid - */ - mkdir("/var", 0600); - mkdir("/var/run", 0600); - mkdir("/var/run/mdadm", 0600); ignore = chdir("/"); - if (make_pidfile(container->devname, O_EXCL) < 0) { + if (victim < 0 && test_pidfile(container->devname) == 0) { if (ping_monitor(container->devname) == 0) { fprintf(stderr, "mdmon: %s already managed\n", container->devname); exit(3); - } else { - int err; - - /* cleanup the old monitor, this one is taking over */ - try_kill_monitor(container->devname); - err = make_pidfile(container->devname, 0); - if (err < 0) { - fprintf(stderr, "mdmon: %s Cannot create pidfile\n", - container->devname); - if (err == -EROFS) { - /* FIXME implement a mechanism to - * prevent duplicate monitor instances - */ - fprintf(stderr, - "mdmon: continuing on read-only file system\n"); - } else - exit(3); - } - } + } else if (victim < 0) + victim = devname2mdmon(container->devname); } - container->sock = make_control_sock(container->devname); - if (container->ss->load_super(container, mdfd, devname)) { fprintf(stderr, "mdmon: Cannot load metadata for %s\n", devname); @@ -536,7 +545,7 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) ignore = dup(0); #endif - mlockall(MCL_FUTURE); + mlockall(MCL_CURRENT | MCL_FUTURE); if (clone_monitor(container) < 0) { fprintf(stderr, "mdmon: failed to start monitor process: %s\n", @@ -544,6 +553,10 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) exit(2); } + if (victim > -1) { + try_kill_monitor(victim, container->devname, victim_sock); + close(victim_sock); + } do_manager(container); exit(0); diff --git a/mdopen.c b/mdopen.c index d322cf4..21baf5d 100644 --- a/mdopen.c +++ b/mdopen.c @@ -43,7 +43,7 @@ void make_parts(char *dev, int cnt) int odig = odig; /* quiet gcc -Os unitialized warning */ int i; int nlen = strlen(dev) + 20; - char *name = malloc(nlen); + char *name; int dig = isdigit(dev[strlen(dev)-1]); char orig[1024]; char sym[1024]; @@ -52,6 +52,7 @@ void make_parts(char *dev, int cnt) if (cnt==0) cnt=4; if (lstat(dev, &stb)!= 0) return; + if (S_ISLNK(stb.st_mode)) { int len = readlink(dev, orig, sizeof(orig)); if (len < 0 || len > 1000) @@ -63,6 +64,7 @@ void make_parts(char *dev, int cnt) minor_num = minor(stb.st_rdev); } else return; + name = malloc(nlen); for (i=1; i <= cnt ; i++) { struct stat stb2; snprintf(name, nlen, "%s%s%d", dev, dig?"p":"", i); @@ -92,6 +94,7 @@ void make_parts(char *dev, int cnt) if (err == 0 && stat(name, &stb2) == 0) add_dev(name, &stb2, 0, NULL); } + free(name); } @@ -156,7 +159,6 @@ int create_mddev(char *dev, char *name, int autof, int trustworthy, if (dev) { - if (strncmp(dev, "/dev/md/", 8) == 0) { strcpy(cname, dev+8); } else if (strncmp(dev, "/dev/", 5) == 0) { @@ -307,7 +309,7 @@ int create_mddev(char *dev, char *name, int autof, int trustworthy, } } - if (dev) + if (dev && dev[0] == '/') strcpy(chosen, dev); else if (cname[0] == 0) strcpy(chosen, devname); diff --git a/msg.c b/msg.c index 5a4839f..8d52b94 100644 --- a/msg.c +++ b/msg.c @@ -177,10 +177,8 @@ int connect_monitor(char *devname) return sfd; } -/* give the monitor a chance to update the metadata */ -int ping_monitor(char *devname) +int fping_monitor(int sfd) { - int sfd = connect_monitor(devname); int err = 0; if (sfd < 0) @@ -194,6 +192,16 @@ int ping_monitor(char *devname) if (!err && wait_reply(sfd, 20) != 0) err = -1; + return err; +} + + +/* give the monitor a chance to update the metadata */ +int ping_monitor(char *devname) +{ + int sfd = connect_monitor(devname); + int err = fping_monitor(sfd); + close(sfd); return err; } diff --git a/msg.h b/msg.h index b9bd205..f8e89fd 100644 --- a/msg.h +++ b/msg.h @@ -27,6 +27,7 @@ extern int ack(int fd, int tmo); extern int wait_reply(int fd, int tmo); extern int connect_monitor(char *devname); extern int ping_monitor(char *devname); +extern int fping_monitor(int sock); extern int ping_manager(char *devname); #define MSG_MAX_LEN (4*1024*1024) diff --git a/super-ddf.c b/super-ddf.c index 9bf08c2..06858e2 100644 --- a/super-ddf.c +++ b/super-ddf.c @@ -1589,13 +1589,8 @@ static int init_super_ddf(struct supertype *st, struct phys_disk *pd; struct virtual_disk *vd; - if (!info) { - st->sb = NULL; - return 0; - } if (st->sb) - return init_super_ddf_bvd(st, info, size, name, homehost, - uuid); + return init_super_ddf_bvd(st, info, size, name, homehost, uuid); if (posix_memalign((void**)&ddf, 512, sizeof(*ddf)) != 0) { fprintf(stderr, Name ": %s could not allocate superblock\n", __func__); @@ -1604,6 +1599,12 @@ static int init_super_ddf(struct supertype *st, memset(ddf, 0, sizeof(*ddf)); ddf->dlist = NULL; /* no physical disks yet */ ddf->conflist = NULL; /* No virtual disks yet */ + st->sb = ddf; + + if (info == NULL) { + /* zeroing superblock */ + return 0; + } /* At least 32MB *must* be reserved for the ddf. So let's just * start 32MB from the end, and put the primary header there. @@ -2971,12 +2972,22 @@ static struct mdinfo *container_content_ddf(struct supertype *st) return rest; } -static int store_zero_ddf(struct supertype *st, int fd) +static int store_super_ddf(struct supertype *st, int fd) { + struct ddf_super *ddf = st->sb; unsigned long long dsize; void *buf; int rc; + if (!ddf) + return 1; + + /* ->dlist and ->conflist will be set for updates, currently not + * supported + */ + if (ddf->dlist || ddf->conflist) + return 1; + if (!get_dev_size(fd, NULL, &dsize)) return 1; @@ -3627,7 +3638,7 @@ struct superswitch super_ddf = { .load_super = load_super_ddf, .init_super = init_super_ddf, - .store_super = store_zero_ddf, + .store_super = store_super_ddf, .free_super = free_super_ddf, .match_metadata_desc = match_metadata_desc_ddf, .container_content = container_content_ddf, diff --git a/super-intel.c b/super-intel.c index 07b0b90..9a99d60 100644 --- a/super-intel.c +++ b/super-intel.c @@ -265,6 +265,14 @@ struct intel_super { struct bbm_log *bbm_log; const char *hba; /* device path of the raid controller for this metadata */ const struct imsm_orom *orom; /* platform firmware support */ + struct intel_super *next; /* (temp) list for disambiguating family_num */ +}; + +struct intel_disk { + struct imsm_disk disk; + #define IMSM_UNKNOWN_OWNER (-1) + int owner; + struct intel_disk *next; }; struct extent { @@ -611,6 +619,21 @@ static __u32 imsm_reserved_sectors(struct intel_super *super, struct dl *dl) return rv; } +static int is_spare(struct imsm_disk *disk) +{ + return (disk->status & SPARE_DISK) == SPARE_DISK; +} + +static int is_configured(struct imsm_disk *disk) +{ + return (disk->status & CONFIGURED_DISK) == CONFIGURED_DISK; +} + +static int is_failed(struct imsm_disk *disk) +{ + return (disk->status & FAILED_DISK) == FAILED_DISK; +} + #ifndef MDASSEMBLE static void print_imsm_dev(struct imsm_dev *dev, char *uuid, int disk_idx) { @@ -676,7 +699,6 @@ static void print_imsm_disk(struct imsm_super *mpb, int index, __u32 reserved) { struct imsm_disk *disk = __get_imsm_disk(mpb, index); char str[MAX_RAID_SERIAL_LEN + 1]; - __u32 s; __u64 sz; if (index < 0) @@ -685,10 +707,9 @@ static void print_imsm_disk(struct imsm_super *mpb, int index, __u32 reserved) printf("\n"); snprintf(str, MAX_RAID_SERIAL_LEN + 1, "%s", disk->serial); printf(" Disk%02d Serial : %s\n", index, str); - s = disk->status; - printf(" State :%s%s%s\n", s&SPARE_DISK ? " spare" : "", - s&CONFIGURED_DISK ? " active" : "", - s&FAILED_DISK ? " failed" : ""); + printf(" State :%s%s%s\n", is_spare(disk) ? " spare" : "", + is_configured(disk) ? " active" : "", + is_failed(disk) ? " failed" : ""); printf(" Id : %08x\n", __le32_to_cpu(disk->scsi_id)); sz = __le32_to_cpu(disk->total_blocks) - reserved; printf(" Usable Size : %llu%s\n", (unsigned long long)sz, @@ -1298,7 +1319,6 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info) { struct intel_super *super = st->sb; struct imsm_disk *disk; - __u32 s; if (super->current_vol >= 0) { getinfo_super_imsm_volume(st, info); @@ -1334,14 +1354,13 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info) disk = &super->disks->disk; info->data_offset = __le32_to_cpu(disk->total_blocks) - reserved; info->component_size = reserved; - s = disk->status; - info->disk.state = s & CONFIGURED_DISK ? (1 << MD_DISK_ACTIVE) : 0; + info->disk.state = is_configured(disk) ? (1 << MD_DISK_ACTIVE) : 0; /* we don't change info->disk.raid_disk here because * this state will be finalized in mdmon after we have * found the 'most fresh' version of the metadata */ - info->disk.state |= s & FAILED_DISK ? (1 << MD_DISK_FAULTY) : 0; - info->disk.state |= s & SPARE_DISK ? 0 : (1 << MD_DISK_SYNC); + info->disk.state |= is_failed(disk) ? (1 << MD_DISK_FAULTY) : 0; + info->disk.state |= is_spare(disk) ? 0 : (1 << MD_DISK_SYNC); } /* only call uuid_from_super_imsm when this disk is part of a populated container, @@ -1359,8 +1378,6 @@ static int update_super_imsm(struct supertype *st, struct mdinfo *info, char *update, char *devname, int verbose, int uuid_set, char *homehost) { - /* FIXME */ - /* For 'assemble' and 'force' we need to return non-zero if any * change was made. For others, the return value is ignored. * Update options are: @@ -1376,26 +1393,55 @@ static int update_super_imsm(struct supertype *st, struct mdinfo *info, * linear only * resync: mark as dirty so a resync will happen. * name: update the name - preserving the homehost + * uuid: Change the uuid of the array to match watch is given * * Following are not relevant for this imsm: * sparc2.2 : update from old dodgey metadata * super-minor: change the preferred_minor number * summaries: update redundant counters. - * uuid: Change the uuid of the array to match watch is given * homehost: update the recorded homehost * _reshape_progress: record new reshape_progress position. */ - int rv = 0; - //struct intel_super *super = st->sb; - //struct imsm_super *mpb = super->mpb; + int rv = 1; + struct intel_super *super = st->sb; + struct imsm_super *mpb; - if (strcmp(update, "grow") == 0) { - } - if (strcmp(update, "resync") == 0) { - /* dev->vol.dirty = 1; */ - } + /* we can only update container info */ + if (!super || super->current_vol >= 0 || !super->anchor) + return 1; - /* IMSM has no concept of UUID or homehost */ + mpb = super->anchor; + + if (strcmp(update, "uuid") == 0 && uuid_set && !info->update_private) + fprintf(stderr, + Name ": '--uuid' not supported for imsm metadata\n"); + else if (strcmp(update, "uuid") == 0 && uuid_set && info->update_private) { + mpb->orig_family_num = *((__u32 *) info->update_private); + rv = 0; + } else if (strcmp(update, "uuid") == 0) { + __u32 *new_family = malloc(sizeof(*new_family)); + + /* update orig_family_number with the incoming random + * data, report the new effective uuid, and store the + * new orig_family_num for future updates. + */ + if (new_family) { + memcpy(&mpb->orig_family_num, info->uuid, sizeof(__u32)); + uuid_from_super_imsm(st, info->uuid); + *new_family = mpb->orig_family_num; + info->update_private = new_family; + rv = 0; + } + } else if (strcmp(update, "assemble") == 0) + rv = 0; + else + fprintf(stderr, + Name ": '--update=%s' not supported for imsm metadata\n", + update); + + /* successful update? recompute checksum */ + if (rv == 0) + mpb->check_sum = __le32_to_cpu(__gen_imsm_checksum(mpb)); return rv; } @@ -1458,19 +1504,33 @@ static int compare_super_imsm(struct supertype *st, struct supertype *tst) return 0; } - if (memcmp(first->anchor->sig, sec->anchor->sig, MAX_SIGNATURE_LENGTH) != 0) - return 3; - /* if an anchor does not have num_raid_devs set then it is a free * floating spare */ if (first->anchor->num_raid_devs > 0 && sec->anchor->num_raid_devs > 0) { - if (first->anchor->orig_family_num != sec->anchor->orig_family_num || - first->anchor->family_num != sec->anchor->family_num) + /* Determine if these disks might ever have been + * related. Further disambiguation can only take place + * in load_super_imsm_all + */ + __u32 first_family = first->anchor->orig_family_num; + __u32 sec_family = sec->anchor->orig_family_num; + + if (memcmp(first->anchor->sig, sec->anchor->sig, + MAX_SIGNATURE_LENGTH) != 0) return 3; + + if (first_family == 0) + first_family = first->anchor->family_num; + if (sec_family == 0) + sec_family = sec->anchor->family_num; + + if (first_family != sec_family) + return 3; + } + /* if 'first' is a spare promote it to a populated mpb with sec's * family number */ @@ -1537,7 +1597,6 @@ static void fd2devname(int fd, char *name) snprintf(name, MAX_RAID_SERIAL_LEN, "/dev/%s", nm); } - extern int scsi_get_serial(int fd, void *buf, size_t buf_len); static int imsm_read_serial(int fd, char *devname, @@ -1620,6 +1679,7 @@ static void serialcpy(__u8 *dest, __u8 *src) strncpy((char *) dest, (char *) src, MAX_RAID_SERIAL_LEN); } +#ifndef MDASSEMBLE static struct dl *serial_to_dl(__u8 *serial, struct intel_super *super) { struct dl *dl; @@ -1630,15 +1690,34 @@ static struct dl *serial_to_dl(__u8 *serial, struct intel_super *super) return dl; } +#endif + +static struct imsm_disk * +__serial_to_disk(__u8 *serial, struct imsm_super *mpb, int *idx) +{ + int i; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, i); + + if (serialcmp(disk->serial, serial) == 0) { + if (idx) + *idx = i; + return disk; + } + } + + return NULL; +} static int load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) { + struct imsm_disk *disk; struct dl *dl; struct stat stb; int rv; - int i; - int alloc = 1; + char name[40]; __u8 serial[MAX_RAID_SERIAL_LEN]; rv = imsm_read_serial(fd, devname, serial); @@ -1646,16 +1725,7 @@ load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) if (rv != 0) return 2; - /* check if this is a disk we have seen before. it may be a spare in - * super->disks while the current anchor believes it is a raid member, - * check if we need to update dl->index - */ - dl = serial_to_dl(serial, super); - if (!dl) - dl = malloc(sizeof(*dl)); - else - alloc = 0; - + dl = calloc(1, sizeof(*dl)); if (!dl) { if (devname) fprintf(stderr, @@ -1664,53 +1734,35 @@ load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) return 2; } - if (alloc) { - fstat(fd, &stb); - dl->major = major(stb.st_rdev); - dl->minor = minor(stb.st_rdev); - dl->next = super->disks; - dl->fd = keep_fd ? fd : -1; - dl->devname = devname ? strdup(devname) : NULL; - serialcpy(dl->serial, serial); - dl->index = -2; - dl->e = NULL; - } else if (keep_fd) { - close(dl->fd); - dl->fd = fd; - } + fstat(fd, &stb); + dl->major = major(stb.st_rdev); + dl->minor = minor(stb.st_rdev); + dl->next = super->disks; + dl->fd = keep_fd ? fd : -1; + assert(super->disks == NULL); + super->disks = dl; + serialcpy(dl->serial, serial); + dl->index = -2; + dl->e = NULL; + fd2devname(fd, name); + if (devname) + dl->devname = strdup(devname); + else + dl->devname = strdup(name); /* look up this disk's index in the current anchor */ - for (i = 0; i < super->anchor->num_disks; i++) { - struct imsm_disk *disk_iter; - - disk_iter = __get_imsm_disk(super->anchor, i); - - if (serialcmp(disk_iter->serial, dl->serial) == 0) { - dl->disk = *disk_iter; - /* only set index on disks that are a member of a - * populated contianer, i.e. one with raid_devs - */ - if (dl->disk.status & FAILED_DISK) - dl->index = -2; - else if (dl->disk.status & SPARE_DISK) - dl->index = -1; - else - dl->index = i; - - break; - } - } - - /* no match, maybe a stale failed drive */ - if (i == super->anchor->num_disks && dl->index >= 0) { - dl->disk = *__get_imsm_disk(super->anchor, dl->index); - if (dl->disk.status & FAILED_DISK) + disk = __serial_to_disk(dl->serial, super->anchor, &dl->index); + if (disk) { + dl->disk = *disk; + /* only set index on disks that are a member of a + * populated contianer, i.e. one with raid_devs + */ + if (is_failed(&dl->disk)) dl->index = -2; + else if (is_spare(&dl->disk)) + dl->index = -1; } - if (alloc) - super->disks = dl; - return 0; } @@ -1852,7 +1904,6 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) struct stat; struct imsm_super *anchor; __u32 check_sum; - int rc; get_dev_size(fd, NULL, &dsize); @@ -1914,10 +1965,7 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) return 2; } - rc = load_imsm_disk(fd, super, devname, 0); - if (rc == 0) - rc = parse_raid_devices(super); - return rc; + return 0; } /* read the extended mpb */ @@ -1953,11 +2001,23 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) */ super->bbm_log = __get_imsm_bbm_log(super->anchor); - rc = load_imsm_disk(fd, super, devname, 0); - if (rc == 0) - rc = parse_raid_devices(super); + return 0; +} - return rc; +static int +load_and_parse_mpb(int fd, struct intel_super *super, char *devname, int keep_fd) +{ + int err; + + err = load_imsm_mpb(fd, super, devname); + if (err) + return err; + err = load_imsm_disk(fd, super, devname, keep_fd); + if (err) + return err; + err = parse_raid_devices(super); + + return err; } static void __free_imsm_disk(struct dl *d) @@ -2087,19 +2147,333 @@ static int find_missing(struct intel_super *super) return 0; } +static struct intel_disk *disk_list_get(__u8 *serial, struct intel_disk *disk_list) +{ + struct intel_disk *idisk = disk_list; + + while (idisk) { + if (serialcmp(idisk->disk.serial, serial) == 0) + break; + idisk = idisk->next; + } + + return idisk; +} + +static int __prep_thunderdome(struct intel_super **table, int tbl_size, + struct intel_super *super, + struct intel_disk **disk_list) +{ + struct imsm_disk *d = &super->disks->disk; + struct imsm_super *mpb = super->anchor; + int i, j; + + for (i = 0; i < tbl_size; i++) { + struct imsm_super *tbl_mpb = table[i]->anchor; + struct imsm_disk *tbl_d = &table[i]->disks->disk; + + if (tbl_mpb->family_num == mpb->family_num) { + if (tbl_mpb->check_sum == mpb->check_sum) { + dprintf("%s: mpb from %d:%d matches %d:%d\n", + __func__, super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + break; + } + + if (((is_configured(d) && !is_configured(tbl_d)) || + is_configured(d) == is_configured(tbl_d)) && + tbl_mpb->generation_num < mpb->generation_num) { + /* current version of the mpb is a + * better candidate than the one in + * super_table, but copy over "cross + * generational" status + */ + struct intel_disk *idisk; + + dprintf("%s: mpb from %d:%d replaces %d:%d\n", + __func__, super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + + idisk = disk_list_get(tbl_d->serial, *disk_list); + if (idisk && is_failed(&idisk->disk)) + tbl_d->status |= FAILED_DISK; + break; + } else { + struct intel_disk *idisk; + struct imsm_disk *disk; + + /* tbl_mpb is more up to date, but copy + * over cross generational status before + * returning + */ + disk = __serial_to_disk(d->serial, mpb, NULL); + if (disk && is_failed(disk)) + d->status |= FAILED_DISK; + + idisk = disk_list_get(d->serial, *disk_list); + if (idisk) { + idisk->owner = i; + if (disk && is_configured(disk)) + idisk->disk.status |= CONFIGURED_DISK; + } + + dprintf("%s: mpb from %d:%d prefer %d:%d\n", + __func__, super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + + return tbl_size; + } + } + } + + if (i >= tbl_size) + table[tbl_size++] = super; + else + table[i] = super; + + /* update/extend the merged list of imsm_disk records */ + for (j = 0; j < mpb->num_disks; j++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, j); + struct intel_disk *idisk; + + idisk = disk_list_get(disk->serial, *disk_list); + if (idisk) { + idisk->disk.status |= disk->status; + if (is_configured(&idisk->disk) || + is_failed(&idisk->disk)) + idisk->disk.status &= ~(SPARE_DISK); + } else { + idisk = calloc(1, sizeof(*idisk)); + if (!idisk) + return -1; + idisk->owner = IMSM_UNKNOWN_OWNER; + idisk->disk = *disk; + idisk->next = *disk_list; + *disk_list = idisk; + } + + if (serialcmp(idisk->disk.serial, d->serial) == 0) + idisk->owner = i; + } + + return tbl_size; +} + +static struct intel_super * +validate_members(struct intel_super *super, struct intel_disk *disk_list, + const int owner) +{ + struct imsm_super *mpb = super->anchor; + int ok_count = 0; + int i; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, i); + struct intel_disk *idisk; + + idisk = disk_list_get(disk->serial, disk_list); + if (idisk) { + if (idisk->owner == owner || + idisk->owner == IMSM_UNKNOWN_OWNER) + ok_count++; + else + dprintf("%s: '%.16s' owner %d != %d\n", + __func__, disk->serial, idisk->owner, + owner); + } else { + dprintf("%s: unknown disk %x [%d]: %.16s\n", + __func__, __le32_to_cpu(mpb->family_num), i, + disk->serial); + break; + } + } + + if (ok_count == mpb->num_disks) + return super; + return NULL; +} + +static void show_conflicts(__u32 family_num, struct intel_super *super_list) +{ + struct intel_super *s; + + for (s = super_list; s; s = s->next) { + if (family_num != s->anchor->family_num) + continue; + fprintf(stderr, "Conflict, offlining family %#x on '%s'\n", + __le32_to_cpu(family_num), s->disks->devname); + } +} + +static struct intel_super * +imsm_thunderdome(struct intel_super **super_list, int len) +{ + struct intel_super *super_table[len]; + struct intel_disk *disk_list = NULL; + struct intel_super *champion, *spare; + struct intel_super *s, **del; + int tbl_size = 0; + int conflict; + int i; + + memset(super_table, 0, sizeof(super_table)); + for (s = *super_list; s; s = s->next) + tbl_size = __prep_thunderdome(super_table, tbl_size, s, &disk_list); + + for (i = 0; i < tbl_size; i++) { + struct imsm_disk *d; + struct intel_disk *idisk; + struct imsm_super *mpb = super_table[i]->anchor; + + s = super_table[i]; + d = &s->disks->disk; + + /* 'd' must appear in merged disk list for its + * configuration to be valid + */ + idisk = disk_list_get(d->serial, disk_list); + if (idisk && idisk->owner == i) + s = validate_members(s, disk_list, i); + else + s = NULL; + + if (!s) + dprintf("%s: marking family: %#x from %d:%d offline\n", + __func__, mpb->family_num, + super_table[i]->disks->major, + super_table[i]->disks->minor); + super_table[i] = s; + } + + /* This is where the mdadm implementation differs from the Windows + * driver which has no strict concept of a container. We can only + * assemble one family from a container, so when returning a prodigal + * array member to this system the code will not be able to disambiguate + * the container contents that should be assembled ("foreign" versus + * "local"). It requires user intervention to set the orig_family_num + * to a new value to establish a new container. The Windows driver in + * this situation fixes up the volume name in place and manages the + * foreign array as an independent entity. + */ + s = NULL; + spare = NULL; + conflict = 0; + for (i = 0; i < tbl_size; i++) { + struct intel_super *tbl_ent = super_table[i]; + int is_spare = 0; + + if (!tbl_ent) + continue; + + if (tbl_ent->anchor->num_raid_devs == 0) { + spare = tbl_ent; + is_spare = 1; + } + + if (s && !is_spare) { + show_conflicts(tbl_ent->anchor->family_num, *super_list); + conflict++; + } else if (!s && !is_spare) + s = tbl_ent; + } + + if (!s) + s = spare; + if (!s) { + champion = NULL; + goto out; + } + champion = s; + + if (conflict) + fprintf(stderr, "Chose family %#x on '%s', " + "assemble conflicts to new container with '--update=uuid'\n", + __le32_to_cpu(s->anchor->family_num), s->disks->devname); + + /* collect all dl's onto 'champion', and update them to + * champion's version of the status + */ + for (s = *super_list; s; s = s->next) { + struct imsm_super *mpb = champion->anchor; + struct dl *dl = s->disks; + + if (s == champion) + continue; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk; + + disk = __serial_to_disk(dl->serial, mpb, &dl->index); + if (disk) { + dl->disk = *disk; + /* only set index on disks that are a member of + * a populated contianer, i.e. one with + * raid_devs + */ + if (is_failed(&dl->disk)) + dl->index = -2; + else if (is_spare(&dl->disk)) + dl->index = -1; + break; + } + } + + if (i >= mpb->num_disks) { + struct intel_disk *idisk; + + idisk = disk_list_get(dl->serial, disk_list); + if (is_spare(&idisk->disk) && + !is_failed(&idisk->disk) && !is_configured(&idisk->disk)) + dl->index = -1; + else { + dl->index = -2; + continue; + } + } + + dl->next = champion->disks; + champion->disks = dl; + s->disks = NULL; + } + + /* delete 'champion' from super_list */ + for (del = super_list; *del; ) { + if (*del == champion) { + *del = (*del)->next; + break; + } else + del = &(*del)->next; + } + champion->next = NULL; + + out: + while (disk_list) { + struct intel_disk *idisk = disk_list; + + disk_list = disk_list->next; + free(idisk); + } + + return champion; +} + static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, char *devname, int keep_fd) { struct mdinfo *sra; - struct intel_super *super; - struct mdinfo *sd, *best = NULL; - __u32 bestgen = 0; - __u32 gen; - char nm[20]; - int dfd; - int rv; + struct intel_super *super_list = NULL; + struct intel_super *super = NULL; int devnum = fd2devnum(fd); + struct mdinfo *sd; int retry; + int err = 0; + int i; enum sysfs_read_flags flags; flags = GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE; @@ -2116,81 +2490,51 @@ static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, strcmp(sra->text_version, "imsm") != 0) return 1; - super = alloc_super(0); - if (!super) - return 1; + /* load all mpbs */ + for (sd = sra->devs, i = 0; sd; sd = sd->next, i++) { + struct intel_super *s = alloc_super(0); + char nm[20]; + int dfd; - /* find the most up to date disk in this array, skipping spares */ - for (sd = sra->devs; sd; sd = sd->next) { + err = 1; + if (!s) + goto error; + s->next = super_list; + super_list = s; + + err = 2; sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); dfd = dev_open(nm, keep_fd ? O_RDWR : O_RDONLY); - if (dfd < 0) { - free_imsm(super); - return 2; - } - rv = load_imsm_mpb(dfd, super, NULL); + if (dfd < 0) + goto error; + + err = load_and_parse_mpb(dfd, s, NULL, keep_fd); /* retry the load if we might have raced against mdmon */ - if (rv == 3 && mdmon_running(devnum)) + if (err == 3 && mdmon_running(devnum)) for (retry = 0; retry < 3; retry++) { usleep(3000); - rv = load_imsm_mpb(dfd, super, NULL); - if (rv != 3) + err = load_and_parse_mpb(dfd, s, NULL, keep_fd); + if (err != 3) break; } if (!keep_fd) close(dfd); - if (rv == 0) { - if (super->anchor->num_raid_devs == 0) - gen = 0; - else - gen = __le32_to_cpu(super->anchor->generation_num); - if (!best || gen > bestgen) { - bestgen = gen; - best = sd; - } - } else { - free_imsm(super); - return rv; - } + if (err) + goto error; } - if (!best) { - free_imsm(super); - return 1; + /* all mpbs enter, maybe one leaves */ + super = imsm_thunderdome(&super_list, i); + if (!super) { + err = 1; + goto error; } - /* load the most up to date anchor */ - sprintf(nm, "%d:%d", best->disk.major, best->disk.minor); - dfd = dev_open(nm, O_RDONLY); - if (dfd < 0) { - free_imsm(super); - return 1; - } - rv = load_imsm_mpb(dfd, super, NULL); - close(dfd); - if (rv != 0) { - free_imsm(super); - return 2; - } - - /* re-parse the disk list with the current anchor */ - for (sd = sra->devs ; sd ; sd = sd->next) { - sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); - dfd = dev_open(nm, keep_fd? O_RDWR : O_RDONLY); - if (dfd < 0) { - free_imsm(super); - return 2; - } - load_imsm_disk(dfd, super, NULL, keep_fd); - if (!keep_fd) - close(dfd); - } - - if (find_missing(super) != 0) { free_imsm(super); - return 2; + err = 2; + goto error; } if (st->subarray[0]) { @@ -2198,13 +2542,26 @@ static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, super->current_vol = atoi(st->subarray); else { free_imsm(super); - return 1; + err = 1; + goto error; } } + err = 0; + + error: + while (super_list) { + struct intel_super *s = super_list; + + super_list = super_list->next; + free_imsm(s); + } + + if (err) + return err; *sbp = super; st->container_dev = devnum; - if (st->ss == NULL) { + if (err == 0 && st->ss == NULL) { st->ss = &super_imsm; st->minor_version = 0; st->max_devs = IMSM_MAX_DEVICES; @@ -2235,7 +2592,7 @@ static int load_super_imsm(struct supertype *st, int fd, char *devname) return 1; } - rv = load_imsm_mpb(fd, super, devname); + rv = load_and_parse_mpb(fd, super, devname, 0); if (rv) { if (devname) @@ -2491,24 +2848,33 @@ static int init_super_imsm(struct supertype *st, mdu_array_info_t *info, size_t mpb_size; char *version; - if (!info) { - st->sb = NULL; - return 0; - } if (st->sb) - return init_super_imsm_volume(st, info, size, name, homehost, - uuid); + return init_super_imsm_volume(st, info, size, name, homehost, uuid); + + if (info) + mpb_size = disks_to_mpb_size(info->nr_disks); + else + mpb_size = 512; super = alloc_super(1); - if (!super) - return 0; - mpb_size = disks_to_mpb_size(info->nr_disks); - if (posix_memalign(&super->buf, 512, mpb_size) != 0) { + if (super && posix_memalign(&super->buf, 512, mpb_size) != 0) { free(super); + super = NULL; + } + if (!super) { + fprintf(stderr, Name + ": %s could not allocate superblock\n", __func__); return 0; } + memset(super->buf, 0, mpb_size); mpb = super->buf; - memset(mpb, 0, mpb_size); + mpb->mpb_size = __cpu_to_le32(mpb_size); + st->sb = super; + + if (info == NULL) { + /* zeroing superblock */ + return 0; + } mpb->attributes = MPB_ATTRIB_CHECKSUM_VERIFY; @@ -2516,9 +2882,7 @@ static int init_super_imsm(struct supertype *st, mdu_array_info_t *info, strcpy(version, MPB_SIGNATURE); version += strlen(MPB_SIGNATURE); strcpy(version, MPB_VERSION_RAID0); - mpb->mpb_size = mpb_size; - st->sb = super; return 1; } @@ -2651,39 +3015,48 @@ static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk, return 0; } -static int store_imsm_mpb(int fd, struct intel_super *super); +static int store_imsm_mpb(int fd, struct imsm_super *mpb); + +static union { + char buf[512]; + struct imsm_super anchor; +} spare_record __attribute__ ((aligned(512))); /* spare records have their own family number and do not have any defined raid * devices */ static int write_super_imsm_spares(struct intel_super *super, int doclose) { - struct imsm_super mpb_save; struct imsm_super *mpb = super->anchor; + struct imsm_super *spare = &spare_record.anchor; __u32 sum; struct dl *d; - mpb_save = *mpb; - mpb->num_raid_devs = 0; - mpb->num_disks = 1; - mpb->mpb_size = sizeof(struct imsm_super); - mpb->generation_num = __cpu_to_le32(1UL); + spare->mpb_size = __cpu_to_le32(sizeof(struct imsm_super)), + spare->generation_num = __cpu_to_le32(1UL), + spare->attributes = MPB_ATTRIB_CHECKSUM_VERIFY; + spare->num_disks = 1, + spare->num_raid_devs = 0, + spare->cache_size = mpb->cache_size, + spare->pwr_cycle_count = __cpu_to_le32(1), + + snprintf((char *) spare->sig, MAX_SIGNATURE_LENGTH, + MPB_SIGNATURE MPB_VERSION_RAID0); for (d = super->disks; d; d = d->next) { if (d->index != -1) continue; - mpb->disk[0] = d->disk; - sum = __gen_imsm_checksum(mpb); - mpb->family_num = __cpu_to_le32(sum); - mpb->orig_family_num = 0; - sum = __gen_imsm_checksum(mpb); - mpb->check_sum = __cpu_to_le32(sum); + spare->disk[0] = d->disk; + sum = __gen_imsm_checksum(spare); + spare->family_num = __cpu_to_le32(sum); + spare->orig_family_num = 0; + sum = __gen_imsm_checksum(spare); + spare->check_sum = __cpu_to_le32(sum); - if (store_imsm_mpb(d->fd, super)) { + if (store_imsm_mpb(d->fd, spare)) { fprintf(stderr, "%s: failed for device %d:%d %s\n", __func__, d->major, d->minor, strerror(errno)); - *mpb = mpb_save; return 1; } if (doclose) { @@ -2692,7 +3065,6 @@ static int write_super_imsm_spares(struct intel_super *super, int doclose) } } - *mpb = mpb_save; return 0; } @@ -2744,7 +3116,7 @@ static int write_super_imsm(struct intel_super *super, int doclose) for (d = super->disks; d ; d = d->next) { if (d->index < 0) continue; - if (store_imsm_mpb(d->fd, super)) + if (store_imsm_mpb(d->fd, mpb)) fprintf(stderr, "%s: failed for device %d:%d %s\n", __func__, d->major, d->minor, strerror(errno)); if (doclose) { @@ -2852,24 +3224,19 @@ static int write_init_super_imsm(struct supertype *st) } #endif -static int store_zero_imsm(struct supertype *st, int fd) +static int store_super_imsm(struct supertype *st, int fd) { - unsigned long long dsize; - void *buf; + struct intel_super *super = st->sb; + struct imsm_super *mpb = super ? super->anchor : NULL; - get_dev_size(fd, NULL, &dsize); - - /* first block is stored on second to last sector of the disk */ - if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) + if (!mpb) return 1; - if (posix_memalign(&buf, 512, 512) != 0) - return 1; - - memset(buf, 0, 512); - if (write(fd, buf, 512) != 512) - return 1; - return 0; +#ifndef MDASSEMBLE + return store_imsm_mpb(fd, mpb); +#else + return 1; +#endif } static int imsm_bbm_log_size(struct imsm_super *mpb) @@ -3444,7 +3811,6 @@ static struct mdinfo *container_content_imsm(struct supertype *st) struct dl *d; int idx; int skip; - __u32 s; __u32 ord; skip = 0; @@ -3456,9 +3822,7 @@ static struct mdinfo *container_content_imsm(struct supertype *st) if (d == NULL) skip = 1; - - s = d ? d->disk.status : 0; - if (s & FAILED_DISK) + if (d && is_failed(&d->disk)) skip = 1; if (ord & IMSM_ORD_REBUILD) skip = 1; @@ -3565,8 +3929,7 @@ static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev, insync = 2; disk = get_imsm_disk(super, idx); - if (!disk || disk->status & FAILED_DISK || - ord & IMSM_ORD_REBUILD) + if (!disk || is_failed(disk) || ord & IMSM_ORD_REBUILD) insync--; /* no in-sync disks left in this mirror the @@ -3616,8 +3979,7 @@ static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev) idx = ord_to_idx(ord); disk = get_imsm_disk(super, idx); - if (!disk || disk->status & FAILED_DISK || - ord & IMSM_ORD_REBUILD) + if (!disk || is_failed(disk) || ord & IMSM_ORD_REBUILD) failed++; } @@ -3676,7 +4038,7 @@ static int mark_failure(struct imsm_dev *dev, struct imsm_disk *disk, int idx) return 0; ord = __le32_to_cpu(map->disk_ord_tbl[slot]); - if ((disk->status & FAILED_DISK) && (ord & IMSM_ORD_REBUILD)) + if (is_failed(disk) && (ord & IMSM_ORD_REBUILD)) return 0; disk->status |= FAILED_DISK; @@ -3824,9 +4186,9 @@ static void imsm_set_disk(struct active_array *a, int n, int state) } } -static int store_imsm_mpb(int fd, struct intel_super *super) +static int store_imsm_mpb(int fd, struct imsm_super *mpb) { - struct imsm_super *mpb = super->anchor; + void *buf = mpb; __u32 mpb_size = __le32_to_cpu(mpb->mpb_size); unsigned long long dsize; unsigned long long sectors; @@ -3841,7 +4203,7 @@ static int store_imsm_mpb(int fd, struct intel_super *super) if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0) return 1; - if (write(fd, super->buf + 512, 512 * sectors) != 512 * sectors) + if (write(fd, buf + 512, 512 * sectors) != 512 * sectors) return 1; } @@ -3849,7 +4211,7 @@ static int store_imsm_mpb(int fd, struct intel_super *super) if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) return 1; - if (write(fd, super->buf, 512) != 512) + if (write(fd, buf, 512) != 512) return 1; return 0; @@ -3877,7 +4239,7 @@ static struct dl *imsm_readd(struct intel_super *super, int idx, struct active_a if (dl->index == i) break; - if (dl && dl->disk.status & FAILED_DISK) + if (dl && is_failed(&dl->disk)) dl = NULL; if (dl) @@ -3915,11 +4277,10 @@ static struct dl *imsm_add_spare(struct intel_super *super, int slot, continue; /* skip in use or failed drives */ - if (dl->disk.status & FAILED_DISK || idx == dl->index || + if (is_failed(&dl->disk) || idx == dl->index || dl->index == -2) { dprintf("%x:%x status (failed: %d index: %d)\n", - dl->major, dl->minor, - (dl->disk.status & FAILED_DISK) == FAILED_DISK, idx); + dl->major, dl->minor, is_failed(&dl->disk), idx); continue; } @@ -4221,7 +4582,7 @@ static void imsm_process_update(struct supertype *st, if (i == u->slot) continue; disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i)); - if (!disk || disk->status & FAILED_DISK) + if (!disk || is_failed(disk)) failed++; } @@ -4584,7 +4945,7 @@ struct superswitch super_imsm = { .load_super = load_super_imsm, .init_super = init_super_imsm, - .store_super = store_zero_imsm, + .store_super = store_super_imsm, .free_super = free_super_imsm, .match_metadata_desc = match_metadata_desc_imsm, .container_content = container_content_imsm, diff --git a/sysfs.c b/sysfs.c index 5806fa7..35dfbd4 100644 --- a/sysfs.c +++ b/sysfs.c @@ -792,7 +792,7 @@ int sysfs_unique_holder(int devnum, long rdev) static char *clean_states[] = { "clear", "inactive", "readonly", "read-auto", "clean", NULL }; -int WaitClean(char *dev, int verbose) +int WaitClean(char *dev, int sock, int verbose) { int fd; struct mdinfo *mdi; @@ -868,7 +868,8 @@ int WaitClean(char *dev, int verbose) } if (rv < 0) rv = 1; - else if (ping_monitor(mdi->text_version) == 0) { + else if (fping_monitor(sock) == 0 || + ping_monitor(mdi->text_version) == 0) { /* we need to ping to close the window between array * state transitioning to clean and the metadata being * marked clean diff --git a/tests/09imsm-assemble b/tests/09imsm-assemble new file mode 100644 index 0000000..7389b0e --- /dev/null +++ b/tests/09imsm-assemble @@ -0,0 +1,46 @@ +# validate the prodigal member disk scenario i.e. a former container +# member is returned after having been rebuilt on another system +num_disks=4 +size=$((10*1024)) +mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 $dev2 $dev3 +mdadm -CR $member $dev0 $dev2 -n 2 -l 1 -z $size +mdadm --wait $member +mdadm -Ss + +# make dev0 and dev1 a new rebuild family +mdadm -A $container $dev0 $dev1 +mdadm -I $container +mdadm --wait ${member}_0 +mdadm -Ss + +# make dev2 and dev3 a new rebuild family +mdadm -A $container $dev2 $dev3 +mdadm -I $container +mdadm --wait ${member}_0 +mdadm -Ss + +# reassemble and make sure one of the families falls out +mdadm -A $container $dev0 $dev1 $dev2 $dev3 +mdadm -I $container +testdev ${member}_0 1 $size 1 +if mdadm --remove $container $dev0 ; then + # the dev[23] family won + imsm_check_removal $container $dev1 + imsm_check_hold $container $dev2 + imsm_check_hold $container $dev3 +else + # the dev[01] family won + imsm_check_hold $container $dev1 + imsm_check_removal $container $dev2 + imsm_check_removal $container $dev3 +fi +mdadm -Ss + +# reassemble with a new id for the dev[23] family +mdadm -A $container $dev0 $dev1 +mdadm -I $container +mdadm -A ${container}2 $dev2 $dev3 --update=uuid +mdadm -I ${container}2 + +testdev ${member}_0 1 $size 1 +testdev ${member}_1 1 $size 1 diff --git a/tests/10ddf-create b/tests/10ddf-create index db22b64..a32dc0f 100644 --- a/tests/10ddf-create +++ b/tests/10ddf-create @@ -55,8 +55,8 @@ mdadm -Ss mdadm -Asc /var/tmp/mdadm.conf check nosync # This failed once. The raid5 was resyncing. -mdadm -Dbs > /tmp/mdadm.conf -diff /tmp/mdadm.conf /var/tmp/mdadm.conf +mdadm -Dbs | sort > /tmp/mdadm.conf +sort /var/tmp/mdadm.conf | diff /tmp/mdadm.conf - mdadm -Ss # and now assemble fully incrementally. @@ -70,7 +70,7 @@ do done check nosync -mdadm -Dbs > /tmp/mdadm.conf -diff /tmp/mdadm.conf /var/tmp/mdadm.conf +mdadm -Dbs | sort > /tmp/mdadm.conf +sort /var/tmp/mdadm.conf | diff /tmp/mdadm.conf - mdadm -Ss rm /tmp/mdadm.conf /var/tmp/mdadm.conf diff --git a/tests/env-09imsm-assemble b/tests/env-09imsm-assemble new file mode 100644 index 0000000..b12954b --- /dev/null +++ b/tests/env-09imsm-assemble @@ -0,0 +1,32 @@ +imsm_check_hold() { + if mdadm --remove $1 $2; then + echo "$2 removal from $1 should have been blocked" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +imsm_check_removal() { + if ! mdadm --remove $1 $2 ; then + echo "$2 removal from $1 should have succeeded" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +setup_env() { + export IMSM_DEVNAME_AS_SERIAL=1 + export IMSM_TEST_OROM=1 + container=/dev/md/container + member=/dev/md/vol0 +} + +reset_env() { + unset IMSM_DEVNAME_AS_SERIAL + unset IMSM_TEST_OROM + unset imsm_check + unset container + unset member +} diff --git a/util.c b/util.c index 662061b..048c39f 100644 --- a/util.c +++ b/util.c @@ -336,17 +336,15 @@ void copy_uuid(void *a, int b[4], int swapuuid) memcpy(a, b, 16); } -char *fname_from_uuid(struct supertype *st, struct mdinfo *info, char *buf, char sep) +char *__fname_from_uuid(int id[4], int swap, char *buf, char sep) { int i, j; - int id; char uuid[16]; char *c = buf; strcpy(c, "UUID-"); c += strlen(c); - copy_uuid(uuid, info->uuid, st->ss->swapuuid); + copy_uuid(uuid, id, swap); for (i = 0; i < 4; i++) { - id = uuid[i]; if (i) *c++ = sep; for (j = 3; j >= 0; j--) { @@ -355,6 +353,12 @@ char *fname_from_uuid(struct supertype *st, struct mdinfo *info, char *buf, char } } return buf; + +} + +char *fname_from_uuid(struct supertype *st, struct mdinfo *info, char *buf, char sep) +{ + return __fname_from_uuid(info->uuid, st->ss->swapuuid, buf, sep); } #ifndef MDASSEMBLE