From 7443ee81873b26516bc672cbe6a736030c5ec746 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 6 Jan 2011 15:58:32 +1100 Subject: [PATCH] Refactor reshape monitoring. Combine all the non-backing-up code into a single function: progress_reshape. It is called repeatedly to monitor a reshape and allow it to happen safely. Have a single separate function 'child_monitor' which performs backups of data and calls progress_reshape to wait for the next backup to be needed. Signed-off-by: NeilBrown --- Grow.c | 721 +++++++++++++++++++----------------- external-reshape-design.txt | 70 +++- 2 files changed, 442 insertions(+), 349 deletions(-) diff --git a/Grow.c b/Grow.c index 43ef421..f3beed5 100644 --- a/Grow.c +++ b/Grow.c @@ -453,20 +453,6 @@ static __u32 bsb_csum(char *buf, int len) return __cpu_to_le32(csum); } -static int child_grow(int afd, struct mdinfo *sra, unsigned long blocks, - int *fds, unsigned long long *offsets, - int disks, int chunk, int level, int layout, int data, - int dests, int *destfd, unsigned long long *destoffsets); -static int child_shrink(int afd, struct mdinfo *sra, unsigned long blocks, - int *fds, unsigned long long *offsets, - int disks, int chunk, int level, int layout, int data, - int dests, int *destfd, unsigned long long *destoffsets); -static int child_same_size(int afd, struct mdinfo *sra, unsigned long blocks, - int *fds, unsigned long long *offsets, - unsigned long long start, - int disks, int chunk, int level, int layout, int data, - int dests, int *destfd, unsigned long long *destoffsets); - static int check_idle(struct supertype *st) { /* Check that all member arrays for this container, or the @@ -1291,6 +1277,11 @@ static int reshape_container(char *container, int cfd, char *devname, int force, char *backup_file, int quiet); +static int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, + unsigned long stripes, + int *fds, unsigned long long *offsets, + int dests, int *destfd, unsigned long long *destoffsets); + int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, long long size, @@ -1507,6 +1498,8 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, } info.array = array; + sysfs_init(&info, fd, NoMdDev); + info.component_size = size*2; info.new_level = level; info.new_chunk = chunksize * 1024; if (raid_disks) @@ -1792,7 +1785,7 @@ static int reshape_array(char *container, int fd, char *devname, } sra = sysfs_read(fd, 0, - GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE| + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE||GET_CHUNK| GET_CACHE); if (!sra) { @@ -2001,32 +1994,10 @@ static int reshape_array(char *container, int fd, char *devname, odisks = reshape.before.data_disks + reshape.parity; - if (reshape.before.data_disks < - reshape.after.data_disks) - done = child_grow(fd, sra, stripes, - fdlist, offsets, - odisks, - info->array.chunk_size, - reshape.level, - reshape.before.layout, - reshape.before.data_disks, - d - odisks, fdlist+odisks, offsets+odisks); - else if (reshape.before.data_disks > - reshape.after.data_disks) - done = child_shrink(fd, sra, stripes, - fdlist, offsets, - odisks, info->array.chunk_size, reshape.level, - reshape.before.layout, - reshape.before.data_disks, - d - odisks, fdlist+odisks, offsets+odisks); - else - done = child_same_size(fd, sra, stripes, - fdlist, offsets, - 0, - odisks, info->array.chunk_size, reshape.level, - reshape.before.layout, - reshape.before.data_disks, - d - odisks, fdlist+odisks, offsets+odisks); + done = child_monitor(fd, sra, &reshape, stripes, + fdlist, offsets, + d - odisks, fdlist+odisks, offsets+odisks); + if (backup_file && done) unlink(backup_file); if (!done) @@ -2134,6 +2105,7 @@ int reshape_container(char *container, int cfd, char *devname, if (!adev) adev = cc->text_version; + sysfs_init(cc, fd, mdstat->devnum); rv = reshape_array(container, fd, adev, st, cc, force, backup_file, quiet, 1); close(fd); @@ -2167,10 +2139,244 @@ int reshape_container(char *container, int cfd, char *devname, * */ +int progress_reshape(struct mdinfo *info, struct reshape *reshape, + unsigned long long backup_point, + unsigned long long wait_point, + unsigned long long *suspend_point, + unsigned long long *reshape_completed) +{ + /* This function is called repeatedly by the reshape manager. + * It determines how much progress can safely be made and allows + * that progress. + * - 'info' identifies the array and particularly records in + * ->reshape_progress the metadata's knowledge of progress + * This is a sector offset from the start of the array + * of the next array block to be relocated. This number + * may increase from 0 or decrease from array_size, depending + * on the type of reshape that is happening. + * Note that in contrast, 'sync_completed' is a block count of the + * reshape so far. It gives the distance between the start point + * (head or tail of device) and the next place that data will be + * written. It always increases. + * - 'reshape' is the structure created by analyse_change + * - 'backup_point' shows how much the metadata manager has backed-up + * data. For reshapes with increasing progress, it is the next address + * to be backed up, previous addresses have been backed-up. For + * decreasing progress, it is the earliest address that has been + * backed up - later address are also backed up. + * So addresses between reshape_progress and backup_point are + * backed up providing those are in the 'correct' order. + * - 'wait_point' is an array address. When reshape_completed + * passes this point, progress_reshape should return. It might + * return earlier if it determines that ->reshape_progress needs + * to be updated or further backup is needed. + * - suspend_point is maintained by progress_reshape and the caller + * should not touch it except to initialise to zero. + * It is an array address and it only increases in 2.6.37 and earlier. + * This makes it difficulty to handle reducing reshapes with + * external metadata. + * However: it is similar to backup_point in that it records the + * other end of a suspended region from reshape_progress. + * it is moved to extend the region that is safe to backup and/or + * reshape + * - reshape_completed is read from sysfs and returned. The caller + * should copy this into ->reshape_progress when it has reason to + * believe that the metadata knows this, and any backup outside this + * has been erased. + * + * Return value is: + * 1 if more data from backup_point - but only as far as suspend_point, + * should be backed up + * 0 if things are progressing smoothly + * -1 if the reshape is finished, either because it is all done, + * or due to an error. + */ + + int advancing = (reshape->after.data_disks + >= reshape->before.data_disks); + int need_backup = (reshape->after.data_disks + == reshape->before.data_disks); + unsigned long long read_offset, write_offset; + unsigned long long read_range, write_range; + unsigned long long max_progress, target, completed; + int fd; + + /* First, we unsuspend any region that is now known to be safe. + * If suspend_point is on the 'wrong' side of reshape_progress, then + * we don't have or need suspension at the moment. This is true for + * native metadata when we don't need to back-up. + */ + if (advancing) { + if (info->reshape_progress < *suspend_point) + sysfs_set_num(info, NULL, "suspend_lo", + info->reshape_progress); + } else { + /* Note: this won't work in 2.6.37 and before. + * Something somewhere should make sure we don't need it! + */ + if (info->reshape_progress > *suspend_point) + sysfs_set_num(info, NULL, "suspend_hi", + info->reshape_progress); + } + + /* Now work out how far it is safe to progress. + * If the read_offset for ->reshape_progress is less than + * 'blocks' beyond the write_offset, we can only progress as far + * as a backup. + * Otherwise we can progress until the write_offset for the new location + * reaches (within 'blocks' of) the read_offset at the current location. + * However that region must be suspended unless we are using native + * metadata. + * If we need to suspend more, we limit it to 128M per device, which is + * rather arbitrary and should be some time-based calculation. + */ + write_offset = info->reshape_progress / reshape->before.data_disks; + read_offset = info->reshape_progress / reshape->after.data_disks; + write_range = reshape->blocks / reshape->before.data_disks; + read_range = reshape->blocks / reshape->after.data_disks; + if (advancing) { + if (read_offset < write_offset + write_range) { + max_progress = backup_point; + if (max_progress <= info->reshape_progress) + need_backup = 1; + } else { + max_progress = + (read_offset - write_range) * + reshape->before.data_disks; + } + } else { + if (read_offset > write_offset - write_range) { + max_progress = backup_point; + if (max_progress >= info->reshape_progress) + need_backup = 1; + } else { + max_progress = + (read_offset + write_range) * + reshape->before.data_disks; + /* If we are using internal metadata, then we can + * progress all the way to the suspend_point without + * worrying about backing-up/suspending along the + * way. + */ + if (max_progress < *suspend_point && + info->array.major_version >= 0) + max_progress = *suspend_point; + } + } + + /* We know it is safe to progress to 'max_progress' providing + * it is suspended or we are using native metadata. + * Consider extending suspend_point 128M per device if it + * is less than 64M per device beyond reshape_progress. + * But always do a multiple of 'blocks' + */ + target = 64*1024*2 * min(reshape->before.data_disks, + reshape->after.data_disks); + target /= reshape->blocks; + if (target < 2) + target = 2; + target *= reshape->blocks; + + /* For externally managed metadata we always need to suspend IO to + * the area being reshaped so we regularly push suspend_point forward. + * For native metadata we only need the suspend if we are going to do + * a backup. + */ + if (advancing) { + if ((need_backup || info->array.major_version < 0) && + *suspend_point < info->reshape_progress + target) { + if (max_progress < *suspend_point + 2 * target) + *suspend_point = max_progress; + else + *suspend_point += 2 * target; + sysfs_set_num(info, NULL, "suspend_hi", *suspend_point); + max_progress = *suspend_point; + } + } else { + if ((need_backup || info->array.major_version < 0) && + *suspend_point > info->reshape_progress - target) { + if (max_progress > *suspend_point - 2 * target) + *suspend_point = max_progress; + else + *suspend_point -= 2 * target; + sysfs_set_num(info, NULL, "suspend_lo", *suspend_point); + max_progress = *suspend_point; + } + } + + /* now set sync_max to allow that progress. sync_max, like + * sync_completed is a count of sectors written per device, so + * we find the difference between max_progress and the start point, + * and divide that by after.data_disks to get a sync_max + * number. + * At the same time we convert wait_point to a similar number + * for comparing against sync_completed. + */ + if (!advancing) { + max_progress = info->component_size * reshape->after.data_disks + - max_progress; + wait_point = info->component_size * reshape->after.data_disks + - wait_point; + } + max_progress /= reshape->after.data_disks; + wait_point /= reshape->after.data_disks; + + sysfs_set_num(info, NULL, "sync_max", max_progress); + + /* Now wait. If we have already reached the point that we were + * asked to wait to, don't wait at all, else wait for any change. + * We need to select on 'sync_completed' as that is the place that + * notifications happen, but we are really interested in + * 'reshape_position' + */ + fd = sysfs_get_fd(info, NULL, "sync_completed"); + if (fd < 0) + return -1; + + if (sysfs_fd_get_ll(fd, &completed) < 0) { + close(fd); + return -1; + } + while (completed < max_progress && completed < wait_point) { + /* Check that sync_action is still 'reshape' to avoid + * waiting forever on a dead array + */ + char action[20]; + fd_set rfds; + if (sysfs_get_str(info, NULL, "sync_action", + action, 20) <= 0 || + strncmp(action, "reshape", 7) != 0) + break; + FD_ZERO(&rfds); + FD_SET(fd, &rfds); + select(fd+1, NULL, NULL, &rfds, NULL); + if (sysfs_fd_get_ll(fd, &completed) < 0) { + close(fd); + return -1; + } + } + /* Convert 'completed' back in to a 'progress' number */ + completed *= reshape->after.data_disks; + if (!advancing) { + completed = info->component_size * reshape->after.data_disks + - completed; + } + *reshape_completed = completed; + + close(fd); + + /* We return the need_backup flag. Caller will decide + * how much (a multiple of ->blocks) and will adjust + * suspend_{lo,hi} and suspend_point. + */ + return need_backup; +} + + /* FIXME return status is never checked */ static int grow_backup(struct mdinfo *sra, unsigned long long offset, /* per device */ - unsigned long stripes, /* per device */ + unsigned long stripes, /* per device, in old chunks */ int *sources, unsigned long long *offsets, int disks, int chunk, int level, int layout, int dests, int *destfd, unsigned long long *destoffsets, @@ -2193,7 +2399,7 @@ static int grow_backup(struct mdinfo *sra, odata--; if (level == 6) odata--; - sysfs_set_num(sra, NULL, "suspend_hi", (offset + stripes * (chunk/512)) * odata); + /* Check that array hasn't become degraded, else we might backup the wrong data */ sysfs_get_ll(sra, NULL, "degraded", &ll); new_degraded = (int)ll; @@ -2283,46 +2489,16 @@ static int grow_backup(struct mdinfo *sra, * every works. */ /* FIXME return value is often ignored */ -static int wait_backup(struct mdinfo *sra, - unsigned long long offset, /* per device */ - unsigned long long blocks, /* per device */ - unsigned long long blocks2, /* per device - hack */ +static int forget_backup( int dests, int *destfd, unsigned long long *destoffsets, int part) { - /* Wait for resync to pass the section that was backed up - * then erase the backup and allow IO + /* + * Erase backup 'part' (which is 0 or 1) */ - int fd = sysfs_get_fd(sra, NULL, "sync_completed"); - unsigned long long completed; int i; int rv; - if (fd < 0) - return -1; - sysfs_set_num(sra, NULL, "sync_max", offset + blocks + blocks2); - - if (sysfs_fd_get_ll(fd, &completed) < 0) { - close(fd); - return -1; - } - while (completed < offset + blocks) { - char action[20]; - fd_set rfds; - FD_ZERO(&rfds); - FD_SET(fd, &rfds); - select(fd+1, NULL, NULL, &rfds, NULL); - if (sysfs_fd_get_ll(fd, &completed) < 0) { - close(fd); - return -1; - } - if (sysfs_get_str(sra, NULL, "sync_action", - action, 20) > 0 && - strncmp(action, "reshape", 7) != 0) - break; - } - close(fd); - if (part) { bsb.arraystart2 = __cpu_to_le64(0); bsb.length2 = __cpu_to_le64(0); @@ -2442,130 +2618,133 @@ static void validate(int afd, int bfd, unsigned long long offset) } } -static int child_grow(int afd, struct mdinfo *sra, unsigned long stripes, - int *fds, unsigned long long *offsets, - int disks, int chunk, int level, int layout, int data, - int dests, int *destfd, unsigned long long *destoffsets) +static int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, + unsigned long stripes, + int *fds, unsigned long long *offsets, + int dests, int *destfd, unsigned long long *destoffsets) { + /* Monitor a reshape where backup is being performed using + * 'native' mechanism - either to a backup file, or + * to some space in a spare. + */ char *buf; - int degraded = 0; + int degraded = -1; + unsigned long long speed; + unsigned long long suspend_point, array_size; + unsigned long long backup_point, wait_point; + unsigned long long reshape_completed; + int done = 0; + int increasing = reshape->after.data_disks >= reshape->before.data_disks; + int part = 0; /* The next part of the backup area to fill. It may already + * be full, so we need to check */ + int level = reshape->level; + int layout = reshape->before.layout; + int data = reshape->before.data_disks; + int disks = reshape->before.data_disks + reshape->parity; + int chunk = sra->array.chunk_size; if (posix_memalign((void**)&buf, 4096, disks * chunk)) /* Don't start the 'reshape' */ return 0; - grow_backup(sra, 0, stripes, - fds, offsets, disks, chunk, level, layout, - dests, destfd, destoffsets, - 0, °raded, buf); - validate(afd, destfd[0], destoffsets[0]); - wait_backup(sra, 0, stripes * (chunk / 512), stripes * (chunk / 512), - dests, destfd, destoffsets, - 0); - sysfs_set_num(sra, NULL, "suspend_lo", (stripes * (chunk/512)) * data); - free(buf); - /* FIXME this should probably be numeric */ - sysfs_set_str(sra, NULL, "sync_max", "max"); - return 1; -} - -static int child_shrink(int afd, struct mdinfo *sra, unsigned long stripes, - int *fds, unsigned long long *offsets, - int disks, int chunk, int level, int layout, int data, - int dests, int *destfd, unsigned long long *destoffsets) -{ - char *buf; - unsigned long long start; - int rv; - int degraded = 0; - - if (posix_memalign((void**)&buf, 4096, disks * chunk)) - return 0; - start = sra->component_size - stripes * (chunk/512); - sysfs_set_num(sra, NULL, "sync_max", start); - rv = wait_backup(sra, 0, start - stripes * (chunk/512), stripes * (chunk/512), - dests, destfd, destoffsets, 0); - if (rv < 0) - return 0; - grow_backup(sra, 0, stripes, - fds, offsets, - disks, chunk, level, layout, - dests, destfd, destoffsets, - 0, °raded, buf); - validate(afd, destfd[0], destoffsets[0]); - wait_backup(sra, start, stripes*(chunk/512), 0, - dests, destfd, destoffsets, 0); - sysfs_set_num(sra, NULL, "suspend_lo", (stripes * (chunk/512)) * data); - free(buf); - /* FIXME this should probably be numeric */ - sysfs_set_str(sra, NULL, "sync_max", "max"); - return 1; -} - -static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes, - int *fds, unsigned long long *offsets, - unsigned long long start, - int disks, int chunk, int level, int layout, int data, - int dests, int *destfd, unsigned long long *destoffsets) -{ - unsigned long long size; - unsigned long tailstripes = stripes; - int part; - char *buf; - unsigned long long speed; - int degraded = 0; - - - if (posix_memalign((void**)&buf, 4096, disks * chunk)) - return 0; - - sysfs_get_ll(sra, NULL, "sync_speed_min", &speed); - sysfs_set_num(sra, NULL, "sync_speed_min", 200000); - - grow_backup(sra, start, stripes, - fds, offsets, - disks, chunk, level, layout, - dests, destfd, destoffsets, - 0, °raded, buf); - grow_backup(sra, (start + stripes) * (chunk/512), stripes, - fds, offsets, - disks, chunk, level, layout, - dests, destfd, destoffsets, - 1, °raded, buf); - validate(afd, destfd[0], destoffsets[0]); - part = 0; - start += stripes * 2; /* where to read next */ - size = sra->component_size / (chunk/512); - while (start < size) { - if (wait_backup(sra, (start-stripes*2)*(chunk/512), - stripes*(chunk/512), 0, - dests, destfd, destoffsets, - part) < 0) - return 0; - sysfs_set_num(sra, NULL, "suspend_lo", start*(chunk/512) * data); - if (start + stripes > size) - tailstripes = (size - start); - - grow_backup(sra, start*(chunk/512), tailstripes, - fds, offsets, - disks, chunk, level, layout, - dests, destfd, destoffsets, - part, °raded, buf); - start += stripes; - part = 1 - part; - validate(afd, destfd[0], destoffsets[0]); + if (reshape->before.data_disks == reshape->after.data_disks) { + sysfs_get_ll(sra, NULL, "sync_speed_min", &speed); + sysfs_set_num(sra, NULL, "sync_speed_min", 200000); } - if (wait_backup(sra, (start-stripes*2) * (chunk/512), stripes * (chunk/512), 0, - dests, destfd, destoffsets, - part) < 0) - return 0; - sysfs_set_num(sra, NULL, "suspend_lo", ((start-stripes)*(chunk/512)) * data); - wait_backup(sra, (start-stripes) * (chunk/512), tailstripes * (chunk/512), 0, - dests, destfd, destoffsets, - 1-part); - sysfs_set_num(sra, NULL, "suspend_lo", (size*(chunk/512)) * data); - sysfs_set_num(sra, NULL, "sync_speed_min", speed); + + array_size = sra->component_size * data; + if (increasing) { + backup_point = sra->reshape_progress; + suspend_point = 0; + } else { + backup_point = array_size; + suspend_point = array_size; + } + + while (!done) { + int rv; + + /* Want to return as soon the oldest backup slot can + * be released as that allows us to start backing up + * some more, providing suspend_point has been + * advanced, which it should have + */ + if (increasing) { + wait_point = array_size; + if (part == 0 && __le64_to_cpu(bsb.length) > 0) + wait_point = (__le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length)); + if (part == 1 && __le64_to_cpu(bsb.length2) > 0) + wait_point = (__le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2)); + } else { + wait_point = 0; + if (part == 0 && __le64_to_cpu(bsb.length) > 0) + wait_point = __le64_to_cpu(bsb.arraystart); + if (part == 1 && __le64_to_cpu(bsb.length2) > 0) + wait_point = __le64_to_cpu(bsb.arraystart2); + } + + rv = progress_reshape(sra, reshape, + backup_point, wait_point, + &suspend_point, &reshape_completed); + if (rv < 0) { + done = 1; + break; + } + + /* external metadata would need to ping_monitor here */ + sra->reshape_progress = reshape_completed; + + /* Clear any backup region that is before 'here' */ + if (increasing) { + if (reshape_completed >= (__le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length))) + forget_backup(dests, destfd, + destoffsets, 0); + if (reshape_completed >= (__le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2))) + forget_backup(dests, destfd, + destoffsets, 1); + } else { + if (reshape_completed <= (__le64_to_cpu(bsb.arraystart))) + forget_backup(dests, destfd, + destoffsets, 0); + if (reshape_completed <= (__le64_to_cpu(bsb.arraystart2))) + forget_backup(dests, destfd, + destoffsets, 1); + } + + if (rv) { + unsigned long long offset; + /* need to backup some space... */ + /* Check that 'part' is unused */ + if (part == 0 && __le64_to_cpu(bsb.length) != 0) + abort(); /* BUG here */ + if (part == 1 && __le64_to_cpu(bsb.length2) != 0) + abort(); + + offset = backup_point / data; + if (!increasing) + offset -= stripes * (chunk/512); + grow_backup(sra, offset, stripes, + fds, offsets, + disks, chunk, level, layout, + dests, destfd, destoffsets, + part, °raded, buf); + validate(afd, destfd[0], destoffsets[0]); + /* record where 'part' is up to */ + part = !part; + if (increasing) + backup_point += stripes * (chunk/512) * data; + else + backup_point -= stripes * (chunk/512) * data; + } + } + + if (reshape->before.data_disks == reshape->after.data_disks) + sysfs_set_num(sra, NULL, "sync_speed_min", speed); free(buf); - return 1; + return 1; /* FIXME what does this mean? */ } /* @@ -2859,164 +3038,10 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info, char *backup_file) { - /* Array is assembled and ready to be started, but - * monitoring is probably required. - * So: - * - start read-only - * - set upper bound for resync - * - initialise the 'suspend' boundaries - * - switch to read-write - * - fork and continue monitoring - */ - int err; - int backup_list[1]; - unsigned long long backup_offsets[1]; - int odisks, ndisks, ochunk, nchunk,odata,ndata; - unsigned long a,b,blocks,stripes; - int backup_fd; - int *fds; - unsigned long long *offsets; - int d; - struct mdinfo *sra, *sd; - int rv; - unsigned long cache; - int done = 0; - - err = sysfs_set_str(info, NULL, "array_state", "readonly"); + int err = sysfs_set_str(info, NULL, "array_state", "readonly"); if (err) return err; - - /* make sure reshape doesn't progress until we are ready */ - sysfs_set_str(info, NULL, "sync_max", "0"); - sysfs_set_str(info, NULL, "array_state", "active"); /* FIXME or clean */ - - sra = sysfs_read(-1, devname2devnum(info->sys_name), - GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE| - GET_CACHE); - if (!sra) - return 1; - - /* ndisks is not growing, so raid_disks is old and +delta is new */ - odisks = info->array.raid_disks; - ndisks = odisks + info->delta_disks; - odata = odisks - 1; - ndata = ndisks - 1; - if (info->array.level == 6) { - odata--; - ndata--; - } - ochunk = info->array.chunk_size; - nchunk = info->new_chunk; - - a = (ochunk/512) * odata; - b = (nchunk/512) * ndata; - /* Find GCD */ - while (a != b) { - if (a < b) - b -= a; - if (b < a) - a -= b; - } - /* LCM == product / GCD */ - blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a; - - if (ndata == odata) - while (blocks * 32 < sra->component_size && - blocks < 16*1024*2) - blocks *= 2; - stripes = blocks / (info->array.chunk_size/512) / odata; - - /* check that the internal stripe cache is - * large enough, or it won't work. - */ - cache = (nchunk < ochunk) ? ochunk : nchunk; - cache = cache * 4 / 4096; - if (cache < blocks / 8 / odisks + 16) - /* Make it big enough to hold 'blocks' */ - cache = blocks / 8 / odisks + 16; - if (sra->cache_size < cache) - sysfs_set_num(sra, NULL, "stripe_cache_size", - cache+1); - - memset(&bsb, 0, 512); - memcpy(bsb.magic, "md_backup_data-1", 16); - memcpy(&bsb.set_uuid, info->uuid, 16); - bsb.mtime = __cpu_to_le64(time(0)); - bsb.devstart2 = blocks; - - backup_fd = open(backup_file, O_RDWR|O_CREAT, S_IRUSR | S_IWUSR); - if (backup_fd < 0) { - fprintf(stderr, Name ": Cannot open backup file %s\n", - backup_file ?: "- no backup-file given"); - return 1; - } - backup_list[0] = backup_fd; - backup_offsets[0] = 8 * 512; - fds = malloc(odisks * sizeof(fds[0])); - offsets = malloc(odisks * sizeof(offsets[0])); - for (d=0; ddevs; sd; sd = sd->next) { - if (sd->disk.state & (1<disk.state & (1<disk.major, - sd->disk.minor, 1); - fds[sd->disk.raid_disk] - = dev_open(dn, O_RDONLY); - offsets[sd->disk.raid_disk] = sd->data_offset*512; - if (fds[sd->disk.raid_disk] < 0) { - fprintf(stderr, Name ": %s: cannot open component %s\n", - info->sys_name, dn?dn:"-unknown-"); - rv = 1; - goto release; - } - free(dn); - } - } - - switch(fork()) { - case 0: - close(mdfd); - mlockall(MCL_FUTURE); - if (info->delta_disks < 0) - done = child_shrink(-1, info, stripes, - fds, offsets, - info->array.raid_disks, - info->array.chunk_size, - info->array.level, info->array.layout, - odata, - 1, backup_list, backup_offsets); - else if (info->delta_disks == 0) { - /* The 'start' is a per-device stripe number. - * reshape_progress is a per-array sector number. - * So divide by ndata * chunk_size - */ - unsigned long long start = info->reshape_progress / ndata; - start /= (info->array.chunk_size/512); - done = child_same_size(-1, info, stripes, - fds, offsets, - start, - info->array.raid_disks, - info->array.chunk_size, - info->array.level, info->array.layout, - odata, - 1, backup_list, backup_offsets); - } - if (backup_file && done) - unlink(backup_file); - /* FIXME should I intuit a level change */ - exit(0); - case -1: - fprintf(stderr, Name ": Cannot run child to continue monitoring reshape: %s\n", - strerror(errno)); - return 1; - default: - break; - } -release: - return 0; + return reshape_array(NULL, mdfd, "array", st, info, 1, backup_file, 0, 0); } diff --git a/external-reshape-design.txt b/external-reshape-design.txt index 23ba890..4eb04a2 100644 --- a/external-reshape-design.txt +++ b/external-reshape-design.txt @@ -205,8 +205,76 @@ sync_action 2.6 Reshape raid disks (shrink) -3 TODO +3 Interaction with metadata handle. + The following calls are made into the metadata handler to assist + with initiating and monitoring a 'reshape'. + + 1/ ->reshape_super is called quite early (after only minimial + checks) to make sure that the metadata can record the new shape + and any necessary transitions. It may be passed a 'container' + or an individual array within a container, and it should notice + the difference and act accordingly. + When a reshape is requested against a container it is expected + that it should be applied to every array in the container, + however it is up to the metadata handler to determine final + policy. + + If the reshape is supportable, the internal copy of the metadata + should be updated, and a metadata update suitable for sending + to mdmon should be queued. + + If the reshape will involve converting spares into array members, + this must be recorded in the metadata too. + + 2/ ->container_content will be called to find out the new state + of all the array, or all arrays in the container. Any newly + added devices (with state==0 and raid_disk >= 0) will be added + to the array as spares with the relevant slot number. + + It is likely that the info returned by ->container_content will + have ->reshape_active set, ->reshape_progress set to e.g. 0, and + new_* set appropriately. mdadm will use this information to + cause the correct reshape to start at an appropriate time. + + 3/ ->set_array_state will be called by mdmon when reshape has + started and again periodically as it progresses. This should + record the ->last_checkpoint as the point where reshape has + progressed to. When the reshape finished this will be called + again and it should notice that ->curr_action is no longer + 'reshape' and so should record that the reshape has finished + providing 'last_checkpoint' has progressed suitably. + + 4/ ->manage_reshape will be called once the reshape has been set + up in the kernel but before sync_max has been moved from 0, so + no actual reshape will have happened. + + ->manage_reshape should call progress_reshape() to allow the + reshape to progress, and should back-up any data as indicated + by the return value. See the documentation of that function + for more details. + ->manage_reshape will be called multiple times when a + container is being reshaped, once for each member array in + the container. + + + The progress of the metadata is as follows: + 1/ mdadm sends a metadata update to mdmon which marks the array + as undergoing a reshape. This is set up by + ->reshape_super and applied by ->process_update + For container-wide reshape, this happens once for the whole + container. + 2/ mdmon notices progress via the sysfs files and calls + ->set_array_state to update the state periodically + For container-wide reshape, this happens repeatedly for + one array, then repeatedly for the next, etc. + 3/ mdmon notices when reshape has finished and call + ->set_array_state to record the the reshape is complete. + For container-wide reshape, this happens once for each + member array. + + + ... [1]: Linux kernel design patterns - part 3, Neil Brown http://lwn.net/Articles/336262/