From 7443ee81873b26516bc672cbe6a736030c5ec746 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 6 Jan 2011 15:58:32 +1100
Subject: [PATCH] Refactor reshape monitoring.

Combine all the non-backing-up code into a single function:
progress_reshape.  It is called repeatedly to monitor a
reshape and allow it to happen safely.

Have a single separate function 'child_monitor' which
performs backups of data and calls progress_reshape to
wait for the next backup to be needed.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 Grow.c                      | 721 +++++++++++++++++++-----------------
 external-reshape-design.txt |  70 +++-
 2 files changed, 442 insertions(+), 349 deletions(-)

diff --git a/Grow.c b/Grow.c
index 43ef421..f3beed5 100644
--- a/Grow.c
+++ b/Grow.c
@@ -453,20 +453,6 @@ static __u32 bsb_csum(char *buf, int len)
 	return __cpu_to_le32(csum);
 }
 
-static int child_grow(int afd, struct mdinfo *sra, unsigned long blocks,
-		      int *fds, unsigned long long *offsets,
-		      int disks, int chunk, int level, int layout, int data,
-		      int dests, int *destfd, unsigned long long *destoffsets);
-static int child_shrink(int afd, struct mdinfo *sra, unsigned long blocks,
-			int *fds, unsigned long long *offsets,
-			int disks, int chunk, int level, int layout, int data,
-			int dests, int *destfd, unsigned long long *destoffsets);
-static int child_same_size(int afd, struct mdinfo *sra, unsigned long blocks,
-			   int *fds, unsigned long long *offsets,
-			   unsigned long long start,
-			   int disks, int chunk, int level, int layout, int data,
-			   int dests, int *destfd, unsigned long long *destoffsets);
-
 static int check_idle(struct supertype *st)
 {
 	/* Check that all member arrays for this container, or the
@@ -1291,6 +1277,11 @@ static int reshape_container(char *container, int cfd, char *devname,
 			     int force,
 			     char *backup_file,
 			     int quiet);
+static int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape,
+			 unsigned long stripes,
+			 int *fds, unsigned long long *offsets,
+			 int dests, int *destfd, unsigned long long *destoffsets);
+
 
 int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
 		 long long size,
@@ -1507,6 +1498,8 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
 	}
 
 	info.array = array;
+	sysfs_init(&info, fd, NoMdDev);
+	info.component_size = size*2;
 	info.new_level = level;
 	info.new_chunk = chunksize * 1024;
 	if (raid_disks)
@@ -1792,7 +1785,7 @@ static int reshape_array(char *container, int fd, char *devname,
 	}
 
 	sra = sysfs_read(fd, 0,
-			 GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
+			 GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE||GET_CHUNK|
 			 GET_CACHE);
 
 	if (!sra) {
@@ -2001,32 +1994,10 @@ static int reshape_array(char *container, int fd, char *devname,
 
 		odisks = reshape.before.data_disks + reshape.parity;
 
-		if (reshape.before.data_disks < 
-		    reshape.after.data_disks)
-			done = child_grow(fd, sra, stripes,
-					  fdlist, offsets,
-					  odisks,
-					  info->array.chunk_size,
-					  reshape.level,
-					  reshape.before.layout, 
-					  reshape.before.data_disks,
-					  d - odisks, fdlist+odisks, offsets+odisks);
-		else if (reshape.before.data_disks >
-			 reshape.after.data_disks)
-			done = child_shrink(fd, sra, stripes,
-					    fdlist, offsets,
-					    odisks, info->array.chunk_size, reshape.level,	
-					    reshape.before.layout,
-					    reshape.before.data_disks,
-					    d - odisks, fdlist+odisks, offsets+odisks);
-		else
-			done = child_same_size(fd, sra, stripes,
-					       fdlist, offsets,
-					       0,
-					       odisks, info->array.chunk_size, reshape.level,
-					       reshape.before.layout,
-					       reshape.before.data_disks,
-					       d - odisks, fdlist+odisks, offsets+odisks);
+		done = child_monitor(fd, sra, &reshape, stripes,
+				     fdlist, offsets,
+				     d - odisks, fdlist+odisks, offsets+odisks);
+
 		if (backup_file && done)
 			unlink(backup_file);
 		if (!done)
@@ -2134,6 +2105,7 @@ int reshape_container(char *container, int cfd, char *devname,
 		if (!adev)
 			adev = cc->text_version;
 
+		sysfs_init(cc, fd, mdstat->devnum);
 		rv = reshape_array(container, fd, adev, st, cc, force,
 				   backup_file, quiet, 1);
 		close(fd);
@@ -2167,10 +2139,244 @@ int reshape_container(char *container, int cfd, char *devname,
  * 
  */
 
+int progress_reshape(struct mdinfo *info, struct reshape *reshape,
+		     unsigned long long backup_point,
+		     unsigned long long wait_point,
+		     unsigned long long *suspend_point,
+		     unsigned long long *reshape_completed)
+{
+	/* This function is called repeatedly by the reshape manager.
+	 * It determines how much progress can safely be made and allows
+	 * that progress.
+	 * - 'info' identifies the array and particularly records in
+	 *    ->reshape_progress the metadata's knowledge of progress
+	 *      This is a sector offset from the start of the array
+	 *      of the next array block to be relocated.  This number
+	 *      may increase from 0 or decrease from array_size, depending
+	 *      on the type of reshape that is happening.
+	 *    Note that in contrast, 'sync_completed' is a block count of the
+	 *    reshape so far.  It gives the distance between the start point
+	 *    (head or tail of device) and the next place that data will be
+	 *    written.  It always increases.
+	 * - 'reshape' is the structure created by analyse_change
+	 * - 'backup_point' shows how much the metadata manager has backed-up
+	 *   data.  For reshapes with increasing progress, it is the next address
+	 *   to be backed up, previous addresses have been backed-up.  For
+	 *   decreasing progress, it is the earliest address that has been
+	 *   backed up - later address are also backed up.
+	 *   So addresses between reshape_progress and backup_point are
+	 *   backed up providing those are in the 'correct' order.
+	 * - 'wait_point' is an array address.  When reshape_completed
+	 *   passes this point, progress_reshape should return.  It might
+	 *   return earlier if it determines that ->reshape_progress needs
+	 *   to be updated or further backup is needed.
+	 * - suspend_point is maintained by progress_reshape and the caller
+	 *   should not touch it except to initialise to zero.
+	 *   It is an array address and it only increases in 2.6.37 and earlier.
+	 *   This makes it difficulty to handle reducing reshapes with
+	 *   external metadata.
+	 *   However:  it is similar to backup_point in that it records the
+	 *     other end of a suspended region from  reshape_progress.
+	 *     it is moved to extend the region that is safe to backup and/or
+	 *     reshape
+	 * - reshape_completed is read from sysfs and returned.  The caller
+	 *   should copy this into ->reshape_progress when it has reason to
+	 *   believe that the metadata knows this, and any backup outside this
+	 *   has been erased.
+	 *
+	 * Return value is:
+	 *   1 if more data from backup_point - but only as far as suspend_point,
+	 *     should be backed up
+	 *   0 if things are progressing smoothly
+	 *  -1 if the reshape is finished, either because it is all done,
+	 *     or due to an error.
+	 */
+
+	int advancing = (reshape->after.data_disks
+			 >= reshape->before.data_disks);
+	int need_backup = (reshape->after.data_disks
+			   == reshape->before.data_disks);
+	unsigned long long read_offset, write_offset;
+	unsigned long long read_range, write_range;
+	unsigned long long max_progress, target, completed;
+	int fd;
+
+	/* First, we unsuspend any region that is now known to be safe.
+	 * If suspend_point is on the 'wrong' side of reshape_progress, then
+	 * we don't have or need suspension at the moment.  This is true for
+	 * native metadata when we don't need to back-up.
+	 */
+	if (advancing) {
+		if (info->reshape_progress < *suspend_point)
+			sysfs_set_num(info, NULL, "suspend_lo",
+				      info->reshape_progress);
+	} else {
+		/* Note: this won't work in 2.6.37 and before.
+		 * Something somewhere should make sure we don't need it!
+		 */
+		if (info->reshape_progress > *suspend_point)
+			sysfs_set_num(info, NULL, "suspend_hi",
+				      info->reshape_progress);
+	}
+
+	/* Now work out how far it is safe to progress.
+	 * If the read_offset for ->reshape_progress is less than
+	 * 'blocks' beyond the write_offset, we can only progress as far
+	 * as a backup.
+	 * Otherwise we can progress until the write_offset for the new location
+	 * reaches (within 'blocks' of) the read_offset at the current location.
+	 * However that region must be suspended unless we are using native
+	 * metadata.
+	 * If we need to suspend more, we limit it to 128M per device, which is
+	 * rather arbitrary and should be some time-based calculation.
+	 */
+	write_offset = info->reshape_progress / reshape->before.data_disks;
+	read_offset = info->reshape_progress / reshape->after.data_disks;
+	write_range = reshape->blocks / reshape->before.data_disks;
+	read_range = reshape->blocks / reshape->after.data_disks;
+	if (advancing) {
+		if (read_offset < write_offset + write_range) {
+			max_progress = backup_point;
+			if (max_progress <= info->reshape_progress)
+				need_backup = 1;
+		} else {
+			max_progress =
+				(read_offset - write_range) *
+				reshape->before.data_disks;
+		}
+	} else {
+		if (read_offset > write_offset - write_range) {
+			max_progress = backup_point;
+			if (max_progress >= info->reshape_progress)
+				need_backup = 1;
+		} else {
+			max_progress =
+				(read_offset + write_range) *
+				reshape->before.data_disks;
+			/* If we are using internal metadata, then we can
+			 * progress all the way to the suspend_point without
+			 * worrying about backing-up/suspending along the
+			 * way.
+			 */
+			if (max_progress < *suspend_point &&
+				info->array.major_version >= 0)
+				max_progress = *suspend_point;
+		}
+	}
+
+	/* We know it is safe to progress to 'max_progress' providing
+	 * it is suspended or we are using native metadata.
+	 * Consider extending suspend_point 128M per device if it
+	 * is less than 64M per device beyond reshape_progress.
+	 * But always do a multiple of 'blocks'
+	 */
+	target = 64*1024*2 * min(reshape->before.data_disks,
+				  reshape->after.data_disks);
+	target /= reshape->blocks;
+	if (target < 2)
+		target = 2;
+	target *= reshape->blocks;
+
+	/* For externally managed metadata we always need to suspend IO to
+	 * the area being reshaped so we regularly push suspend_point forward.
+	 * For native metadata we only need the suspend if we are going to do
+	 * a backup.
+	 */
+	if (advancing) {
+		if ((need_backup || info->array.major_version < 0) &&
+		    *suspend_point < info->reshape_progress + target) {
+			if (max_progress < *suspend_point + 2 * target)
+				*suspend_point = max_progress;
+			else
+				*suspend_point += 2 * target;
+			sysfs_set_num(info, NULL, "suspend_hi", *suspend_point);
+			max_progress = *suspend_point;
+		}
+	} else {
+		if ((need_backup || info->array.major_version < 0) &&
+		    *suspend_point > info->reshape_progress - target) {
+			if (max_progress > *suspend_point - 2 * target)
+				*suspend_point = max_progress;
+			else
+				*suspend_point -= 2 * target;
+			sysfs_set_num(info, NULL, "suspend_lo", *suspend_point);
+			max_progress = *suspend_point;
+		}
+	}
+
+	/* now set sync_max to allow that progress. sync_max, like
+	 * sync_completed is a count of sectors written per device, so
+	 * we find the difference between max_progress and the start point,
+	 * and divide that by after.data_disks to get a sync_max
+	 * number.
+	 * At the same time we convert wait_point to a similar number
+	 * for comparing against sync_completed.
+	 */
+	if (!advancing) {
+		max_progress = info->component_size * reshape->after.data_disks
+			- max_progress;
+		wait_point = info->component_size * reshape->after.data_disks
+			- wait_point;
+	}
+	max_progress /= reshape->after.data_disks;
+	wait_point /= reshape->after.data_disks;
+
+	sysfs_set_num(info, NULL, "sync_max", max_progress);
+
+	/* Now wait.  If we have already reached the point that we were
+	 * asked to wait to, don't wait at all, else wait for any change.
+	 * We need to select on 'sync_completed' as that is the place that
+	 * notifications happen, but we are really interested in
+	 * 'reshape_position'
+	 */
+	fd = sysfs_get_fd(info, NULL, "sync_completed");
+	if (fd < 0)
+		return -1;
+
+	if (sysfs_fd_get_ll(fd, &completed) < 0) {
+		close(fd);
+		return -1;
+	}
+	while (completed < max_progress && completed < wait_point) {
+		/* Check that sync_action is still 'reshape' to avoid
+		 * waiting forever on a dead array
+		 */
+		char action[20];
+		fd_set rfds;
+		if (sysfs_get_str(info, NULL, "sync_action",
+				  action, 20) <= 0 ||
+		    strncmp(action, "reshape", 7) != 0)
+			break;
+		FD_ZERO(&rfds);
+		FD_SET(fd, &rfds);
+		select(fd+1, NULL, NULL, &rfds, NULL);
+		if (sysfs_fd_get_ll(fd, &completed) < 0) {
+			close(fd);
+			return -1;
+		}
+	}
+	/* Convert 'completed' back in to a 'progress' number */
+	completed *= reshape->after.data_disks;
+	if (!advancing) {
+		completed = info->component_size * reshape->after.data_disks
+			- completed;
+	}
+	*reshape_completed = completed;
+	
+	close(fd);
+
+	/* We return the need_backup flag.  Caller will decide
+	 * how much (a multiple of ->blocks) and will adjust
+	 * suspend_{lo,hi} and suspend_point.
+	 */
+	return need_backup;
+}
+
+
 /* FIXME return status is never checked */
 static int grow_backup(struct mdinfo *sra,
 		unsigned long long offset, /* per device */
-		unsigned long stripes, /* per device */
+		unsigned long stripes, /* per device, in old chunks */
 		int *sources, unsigned long long *offsets,
 		int disks, int chunk, int level, int layout,
 		int dests, int *destfd, unsigned long long *destoffsets,
@@ -2193,7 +2399,7 @@ static int grow_backup(struct mdinfo *sra,
 		odata--;
 	if (level == 6)
 		odata--;
-	sysfs_set_num(sra, NULL, "suspend_hi", (offset + stripes * (chunk/512)) * odata);
+
 	/* Check that array hasn't become degraded, else we might backup the wrong data */
 	sysfs_get_ll(sra, NULL, "degraded", &ll);
 	new_degraded = (int)ll;
@@ -2283,46 +2489,16 @@ static int grow_backup(struct mdinfo *sra,
  * every works.
  */
 /* FIXME return value is often ignored */
-static int wait_backup(struct mdinfo *sra,
-		unsigned long long offset, /* per device */
-		unsigned long long blocks, /* per device */
-		unsigned long long blocks2, /* per device - hack */
+static int forget_backup(
 		int dests, int *destfd, unsigned long long *destoffsets,
 		int part)
 {
-	/* Wait for resync to pass the section that was backed up
-	 * then erase the backup and allow IO
+	/* 
+	 * Erase backup 'part' (which is 0 or 1)
 	 */
-	int fd = sysfs_get_fd(sra, NULL, "sync_completed");
-	unsigned long long completed;
 	int i;
 	int rv;
 
-	if (fd < 0)
-		return -1;
-	sysfs_set_num(sra, NULL, "sync_max", offset + blocks + blocks2);
-
-	if (sysfs_fd_get_ll(fd, &completed) < 0) {
-		close(fd);
-		return -1;
-	}
-	while (completed < offset + blocks) {
-		char action[20];
-		fd_set rfds;
-		FD_ZERO(&rfds);
-		FD_SET(fd, &rfds);
-		select(fd+1, NULL, NULL, &rfds, NULL);
-		if (sysfs_fd_get_ll(fd, &completed) < 0) {
-			close(fd);
-			return -1;
-		}
-		if (sysfs_get_str(sra, NULL, "sync_action",
-				  action, 20) > 0 &&
-		    strncmp(action, "reshape", 7) != 0)
-			break;
-	}
-	close(fd);
-
 	if (part) {
 		bsb.arraystart2 = __cpu_to_le64(0);
 		bsb.length2 = __cpu_to_le64(0);
@@ -2442,130 +2618,133 @@ static void validate(int afd, int bfd, unsigned long long offset)
 	}
 }
 
-static int child_grow(int afd, struct mdinfo *sra, unsigned long stripes,
-		      int *fds, unsigned long long *offsets,
-		      int disks, int chunk, int level, int layout, int data,
-		      int dests, int *destfd, unsigned long long *destoffsets)
+static int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape,
+			 unsigned long stripes,
+			 int *fds, unsigned long long *offsets,
+			 int dests, int *destfd, unsigned long long *destoffsets)
 {
+	/* Monitor a reshape where backup is being performed using
+	 * 'native' mechanism - either to a backup file, or
+	 * to some space in a spare.
+	 */
 	char *buf;
-	int degraded = 0;
+	int degraded = -1;
+	unsigned long long speed;
+	unsigned long long suspend_point, array_size;
+	unsigned long long backup_point, wait_point;
+	unsigned long long reshape_completed;
+	int done = 0;
+	int increasing = reshape->after.data_disks >= reshape->before.data_disks;
+	int part = 0; /* The next part of the backup area to fill.  It may already
+		       * be full, so we need to check */
+	int level = reshape->level;
+	int layout = reshape->before.layout;
+	int data = reshape->before.data_disks;
+	int disks = reshape->before.data_disks + reshape->parity;
+	int chunk = sra->array.chunk_size;
 
 	if (posix_memalign((void**)&buf, 4096, disks * chunk))
 		/* Don't start the 'reshape' */
 		return 0;
-	grow_backup(sra, 0, stripes,
-		    fds, offsets, disks, chunk, level, layout,
-		    dests, destfd, destoffsets,
-		    0, &degraded, buf);
-	validate(afd, destfd[0], destoffsets[0]);
-	wait_backup(sra, 0, stripes * (chunk / 512), stripes * (chunk / 512),
-		    dests, destfd, destoffsets,
-		    0);
-	sysfs_set_num(sra, NULL, "suspend_lo", (stripes * (chunk/512)) * data);
-	free(buf);
-	/* FIXME this should probably be numeric */
-	sysfs_set_str(sra, NULL, "sync_max", "max");
-	return 1;
-}
-
-static int child_shrink(int afd, struct mdinfo *sra, unsigned long stripes,
-			int *fds, unsigned long long *offsets,
-			int disks, int chunk, int level, int layout, int data,
-			int dests, int *destfd, unsigned long long *destoffsets)
-{
-	char *buf;
-	unsigned long long start;
-	int rv;
-	int degraded = 0;
-
-	if (posix_memalign((void**)&buf, 4096, disks * chunk))
-		return 0;
-	start = sra->component_size - stripes * (chunk/512);
-	sysfs_set_num(sra, NULL, "sync_max", start);
-	rv = wait_backup(sra, 0, start - stripes * (chunk/512), stripes * (chunk/512),
-			 dests, destfd, destoffsets, 0);
-	if (rv < 0)
-		return 0;
-	grow_backup(sra, 0, stripes,
-		    fds, offsets,
-		    disks, chunk, level, layout,
-		    dests, destfd, destoffsets,
-		    0, &degraded, buf);
-	validate(afd, destfd[0], destoffsets[0]);
-	wait_backup(sra, start, stripes*(chunk/512), 0,
-		    dests, destfd, destoffsets, 0);
-	sysfs_set_num(sra, NULL, "suspend_lo", (stripes * (chunk/512)) * data);
-	free(buf);
-	/* FIXME this should probably be numeric */
-	sysfs_set_str(sra, NULL, "sync_max", "max");
-	return 1;
-}
-
-static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes,
-			   int *fds, unsigned long long *offsets,
-			   unsigned long long start,
-			   int disks, int chunk, int level, int layout, int data,
-			   int dests, int *destfd, unsigned long long *destoffsets)
-{
-	unsigned long long size;
-	unsigned long tailstripes = stripes;
-	int part;
-	char *buf;
-	unsigned long long speed;
-	int degraded = 0;
-
-
-	if (posix_memalign((void**)&buf, 4096, disks * chunk))
-		return 0;
-
-	sysfs_get_ll(sra, NULL, "sync_speed_min", &speed);
-	sysfs_set_num(sra, NULL, "sync_speed_min", 200000);
-
-	grow_backup(sra, start, stripes,
-		    fds, offsets,
-		    disks, chunk, level, layout,
-		    dests, destfd, destoffsets,
-		    0, &degraded, buf);
-	grow_backup(sra, (start + stripes) * (chunk/512), stripes,
-		    fds, offsets,
-		    disks, chunk, level, layout,
-		    dests, destfd, destoffsets,
-		    1, &degraded, buf);
-	validate(afd, destfd[0], destoffsets[0]);
-	part = 0;
-	start += stripes * 2; /* where to read next */
-	size = sra->component_size / (chunk/512);
-	while (start < size) {
-		if (wait_backup(sra, (start-stripes*2)*(chunk/512),
-				stripes*(chunk/512), 0,
-				dests, destfd, destoffsets,
-				part) < 0)
-			return 0;
-		sysfs_set_num(sra, NULL, "suspend_lo", start*(chunk/512) * data);
-		if (start + stripes > size)
-			tailstripes = (size - start);
-
-		grow_backup(sra, start*(chunk/512), tailstripes,
-			    fds, offsets,
-			    disks, chunk, level, layout,
-			    dests, destfd, destoffsets,
-			    part, &degraded, buf);
-		start += stripes;
-		part = 1 - part;
-		validate(afd, destfd[0], destoffsets[0]);
+	if (reshape->before.data_disks == reshape->after.data_disks) {
+		sysfs_get_ll(sra, NULL, "sync_speed_min", &speed);
+		sysfs_set_num(sra, NULL, "sync_speed_min", 200000);
 	}
-	if (wait_backup(sra, (start-stripes*2) * (chunk/512), stripes * (chunk/512), 0,
-			dests, destfd, destoffsets,
-			part) < 0)
-		return 0;
-	sysfs_set_num(sra, NULL, "suspend_lo", ((start-stripes)*(chunk/512)) * data);
-	wait_backup(sra, (start-stripes) * (chunk/512), tailstripes * (chunk/512), 0,
-		    dests, destfd, destoffsets,
-		    1-part);
-	sysfs_set_num(sra, NULL, "suspend_lo", (size*(chunk/512)) * data);
-	sysfs_set_num(sra, NULL, "sync_speed_min", speed);
+
+	array_size = sra->component_size * data;
+	if (increasing) {
+		backup_point = sra->reshape_progress;
+		suspend_point = 0;
+	} else {
+		backup_point = array_size;
+		suspend_point = array_size;
+	}
+
+	while (!done) {
+		int rv;
+
+		/* Want to return as soon the oldest backup slot can
+		 * be released as that allows us to start backing up
+		 * some more, providing suspend_point has been
+		 * advanced, which it should have
+		 */
+		if (increasing) {
+			wait_point = array_size;
+			if (part == 0 && __le64_to_cpu(bsb.length) > 0)
+				wait_point = (__le64_to_cpu(bsb.arraystart) +
+					      __le64_to_cpu(bsb.length));
+			if (part == 1 && __le64_to_cpu(bsb.length2) > 0)
+				wait_point = (__le64_to_cpu(bsb.arraystart2) +
+					      __le64_to_cpu(bsb.length2));
+		} else {
+			wait_point = 0;
+			if (part == 0 && __le64_to_cpu(bsb.length) > 0)
+				wait_point = __le64_to_cpu(bsb.arraystart);
+			if (part == 1 && __le64_to_cpu(bsb.length2) > 0)
+				wait_point = __le64_to_cpu(bsb.arraystart2);
+		}
+
+		rv = progress_reshape(sra, reshape,
+				      backup_point, wait_point,
+				      &suspend_point, &reshape_completed);
+		if (rv < 0) {
+			done = 1;
+			break;
+		}
+
+		/* external metadata would need to ping_monitor here */
+		sra->reshape_progress = reshape_completed;
+
+		/* Clear any backup region that is before 'here' */
+		if (increasing) {
+			if (reshape_completed >= (__le64_to_cpu(bsb.arraystart) +
+						  __le64_to_cpu(bsb.length)))
+				forget_backup(dests, destfd,
+					      destoffsets, 0);
+			if (reshape_completed >= (__le64_to_cpu(bsb.arraystart2) +
+						  __le64_to_cpu(bsb.length2)))
+				forget_backup(dests, destfd,
+					      destoffsets, 1);
+		} else {
+			if (reshape_completed <= (__le64_to_cpu(bsb.arraystart)))
+				forget_backup(dests, destfd,
+					      destoffsets, 0);
+			if (reshape_completed <= (__le64_to_cpu(bsb.arraystart2)))
+				forget_backup(dests, destfd,
+					      destoffsets, 1);
+		}
+
+		if (rv) {
+			unsigned long long offset;
+			/* need to backup some space... */
+			/* Check that 'part' is unused */
+			if (part == 0 && __le64_to_cpu(bsb.length) != 0)
+				abort(); /* BUG here */
+			if (part == 1 && __le64_to_cpu(bsb.length2) != 0)
+				abort();
+
+			offset = backup_point / data;
+			if (!increasing)
+				offset -= stripes * (chunk/512);
+			grow_backup(sra, offset, stripes,
+				    fds, offsets,
+				    disks, chunk, level, layout,
+				    dests, destfd, destoffsets,
+				    part, &degraded, buf);
+			validate(afd, destfd[0], destoffsets[0]);
+			/* record where 'part' is up to */
+			part = !part;
+			if (increasing)
+				backup_point += stripes * (chunk/512) * data;
+			else
+				backup_point -= stripes * (chunk/512) * data;
+		}
+	}
+
+	if (reshape->before.data_disks == reshape->after.data_disks)
+		sysfs_set_num(sra, NULL, "sync_speed_min", speed);
 	free(buf);
-	return 1;
+	return 1; /* FIXME what does this mean? */
 }
 
 /*
@@ -2859,164 +3038,10 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
 int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info,
 		  char *backup_file)
 {
-	/* Array is assembled and ready to be started, but
-	 * monitoring is probably required.
-	 * So:
-	 *   - start read-only
-	 *   - set upper bound for resync
-	 *   - initialise the 'suspend' boundaries
-	 *   - switch to read-write
-	 *   - fork and continue monitoring
-	 */
-	int err;
-	int backup_list[1];
-	unsigned long long backup_offsets[1];
-	int odisks, ndisks, ochunk, nchunk,odata,ndata;
-	unsigned long a,b,blocks,stripes;
-	int backup_fd;
-	int *fds;
-	unsigned long long *offsets;
-	int d;
-	struct mdinfo *sra, *sd;
-	int rv;
-	unsigned long cache;
-	int done = 0;
-
-	err = sysfs_set_str(info, NULL, "array_state", "readonly");
+	int err = sysfs_set_str(info, NULL, "array_state", "readonly");
 	if (err)
 		return err;
-
-	/* make sure reshape doesn't progress until we are ready */
-	sysfs_set_str(info, NULL, "sync_max", "0");
-	sysfs_set_str(info, NULL, "array_state", "active"); /* FIXME or clean */
-
-	sra = sysfs_read(-1, devname2devnum(info->sys_name),
-			 GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
-			 GET_CACHE);
-	if (!sra)
-		return 1;
-
-	/* ndisks is not growing, so raid_disks is old and +delta is new */
-	odisks = info->array.raid_disks;
-	ndisks = odisks + info->delta_disks;
-	odata = odisks - 1;
-	ndata = ndisks - 1;
-	if (info->array.level == 6) {
-		odata--;
-		ndata--;
-	}
-	ochunk = info->array.chunk_size;
-	nchunk = info->new_chunk;
-
-	a = (ochunk/512) * odata;
-	b = (nchunk/512) * ndata;
-	/* Find GCD */
-	while (a != b) {
-		if (a < b)
-			b -= a;
-		if (b < a)
-			a -= b;
-	}
-	/* LCM == product / GCD */
-	blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a;
-
-	if (ndata == odata)
-		while (blocks * 32 < sra->component_size &&
-		       blocks < 16*1024*2)
-			blocks *= 2;
-	stripes = blocks / (info->array.chunk_size/512) / odata;
-
-	/* check that the internal stripe cache is
-	 * large enough, or it won't work.
-	 */
-	cache = (nchunk < ochunk) ? ochunk : nchunk;
-	cache = cache * 4 / 4096;
-	if (cache < blocks / 8 / odisks + 16)
-		/* Make it big enough to hold 'blocks' */
-		cache = blocks / 8 / odisks + 16;
-	if (sra->cache_size < cache)
-		sysfs_set_num(sra, NULL, "stripe_cache_size",
-			      cache+1);
-
-	memset(&bsb, 0, 512);
-	memcpy(bsb.magic, "md_backup_data-1", 16);
-	memcpy(&bsb.set_uuid, info->uuid, 16);
-	bsb.mtime = __cpu_to_le64(time(0));
-	bsb.devstart2 = blocks;
-
-	backup_fd = open(backup_file, O_RDWR|O_CREAT, S_IRUSR | S_IWUSR);
-	if (backup_fd < 0) {
-		fprintf(stderr, Name ": Cannot open backup file %s\n",
-			backup_file ?: "- no backup-file given");
-		return 1;
-	}
-	backup_list[0] = backup_fd;
-	backup_offsets[0] = 8 * 512;
-	fds = malloc(odisks * sizeof(fds[0]));
-	offsets = malloc(odisks * sizeof(offsets[0]));
-	for (d=0; d<odisks; d++)
-		fds[d] = -1;
-
-	for (sd = sra->devs; sd; sd = sd->next) {
-		if (sd->disk.state & (1<<MD_DISK_FAULTY))
-			continue;
-		if (sd->disk.state & (1<<MD_DISK_SYNC)) {
-			char *dn = map_dev(sd->disk.major,
-					   sd->disk.minor, 1);
-			fds[sd->disk.raid_disk]
-				= dev_open(dn, O_RDONLY);
-			offsets[sd->disk.raid_disk] = sd->data_offset*512;
-			if (fds[sd->disk.raid_disk] < 0) {
-				fprintf(stderr, Name ": %s: cannot open component %s\n",
-					info->sys_name, dn?dn:"-unknown-");
-				rv = 1;
-				goto release;
-			}
-			free(dn);
-		}
-	}
-
-	switch(fork()) {
-	case 0:
-		close(mdfd);
-		mlockall(MCL_FUTURE);
-		if (info->delta_disks < 0)
-			done = child_shrink(-1, info, stripes,
-					    fds, offsets,
-					    info->array.raid_disks,
-					    info->array.chunk_size,
-					    info->array.level, info->array.layout,
-					    odata,
-					    1, backup_list, backup_offsets);
-		else if (info->delta_disks == 0) {
-			/* The 'start' is a per-device stripe number.
-			 * reshape_progress is a per-array sector number.
-			 * So divide by ndata * chunk_size
-			 */
-			unsigned long long start = info->reshape_progress / ndata;
-			start /= (info->array.chunk_size/512);
-			done = child_same_size(-1, info, stripes,
-					       fds, offsets,
-					       start,
-					       info->array.raid_disks,
-					       info->array.chunk_size,
-					       info->array.level, info->array.layout,
-					       odata,
-					       1, backup_list, backup_offsets);
-		}
-		if (backup_file && done)
-			unlink(backup_file);
-		/* FIXME should I intuit a level change */
-		exit(0);
-	case -1:
-		fprintf(stderr, Name ": Cannot run child to continue monitoring reshape: %s\n",
-			strerror(errno));
-		return 1;
-	default:
-		break;
-	}
-release:
-	return 0;
+	return reshape_array(NULL, mdfd, "array", st, info, 1, backup_file, 0, 0);
 }
 
 
diff --git a/external-reshape-design.txt b/external-reshape-design.txt
index 23ba890..4eb04a2 100644
--- a/external-reshape-design.txt
+++ b/external-reshape-design.txt
@@ -205,8 +205,76 @@ sync_action
 
 2.6 Reshape raid disks (shrink)
 
-3 TODO
+3 Interaction with metadata handle.
 
+  The following calls are made into the metadata handler to assist
+  with initiating and monitoring a 'reshape'.
+
+  1/ ->reshape_super is called quite early (after only minimial
+     checks) to make sure that the metadata can record the new shape
+     and any necessary transitions.  It may be passed a 'container'
+     or an individual array within a container, and it should notice
+     the difference and act accordingly.
+     When a reshape is requested against a container it is expected
+     that it should be applied to every array in the container,
+     however it is up to the metadata handler to determine final
+     policy.
+
+     If the reshape is supportable, the internal copy of the metadata
+     should be updated, and a metadata update suitable for sending
+     to mdmon should be queued.
+
+     If the reshape will involve converting spares into array members,
+     this must be recorded in the metadata too.
+
+  2/ ->container_content will be called to find out the new state
+     of all the array, or all arrays in the container.  Any newly
+     added devices (with state==0 and raid_disk >= 0) will be added
+     to the array as spares with the relevant slot number.
+
+     It is likely that the info returned by  ->container_content will
+     have ->reshape_active set, ->reshape_progress set to e.g. 0, and
+     new_* set appropriately.  mdadm will use this information to
+     cause the correct reshape to start at an appropriate time.
+
+  3/ ->set_array_state will be called by mdmon when reshape has
+     started and again periodically as it progresses.  This should
+     record the ->last_checkpoint as the point where reshape has
+     progressed to.  When the reshape finished this will be called
+     again and it should notice that ->curr_action is no longer
+     'reshape' and so should record that the reshape has finished
+     providing 'last_checkpoint' has progressed suitably.
+
+  4/ ->manage_reshape will be called once the reshape has been set
+     up in the kernel but before sync_max has been moved from 0, so
+     no actual reshape will have happened.
+
+     ->manage_reshape should call progress_reshape() to allow the
+     reshape to progress, and should back-up any data as indicated
+     by the return value.  See the documentation of that function
+     for more details.
+     ->manage_reshape will be called multiple times when a
+     container is being reshaped, once for each member array in
+     the container.
+
+
+   The progress of the metadata is as follows:
+    1/ mdadm sends a metadata update to mdmon which marks the array
+       as undergoing a reshape. This is set up by
+       ->reshape_super and applied by ->process_update
+       For container-wide reshape, this happens once for the whole
+       container.
+    2/ mdmon notices progress via the sysfs files and calls
+       ->set_array_state to update the state periodically
+       For container-wide reshape, this happens repeatedly for
+       one array, then repeatedly for the next, etc.
+    3/ mdmon notices when reshape has finished and call
+       ->set_array_state to record the the reshape is complete.
+       For container-wide reshape, this happens once for each
+       member array.
+     
+     
+   
 ...
 
 [1]: Linux kernel design patterns - part 3, Neil Brown http://lwn.net/Articles/336262/