Refactor reshape monitoring.
Combine all the non-backing-up code into a single function: progress_reshape. It is called repeatedly to monitor a reshape and allow it to happen safely. Have a single separate function 'child_monitor' which performs backups of data and calls progress_reshape to wait for the next backup to be needed. Signed-off-by: NeilBrown <neilb@suse.de>
This commit is contained in:
parent
5da9ab9874
commit
7443ee8187
721
Grow.c
721
Grow.c
|
@ -453,20 +453,6 @@ static __u32 bsb_csum(char *buf, int len)
|
|||
return __cpu_to_le32(csum);
|
||||
}
|
||||
|
||||
static int child_grow(int afd, struct mdinfo *sra, unsigned long blocks,
|
||||
int *fds, unsigned long long *offsets,
|
||||
int disks, int chunk, int level, int layout, int data,
|
||||
int dests, int *destfd, unsigned long long *destoffsets);
|
||||
static int child_shrink(int afd, struct mdinfo *sra, unsigned long blocks,
|
||||
int *fds, unsigned long long *offsets,
|
||||
int disks, int chunk, int level, int layout, int data,
|
||||
int dests, int *destfd, unsigned long long *destoffsets);
|
||||
static int child_same_size(int afd, struct mdinfo *sra, unsigned long blocks,
|
||||
int *fds, unsigned long long *offsets,
|
||||
unsigned long long start,
|
||||
int disks, int chunk, int level, int layout, int data,
|
||||
int dests, int *destfd, unsigned long long *destoffsets);
|
||||
|
||||
static int check_idle(struct supertype *st)
|
||||
{
|
||||
/* Check that all member arrays for this container, or the
|
||||
|
@ -1291,6 +1277,11 @@ static int reshape_container(char *container, int cfd, char *devname,
|
|||
int force,
|
||||
char *backup_file,
|
||||
int quiet);
|
||||
static int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape,
|
||||
unsigned long stripes,
|
||||
int *fds, unsigned long long *offsets,
|
||||
int dests, int *destfd, unsigned long long *destoffsets);
|
||||
|
||||
|
||||
int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
|
||||
long long size,
|
||||
|
@ -1507,6 +1498,8 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file,
|
|||
}
|
||||
|
||||
info.array = array;
|
||||
sysfs_init(&info, fd, NoMdDev);
|
||||
info.component_size = size*2;
|
||||
info.new_level = level;
|
||||
info.new_chunk = chunksize * 1024;
|
||||
if (raid_disks)
|
||||
|
@ -1792,7 +1785,7 @@ static int reshape_array(char *container, int fd, char *devname,
|
|||
}
|
||||
|
||||
sra = sysfs_read(fd, 0,
|
||||
GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
|
||||
GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE||GET_CHUNK|
|
||||
GET_CACHE);
|
||||
|
||||
if (!sra) {
|
||||
|
@ -2001,32 +1994,10 @@ static int reshape_array(char *container, int fd, char *devname,
|
|||
|
||||
odisks = reshape.before.data_disks + reshape.parity;
|
||||
|
||||
if (reshape.before.data_disks <
|
||||
reshape.after.data_disks)
|
||||
done = child_grow(fd, sra, stripes,
|
||||
fdlist, offsets,
|
||||
odisks,
|
||||
info->array.chunk_size,
|
||||
reshape.level,
|
||||
reshape.before.layout,
|
||||
reshape.before.data_disks,
|
||||
d - odisks, fdlist+odisks, offsets+odisks);
|
||||
else if (reshape.before.data_disks >
|
||||
reshape.after.data_disks)
|
||||
done = child_shrink(fd, sra, stripes,
|
||||
fdlist, offsets,
|
||||
odisks, info->array.chunk_size, reshape.level,
|
||||
reshape.before.layout,
|
||||
reshape.before.data_disks,
|
||||
d - odisks, fdlist+odisks, offsets+odisks);
|
||||
else
|
||||
done = child_same_size(fd, sra, stripes,
|
||||
fdlist, offsets,
|
||||
0,
|
||||
odisks, info->array.chunk_size, reshape.level,
|
||||
reshape.before.layout,
|
||||
reshape.before.data_disks,
|
||||
d - odisks, fdlist+odisks, offsets+odisks);
|
||||
done = child_monitor(fd, sra, &reshape, stripes,
|
||||
fdlist, offsets,
|
||||
d - odisks, fdlist+odisks, offsets+odisks);
|
||||
|
||||
if (backup_file && done)
|
||||
unlink(backup_file);
|
||||
if (!done)
|
||||
|
@ -2134,6 +2105,7 @@ int reshape_container(char *container, int cfd, char *devname,
|
|||
if (!adev)
|
||||
adev = cc->text_version;
|
||||
|
||||
sysfs_init(cc, fd, mdstat->devnum);
|
||||
rv = reshape_array(container, fd, adev, st, cc, force,
|
||||
backup_file, quiet, 1);
|
||||
close(fd);
|
||||
|
@ -2167,10 +2139,244 @@ int reshape_container(char *container, int cfd, char *devname,
|
|||
*
|
||||
*/
|
||||
|
||||
int progress_reshape(struct mdinfo *info, struct reshape *reshape,
|
||||
unsigned long long backup_point,
|
||||
unsigned long long wait_point,
|
||||
unsigned long long *suspend_point,
|
||||
unsigned long long *reshape_completed)
|
||||
{
|
||||
/* This function is called repeatedly by the reshape manager.
|
||||
* It determines how much progress can safely be made and allows
|
||||
* that progress.
|
||||
* - 'info' identifies the array and particularly records in
|
||||
* ->reshape_progress the metadata's knowledge of progress
|
||||
* This is a sector offset from the start of the array
|
||||
* of the next array block to be relocated. This number
|
||||
* may increase from 0 or decrease from array_size, depending
|
||||
* on the type of reshape that is happening.
|
||||
* Note that in contrast, 'sync_completed' is a block count of the
|
||||
* reshape so far. It gives the distance between the start point
|
||||
* (head or tail of device) and the next place that data will be
|
||||
* written. It always increases.
|
||||
* - 'reshape' is the structure created by analyse_change
|
||||
* - 'backup_point' shows how much the metadata manager has backed-up
|
||||
* data. For reshapes with increasing progress, it is the next address
|
||||
* to be backed up, previous addresses have been backed-up. For
|
||||
* decreasing progress, it is the earliest address that has been
|
||||
* backed up - later address are also backed up.
|
||||
* So addresses between reshape_progress and backup_point are
|
||||
* backed up providing those are in the 'correct' order.
|
||||
* - 'wait_point' is an array address. When reshape_completed
|
||||
* passes this point, progress_reshape should return. It might
|
||||
* return earlier if it determines that ->reshape_progress needs
|
||||
* to be updated or further backup is needed.
|
||||
* - suspend_point is maintained by progress_reshape and the caller
|
||||
* should not touch it except to initialise to zero.
|
||||
* It is an array address and it only increases in 2.6.37 and earlier.
|
||||
* This makes it difficulty to handle reducing reshapes with
|
||||
* external metadata.
|
||||
* However: it is similar to backup_point in that it records the
|
||||
* other end of a suspended region from reshape_progress.
|
||||
* it is moved to extend the region that is safe to backup and/or
|
||||
* reshape
|
||||
* - reshape_completed is read from sysfs and returned. The caller
|
||||
* should copy this into ->reshape_progress when it has reason to
|
||||
* believe that the metadata knows this, and any backup outside this
|
||||
* has been erased.
|
||||
*
|
||||
* Return value is:
|
||||
* 1 if more data from backup_point - but only as far as suspend_point,
|
||||
* should be backed up
|
||||
* 0 if things are progressing smoothly
|
||||
* -1 if the reshape is finished, either because it is all done,
|
||||
* or due to an error.
|
||||
*/
|
||||
|
||||
int advancing = (reshape->after.data_disks
|
||||
>= reshape->before.data_disks);
|
||||
int need_backup = (reshape->after.data_disks
|
||||
== reshape->before.data_disks);
|
||||
unsigned long long read_offset, write_offset;
|
||||
unsigned long long read_range, write_range;
|
||||
unsigned long long max_progress, target, completed;
|
||||
int fd;
|
||||
|
||||
/* First, we unsuspend any region that is now known to be safe.
|
||||
* If suspend_point is on the 'wrong' side of reshape_progress, then
|
||||
* we don't have or need suspension at the moment. This is true for
|
||||
* native metadata when we don't need to back-up.
|
||||
*/
|
||||
if (advancing) {
|
||||
if (info->reshape_progress < *suspend_point)
|
||||
sysfs_set_num(info, NULL, "suspend_lo",
|
||||
info->reshape_progress);
|
||||
} else {
|
||||
/* Note: this won't work in 2.6.37 and before.
|
||||
* Something somewhere should make sure we don't need it!
|
||||
*/
|
||||
if (info->reshape_progress > *suspend_point)
|
||||
sysfs_set_num(info, NULL, "suspend_hi",
|
||||
info->reshape_progress);
|
||||
}
|
||||
|
||||
/* Now work out how far it is safe to progress.
|
||||
* If the read_offset for ->reshape_progress is less than
|
||||
* 'blocks' beyond the write_offset, we can only progress as far
|
||||
* as a backup.
|
||||
* Otherwise we can progress until the write_offset for the new location
|
||||
* reaches (within 'blocks' of) the read_offset at the current location.
|
||||
* However that region must be suspended unless we are using native
|
||||
* metadata.
|
||||
* If we need to suspend more, we limit it to 128M per device, which is
|
||||
* rather arbitrary and should be some time-based calculation.
|
||||
*/
|
||||
write_offset = info->reshape_progress / reshape->before.data_disks;
|
||||
read_offset = info->reshape_progress / reshape->after.data_disks;
|
||||
write_range = reshape->blocks / reshape->before.data_disks;
|
||||
read_range = reshape->blocks / reshape->after.data_disks;
|
||||
if (advancing) {
|
||||
if (read_offset < write_offset + write_range) {
|
||||
max_progress = backup_point;
|
||||
if (max_progress <= info->reshape_progress)
|
||||
need_backup = 1;
|
||||
} else {
|
||||
max_progress =
|
||||
(read_offset - write_range) *
|
||||
reshape->before.data_disks;
|
||||
}
|
||||
} else {
|
||||
if (read_offset > write_offset - write_range) {
|
||||
max_progress = backup_point;
|
||||
if (max_progress >= info->reshape_progress)
|
||||
need_backup = 1;
|
||||
} else {
|
||||
max_progress =
|
||||
(read_offset + write_range) *
|
||||
reshape->before.data_disks;
|
||||
/* If we are using internal metadata, then we can
|
||||
* progress all the way to the suspend_point without
|
||||
* worrying about backing-up/suspending along the
|
||||
* way.
|
||||
*/
|
||||
if (max_progress < *suspend_point &&
|
||||
info->array.major_version >= 0)
|
||||
max_progress = *suspend_point;
|
||||
}
|
||||
}
|
||||
|
||||
/* We know it is safe to progress to 'max_progress' providing
|
||||
* it is suspended or we are using native metadata.
|
||||
* Consider extending suspend_point 128M per device if it
|
||||
* is less than 64M per device beyond reshape_progress.
|
||||
* But always do a multiple of 'blocks'
|
||||
*/
|
||||
target = 64*1024*2 * min(reshape->before.data_disks,
|
||||
reshape->after.data_disks);
|
||||
target /= reshape->blocks;
|
||||
if (target < 2)
|
||||
target = 2;
|
||||
target *= reshape->blocks;
|
||||
|
||||
/* For externally managed metadata we always need to suspend IO to
|
||||
* the area being reshaped so we regularly push suspend_point forward.
|
||||
* For native metadata we only need the suspend if we are going to do
|
||||
* a backup.
|
||||
*/
|
||||
if (advancing) {
|
||||
if ((need_backup || info->array.major_version < 0) &&
|
||||
*suspend_point < info->reshape_progress + target) {
|
||||
if (max_progress < *suspend_point + 2 * target)
|
||||
*suspend_point = max_progress;
|
||||
else
|
||||
*suspend_point += 2 * target;
|
||||
sysfs_set_num(info, NULL, "suspend_hi", *suspend_point);
|
||||
max_progress = *suspend_point;
|
||||
}
|
||||
} else {
|
||||
if ((need_backup || info->array.major_version < 0) &&
|
||||
*suspend_point > info->reshape_progress - target) {
|
||||
if (max_progress > *suspend_point - 2 * target)
|
||||
*suspend_point = max_progress;
|
||||
else
|
||||
*suspend_point -= 2 * target;
|
||||
sysfs_set_num(info, NULL, "suspend_lo", *suspend_point);
|
||||
max_progress = *suspend_point;
|
||||
}
|
||||
}
|
||||
|
||||
/* now set sync_max to allow that progress. sync_max, like
|
||||
* sync_completed is a count of sectors written per device, so
|
||||
* we find the difference between max_progress and the start point,
|
||||
* and divide that by after.data_disks to get a sync_max
|
||||
* number.
|
||||
* At the same time we convert wait_point to a similar number
|
||||
* for comparing against sync_completed.
|
||||
*/
|
||||
if (!advancing) {
|
||||
max_progress = info->component_size * reshape->after.data_disks
|
||||
- max_progress;
|
||||
wait_point = info->component_size * reshape->after.data_disks
|
||||
- wait_point;
|
||||
}
|
||||
max_progress /= reshape->after.data_disks;
|
||||
wait_point /= reshape->after.data_disks;
|
||||
|
||||
sysfs_set_num(info, NULL, "sync_max", max_progress);
|
||||
|
||||
/* Now wait. If we have already reached the point that we were
|
||||
* asked to wait to, don't wait at all, else wait for any change.
|
||||
* We need to select on 'sync_completed' as that is the place that
|
||||
* notifications happen, but we are really interested in
|
||||
* 'reshape_position'
|
||||
*/
|
||||
fd = sysfs_get_fd(info, NULL, "sync_completed");
|
||||
if (fd < 0)
|
||||
return -1;
|
||||
|
||||
if (sysfs_fd_get_ll(fd, &completed) < 0) {
|
||||
close(fd);
|
||||
return -1;
|
||||
}
|
||||
while (completed < max_progress && completed < wait_point) {
|
||||
/* Check that sync_action is still 'reshape' to avoid
|
||||
* waiting forever on a dead array
|
||||
*/
|
||||
char action[20];
|
||||
fd_set rfds;
|
||||
if (sysfs_get_str(info, NULL, "sync_action",
|
||||
action, 20) <= 0 ||
|
||||
strncmp(action, "reshape", 7) != 0)
|
||||
break;
|
||||
FD_ZERO(&rfds);
|
||||
FD_SET(fd, &rfds);
|
||||
select(fd+1, NULL, NULL, &rfds, NULL);
|
||||
if (sysfs_fd_get_ll(fd, &completed) < 0) {
|
||||
close(fd);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
/* Convert 'completed' back in to a 'progress' number */
|
||||
completed *= reshape->after.data_disks;
|
||||
if (!advancing) {
|
||||
completed = info->component_size * reshape->after.data_disks
|
||||
- completed;
|
||||
}
|
||||
*reshape_completed = completed;
|
||||
|
||||
close(fd);
|
||||
|
||||
/* We return the need_backup flag. Caller will decide
|
||||
* how much (a multiple of ->blocks) and will adjust
|
||||
* suspend_{lo,hi} and suspend_point.
|
||||
*/
|
||||
return need_backup;
|
||||
}
|
||||
|
||||
|
||||
/* FIXME return status is never checked */
|
||||
static int grow_backup(struct mdinfo *sra,
|
||||
unsigned long long offset, /* per device */
|
||||
unsigned long stripes, /* per device */
|
||||
unsigned long stripes, /* per device, in old chunks */
|
||||
int *sources, unsigned long long *offsets,
|
||||
int disks, int chunk, int level, int layout,
|
||||
int dests, int *destfd, unsigned long long *destoffsets,
|
||||
|
@ -2193,7 +2399,7 @@ static int grow_backup(struct mdinfo *sra,
|
|||
odata--;
|
||||
if (level == 6)
|
||||
odata--;
|
||||
sysfs_set_num(sra, NULL, "suspend_hi", (offset + stripes * (chunk/512)) * odata);
|
||||
|
||||
/* Check that array hasn't become degraded, else we might backup the wrong data */
|
||||
sysfs_get_ll(sra, NULL, "degraded", &ll);
|
||||
new_degraded = (int)ll;
|
||||
|
@ -2283,46 +2489,16 @@ static int grow_backup(struct mdinfo *sra,
|
|||
* every works.
|
||||
*/
|
||||
/* FIXME return value is often ignored */
|
||||
static int wait_backup(struct mdinfo *sra,
|
||||
unsigned long long offset, /* per device */
|
||||
unsigned long long blocks, /* per device */
|
||||
unsigned long long blocks2, /* per device - hack */
|
||||
static int forget_backup(
|
||||
int dests, int *destfd, unsigned long long *destoffsets,
|
||||
int part)
|
||||
{
|
||||
/* Wait for resync to pass the section that was backed up
|
||||
* then erase the backup and allow IO
|
||||
/*
|
||||
* Erase backup 'part' (which is 0 or 1)
|
||||
*/
|
||||
int fd = sysfs_get_fd(sra, NULL, "sync_completed");
|
||||
unsigned long long completed;
|
||||
int i;
|
||||
int rv;
|
||||
|
||||
if (fd < 0)
|
||||
return -1;
|
||||
sysfs_set_num(sra, NULL, "sync_max", offset + blocks + blocks2);
|
||||
|
||||
if (sysfs_fd_get_ll(fd, &completed) < 0) {
|
||||
close(fd);
|
||||
return -1;
|
||||
}
|
||||
while (completed < offset + blocks) {
|
||||
char action[20];
|
||||
fd_set rfds;
|
||||
FD_ZERO(&rfds);
|
||||
FD_SET(fd, &rfds);
|
||||
select(fd+1, NULL, NULL, &rfds, NULL);
|
||||
if (sysfs_fd_get_ll(fd, &completed) < 0) {
|
||||
close(fd);
|
||||
return -1;
|
||||
}
|
||||
if (sysfs_get_str(sra, NULL, "sync_action",
|
||||
action, 20) > 0 &&
|
||||
strncmp(action, "reshape", 7) != 0)
|
||||
break;
|
||||
}
|
||||
close(fd);
|
||||
|
||||
if (part) {
|
||||
bsb.arraystart2 = __cpu_to_le64(0);
|
||||
bsb.length2 = __cpu_to_le64(0);
|
||||
|
@ -2442,130 +2618,133 @@ static void validate(int afd, int bfd, unsigned long long offset)
|
|||
}
|
||||
}
|
||||
|
||||
static int child_grow(int afd, struct mdinfo *sra, unsigned long stripes,
|
||||
int *fds, unsigned long long *offsets,
|
||||
int disks, int chunk, int level, int layout, int data,
|
||||
int dests, int *destfd, unsigned long long *destoffsets)
|
||||
static int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape,
|
||||
unsigned long stripes,
|
||||
int *fds, unsigned long long *offsets,
|
||||
int dests, int *destfd, unsigned long long *destoffsets)
|
||||
{
|
||||
/* Monitor a reshape where backup is being performed using
|
||||
* 'native' mechanism - either to a backup file, or
|
||||
* to some space in a spare.
|
||||
*/
|
||||
char *buf;
|
||||
int degraded = 0;
|
||||
int degraded = -1;
|
||||
unsigned long long speed;
|
||||
unsigned long long suspend_point, array_size;
|
||||
unsigned long long backup_point, wait_point;
|
||||
unsigned long long reshape_completed;
|
||||
int done = 0;
|
||||
int increasing = reshape->after.data_disks >= reshape->before.data_disks;
|
||||
int part = 0; /* The next part of the backup area to fill. It may already
|
||||
* be full, so we need to check */
|
||||
int level = reshape->level;
|
||||
int layout = reshape->before.layout;
|
||||
int data = reshape->before.data_disks;
|
||||
int disks = reshape->before.data_disks + reshape->parity;
|
||||
int chunk = sra->array.chunk_size;
|
||||
|
||||
if (posix_memalign((void**)&buf, 4096, disks * chunk))
|
||||
/* Don't start the 'reshape' */
|
||||
return 0;
|
||||
grow_backup(sra, 0, stripes,
|
||||
fds, offsets, disks, chunk, level, layout,
|
||||
dests, destfd, destoffsets,
|
||||
0, °raded, buf);
|
||||
validate(afd, destfd[0], destoffsets[0]);
|
||||
wait_backup(sra, 0, stripes * (chunk / 512), stripes * (chunk / 512),
|
||||
dests, destfd, destoffsets,
|
||||
0);
|
||||
sysfs_set_num(sra, NULL, "suspend_lo", (stripes * (chunk/512)) * data);
|
||||
free(buf);
|
||||
/* FIXME this should probably be numeric */
|
||||
sysfs_set_str(sra, NULL, "sync_max", "max");
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int child_shrink(int afd, struct mdinfo *sra, unsigned long stripes,
|
||||
int *fds, unsigned long long *offsets,
|
||||
int disks, int chunk, int level, int layout, int data,
|
||||
int dests, int *destfd, unsigned long long *destoffsets)
|
||||
{
|
||||
char *buf;
|
||||
unsigned long long start;
|
||||
int rv;
|
||||
int degraded = 0;
|
||||
|
||||
if (posix_memalign((void**)&buf, 4096, disks * chunk))
|
||||
return 0;
|
||||
start = sra->component_size - stripes * (chunk/512);
|
||||
sysfs_set_num(sra, NULL, "sync_max", start);
|
||||
rv = wait_backup(sra, 0, start - stripes * (chunk/512), stripes * (chunk/512),
|
||||
dests, destfd, destoffsets, 0);
|
||||
if (rv < 0)
|
||||
return 0;
|
||||
grow_backup(sra, 0, stripes,
|
||||
fds, offsets,
|
||||
disks, chunk, level, layout,
|
||||
dests, destfd, destoffsets,
|
||||
0, °raded, buf);
|
||||
validate(afd, destfd[0], destoffsets[0]);
|
||||
wait_backup(sra, start, stripes*(chunk/512), 0,
|
||||
dests, destfd, destoffsets, 0);
|
||||
sysfs_set_num(sra, NULL, "suspend_lo", (stripes * (chunk/512)) * data);
|
||||
free(buf);
|
||||
/* FIXME this should probably be numeric */
|
||||
sysfs_set_str(sra, NULL, "sync_max", "max");
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes,
|
||||
int *fds, unsigned long long *offsets,
|
||||
unsigned long long start,
|
||||
int disks, int chunk, int level, int layout, int data,
|
||||
int dests, int *destfd, unsigned long long *destoffsets)
|
||||
{
|
||||
unsigned long long size;
|
||||
unsigned long tailstripes = stripes;
|
||||
int part;
|
||||
char *buf;
|
||||
unsigned long long speed;
|
||||
int degraded = 0;
|
||||
|
||||
|
||||
if (posix_memalign((void**)&buf, 4096, disks * chunk))
|
||||
return 0;
|
||||
|
||||
sysfs_get_ll(sra, NULL, "sync_speed_min", &speed);
|
||||
sysfs_set_num(sra, NULL, "sync_speed_min", 200000);
|
||||
|
||||
grow_backup(sra, start, stripes,
|
||||
fds, offsets,
|
||||
disks, chunk, level, layout,
|
||||
dests, destfd, destoffsets,
|
||||
0, °raded, buf);
|
||||
grow_backup(sra, (start + stripes) * (chunk/512), stripes,
|
||||
fds, offsets,
|
||||
disks, chunk, level, layout,
|
||||
dests, destfd, destoffsets,
|
||||
1, °raded, buf);
|
||||
validate(afd, destfd[0], destoffsets[0]);
|
||||
part = 0;
|
||||
start += stripes * 2; /* where to read next */
|
||||
size = sra->component_size / (chunk/512);
|
||||
while (start < size) {
|
||||
if (wait_backup(sra, (start-stripes*2)*(chunk/512),
|
||||
stripes*(chunk/512), 0,
|
||||
dests, destfd, destoffsets,
|
||||
part) < 0)
|
||||
return 0;
|
||||
sysfs_set_num(sra, NULL, "suspend_lo", start*(chunk/512) * data);
|
||||
if (start + stripes > size)
|
||||
tailstripes = (size - start);
|
||||
|
||||
grow_backup(sra, start*(chunk/512), tailstripes,
|
||||
fds, offsets,
|
||||
disks, chunk, level, layout,
|
||||
dests, destfd, destoffsets,
|
||||
part, °raded, buf);
|
||||
start += stripes;
|
||||
part = 1 - part;
|
||||
validate(afd, destfd[0], destoffsets[0]);
|
||||
if (reshape->before.data_disks == reshape->after.data_disks) {
|
||||
sysfs_get_ll(sra, NULL, "sync_speed_min", &speed);
|
||||
sysfs_set_num(sra, NULL, "sync_speed_min", 200000);
|
||||
}
|
||||
if (wait_backup(sra, (start-stripes*2) * (chunk/512), stripes * (chunk/512), 0,
|
||||
dests, destfd, destoffsets,
|
||||
part) < 0)
|
||||
return 0;
|
||||
sysfs_set_num(sra, NULL, "suspend_lo", ((start-stripes)*(chunk/512)) * data);
|
||||
wait_backup(sra, (start-stripes) * (chunk/512), tailstripes * (chunk/512), 0,
|
||||
dests, destfd, destoffsets,
|
||||
1-part);
|
||||
sysfs_set_num(sra, NULL, "suspend_lo", (size*(chunk/512)) * data);
|
||||
sysfs_set_num(sra, NULL, "sync_speed_min", speed);
|
||||
|
||||
array_size = sra->component_size * data;
|
||||
if (increasing) {
|
||||
backup_point = sra->reshape_progress;
|
||||
suspend_point = 0;
|
||||
} else {
|
||||
backup_point = array_size;
|
||||
suspend_point = array_size;
|
||||
}
|
||||
|
||||
while (!done) {
|
||||
int rv;
|
||||
|
||||
/* Want to return as soon the oldest backup slot can
|
||||
* be released as that allows us to start backing up
|
||||
* some more, providing suspend_point has been
|
||||
* advanced, which it should have
|
||||
*/
|
||||
if (increasing) {
|
||||
wait_point = array_size;
|
||||
if (part == 0 && __le64_to_cpu(bsb.length) > 0)
|
||||
wait_point = (__le64_to_cpu(bsb.arraystart) +
|
||||
__le64_to_cpu(bsb.length));
|
||||
if (part == 1 && __le64_to_cpu(bsb.length2) > 0)
|
||||
wait_point = (__le64_to_cpu(bsb.arraystart2) +
|
||||
__le64_to_cpu(bsb.length2));
|
||||
} else {
|
||||
wait_point = 0;
|
||||
if (part == 0 && __le64_to_cpu(bsb.length) > 0)
|
||||
wait_point = __le64_to_cpu(bsb.arraystart);
|
||||
if (part == 1 && __le64_to_cpu(bsb.length2) > 0)
|
||||
wait_point = __le64_to_cpu(bsb.arraystart2);
|
||||
}
|
||||
|
||||
rv = progress_reshape(sra, reshape,
|
||||
backup_point, wait_point,
|
||||
&suspend_point, &reshape_completed);
|
||||
if (rv < 0) {
|
||||
done = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
/* external metadata would need to ping_monitor here */
|
||||
sra->reshape_progress = reshape_completed;
|
||||
|
||||
/* Clear any backup region that is before 'here' */
|
||||
if (increasing) {
|
||||
if (reshape_completed >= (__le64_to_cpu(bsb.arraystart) +
|
||||
__le64_to_cpu(bsb.length)))
|
||||
forget_backup(dests, destfd,
|
||||
destoffsets, 0);
|
||||
if (reshape_completed >= (__le64_to_cpu(bsb.arraystart2) +
|
||||
__le64_to_cpu(bsb.length2)))
|
||||
forget_backup(dests, destfd,
|
||||
destoffsets, 1);
|
||||
} else {
|
||||
if (reshape_completed <= (__le64_to_cpu(bsb.arraystart)))
|
||||
forget_backup(dests, destfd,
|
||||
destoffsets, 0);
|
||||
if (reshape_completed <= (__le64_to_cpu(bsb.arraystart2)))
|
||||
forget_backup(dests, destfd,
|
||||
destoffsets, 1);
|
||||
}
|
||||
|
||||
if (rv) {
|
||||
unsigned long long offset;
|
||||
/* need to backup some space... */
|
||||
/* Check that 'part' is unused */
|
||||
if (part == 0 && __le64_to_cpu(bsb.length) != 0)
|
||||
abort(); /* BUG here */
|
||||
if (part == 1 && __le64_to_cpu(bsb.length2) != 0)
|
||||
abort();
|
||||
|
||||
offset = backup_point / data;
|
||||
if (!increasing)
|
||||
offset -= stripes * (chunk/512);
|
||||
grow_backup(sra, offset, stripes,
|
||||
fds, offsets,
|
||||
disks, chunk, level, layout,
|
||||
dests, destfd, destoffsets,
|
||||
part, °raded, buf);
|
||||
validate(afd, destfd[0], destoffsets[0]);
|
||||
/* record where 'part' is up to */
|
||||
part = !part;
|
||||
if (increasing)
|
||||
backup_point += stripes * (chunk/512) * data;
|
||||
else
|
||||
backup_point -= stripes * (chunk/512) * data;
|
||||
}
|
||||
}
|
||||
|
||||
if (reshape->before.data_disks == reshape->after.data_disks)
|
||||
sysfs_set_num(sra, NULL, "sync_speed_min", speed);
|
||||
free(buf);
|
||||
return 1;
|
||||
return 1; /* FIXME what does this mean? */
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -2859,164 +3038,10 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
|
|||
int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info,
|
||||
char *backup_file)
|
||||
{
|
||||
/* Array is assembled and ready to be started, but
|
||||
* monitoring is probably required.
|
||||
* So:
|
||||
* - start read-only
|
||||
* - set upper bound for resync
|
||||
* - initialise the 'suspend' boundaries
|
||||
* - switch to read-write
|
||||
* - fork and continue monitoring
|
||||
*/
|
||||
int err;
|
||||
int backup_list[1];
|
||||
unsigned long long backup_offsets[1];
|
||||
int odisks, ndisks, ochunk, nchunk,odata,ndata;
|
||||
unsigned long a,b,blocks,stripes;
|
||||
int backup_fd;
|
||||
int *fds;
|
||||
unsigned long long *offsets;
|
||||
int d;
|
||||
struct mdinfo *sra, *sd;
|
||||
int rv;
|
||||
unsigned long cache;
|
||||
int done = 0;
|
||||
|
||||
err = sysfs_set_str(info, NULL, "array_state", "readonly");
|
||||
int err = sysfs_set_str(info, NULL, "array_state", "readonly");
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
/* make sure reshape doesn't progress until we are ready */
|
||||
sysfs_set_str(info, NULL, "sync_max", "0");
|
||||
sysfs_set_str(info, NULL, "array_state", "active"); /* FIXME or clean */
|
||||
|
||||
sra = sysfs_read(-1, devname2devnum(info->sys_name),
|
||||
GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
|
||||
GET_CACHE);
|
||||
if (!sra)
|
||||
return 1;
|
||||
|
||||
/* ndisks is not growing, so raid_disks is old and +delta is new */
|
||||
odisks = info->array.raid_disks;
|
||||
ndisks = odisks + info->delta_disks;
|
||||
odata = odisks - 1;
|
||||
ndata = ndisks - 1;
|
||||
if (info->array.level == 6) {
|
||||
odata--;
|
||||
ndata--;
|
||||
}
|
||||
ochunk = info->array.chunk_size;
|
||||
nchunk = info->new_chunk;
|
||||
|
||||
a = (ochunk/512) * odata;
|
||||
b = (nchunk/512) * ndata;
|
||||
/* Find GCD */
|
||||
while (a != b) {
|
||||
if (a < b)
|
||||
b -= a;
|
||||
if (b < a)
|
||||
a -= b;
|
||||
}
|
||||
/* LCM == product / GCD */
|
||||
blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a;
|
||||
|
||||
if (ndata == odata)
|
||||
while (blocks * 32 < sra->component_size &&
|
||||
blocks < 16*1024*2)
|
||||
blocks *= 2;
|
||||
stripes = blocks / (info->array.chunk_size/512) / odata;
|
||||
|
||||
/* check that the internal stripe cache is
|
||||
* large enough, or it won't work.
|
||||
*/
|
||||
cache = (nchunk < ochunk) ? ochunk : nchunk;
|
||||
cache = cache * 4 / 4096;
|
||||
if (cache < blocks / 8 / odisks + 16)
|
||||
/* Make it big enough to hold 'blocks' */
|
||||
cache = blocks / 8 / odisks + 16;
|
||||
if (sra->cache_size < cache)
|
||||
sysfs_set_num(sra, NULL, "stripe_cache_size",
|
||||
cache+1);
|
||||
|
||||
memset(&bsb, 0, 512);
|
||||
memcpy(bsb.magic, "md_backup_data-1", 16);
|
||||
memcpy(&bsb.set_uuid, info->uuid, 16);
|
||||
bsb.mtime = __cpu_to_le64(time(0));
|
||||
bsb.devstart2 = blocks;
|
||||
|
||||
backup_fd = open(backup_file, O_RDWR|O_CREAT, S_IRUSR | S_IWUSR);
|
||||
if (backup_fd < 0) {
|
||||
fprintf(stderr, Name ": Cannot open backup file %s\n",
|
||||
backup_file ?: "- no backup-file given");
|
||||
return 1;
|
||||
}
|
||||
backup_list[0] = backup_fd;
|
||||
backup_offsets[0] = 8 * 512;
|
||||
fds = malloc(odisks * sizeof(fds[0]));
|
||||
offsets = malloc(odisks * sizeof(offsets[0]));
|
||||
for (d=0; d<odisks; d++)
|
||||
fds[d] = -1;
|
||||
|
||||
for (sd = sra->devs; sd; sd = sd->next) {
|
||||
if (sd->disk.state & (1<<MD_DISK_FAULTY))
|
||||
continue;
|
||||
if (sd->disk.state & (1<<MD_DISK_SYNC)) {
|
||||
char *dn = map_dev(sd->disk.major,
|
||||
sd->disk.minor, 1);
|
||||
fds[sd->disk.raid_disk]
|
||||
= dev_open(dn, O_RDONLY);
|
||||
offsets[sd->disk.raid_disk] = sd->data_offset*512;
|
||||
if (fds[sd->disk.raid_disk] < 0) {
|
||||
fprintf(stderr, Name ": %s: cannot open component %s\n",
|
||||
info->sys_name, dn?dn:"-unknown-");
|
||||
rv = 1;
|
||||
goto release;
|
||||
}
|
||||
free(dn);
|
||||
}
|
||||
}
|
||||
|
||||
switch(fork()) {
|
||||
case 0:
|
||||
close(mdfd);
|
||||
mlockall(MCL_FUTURE);
|
||||
if (info->delta_disks < 0)
|
||||
done = child_shrink(-1, info, stripes,
|
||||
fds, offsets,
|
||||
info->array.raid_disks,
|
||||
info->array.chunk_size,
|
||||
info->array.level, info->array.layout,
|
||||
odata,
|
||||
1, backup_list, backup_offsets);
|
||||
else if (info->delta_disks == 0) {
|
||||
/* The 'start' is a per-device stripe number.
|
||||
* reshape_progress is a per-array sector number.
|
||||
* So divide by ndata * chunk_size
|
||||
*/
|
||||
unsigned long long start = info->reshape_progress / ndata;
|
||||
start /= (info->array.chunk_size/512);
|
||||
done = child_same_size(-1, info, stripes,
|
||||
fds, offsets,
|
||||
start,
|
||||
info->array.raid_disks,
|
||||
info->array.chunk_size,
|
||||
info->array.level, info->array.layout,
|
||||
odata,
|
||||
1, backup_list, backup_offsets);
|
||||
}
|
||||
if (backup_file && done)
|
||||
unlink(backup_file);
|
||||
/* FIXME should I intuit a level change */
|
||||
exit(0);
|
||||
case -1:
|
||||
fprintf(stderr, Name ": Cannot run child to continue monitoring reshape: %s\n",
|
||||
strerror(errno));
|
||||
return 1;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
release:
|
||||
return 0;
|
||||
return reshape_array(NULL, mdfd, "array", st, info, 1, backup_file, 0, 0);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -205,8 +205,76 @@ sync_action
|
|||
|
||||
2.6 Reshape raid disks (shrink)
|
||||
|
||||
3 TODO
|
||||
3 Interaction with metadata handle.
|
||||
|
||||
The following calls are made into the metadata handler to assist
|
||||
with initiating and monitoring a 'reshape'.
|
||||
|
||||
1/ ->reshape_super is called quite early (after only minimial
|
||||
checks) to make sure that the metadata can record the new shape
|
||||
and any necessary transitions. It may be passed a 'container'
|
||||
or an individual array within a container, and it should notice
|
||||
the difference and act accordingly.
|
||||
When a reshape is requested against a container it is expected
|
||||
that it should be applied to every array in the container,
|
||||
however it is up to the metadata handler to determine final
|
||||
policy.
|
||||
|
||||
If the reshape is supportable, the internal copy of the metadata
|
||||
should be updated, and a metadata update suitable for sending
|
||||
to mdmon should be queued.
|
||||
|
||||
If the reshape will involve converting spares into array members,
|
||||
this must be recorded in the metadata too.
|
||||
|
||||
2/ ->container_content will be called to find out the new state
|
||||
of all the array, or all arrays in the container. Any newly
|
||||
added devices (with state==0 and raid_disk >= 0) will be added
|
||||
to the array as spares with the relevant slot number.
|
||||
|
||||
It is likely that the info returned by ->container_content will
|
||||
have ->reshape_active set, ->reshape_progress set to e.g. 0, and
|
||||
new_* set appropriately. mdadm will use this information to
|
||||
cause the correct reshape to start at an appropriate time.
|
||||
|
||||
3/ ->set_array_state will be called by mdmon when reshape has
|
||||
started and again periodically as it progresses. This should
|
||||
record the ->last_checkpoint as the point where reshape has
|
||||
progressed to. When the reshape finished this will be called
|
||||
again and it should notice that ->curr_action is no longer
|
||||
'reshape' and so should record that the reshape has finished
|
||||
providing 'last_checkpoint' has progressed suitably.
|
||||
|
||||
4/ ->manage_reshape will be called once the reshape has been set
|
||||
up in the kernel but before sync_max has been moved from 0, so
|
||||
no actual reshape will have happened.
|
||||
|
||||
->manage_reshape should call progress_reshape() to allow the
|
||||
reshape to progress, and should back-up any data as indicated
|
||||
by the return value. See the documentation of that function
|
||||
for more details.
|
||||
->manage_reshape will be called multiple times when a
|
||||
container is being reshaped, once for each member array in
|
||||
the container.
|
||||
|
||||
|
||||
The progress of the metadata is as follows:
|
||||
1/ mdadm sends a metadata update to mdmon which marks the array
|
||||
as undergoing a reshape. This is set up by
|
||||
->reshape_super and applied by ->process_update
|
||||
For container-wide reshape, this happens once for the whole
|
||||
container.
|
||||
2/ mdmon notices progress via the sysfs files and calls
|
||||
->set_array_state to update the state periodically
|
||||
For container-wide reshape, this happens repeatedly for
|
||||
one array, then repeatedly for the next, etc.
|
||||
3/ mdmon notices when reshape has finished and call
|
||||
->set_array_state to record the the reshape is complete.
|
||||
For container-wide reshape, this happens once for each
|
||||
member array.
|
||||
|
||||
|
||||
|
||||
...
|
||||
|
||||
[1]: Linux kernel design patterns - part 3, Neil Brown http://lwn.net/Articles/336262/
|
||||
|
|
Loading…
Reference in New Issue