From 95a05b37e8eb2bc0803b1a0298fce6adc60eff16 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 10 Jun 2015 13:42:04 +0800 Subject: [PATCH 01/10] Create n bitmaps for clustered mode For a clustered MD, create bitmaps equal to number of nodes so each node has an independent bitmap. Only the first bitmap is has the bits set so that the first node that assembles the device also performs the sync. The bitmaps are aligned to 4k boundaries. On-disk format: 0 4k 8k 12k ------------------------------------------------------------------- | idle | md super | bm super [0] + bits | | bm bits[0, contd] | bm super[1] + bits | bm bits[1, contd] | | bm super[2] + bits | bm bits [2, contd] | bm super[3] + bits | | bm bits [3, contd] | | | Signed-off-by: Goldwyn Rodrigues Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- Create.c | 3 ++- bitmap.c | 2 ++ bitmap.h | 7 +++++-- mdadm.8.in | 7 ++++++- super1.c | 53 ++++++++++++++++++++++++++++++++++------------------- 5 files changed, 49 insertions(+), 23 deletions(-) diff --git a/Create.c b/Create.c index ef28da0..19f3054 100644 --- a/Create.c +++ b/Create.c @@ -750,7 +750,8 @@ int Create(struct supertype *st, char *mddev, #endif } - if (s->bitmap_file && strcmp(s->bitmap_file, "internal")==0) { + if (s->bitmap_file && (strcmp(s->bitmap_file, "internal")==0 || + strcmp(s->bitmap_file, "clustered")==0)) { if ((vers%100) < 2) { pr_err("internal bitmaps not supported by this kernel.\n"); goto abort_locked; diff --git a/bitmap.c b/bitmap.c index b1d54a6..920033a 100644 --- a/bitmap.c +++ b/bitmap.c @@ -32,6 +32,8 @@ inline void sb_le_to_cpu(bitmap_super_t *sb) sb->daemon_sleep = __le32_to_cpu(sb->daemon_sleep); sb->sync_size = __le64_to_cpu(sb->sync_size); sb->write_behind = __le32_to_cpu(sb->write_behind); + sb->nodes = __le32_to_cpu(sb->nodes); + sb->sectors_reserved = __le32_to_cpu(sb->sectors_reserved); } inline void sb_cpu_to_le(bitmap_super_t *sb) diff --git a/bitmap.h b/bitmap.h index c8725a3..adbf0b4 100644 --- a/bitmap.h +++ b/bitmap.h @@ -154,8 +154,11 @@ typedef struct bitmap_super_s { __u32 chunksize; /* 52 the bitmap chunk size in bytes */ __u32 daemon_sleep; /* 56 seconds between disk flushes */ __u32 write_behind; /* 60 number of outstanding write-behind writes */ - - __u8 pad[256 - 64]; /* set to zero */ + __u32 sectors_reserved; /* 64 number of 512-byte sectors that are + * reserved for the bitmap. */ + __u32 nodes; /* 68 the maximum number of nodes in cluster. */ + __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */ + __u8 pad[256 - 136]; /* set to zero */ } bitmap_super_t; /* notes: diff --git a/mdadm.8.in b/mdadm.8.in index b4a21d9..2a89458 100644 --- a/mdadm.8.in +++ b/mdadm.8.in @@ -694,7 +694,12 @@ and so is replicated on all devices. If the word .B "none" is given with .B \-\-grow -mode, then any bitmap that is present is removed. +mode, then any bitmap that is present is removed. If the word +.B "clustered" +is given, the array is created for a clustered environment. One bitmap +is created for each node as defined by the +.B \-\-nodes +parameter and are stored internally. To help catch typing errors, the filename must contain at least one slash ('/') if it is a real file (not 'internal' or 'none'). diff --git a/super1.c b/super1.c index f0508fe..7928a3d 100644 --- a/super1.c +++ b/super1.c @@ -2177,6 +2177,7 @@ static int write_bitmap1(struct supertype *st, int fd) void *buf; int towrite, n; struct align_fd afd; + unsigned int i = 0; init_afd(&afd, fd); @@ -2185,27 +2186,41 @@ static int write_bitmap1(struct supertype *st, int fd) if (posix_memalign(&buf, 4096, 4096)) return -ENOMEM; - memset(buf, 0xff, 4096); - memcpy(buf, (char *)bms, sizeof(bitmap_super_t)); - - towrite = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9); - towrite = (towrite+7) >> 3; /* bits to bytes */ - towrite += sizeof(bitmap_super_t); - towrite = ROUND_UP(towrite, 512); - while (towrite > 0) { - n = towrite; - if (n > 4096) - n = 4096; - n = awrite(&afd, buf, n); - if (n > 0) - towrite -= n; + do { + /* Only the bitmap[0] should resync + * whole device on initial assembly + */ + if (i) + memset(buf, 0x00, 4096); else + memset(buf, 0xff, 4096); + memcpy(buf, (char *)bms, sizeof(bitmap_super_t)); + + towrite = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9); + towrite = (towrite+7) >> 3; /* bits to bytes */ + towrite += sizeof(bitmap_super_t); + /* we need the bitmaps to be at 4k boundary */ + towrite = ROUND_UP(towrite, 4096); + while (towrite > 0) { + n = towrite; + if (n > 4096) + n = 4096; + n = awrite(&afd, buf, n); + if (n > 0) + towrite -= n; + else + break; + if (i) + memset(buf, 0x00, 4096); + else + memset(buf, 0xff, 4096); + } + fsync(fd); + if (towrite) { + rv = -2; break; - memset(buf, 0xff, 4096); - } - fsync(fd); - if (towrite) - rv = -2; + } + } while (++i < __le32_to_cpu(bms->nodes)); free(buf); return rv; From 529e2aa573333981b211dc50ac687da7baefd224 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 10 Jun 2015 13:42:05 +0800 Subject: [PATCH 02/10] Add nodes option while creating md Specifies the maximum number of nodes in the cluster that may use this device simultaneously. This is equivalent to the number of bitmaps created in the internal superblock (patches to follow). Signed-off-by: Goldwyn Rodrigues Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- Create.c | 1 + ReadMe.c | 1 + mdadm.8.in | 6 ++++++ mdadm.c | 33 ++++++++++++++++++++++++++++++++- mdadm.h | 3 +++ super1.c | 1 + 6 files changed, 44 insertions(+), 1 deletion(-) diff --git a/Create.c b/Create.c index 19f3054..565bf50 100644 --- a/Create.c +++ b/Create.c @@ -531,6 +531,7 @@ int Create(struct supertype *st, char *mddev, st->ss->name); warn = 1; } + st->nodes = c->nodes; if (warn) { if (c->runstop!= 1) { diff --git a/ReadMe.c b/ReadMe.c index 87a4916..30c569d 100644 --- a/ReadMe.c +++ b/ReadMe.c @@ -140,6 +140,7 @@ struct option long_options[] = { {"homehost", 1, 0, HomeHost}, {"symlinks", 1, 0, Symlinks}, {"data-offset",1, 0, DataOffset}, + {"nodes",1, 0, Nodes}, /* For assemble */ {"uuid", 1, 0, 'u'}, diff --git a/mdadm.8.in b/mdadm.8.in index 2a89458..fed0007 100644 --- a/mdadm.8.in +++ b/mdadm.8.in @@ -971,6 +971,12 @@ However for RAID0, it is not possible to add spares. So to increase the number of devices in a RAID0, it is necessary to set the new number of devices, and to add the new devices, in the same command. +.TP +.BR \-\-nodes +Only works when the array is for clustered environment. It specifies +the maximum number of nodes in the cluster that will use this device +simultaneously. If not specified, this defaults to 4. + .SH For assemble: .TP diff --git a/mdadm.c b/mdadm.c index 3e8c49b..2ab006a 100644 --- a/mdadm.c +++ b/mdadm.c @@ -588,7 +588,14 @@ int main(int argc, char *argv[]) } ident.raid_disks = s.raiddisks; continue; - + case O(CREATE, Nodes): + c.nodes = parse_num(optarg); + if (c.nodes <= 0) { + pr_err("invalid number for the number of cluster nodes: %s\n", + optarg); + exit(2); + } + continue; case O(CREATE,'x'): /* number of spare (eXtra) disks */ if (s.sparedisks) { pr_err("spare-devices set twice: %d and %s\n", @@ -1097,6 +1104,15 @@ int main(int argc, char *argv[]) s.bitmap_file = optarg; continue; } + if (strcmp(optarg, "clustered")== 0) { + s.bitmap_file = optarg; + /* Set the default number of cluster nodes + * to 4 if not already set by user + */ + if (c.nodes < 1) + c.nodes = 4; + continue; + } /* probable typo */ pr_err("bitmap file must contain a '/', or be 'internal', or 'none'\n" " not '%s'\n", optarg); @@ -1377,6 +1393,21 @@ int main(int argc, char *argv[]) case CREATE: if (c.delay == 0) c.delay = DEFAULT_BITMAP_DELAY; + + if (c.nodes) { + if (!s.bitmap_file || strcmp(s.bitmap_file, "clustered") != 0) { + pr_err("--nodes argument only compatible with --bitmap=clustered\n"); + rv = 1; + break; + } + + if (s.level != 1) { + pr_err("--bitmap=clustered is currently supported with RAID mirror only\n"); + rv = 1; + break; + } + } + if (s.write_behind && !s.bitmap_file) { pr_err("write-behind mode requires a bitmap.\n"); rv = 1; diff --git a/mdadm.h b/mdadm.h index 141f963..9d55801 100644 --- a/mdadm.h +++ b/mdadm.h @@ -344,6 +344,7 @@ enum special_options { Dump, Restore, Action, + Nodes, }; enum prefix_standard { @@ -418,6 +419,7 @@ struct context { char *backup_file; int invalid_backup; char *action; + int nodes; }; struct shape { @@ -1029,6 +1031,7 @@ struct supertype { */ int devcnt; int retry_soon; + int nodes; struct mdinfo *devs; diff --git a/super1.c b/super1.c index 7928a3d..78d98a7 100644 --- a/super1.c +++ b/super1.c @@ -2144,6 +2144,7 @@ add_internal_bitmap1(struct supertype *st, bms->daemon_sleep = __cpu_to_le32(delay); bms->sync_size = __cpu_to_le64(size); bms->write_behind = __cpu_to_le32(write_behind); + bms->nodes = __cpu_to_le32(st->nodes); *chunkp = chunk; return 1; From 7716570e6d906e7326f83d466f6ba73009649d03 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 10 Jun 2015 13:42:06 +0800 Subject: [PATCH 03/10] Set home-cluster while creating an array The home-cluster is stored in the bitmap super block of the array. The device can be assembled on a cluster with the cluster name same as the one recorded in the bitmap. If home-cluster is not specified, this is auto-detected using dlopen corosync cmap library. neilb: allow code to compile when corosync-devel is not installed. Signed-off-by: Goldwyn Rodrigues Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- Create.c | 1 + Makefile | 6 +++++- ReadMe.c | 1 + config.c | 27 +++++++++++++++++++++++++- mdadm.8.in | 6 ++++++ mdadm.c | 17 ++++++++++++++++ mdadm.h | 5 +++++ super1.c | 5 ++++- util.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 9 files changed, 122 insertions(+), 3 deletions(-) diff --git a/Create.c b/Create.c index 565bf50..b62d8d4 100644 --- a/Create.c +++ b/Create.c @@ -532,6 +532,7 @@ int Create(struct supertype *st, char *mddev, warn = 1; } st->nodes = c->nodes; + st->cluster_name = c->homecluster; if (warn) { if (c->runstop!= 1) { diff --git a/Makefile b/Makefile index a7d8c5c..c189279 100644 --- a/Makefile +++ b/Makefile @@ -79,10 +79,13 @@ MDMON_DIR = $(RUN_DIR) # place for autoreplace cookies FAILED_SLOTS_DIR = $(RUN_DIR)/failed-slots SYSTEMD_DIR=/lib/systemd/system + +COROSYNC:=$(shell [ -d /usr/include/corosync ] || echo -DNO_COROSYNC) + DIRFLAGS = -DMAP_DIR=\"$(MAP_DIR)\" -DMAP_FILE=\"$(MAP_FILE)\" DIRFLAGS += -DMDMON_DIR=\"$(MDMON_DIR)\" DIRFLAGS += -DFAILED_SLOTS_DIR=\"$(FAILED_SLOTS_DIR)\" -CFLAGS = $(CWFLAGS) $(CXFLAGS) -DSendmail=\""$(MAILCMD)"\" $(CONFFILEFLAGS) $(DIRFLAGS) +CFLAGS = $(CWFLAGS) $(CXFLAGS) -DSendmail=\""$(MAILCMD)"\" $(CONFFILEFLAGS) $(DIRFLAGS) $(COROSYNC) VERSION = $(shell [ -d .git ] && git describe HEAD | sed 's/mdadm-//') VERS_DATE = $(shell [ -d .git ] && date --date="`git log -n1 --format=format:%cd --date=short`" '+%0dth %B %Y' | sed -e 's/1th/1st/' -e 's/2th/2nd/' -e 's/11st/11th/' -e 's/12nd/12th/') @@ -101,6 +104,7 @@ endif # If you want a static binary, you might uncomment these # LDFLAGS = -static # STRIP = -s +LDLIBS=-ldl INSTALL = /usr/bin/install DESTDIR = diff --git a/ReadMe.c b/ReadMe.c index 30c569d..c6286ae 100644 --- a/ReadMe.c +++ b/ReadMe.c @@ -141,6 +141,7 @@ struct option long_options[] = { {"symlinks", 1, 0, Symlinks}, {"data-offset",1, 0, DataOffset}, {"nodes",1, 0, Nodes}, + {"home-cluster",1, 0, ClusterName}, /* For assemble */ {"uuid", 1, 0, 'u'}, diff --git a/config.c b/config.c index 7342c42..21b6afd 100644 --- a/config.c +++ b/config.c @@ -77,7 +77,7 @@ char DefaultAltConfFile[] = CONFFILE2; char DefaultAltConfDir[] = CONFFILE2 ".d"; enum linetype { Devices, Array, Mailaddr, Mailfrom, Program, CreateDev, - Homehost, AutoMode, Policy, PartPolicy, LTEnd }; + Homehost, HomeCluster, AutoMode, Policy, PartPolicy, LTEnd }; char *keywords[] = { [Devices] = "devices", [Array] = "array", @@ -86,6 +86,7 @@ char *keywords[] = { [Program] = "program", [CreateDev]= "create", [Homehost] = "homehost", + [HomeCluster] = "homecluster", [AutoMode] = "auto", [Policy] = "policy", [PartPolicy]="part-policy", @@ -562,6 +563,21 @@ void homehostline(char *line) } } +static char *home_cluster = NULL; +void homeclusterline(char *line) +{ + char *w; + + for (w=dl_next(line); w != line ; w=dl_next(w)) { + if (home_cluster == NULL) { + if (strcasecmp(w, "")==0) + home_cluster = xstrdup(""); + else + home_cluster = xstrdup(w); + } + } +} + char auto_yes[] = "yes"; char auto_no[] = "no"; char auto_homehost[] = "homehost"; @@ -724,6 +740,9 @@ void conf_file(FILE *f) case Homehost: homehostline(line); break; + case HomeCluster: + homeclusterline(line); + break; case AutoMode: autoline(line); break; @@ -884,6 +903,12 @@ char *conf_get_homehost(int *require_homehostp) return home_host; } +char *conf_get_homecluster(void) +{ + load_conffile(); + return home_cluster; +} + struct createinfo *conf_get_create_info(void) { load_conffile(); diff --git a/mdadm.8.in b/mdadm.8.in index fed0007..a07ddb7 100644 --- a/mdadm.8.in +++ b/mdadm.8.in @@ -415,6 +415,12 @@ This functionality is currently only provided by and .BR \-\-monitor . +.TP +.B \-\-home\-cluster= +specifies the cluster name for the md device. The md device can be assembled +only on the cluster which matches the name specified. If this option is not +provided, mdadm tries to detect the cluster name automatically. + .SH For create, build, or grow: .TP diff --git a/mdadm.c b/mdadm.c index 2ab006a..1a32328 100644 --- a/mdadm.c +++ b/mdadm.c @@ -596,6 +596,13 @@ int main(int argc, char *argv[]) exit(2); } continue; + case O(CREATE, ClusterName): + c.homecluster = optarg; + if (strlen(c.homecluster) > 64) { + pr_err("Cluster name too big.\n"); + exit(ERANGE); + } + continue; case O(CREATE,'x'): /* number of spare (eXtra) disks */ if (s.sparedisks) { pr_err("spare-devices set twice: %d and %s\n", @@ -1276,6 +1283,16 @@ int main(int argc, char *argv[]) c.require_homehost = 0; } + if (c.homecluster == NULL && (c.nodes > 0)) { + c.homecluster = conf_get_homecluster(); + if (c.homecluster == NULL) + rv = get_cluster_name(&c.homecluster); + if (rv != 0) { + pr_err("The md can't get cluster name\n"); + exit(1); + } + } + if (c.backup_file && data_offset != INVALID_SECTORS) { pr_err("--backup-file and --data-offset are incompatible\n"); exit(2); diff --git a/mdadm.h b/mdadm.h index 9d55801..f56d9d6 100644 --- a/mdadm.h +++ b/mdadm.h @@ -345,6 +345,7 @@ enum special_options { Restore, Action, Nodes, + ClusterName, }; enum prefix_standard { @@ -420,6 +421,7 @@ struct context { int invalid_backup; char *action; int nodes; + char *homecluster; }; struct shape { @@ -1032,6 +1034,7 @@ struct supertype { int devcnt; int retry_soon; int nodes; + char *cluster_name; struct mdinfo *devs; @@ -1308,6 +1311,7 @@ extern char *conf_get_mailaddr(void); extern char *conf_get_mailfrom(void); extern char *conf_get_program(void); extern char *conf_get_homehost(int *require_homehostp); +extern char *conf_get_homecluster(void); extern char *conf_line(FILE *file); extern char *conf_word(FILE *file, int allow_key); extern void print_quoted(char *str); @@ -1416,6 +1420,7 @@ extern char *stat2devnm(struct stat *st); extern char *fd2devnm(int fd); extern int in_initrd(void); +extern int get_cluster_name(char **name); #define _ROUND_UP(val, base) (((val) + (base) - 1) & ~(base - 1)) #define ROUND_UP(val, base) _ROUND_UP(val, (typeof(val))(base)) diff --git a/super1.c b/super1.c index 78d98a7..bbb9f88 100644 --- a/super1.c +++ b/super1.c @@ -2054,7 +2054,7 @@ add_internal_bitmap1(struct supertype *st, bbl_size = -bbl_offset; if (!may_change || (room < 3*2 && - __le32_to_cpu(sb->max_dev) <= 384)) { + __le32_to_cpu(sb->max_dev) <= 384)) { room = 3*2; offset = 1*2; bbl_size = 0; @@ -2145,6 +2145,9 @@ add_internal_bitmap1(struct supertype *st, bms->sync_size = __cpu_to_le64(size); bms->write_behind = __cpu_to_le32(write_behind); bms->nodes = __cpu_to_le32(st->nodes); + if (st->cluster_name) + strncpy((char *)bms->cluster_name, + st->cluster_name, strlen(st->cluster_name)); *chunkp = chunk; return 1; diff --git a/util.c b/util.c index cc98d3b..9ec4aef 100644 --- a/util.c +++ b/util.c @@ -34,6 +34,15 @@ #include #include #include +#include +#include +#ifdef NO_COROSYNC + typedef uint64_t cmap_handle_t; + #define CS_OK 1 +#else + #include +#endif + /* * following taken from linux/blkpg.h because they aren't @@ -1976,3 +1985,51 @@ void reopen_mddev(int mdfd) if (fd >= 0 && fd != mdfd) dup2(fd, mdfd); } + +int get_cluster_name(char **cluster_name) +{ + void *lib_handle = NULL; + int rv = -1; + + cmap_handle_t handle; + static int (*initialize)(cmap_handle_t *handle); + static int (*get_string)(cmap_handle_t handle, + const char *string, + char **name); + static int (*finalize)(cmap_handle_t handle); + + + lib_handle = dlopen("libcmap.so.4", RTLD_NOW | RTLD_LOCAL); + if (!lib_handle) + return rv; + + initialize = dlsym(lib_handle, "cmap_initialize"); + if (!initialize) + goto out; + + get_string = dlsym(lib_handle, "cmap_get_string"); + if (!get_string) + goto out; + + finalize = dlsym(lib_handle, "cmap_finalize"); + if (!finalize) + goto out; + + rv = initialize(&handle); + if (rv != CS_OK) + goto out; + + rv = get_string(handle, "totem.cluster_name", cluster_name); + if (rv != CS_OK) { + free(*cluster_name); + rv = -1; + goto name_err; + } + + rv = 0; +name_err: + finalize(handle); +out: + dlclose(lib_handle); + return rv; +} From b98043a2f8e7bb5b1918e2e02778f822f9dd4d3a Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 10 Jun 2015 13:42:07 +0800 Subject: [PATCH 04/10] Show all bitmaps while examining bitmap This adds capability of exmining bitmaps corresponding to all nodes/slots on the device. Signed-off-by: Goldwyn Rodrigues Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- bitmap.c | 51 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/bitmap.c b/bitmap.c index 920033a..0c3f6de 100644 --- a/bitmap.c +++ b/bitmap.c @@ -260,7 +260,7 @@ int ExamineBitmap(char *filename, int brief, struct supertype *st) int rv = 1; char buf[64]; int swap; - int fd; + int fd, i; __u32 uuid32[4]; fd = bitmap_file_open(filename, &st); @@ -317,9 +317,13 @@ int ExamineBitmap(char *filename, int brief, struct supertype *st) uuid32[2], uuid32[3]); - printf(" Events : %llu\n", (unsigned long long)sb->events); - printf(" Events Cleared : %llu\n", (unsigned long long)sb->events_cleared); - printf(" State : %s\n", bitmap_state(sb->state)); + if (sb->nodes == 0) { + printf(" Events : %llu\n", (unsigned long long)sb->events); + printf(" Events Cleared : %llu\n", (unsigned long long)sb->events_cleared); + printf(" State : %s\n", bitmap_state(sb->state)); + + } + printf(" Chunksize : %s\n", human_chunksize(sb->chunksize)); printf(" Daemon : %ds flush period\n", sb->daemon_sleep); if (sb->write_behind) @@ -329,11 +333,40 @@ int ExamineBitmap(char *filename, int brief, struct supertype *st) printf(" Write Mode : %s\n", buf); printf(" Sync Size : %llu%s\n", (unsigned long long)sb->sync_size/2, human_size(sb->sync_size * 512)); - if (brief) - goto free_info; - printf(" Bitmap : %llu bits (chunks), %llu dirty (%2.1f%%)\n", - info->total_bits, info->dirty_bits, - 100.0 * info->dirty_bits / (info->total_bits?:1)); + + if (sb->nodes == 0) { + if (brief) + goto free_info; + printf(" Bitmap : %llu bits (chunks), %llu dirty (%2.1f%%)\n", + info->total_bits, info->dirty_bits, + 100.0 * info->dirty_bits / (info->total_bits?:1)); + } else { + printf(" Cluster nodes : %d\n", sb->nodes); + printf(" Cluster name : %64s\n", sb->cluster_name); + for (i = 0; i < (int)sb->nodes; i++) { + if (i) { + free(info); + info = bitmap_fd_read(fd, brief); + sb = &info->sb; + } + if (sb->magic != BITMAP_MAGIC) + pr_err("invalid bitmap magic 0x%x, the bitmap file appears to be corrupted\n", sb->magic); + + printf(" Node Slot : %d\n", i); + printf(" Events : %llu\n", + (unsigned long long)sb->events); + printf(" Events Cleared : %llu\n", + (unsigned long long)sb->events_cleared); + printf(" State : %s\n", bitmap_state(sb->state)); + if (brief) + continue; + printf(" Bitmap : %llu bits (chunks), %llu dirty (%2.1f%%)\n", + info->total_bits, info->dirty_bits, + 100.0 * info->dirty_bits / (info->total_bits?:1)); + + } + } + free_info: free(info); return rv; From 4de90913020923b69515630b8f19094d2e0d1d5a Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 10 Jun 2015 13:42:08 +0800 Subject: [PATCH 05/10] Add a new clustered disk A clustered disk is added by the traditional --add sequence. However, other nodes need to acknowledge that they can "see" the device. This is done by --cluster-confirm: --cluster-confirm SLOTNUM:/dev/whatever (if disk is found) or --cluster-confirm SLOTNUM:missing (if disk is not found) The node initiating the --add, has the disk state tagged with MD_DISK_CLUSTER_ADD and the one confirming tag the disk with MD_DISK_CANDIDATE. Signed-off-by: Goldwyn Rodrigues Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- Manage.c | 37 ++++++++++++++++++++++++++++++++++--- ReadMe.c | 1 + md_p.h | 7 +++++++ md_u.h | 1 + mdadm.8.in | 9 +++++++++ mdadm.c | 4 ++++ mdadm.h | 2 ++ util.c | 10 ++++++++++ 8 files changed, 68 insertions(+), 3 deletions(-) diff --git a/Manage.c b/Manage.c index 2e602d7..e3bdfb3 100644 --- a/Manage.c +++ b/Manage.c @@ -690,7 +690,8 @@ skip_re_add: int Manage_add(int fd, int tfd, struct mddev_dev *dv, struct supertype *tst, mdu_array_info_t *array, int force, int verbose, char *devname, - char *update, unsigned long rdev, unsigned long long array_size) + char *update, unsigned long rdev, unsigned long long array_size, + int raid_slot) { unsigned long long ldsize; struct supertype *dev_st = NULL; @@ -880,7 +881,10 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv, } disc.major = major(rdev); disc.minor = minor(rdev); - disc.number =j; + if (raid_slot < 0) + disc.number = j; + else + disc.number = raid_slot; disc.state = 0; if (array->not_persistent==0) { int dfd; @@ -921,6 +925,14 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv, } free(used); } + + if (array->state & (1 << MD_SB_CLUSTERED)) { + if (dv->disposition == 'c') + disc.state |= (1 << MD_DISK_CANDIDATE); + else + disc.state |= (1 << MD_DISK_CLUSTER_ADD); + } + if (dv->writemostly == 1) disc.state |= (1 << MD_DISK_WRITEMOSTLY); if (tst->ss->external) { @@ -1240,6 +1252,7 @@ int Manage_subdevs(char *devname, int fd, * variant on 'A' * 'F' - Another variant of 'A', where the device was faulty * so must be removed from the array first. + * 'c' - confirm the device as found (for clustered environments) * * For 'f' and 'r', the device can also be a kernel-internal * name such as 'sdb'. @@ -1255,6 +1268,7 @@ int Manage_subdevs(char *devname, int fd, struct mdinfo info; int frozen = 0; int busy = 0; + int raid_slot = -1; if (ioctl(fd, GET_ARRAY_INFO, &array)) { pr_err("Cannot get array info for %s\n", @@ -1283,6 +1297,17 @@ int Manage_subdevs(char *devname, int fd, int rv; int mj,mn; + raid_slot = -1; + if (dv->disposition == 'c') { + rv = parse_cluster_confirm_arg(dv->devname, + &dv->devname, + &raid_slot); + if (!rv) { + pr_err("Could not get the devname of cluster\n"); + goto abort; + } + } + if (strcmp(dv->devname, "failed") == 0 || strcmp(dv->devname, "faulty") == 0) { if (dv->disposition != 'A' @@ -1308,6 +1333,11 @@ int Manage_subdevs(char *devname, int fd, if (strcmp(dv->devname, "missing") == 0) { struct mddev_dev *add_devlist = NULL; struct mddev_dev **dp; + if (dv->disposition == 'c') { + rv = ioctl(fd, CLUSTERED_DISK_NACK, NULL); + break; + } + if (dv->disposition != 'A') { pr_err("'missing' only meaningful with --re-add\n"); goto abort; @@ -1438,6 +1468,7 @@ int Manage_subdevs(char *devname, int fd, case 'A': case 'M': /* --re-add missing */ case 'F': /* --re-add faulty */ + case 'c': /* --cluster-confirm */ /* add the device */ if (subarray) { pr_err("Cannot add disks to a \'member\' array, perform this operation on the parent container\n"); @@ -1471,7 +1502,7 @@ int Manage_subdevs(char *devname, int fd, } rv = Manage_add(fd, tfd, dv, tst, &array, force, verbose, devname, update, - rdev, array_size); + rdev, array_size, raid_slot); close(tfd); tfd = -1; if (rv < 0) diff --git a/ReadMe.c b/ReadMe.c index c6286ae..c854cd5 100644 --- a/ReadMe.c +++ b/ReadMe.c @@ -169,6 +169,7 @@ struct option long_options[] = { {"wait", 0, 0, WaitOpt}, {"wait-clean", 0, 0, Waitclean }, {"action", 1, 0, Action }, + {"cluster-confirm", 0, 0, ClusterConfirm}, /* For Detail/Examine */ {"brief", 0, 0, Brief}, diff --git a/md_p.h b/md_p.h index c4846ba..9b6b5f8 100644 --- a/md_p.h +++ b/md_p.h @@ -78,6 +78,12 @@ #define MD_DISK_ACTIVE 1 /* disk is running but may not be in sync */ #define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ #define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ +#define MD_DISK_CLUSTER_ADD 4 /* Initiate a disk add across the cluster + * For clustered enviroments only. + */ +#define MD_DISK_CANDIDATE 5 /* disk is added as spare (local) until confirmed + * For clustered enviroments only. + */ #define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. * read requests will only be sent here in @@ -106,6 +112,7 @@ typedef struct mdp_device_descriptor_s { #define MD_SB_BLOCK_CONTAINER_RESHAPE 3 /* block container wide reshapes */ #define MD_SB_BLOCK_VOLUME 4 /* block activation of array, other arrays * in container can be activated */ +#define MD_SB_CLUSTERED 5 /* MD is clustered */ #define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */ typedef struct mdp_superblock_s { diff --git a/md_u.h b/md_u.h index be9868a..76068d6 100644 --- a/md_u.h +++ b/md_u.h @@ -44,6 +44,7 @@ #define STOP_ARRAY _IO (MD_MAJOR, 0x32) #define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) #define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) +#define CLUSTERED_DISK_NACK _IO (MD_MAJOR, 0x35) typedef struct mdu_version_s { int major; diff --git a/mdadm.8.in b/mdadm.8.in index a07ddb7..3dd000c 100644 --- a/mdadm.8.in +++ b/mdadm.8.in @@ -1406,6 +1406,15 @@ will avoid reading from these devices if possible. .BR \-\-readwrite Subsequent devices that are added or re\-added will have the 'write-mostly' flag cleared. +.TP +.BR \-\-cluster\-confirm +Confirm the existence of the device. This is issued in response to an \-\-add +request by a node in a cluster. When a node adds a device it sends a message +to all nodes in the cluster to look for a device with a UUID. This translates +to a udev notification with the UUID of the device to be added and the slot +number. The receiving node must acknowledge this message +with \-\-cluster\-confirm. Valid arguments are : in case +the device is found or :missing in case the device is not found. .P Each of these options requires that the first device listed is the array diff --git a/mdadm.c b/mdadm.c index 1a32328..859c48d 100644 --- a/mdadm.c +++ b/mdadm.c @@ -196,6 +196,7 @@ int main(int argc, char *argv[]) case 'f': case Fail: case ReAdd: /* re-add */ + case ClusterConfirm: if (!mode) { newmode = MANAGE; shortopt = short_bitmap_options; @@ -933,6 +934,9 @@ int main(int argc, char *argv[]) * remove the device */ devmode = 'f'; continue; + case O(MANAGE, ClusterConfirm): + devmode = 'c'; + continue; case O(MANAGE,Replace): /* Mark these devices for replacement */ devmode = 'R'; diff --git a/mdadm.h b/mdadm.h index f56d9d6..00c726e 100644 --- a/mdadm.h +++ b/mdadm.h @@ -346,6 +346,7 @@ enum special_options { Action, Nodes, ClusterName, + ClusterConfirm, }; enum prefix_standard { @@ -1281,6 +1282,7 @@ extern int parse_uuid(char *str, int uuid[4]); extern int parse_layout_10(char *layout); extern int parse_layout_faulty(char *layout); extern long parse_num(char *num); +extern int parse_cluster_confirm_arg(char *inp, char **devname, int *slot); extern int check_ext2(int fd, char *name); extern int check_reiser(int fd, char *name); extern int check_raid(int fd, char *name); diff --git a/util.c b/util.c index 9ec4aef..ea6e688 100644 --- a/util.c +++ b/util.c @@ -280,6 +280,16 @@ long parse_num(char *num) } #endif +int parse_cluster_confirm_arg(char *input, char **devname, int *slot) +{ + char *dev; + *slot = strtoul(input, &dev, 10); + if (dev == input || dev[0] != ':') + return -1; + *devname = dev+1; + return 0; +} + void remove_partitions(int fd) { /* remove partitions from this block devices. From 7c25f4d706ac14165fd8d85b169f491f1f14b7cc Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 10 Jun 2015 13:42:09 +0800 Subject: [PATCH 06/10] Convert a bitmap=none device to clustered This adds the ability to convert a regular md without bitmap (--bitmap=none) to a clustered device (--bitmap=clustered). To convert a device with --bitmap=internal or --bitmap=external, you have to convert to --bitmap=none and then re-execute the command with --bitmap=clustered. Signed-off-by: Goldwyn Rodrigues Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- Grow.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Grow.c b/Grow.c index 3180be9..90a7fe9 100644 --- a/Grow.c +++ b/Grow.c @@ -330,8 +330,7 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s) } return 0; } - pr_err("Internal bitmap already present on %s\n", - devname); + pr_err("%s bitmap already present on %s\n", s->bitmap_file, devname); return 1; } @@ -375,7 +374,8 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s) free(st); return 1; } - if (strcmp(s->bitmap_file, "internal") == 0) { + if (strcmp(s->bitmap_file, "internal") == 0 || + strcmp(s->bitmap_file, "clustered") == 0) { int rv; int d; int offset_setable = 0; @@ -384,6 +384,8 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s) pr_err("Internal bitmaps not supported with %s metadata\n", st->ss->name); return 1; } + st->nodes = c->nodes; + st->cluster_name = c->homecluster; mdi = sysfs_read(fd, NULL, GET_BITMAP_LOCATION); if (mdi) offset_setable = 1; @@ -426,6 +428,8 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s) rv = sysfs_set_num_signed(mdi, NULL, "bitmap/location", mdi->bitmap_offset); } else { + if (strcmp(s->bitmap_file, "clustered") == 0) + array.state |= (1< Date: Wed, 10 Jun 2015 13:42:10 +0800 Subject: [PATCH 07/10] Skip clustered devices in incremental We want the clustered devices to be started exclusively by a cluster resource-agent. So, avoid starting using the incremental option. This also skips a clustered md from starting during boot in inactive mode. Signed-off-by: Goldwyn Rodrigues Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- Incremental.c | 5 +++++ super1.c | 2 ++ 2 files changed, 7 insertions(+) diff --git a/Incremental.c b/Incremental.c index 0c9a9a4..5450a5c 100644 --- a/Incremental.c +++ b/Incremental.c @@ -232,6 +232,11 @@ int Incremental(struct mddev_dev *devlist, struct context *c, devname); goto out; } + /* Skip the clustered ones. This should be started by + * clustering resource agents + */ + if (info.array.state & (1 << MD_SB_CLUSTERED)) + goto out; /* 3a/ if not, check for homehost match. If no match, continue * but don't trust the 'name' in the array. Thus a 'random' minor diff --git a/super1.c b/super1.c index bbb9f88..a95c8d0 100644 --- a/super1.c +++ b/super1.c @@ -891,6 +891,8 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map) info->array.state = (__le64_to_cpu(sb->resync_offset) == MaxSector) ? 1 : 0; + if (__le32_to_cpu(bsb->nodes) > 1) + info->array.state |= (1 << MD_SB_CLUSTERED); info->data_offset = __le64_to_cpu(sb->data_offset); info->component_size = __le64_to_cpu(sb->size); From 0aa2f15b207c46ccaf4aa7a082ef7fdd186c7609 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 10 Jun 2015 13:42:11 +0800 Subject: [PATCH 08/10] mdadm: add the ability to change cluster name To support change the cluster name, the commit do the followings: 1. extend original write_bitmap function for new scenario. 2. add the scenarion to handle the modification of cluster's name in write_bitmap1. 3. let the cluster name also show in examine_super1 and detail_super1 Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- Assemble.c | 5 ++++- Grow.c | 2 +- mdadm.8.in | 6 ++++++ mdadm.c | 5 ++++- mdadm.h | 7 ++++++- super0.c | 4 ++-- super1.c | 23 +++++++++++++++++++++-- 7 files changed, 44 insertions(+), 8 deletions(-) diff --git a/Assemble.c b/Assemble.c index 42710a8..12ac299 100644 --- a/Assemble.c +++ b/Assemble.c @@ -626,7 +626,10 @@ static int load_devices(struct devs *devices, char *devmap, if (strcmp(c->update, "byteorder") == 0) err = 0; - else + else if (strcmp(c->update, "home-cluster") == 0) { + tst->cluster_name = c->homecluster; + tst->ss->write_bitmap(tst, dfd, NameUpdate); + } else err = tst->ss->update_super(tst, content, c->update, devname, c->verbose, ident->uuid_set, diff --git a/Grow.c b/Grow.c index 90a7fe9..857c7e1 100644 --- a/Grow.c +++ b/Grow.c @@ -412,7 +412,7 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s) bitmapsize, offset_setable, major) ) - st->ss->write_bitmap(st, fd2); + st->ss->write_bitmap(st, fd2, NoUpdate); else { pr_err("failed to create internal bitmap - chunksize problem.\n"); close(fd2); diff --git a/mdadm.8.in b/mdadm.8.in index 3dd000c..99b02a3 100644 --- a/mdadm.8.in +++ b/mdadm.8.in @@ -1098,6 +1098,7 @@ argument given to this flag can be one of .BR uuid , .BR name , .BR homehost , +.BR home-cluster , .BR resync , .BR byteorder , .BR devicesize , @@ -1159,6 +1160,11 @@ as recorded in the superblock. For version-0 superblocks, this is the same as updating the UUID. For version-1 superblocks, this involves updating the name. +The +.B home\-cluster +option will change the cluster name as recorded in the superblock and +bitmap. This option only works for clustered environment. + The .B resync option will cause the array to be marked diff --git a/mdadm.c b/mdadm.c index 859c48d..426e673 100644 --- a/mdadm.c +++ b/mdadm.c @@ -598,6 +598,7 @@ int main(int argc, char *argv[]) } continue; case O(CREATE, ClusterName): + case O(ASSEMBLE, ClusterName): c.homecluster = optarg; if (strlen(c.homecluster) > 64) { pr_err("Cluster name too big.\n"); @@ -741,6 +742,8 @@ int main(int argc, char *argv[]) continue; if (strcmp(c.update, "homehost")==0) continue; + if (strcmp(c.update, "home-cluster")==0) + continue; if (strcmp(c.update, "devicesize")==0) continue; if (strcmp(c.update, "no-bitmap")==0) @@ -780,7 +783,7 @@ int main(int argc, char *argv[]) } fprintf(outf, "Valid --update options are:\n" " 'sparc2.2', 'super-minor', 'uuid', 'name', 'resync',\n" - " 'summaries', 'homehost', 'byteorder', 'devicesize',\n" + " 'summaries', 'homehost', 'home-cluster', 'byteorder', 'devicesize',\n" " 'no-bitmap', 'metadata', 'revert-reshape'\n" " 'bbl', 'no-bbl'\n" ); diff --git a/mdadm.h b/mdadm.h index 00c726e..d8b0749 100644 --- a/mdadm.h +++ b/mdadm.h @@ -354,6 +354,11 @@ enum prefix_standard { IEC }; +enum bitmap_update { + NoUpdate, + NameUpdate, +}; + /* structures read from config file */ /* List of mddevice names and identifiers * Identifiers can be: @@ -850,7 +855,7 @@ extern struct superswitch { /* if add_internal_bitmap succeeded for existing array, this * writes it out. */ - int (*write_bitmap)(struct supertype *st, int fd); + int (*write_bitmap)(struct supertype *st, int fd, enum bitmap_update update); /* Free the superblock and any other allocated data */ void (*free_super)(struct supertype *st); diff --git a/super0.c b/super0.c index deb5999..6ad9d39 100644 --- a/super0.c +++ b/super0.c @@ -900,7 +900,7 @@ static int write_init_super0(struct supertype *st) rv = store_super0(st, di->fd); if (rv == 0 && (sb->state & (1<ss->write_bitmap(st, di->fd); + rv = st->ss->write_bitmap(st, di->fd, NoUpdate); if (rv) pr_err("failed to write superblock to %s\n", @@ -1175,7 +1175,7 @@ static void locate_bitmap0(struct supertype *st, int fd) lseek64(fd, offset, 0); } -static int write_bitmap0(struct supertype *st, int fd) +static int write_bitmap0(struct supertype *st, int fd, enum bitmap_update update) { unsigned long long dsize; unsigned long long offset; diff --git a/super1.c b/super1.c index a95c8d0..167f2ca 100644 --- a/super1.c +++ b/super1.c @@ -256,6 +256,7 @@ static int awrite(struct align_fd *afd, void *buf, int len) static void examine_super1(struct supertype *st, char *homehost) { struct mdp_superblock_1 *sb = st->sb; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb)+MAX_SB_SIZE); time_t atime; unsigned int d; int role; @@ -289,6 +290,8 @@ static void examine_super1(struct supertype *st, char *homehost) strncmp(sb->set_name, homehost, l) == 0) printf(" (local to host %s)", homehost); printf("\n"); + if (bms->nodes > 0) + printf("Cluster Name : %s", bms->cluster_name); atime = __le64_to_cpu(sb->ctime) & 0xFFFFFFFFFFULL; printf(" Creation Time : %.24s\n", ctime(&atime)); c=map_num(pers, __le32_to_cpu(sb->level)); @@ -740,6 +743,7 @@ err: static void detail_super1(struct supertype *st, char *homehost) { struct mdp_superblock_1 *sb = st->sb; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE); int i; int l = homehost ? strlen(homehost) : 0; @@ -748,6 +752,8 @@ static void detail_super1(struct supertype *st, char *homehost) sb->set_name[l] == ':' && strncmp(sb->set_name, homehost, l) == 0) printf(" (local to host %s)", homehost); + if (bms->nodes > 0) + printf("Cluster Name : %64s", bms->cluster_name); printf("\n UUID : "); for (i=0; i<16; i++) { if ((i&3)==0 && i != 0) printf(":"); @@ -1691,7 +1697,7 @@ static int write_init_super1(struct supertype *st) sb->sb_csum = calc_sb_1_csum(sb); rv = store_super1(st, di->fd); if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1)) - rv = st->ss->write_bitmap(st, di->fd); + rv = st->ss->write_bitmap(st, di->fd, NoUpdate); close(di->fd); di->fd = -1; if (rv) @@ -2175,7 +2181,7 @@ static void locate_bitmap1(struct supertype *st, int fd) lseek64(fd, offset<<9, 0); } -static int write_bitmap1(struct supertype *st, int fd) +static int write_bitmap1(struct supertype *st, int fd, enum bitmap_update update) { struct mdp_superblock_1 *sb = st->sb; bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb)+MAX_SB_SIZE); @@ -2185,6 +2191,19 @@ static int write_bitmap1(struct supertype *st, int fd) struct align_fd afd; unsigned int i = 0; + switch (update) { + case NameUpdate: + /* update cluster name */ + if (st->cluster_name) { + memset((char *)bms->cluster_name, 0, sizeof(bms->cluster_name)); + strncpy((char *)bms->cluster_name, st->cluster_name, 64); + } + break; + case NoUpdate: + default: + break; + } + init_afd(&afd, fd); locate_bitmap1(st, fd); From 7e6e839a265190e15742c4ecdd050aa1d9f208c6 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 10 Jun 2015 13:42:12 +0800 Subject: [PATCH 09/10] mdadm: change the num of cluster node This extends nodes option for assemble mode, make the num of cluster node could be change by user. Before that, it is necessary to ensure there are enough space for those nodes, calc_bitmap_size is introduced to calculate the bitmap size of each node. Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- Assemble.c | 3 +++ ReadMe.c | 2 +- mdadm.8.in | 8 ++++++++ mdadm.c | 5 ++++- mdadm.h | 1 + super1.c | 37 +++++++++++++++++++++++++++++++++++++ 6 files changed, 54 insertions(+), 2 deletions(-) diff --git a/Assemble.c b/Assemble.c index 12ac299..07d363c 100644 --- a/Assemble.c +++ b/Assemble.c @@ -629,6 +629,9 @@ static int load_devices(struct devs *devices, char *devmap, else if (strcmp(c->update, "home-cluster") == 0) { tst->cluster_name = c->homecluster; tst->ss->write_bitmap(tst, dfd, NameUpdate); + } else if (strcmp(c->update, "nodes") == 0) { + tst->nodes = c->nodes; + err = tst->ss->write_bitmap(tst, dfd, NodeNumUpdate); } else err = tst->ss->update_super(tst, content, c->update, devname, c->verbose, diff --git a/ReadMe.c b/ReadMe.c index c854cd5..d1830e1 100644 --- a/ReadMe.c +++ b/ReadMe.c @@ -140,7 +140,7 @@ struct option long_options[] = { {"homehost", 1, 0, HomeHost}, {"symlinks", 1, 0, Symlinks}, {"data-offset",1, 0, DataOffset}, - {"nodes",1, 0, Nodes}, + {"nodes",1, 0, Nodes}, /* also for --assemble */ {"home-cluster",1, 0, ClusterName}, /* For assemble */ diff --git a/mdadm.8.in b/mdadm.8.in index 99b02a3..8b7768d 100644 --- a/mdadm.8.in +++ b/mdadm.8.in @@ -1097,6 +1097,7 @@ argument given to this flag can be one of .BR summaries , .BR uuid , .BR name , +.BR nodes , .BR homehost , .BR home-cluster , .BR resync , @@ -1149,6 +1150,13 @@ The .B name option will change the .I name +of the array as stored in the superblock and bitmap. This option only +works for clustered environment. + +The +.B nodes +option will change the +.I nodes of the array as stored in the superblock. This is only supported for version-1 superblocks. diff --git a/mdadm.c b/mdadm.c index 426e673..c4daf25 100644 --- a/mdadm.c +++ b/mdadm.c @@ -589,6 +589,7 @@ int main(int argc, char *argv[]) } ident.raid_disks = s.raiddisks; continue; + case O(ASSEMBLE, Nodes): case O(CREATE, Nodes): c.nodes = parse_num(optarg); if (c.nodes <= 0) { @@ -744,6 +745,8 @@ int main(int argc, char *argv[]) continue; if (strcmp(c.update, "home-cluster")==0) continue; + if (strcmp(c.update, "nodes")==0) + continue; if (strcmp(c.update, "devicesize")==0) continue; if (strcmp(c.update, "no-bitmap")==0) @@ -782,7 +785,7 @@ int main(int argc, char *argv[]) Name, c.update); } fprintf(outf, "Valid --update options are:\n" - " 'sparc2.2', 'super-minor', 'uuid', 'name', 'resync',\n" + " 'sparc2.2', 'super-minor', 'uuid', 'name', 'nodes', 'resync',\n" " 'summaries', 'homehost', 'home-cluster', 'byteorder', 'devicesize',\n" " 'no-bitmap', 'metadata', 'revert-reshape'\n" " 'bbl', 'no-bbl'\n" diff --git a/mdadm.h b/mdadm.h index d8b0749..97892e6 100644 --- a/mdadm.h +++ b/mdadm.h @@ -357,6 +357,7 @@ enum prefix_standard { enum bitmap_update { NoUpdate, NameUpdate, + NodeNumUpdate, }; /* structures read from config file */ diff --git a/super1.c b/super1.c index 167f2ca..ba74a33 100644 --- a/super1.c +++ b/super1.c @@ -134,6 +134,20 @@ struct misc_dev_info { |MD_FEATURE_NEW_OFFSET \ ) +/* return how many bytes are needed for bitmap, for cluster-md each node + * should have it's own bitmap */ +static unsigned int calc_bitmap_size(bitmap_super_t *bms, unsigned int boundary) +{ + unsigned long long bits, bytes; + + bits = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9); + bytes = (bits+7) >> 3; + bytes += sizeof(bitmap_super_t); + bytes = ROUND_UP(bytes, boundary); + + return bytes; +} + static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) { unsigned int disk_csum, csum; @@ -2190,6 +2204,7 @@ static int write_bitmap1(struct supertype *st, int fd, enum bitmap_update update int towrite, n; struct align_fd afd; unsigned int i = 0; + unsigned long long total_bm_space, bm_space_per_node; switch (update) { case NameUpdate: @@ -2199,6 +2214,28 @@ static int write_bitmap1(struct supertype *st, int fd, enum bitmap_update update strncpy((char *)bms->cluster_name, st->cluster_name, 64); } break; + case NodeNumUpdate: + /* cluster md only supports superblock 1.2 now */ + if (st->minor_version != 2) { + pr_err("Warning: cluster md only works with superblock 1.2\n"); + return -EINVAL; + } + + /* Each node has an independent bitmap, it is necessary to calculate the + * space is enough or not, first get how many bytes for the total bitmap */ + bm_space_per_node = calc_bitmap_size(bms, 4096); + + total_bm_space = 512 * (__le64_to_cpu(sb->data_offset) - __le64_to_cpu(sb->super_offset)); + total_bm_space = total_bm_space - 4096; /* leave another 4k for superblock */ + + if (bm_space_per_node * st->nodes > total_bm_space) { + pr_err("Warning: The max num of nodes can't exceed %llu\n", + total_bm_space / bm_space_per_node); + return -ENOMEM; + } + + bms->nodes = __cpu_to_le32(st->nodes); + break; case NoUpdate: default: break; From 4a3d29edce15b739803194b5fd5b41c2f3fea939 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 10 Jun 2015 13:42:13 +0800 Subject: [PATCH 10/10] Reuse calc_bitmap_size to reduce code size We can use the new added calc_bitmap_size func to remove some redundant lines. Signed-off-by: Guoqing Jiang Signed-off-by: NeilBrown --- super1.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/super1.c b/super1.c index ba74a33..fda71e3 100644 --- a/super1.c +++ b/super1.c @@ -698,12 +698,8 @@ static int copy_metadata1(struct supertype *st, int from, int to) /* have the header, can calculate * correct bitmap bytes */ bitmap_super_t *bms; - int bits; bms = (void*)buf; - bits = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9); - bytes = (bits+7) >> 3; - bytes += sizeof(bitmap_super_t); - bytes = ROUND_UP(bytes, 512); + bytes = calc_bitmap_size(bms, 512); if (n > bytes) n = bytes; } @@ -2258,11 +2254,7 @@ static int write_bitmap1(struct supertype *st, int fd, enum bitmap_update update memset(buf, 0xff, 4096); memcpy(buf, (char *)bms, sizeof(bitmap_super_t)); - towrite = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9); - towrite = (towrite+7) >> 3; /* bits to bytes */ - towrite += sizeof(bitmap_super_t); - /* we need the bitmaps to be at 4k boundary */ - towrite = ROUND_UP(towrite, 4096); + towrite = calc_bitmap_size(bms, 4096); while (towrite > 0) { n = towrite; if (n > 4096)