Any way what I have is the attached patch that was made to 2.4.18-pre4.
It applies cleanly and compiles against 2.4.18-pre9, though it has not
been tested. In fact the patch has not been heavily tested, I need to
come up with a way to cause a fault on a single path on my test machine.
The interface I have implemented uses sysctl, which shows up in /proc
under /proc/sys if you have procfs enabled, this is what I have been
using. Under /proc/sys I have created a hierarchy:
multipath/
version disk#/
|
config drive#/
|
fault operational recover state
As an example, my test system with a multipathed drive (/dev/md0) has
two paths, the tree looks like:
/proc/sys/dev# ls -R multipath
multipath:
0 version
multipath/0:
0 1 config
multipath/0/0:
fault operational recover state
multipath/0/1:
fault operational recover state
To recover a faulted path "0", write a non-zero acsii string to the file
"/proc/sys/dev/multipath/0/0/recover". This will set the field recover
in the multipath_info struct. This will be checked the next
"make_request" if the drive is marked as faulted, the drive will be
marked as spare, if not it is ignored. In either case, the recover
field will be cleared.
I am planning to design a user level daemon to do auto-recovery, and
will think about cleaning this whole thing up, as well as testing it
more. I'd really like any comments or requests anybody might have.
Beattie.
Brian Beattie<alchemy@us.ibm.com>
IBM LTC Storage IO
----
diff -u -r --exclude-from=../dontdiff ../linux/drivers/md/multipath.c ./drivers/md/multipath.c
--- ../linux/drivers/md/multipath.c Fri Feb 8 10:45:02 2002
+++ ./drivers/md/multipath.c Thu Feb 7 16:23:46 2002
@@ -22,6 +22,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/raid/multipath.h>
+#include <linux/sysctl.h>
#include <asm/atomic.h>
#define MAJOR_NR MD_MAJOR
@@ -46,6 +47,67 @@
#define PRINTK(x...) do { } while (0)
#endif
+static char multipath_version[] =
+ { "MD/LVM Multipath Storage Device Driver: ver 0.0.3" };
+
+static int multipath_proc_readstr (ctl_table *, int, struct file *, void *,
+ size_t *);
+static int multipath_proc_read_dev (ctl_table *, int, struct file *, void *,
+ size_t *);
+static int multipath_proc_disk_fault (ctl_table *, int, struct file *, void *,
+ size_t *);
+
+static void mark_disk_recovered (mddev_t *, int);
+
+static struct ctl_table_header *multipath_table_header;
+
+static struct multipath_disk_table multipath_disk_template = {
+ "",
+ NULL,
+ {
+ {MULTIPATH_DSTATE, "state", NULL, 0, 0444, NULL,
+ &proc_dointvec},
+ {MULTIPATH_DFAULT, "fault", NULL, 0, 0444, NULL,
+ &multipath_proc_disk_fault},
+ {MULTIPATH_OPER, "operational", NULL, 0, 0444, NULL,
+ &proc_dointvec},
+ {MULTIPATH_OPER, "recover", NULL, 0, 0644, NULL,
+ &proc_dointvec},
+ {0},
+ },
+ {{MULTIPATH_DISK, NULL, NULL, 0, 0555, NULL}, {0} },
+ {{MULTIPATH_DEV, NULL, NULL, 0, 0555, NULL},{0} },
+ {{DEV_MULTIPATH, "multipath", NULL, 0, 0555, NULL},{0}},
+ {{CTL_DEV, "dev", NULL, 0, 0555, NULL},{0}}
+};
+
+static struct multipath_dev_table multipath_dev_template = {
+ "",
+ NULL,
+ {
+ {MULTIPATH_CONF, "config", NULL, 0, 0444, NULL,
+ &multipath_proc_read_dev},
+ {0},
+ },
+ {{MULTIPATH_DEV, NULL, NULL, 0, 0555, NULL},{0}},
+ {{DEV_MULTIPATH, "multipath", NULL, 0, 0555, NULL},{0}},
+ {{CTL_DEV, "dev", NULL, 0, 0555, NULL},{0}}
+};
+
+static ctl_table multipath_ver_table[] = {
+ {MULTIPATH_VER, "version", &multipath_version,
+ sizeof(multipath_version), 0444, NULL, &multipath_proc_readstr}, {0}
+};
+
+static ctl_table multipath_dir_table[] = {
+ {DEV_MULTIPATH, "multipath", NULL, 0, 0555, multipath_ver_table},
+ {0}
+};
+
+static ctl_table multipath_root_table[] = {
+ {CTL_DEV, "dev", NULL, 0, 0555, multipath_dir_table},
+ {0}
+};
static mdk_personality_t multipath_personality;
static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
@@ -53,7 +115,205 @@
static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state);
+static int multipath_proc_register_dev(mddev_t *md, multipath_conf_t *conf)
+{
+ struct multipath_dev_table *t;
+
+ t = &(conf->ctl_tbl);
+
+ memcpy(t, &multipath_dev_template, sizeof(*t));
+ /* fill in fields */
+ snprintf( t->mdname, 8, "%d", md->__minor );
+
+ t->dir[0].procname = t->mdname;
+
+ t->md[0].data = md;
+
+ t->dev[0].child = t->mp;
+ t->mp[0].child = t->dir;
+ t->dir[0].child = t->md;
+
+ t->sysctl_header = register_sysctl_table( t->dev, 1 );
+
+ return 0;
+}
+
+static int multipath_proc_unregister_dev( multipath_conf_t *conf )
+{
+ struct multipath_dev_table *t = &conf->ctl_tbl;
+
+ unregister_sysctl_table( t->sysctl_header );
+
+ return 0;
+}
+
+static int multipath_proc_register_disk( multipath_conf_t *conf,
+ struct multipath_info *disk, mdp_disk_t *desc )
+{
+ struct multipath_disk_table *t = &disk->ctl_tbl;
+
+ memcpy(t, &multipath_disk_template, sizeof(*t));
+
+ snprintf( t->mdname, 8, "%d", disk->number );
+
+ t->md[0].procname = t->mdname;
+
+ /* state */
+ t->disk[0].data = &(desc->state);
+ t->disk[0].maxlen = sizeof(desc->state);
+ /* faulty */
+ t->disk[1].data = desc;
+ t->disk[1].maxlen = 0;
+ /* operational */
+ t->disk[2].data = &(disk->operational);
+ t->disk[2].maxlen = sizeof(disk->operational);
+ /* marked for recovery */
+ t->disk[3].data = &(disk->recover);
+ t->disk[3].maxlen = sizeof(disk->recover);
+
+ t->md[0].child = t->disk;
+
+ t->dir[0].procname = conf->ctl_tbl.dir[0].procname;
+
+ t->dir[0].child = t->md;
+ t->mp[0].child = t->dir;
+ t->dev[0].child = t->mp;
+
+ t->sysctl_header = register_sysctl_table(t->dev, 0);
+
+ return 0;
+}
+
+static int multipath_proc_unregister_disk( struct multipath_info *disk )
+{
+ struct multipath_disk_table *t = &disk->ctl_tbl;
+
+ unregister_sysctl_table( t->sysctl_header );
+
+ return 0;
+}
+
+static int multipath_proc_readstr (ctl_table *tbl, int write, struct file *f,
+ void *buffer, size_t *lenp)
+{
+ int n;
+
+ if ( write )
+ return -EACCES; /* readonly string */
+
+ /* check for no or zero length data, or data allready read */
+ if (!tbl->data || !tbl->maxlen || !*lenp || f->f_pos ) {
+ *lenp = 0;
+ return 0;
+ }
+
+ n = strlen(tbl->data);
+
+ if (n > tbl->maxlen)
+ n = tbl->maxlen;
+
+ if ( n > *lenp )
+ n = *lenp;
+
+ if ( n )
+ if(copy_to_user( buffer, multipath_version, n))
+ return -EFAULT;
+ if ( n < *lenp )
+ {
+ if(put_user('\n', ((char *)buffer) + n) )
+ return -EFAULT;
+ n++;
+ }
+ *lenp = n;
+ f->f_pos += n;
+
+ return 0;
+}
+
+static int multipath_proc_disk_fault (ctl_table *t, int w, struct file *f,
+ void *b, size_t *s)
+{
+ if ( w )
+ return -EACCES; /* Readonly */
+
+ /* check for no data, or data allready read */
+ if (!t->data || !*s || f->f_pos ) {
+ *s = 0;
+ return 0;
+ }
+
+ snprintf( b, *s, "%c\n", disk_faulty( (mdp_disk_t *)t->data ) ? 'y': 'n' );
+
+ *s = strlen( b );
+ f->f_pos += *s;
+
+ return 0;
+}
+
+
+static int multipath_proc_read_dev (ctl_table *t, int w, struct file *f,
+ void *b, size_t *s)
+{
+ mddev_t *md;
+ multipath_conf_t *conf;
+ struct multipath_info *info;
+ int path, len = 0;
+#define LEN_HDR 48
+#define LEN_DSK 85
+
+ if (!t->data || !*s || f->f_pos ) {
+ *s = 0;
+ return 0;
+ }
+
+ if ( w )
+ return -EACCES; /* readonly */
+
+ md = t->data;
+ conf = mddev_to_conf( md );
+
+ if ( f->f_pos == 0 ) {
+ if ( *s < LEN_HDR ) /* must be big enough to handle the */
+ return -EFAULT; /* size of the next sprintf */
+
+ sprintf( b, "nr_disks %3d: raid_disks %3d: working_disks %3d\n", conf->nr_disks&255, conf->raid_disks&255,
+ conf->working_disks&255 );
+
+ len = strlen( b );
+ if ( *s < len ) /* check for overflow */
+ return -EFAULT;
+
+ if ( *s < len + LEN_DSK ) {
+ *s = len;
+ f->f_pos = len;
+ return 0;
+ }
+ }
+
+ for ( path = 0; path < conf->nr_disks; path++ ) {
+ info = &conf->multipaths[path];
+
+ sprintf( b + len,
+ "%3d: disk %3d: dev %3d.%3d\n"
+ "\tworking %c: spare %c: used %c "
+ "ops %10d\n",
+ info->number&255, info->raid_disk&255, MAJOR(info->dev), MINOR(info->dev),
+ info->operational? 'y' : 'n',
+ info->spare? 'y' : 'n',
+ info->used_slot? 'y' : 'n',
+ info->nr_ops );
+ len = strlen( b );
+ if ( *s < len + LEN_DSK )
+ break;
+ }
+
+ *s = len;
+ f->f_pos = len;
+ return 0;
+#undef LEN_HDR
+#undef LEN_DSK
+}
static struct multipath_bh *multipath_alloc_mpbh(multipath_conf_t *conf)
{
@@ -245,10 +505,14 @@
struct buffer_head *bh_req;
struct multipath_bh * mp_bh;
struct multipath_info *multipath;
+ int disk;
if (!buffer_locked(bh))
BUG();
+ for (disk = 0; disk < conf->raid_disks; disk++)
+ if (conf->multipaths[disk].recover)
+ mark_disk_recovered( mddev, disk );
/*
* make_request() can abort the operation when READA is being
* used and no empty request is available.
@@ -277,6 +541,9 @@
/* bh_req->b_rsector = bh->n_rsector; */
bh_req->b_end_io = multipath_end_request;
bh_req->b_private = mp_bh;
+
+ multipath->nr_ops++;
+
generic_make_request (rw, bh_req);
return 0;
}
@@ -305,6 +572,27 @@
"multipath: IO failure on %s, disabling IO path. \n" \
" Operation continuing on %d IO paths.\n"
+#define REG_D_ERROR KERN_ERR \
+"multipath: proc_register failed for disk %d\n"
+
+static void mark_disk_recovered (mddev_t *mddev, int recovered)
+{
+ multipath_conf_t *conf = mddev_to_conf(mddev);
+ struct multipath_info *multipath = conf->multipaths+recovered;
+ mdp_super_t *sb = mddev->sb;
+
+ multipath->recover = 0;
+
+ if ( !disk_faulty(sb->disks+multipath->number) )
+ return; /* only disks marked faulty can be recovered */
+
+ sb->active_disks++;
+ sb->working_disks++;
+ sb->failed_disks--;
+ mark_disk_spare( sb->disks+multipath->number);
+ md_wakeup_thread(conf->thread);
+}
+
static void mark_disk_bad (mddev_t *mddev, int failed)
{
multipath_conf_t *conf = mddev_to_conf(mddev);
@@ -312,6 +600,7 @@
mdp_super_t *sb = mddev->sb;
multipath->operational = 0;
+ multipath->recover = 0;
mark_disk_faulty(sb->disks+multipath->number);
mark_disk_nonsync(sb->disks+multipath->number);
mark_disk_inactive(sb->disks+multipath->number);
@@ -399,14 +688,6 @@
int i;
struct multipath_info *tmp;
- printk("MULTIPATH conf printout:\n");
- if (!conf) {
- printk("(conf==NULL)\n");
- return;
- }
- printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
- conf->raid_disks, conf->nr_disks);
-
for (i = 0; i < MD_SB_DISKS; i++) {
tmp = conf->multipaths + i;
if (tmp->spare || tmp->operational || tmp->number ||
@@ -633,6 +914,9 @@
rdisk->dev = MKDEV(0,0);
rdisk->used_slot = 0;
conf->nr_disks--;
+
+ multipath_proc_unregister_disk( rdisk );
+
break;
case DISKOP_HOT_ADD_DISK:
@@ -654,6 +938,9 @@
adisk->used_slot = 1;
conf->nr_disks++;
+ if (multipath_proc_register_disk( conf, adisk, added_desc ) )
+ printk(REG_D_ERROR, adisk->number);
+
break;
default:
@@ -824,6 +1111,9 @@
#define THREAD_ERROR KERN_ERR \
"multipath: couldn't allocate thread for md%d\n"
+#define REG_ERROR KERN_ERR \
+"multipath: proc_register failed for md%d\n"
+
static int multipath_run (mddev_t *mddev)
{
multipath_conf_t *conf;
@@ -855,6 +1145,11 @@
}
memset(conf, 0, sizeof(*conf));
+ if (multipath_proc_register_dev(mddev, conf)) {
+ printk(REG_ERROR, mdidx(mddev));
+ goto out_free_conf;
+ }
+
ITERATE_RDEV(mddev,rdev,tmp) {
if (rdev->faulty) {
/* this is a "should never happen" case and if it */
@@ -908,6 +1203,7 @@
} else
mark_disk_spare(desc);
+ multipath_proc_register_disk( conf, disk, desc );
if(!num_rdevs++) def_rdev = rdev;
}
if(!conf->working_disks && num_rdevs) {
@@ -1031,11 +1327,21 @@
#undef NONE_OPERATIONAL
#undef SB_DIFFERENCES
#undef ARRAY_IS_ACTIVE
+#undef REG_ERROR
+#undef REG_D_ERROR
static int multipath_stop (mddev_t *mddev)
{
multipath_conf_t *conf = mddev_to_conf(mddev);
+ int i, disks = MD_SB_DISKS;
+
+ /* unregister all disks */
+ for (i = 0; i < disks; i++) {
+ if (conf->multipaths[i].used_slot )
+ multipath_proc_unregister_disk( &conf->multipaths[i] );
+ }
+ multipath_proc_unregister_dev( conf );
md_unregister_thread(conf->thread);
multipath_shrink_mpbh(conf);
kfree(conf);
@@ -1057,11 +1363,13 @@
static int md__init multipath_init (void)
{
+ multipath_table_header = register_sysctl_table(multipath_root_table, 1);
return register_md_personality (MULTIPATH, &multipath_personality);
}
static void multipath_exit (void)
{
+ unregister_sysctl_table(multipath_table_header);
unregister_md_personality (MULTIPATH);
}
Only in ./include/linux: modules
diff -u -r --exclude-from=../dontdiff ../linux/include/linux/raid/multipath.h ./include/linux/raid/multipath.h
--- ../linux/include/linux/raid/multipath.h Mon Nov 12 09:51:56 2001
+++ ./include/linux/raid/multipath.h Thu Feb 7 16:10:40 2002
@@ -2,19 +2,43 @@
#define _MULTIPATH_H
#include <linux/raid/md.h>
+#include <linux/sysctl.h>
+
+struct multipath_dev_table {
+ char mdname[8];
+ struct ctl_table_header *sysctl_header;
+ ctl_table md[3];
+ ctl_table dir[2];
+ ctl_table mp[2];
+ ctl_table dev[2];
+};
+
+struct multipath_disk_table {
+ char mdname[8];
+ struct ctl_table_header *sysctl_header;
+ ctl_table disk[5];
+ ctl_table md[2];
+ ctl_table dir[2];
+ ctl_table mp[2];
+ ctl_table dev[2];
+};
struct multipath_info {
int number;
int raid_disk;
kdev_t dev;
+ struct multipath_disk_table ctl_tbl;
/*
* State bits:
*/
int operational;
int spare;
+ int recover; /* marked for retry after failure */
int used_slot;
+
+ unsigned int nr_ops;
};
struct multipath_private_data {
@@ -37,6 +61,8 @@
int freer1_blocked;
int freer1_cnt;
md_wait_queue_head_t wait_buffer;
+ int last; /* last used route */
+ struct multipath_dev_table ctl_tbl;
};
typedef struct multipath_private_data multipath_conf_t;
diff -u -r --exclude-from=../dontdiff ../linux/include/linux/sysctl.h ./include/linux/sysctl.h
--- ../linux/include/linux/sysctl.h Mon Nov 26 05:29:17 2001
+++ ./include/linux/sysctl.h Mon Feb 4 14:13:46 2002
@@ -553,7 +553,8 @@
DEV_HWMON=2,
DEV_PARPORT=3,
DEV_RAID=4,
- DEV_MAC_HID=5
+ DEV_MAC_HID=5,
+ DEV_MULTIPATH=6
};
/* /proc/sys/dev/cdrom */
@@ -575,6 +576,26 @@
enum {
DEV_RAID_SPEED_LIMIT_MIN=1,
DEV_RAID_SPEED_LIMIT_MAX=2
+};
+
+/* /proc/sys/dev/multipath */
+enum {
+ MULTIPATH_VER=1,
+ MULTIPATH_DEV=2
+};
+
+/* /proc/sys/dev/multipath/md n */
+enum {
+ MULTIPATH_ROUTING=1,
+ MULTIPATH_CONF=2,
+ MULTIPATH_DISK=3
+};
+
+/* /proc/sys/dev/multipath/md n/disk n */
+enum {
+ MULTIPATH_DSTATE=1,
+ MULTIPATH_DFAULT=2,
+ MULTIPATH_OPER=3
};
/* /proc/sys/dev/parport/default */
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/