aboutsummaryrefslogtreecommitdiff
path: root/drivers/staging/ramster/tmem.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/staging/ramster/tmem.c')
-rw-r--r--drivers/staging/ramster/tmem.c313
1 files changed, 178 insertions, 135 deletions
diff --git a/drivers/staging/ramster/tmem.c b/drivers/staging/ramster/tmem.c
index 8f2f6892d8d..a2b7e03b606 100644
--- a/drivers/staging/ramster/tmem.c
+++ b/drivers/staging/ramster/tmem.c
@@ -1,33 +1,43 @@
/*
* In-kernel transcendent memory (generic implementation)
*
- * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
+ * Copyright (c) 2009-2012, Dan Magenheimer, Oracle Corp.
*
* The primary purpose of Transcedent Memory ("tmem") is to map object-oriented
* "handles" (triples containing a pool id, and object id, and an index), to
* pages in a page-accessible memory (PAM). Tmem references the PAM pages via
* an abstract "pampd" (PAM page-descriptor), which can be operated on by a
* set of functions (pamops). Each pampd contains some representation of
- * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of
- * pages and must be able to insert, find, and delete these pages at a
- * potential frequency of thousands per second concurrently across many CPUs,
- * (and, if used with KVM, across many vcpus across many guests).
- * Tmem is tracked with a hierarchy of data structures, organized by
- * the elements in a handle-tuple: pool_id, object_id, and page index.
- * One or more "clients" (e.g. guests) each provide one or more tmem_pools.
- * Each pool, contains a hash table of rb_trees of tmem_objs. Each
- * tmem_obj contains a radix-tree-like tree of pointers, with intermediate
- * nodes called tmem_objnodes. Each leaf pointer in this tree points to
- * a pampd, which is accessible only through a small set of callbacks
- * registered by the PAM implementation (see tmem_register_pamops). Tmem
- * does all memory allocation via a set of callbacks registered by the tmem
- * host implementation (e.g. see tmem_register_hostops).
+ * PAGE_SIZE bytes worth of data. For those familiar with key-value stores,
+ * the tmem handle is a three-level hierarchical key, and the value is always
+ * reconstituted (but not necessarily stored) as PAGE_SIZE bytes and is
+ * referenced in the datastore by the pampd. The hierarchy is required
+ * to ensure that certain invalidation functions can be performed efficiently
+ * (i.e. flush all indexes associated with this object_id, or
+ * flush all objects associated with this pool).
+ *
+ * Tmem must support potentially millions of pages and must be able to insert,
+ * find, and delete these pages at a potential frequency of thousands per
+ * second concurrently across many CPUs, (and, if used with KVM, across many
+ * vcpus across many guests). Tmem is tracked with a hierarchy of data
+ * structures, organized by the elements in the handle-tuple: pool_id,
+ * object_id, and page index. One or more "clients" (e.g. guests) each
+ * provide one or more tmem_pools. Each pool, contains a hash table of
+ * rb_trees of tmem_objs. Each tmem_obj contains a radix-tree-like tree
+ * of pointers, with intermediate nodes called tmem_objnodes. Each leaf
+ * pointer in this tree points to a pampd, which is accessible only through
+ * a small set of callbacks registered by the PAM implementation (see
+ * tmem_register_pamops). Tmem only needs to memory allocation for objs
+ * and objnodes and this is done via a set of callbacks that must be
+ * registered by the tmem host implementation (e.g. see tmem_register_hostops).
*/
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
+#ifdef CONFIG_RAMSTER
#include <linux/delay.h>
+#endif
#include "tmem.h"
@@ -52,7 +62,7 @@ void tmem_register_hostops(struct tmem_hostops *m)
/*
* A tmem host implementation must use this function to register
- * callbacks for a page-accessible memory (PAM) implementation
+ * callbacks for a page-accessible memory (PAM) implementation.
*/
static struct tmem_pamops tmem_pamops;
@@ -67,42 +77,62 @@ void tmem_register_pamops(struct tmem_pamops *m)
* So an rb_tree is an ideal data structure to manage tmem_objs. But because
* of the potentially huge number of tmem_objs, each pool manages a hashtable
* of rb_trees to reduce search, insert, delete, and rebalancing time.
- * Each hashbucket also has a lock to manage concurrent access.
+ * Each hashbucket also has a lock to manage concurrent access and no
+ * searches, inserts, or deletions can be performed unless the lock is held.
+ * As a result, care must be taken to ensure tmem routines are not called
+ * recursively; the vast majority of the time, a recursive call may work
+ * but a deadlock will occur a small fraction of the time due to the
+ * hashbucket lock.
*
- * The following routines manage tmem_objs. When any tmem_obj is accessed,
- * the hashbucket lock must be held.
+ * The following routines manage tmem_objs. In all of these routines,
+ * the hashbucket lock is already held.
*/
-/* searches for object==oid in pool, returns locked object if found */
-static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb,
- struct tmem_oid *oidp)
+/* Search for object==oid in pool, returns object if found. */
+static struct tmem_obj *__tmem_obj_find(struct tmem_hashbucket *hb,
+ struct tmem_oid *oidp,
+ struct rb_node **parent,
+ struct rb_node ***link)
{
- struct rb_node *rbnode;
- struct tmem_obj *obj;
+ struct rb_node *_parent = NULL, **rbnode;
+ struct tmem_obj *obj = NULL;
- rbnode = hb->obj_rb_root.rb_node;
- while (rbnode) {
- BUG_ON(RB_EMPTY_NODE(rbnode));
- obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
+ rbnode = &hb->obj_rb_root.rb_node;
+ while (*rbnode) {
+ BUG_ON(RB_EMPTY_NODE(*rbnode));
+ _parent = *rbnode;
+ obj = rb_entry(*rbnode, struct tmem_obj,
+ rb_tree_node);
switch (tmem_oid_compare(oidp, &obj->oid)) {
case 0: /* equal */
goto out;
case -1:
- rbnode = rbnode->rb_left;
+ rbnode = &(*rbnode)->rb_left;
break;
case 1:
- rbnode = rbnode->rb_right;
+ rbnode = &(*rbnode)->rb_right;
break;
}
}
+
+ if (parent)
+ *parent = _parent;
+ if (link)
+ *link = rbnode;
obj = NULL;
out:
return obj;
}
-static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *);
+static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb,
+ struct tmem_oid *oidp)
+{
+ return __tmem_obj_find(hb, oidp, NULL, NULL);
+}
-/* free an object that has no more pampds in it */
+static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *, bool);
+
+/* Free an object that has no more pampds in it. */
static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb)
{
struct tmem_pool *pool;
@@ -113,7 +143,7 @@ static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb)
pool = obj->pool;
BUG_ON(pool == NULL);
if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */
- tmem_pampd_destroy_all_in_obj(obj);
+ tmem_pampd_destroy_all_in_obj(obj, false);
BUG_ON(obj->objnode_tree_root != NULL);
BUG_ON((long)obj->objnode_count != 0);
atomic_dec(&pool->obj_count);
@@ -125,15 +155,14 @@ static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb)
}
/*
- * initialize, and insert an tmem_object_root (called only if find failed)
+ * Initialize, and insert an tmem_object_root (called only if find failed).
*/
static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb,
struct tmem_pool *pool,
struct tmem_oid *oidp)
{
struct rb_root *root = &hb->obj_rb_root;
- struct rb_node **new = &(root->rb_node), *parent = NULL;
- struct tmem_obj *this;
+ struct rb_node **new = NULL, *parent = NULL;
BUG_ON(pool == NULL);
atomic_inc(&pool->obj_count);
@@ -143,24 +172,15 @@ static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb,
obj->oid = *oidp;
obj->objnode_count = 0;
obj->pampd_count = 0;
- (*tmem_pamops.new_obj)(obj);
+#ifdef CONFIG_RAMSTER
+ if (tmem_pamops.new_obj != NULL)
+ (*tmem_pamops.new_obj)(obj);
+#endif
SET_SENTINEL(obj, OBJ);
- while (*new) {
- BUG_ON(RB_EMPTY_NODE(*new));
- this = rb_entry(*new, struct tmem_obj, rb_tree_node);
- parent = *new;
- switch (tmem_oid_compare(oidp, &this->oid)) {
- case 0:
- BUG(); /* already present; should never happen! */
- break;
- case -1:
- new = &(*new)->rb_left;
- break;
- case 1:
- new = &(*new)->rb_right;
- break;
- }
- }
+
+ if (__tmem_obj_find(hb, oidp, &parent, &new))
+ BUG();
+
rb_link_node(&obj->rb_tree_node, parent, new);
rb_insert_color(&obj->rb_tree_node, root);
}
@@ -170,7 +190,7 @@ static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb,
* "ephemeral" vs "persistent". These attributes apply to all tmem_objs
* and all pampds that belong to a tmem_pool. A tmem_pool is created
* or deleted relatively rarely (for example, when a filesystem is
- * mounted or unmounted.
+ * mounted or unmounted).
*/
/* flush all data from a pool and, optionally, free it */
@@ -188,7 +208,7 @@ static void tmem_pool_flush(struct tmem_pool *pool, bool destroy)
while (rbnode != NULL) {
obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
rbnode = rb_next(rbnode);
- tmem_pampd_destroy_all_in_obj(obj);
+ tmem_pampd_destroy_all_in_obj(obj, true);
tmem_obj_free(obj, hb);
(*tmem_hostops.obj_free)(obj, pool);
}
@@ -274,7 +294,7 @@ static void tmem_objnode_free(struct tmem_objnode *objnode)
}
/*
- * lookup index in object and return associated pampd (or NULL if not found)
+ * Lookup index in object and return associated pampd (or NULL if not found).
*/
static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
{
@@ -316,6 +336,7 @@ static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
return slot != NULL ? *slot : NULL;
}
+#ifdef CONFIG_RAMSTER
static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
void *new_pampd, bool no_free)
{
@@ -333,6 +354,7 @@ static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
}
return ret;
}
+#endif
static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index,
void *pampd)
@@ -470,7 +492,7 @@ out:
return slot;
}
-/* recursively walk the objnode_tree destroying pampds and objnodes */
+/* Recursively walk the objnode_tree destroying pampds and objnodes. */
static void tmem_objnode_node_destroy(struct tmem_obj *obj,
struct tmem_objnode *objnode,
unsigned int ht)
@@ -495,7 +517,8 @@ static void tmem_objnode_node_destroy(struct tmem_obj *obj,
}
}
-static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)
+static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj,
+ bool pool_destroy)
{
if (obj->objnode_tree_root == NULL)
return;
@@ -510,7 +533,10 @@ static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)
obj->objnode_tree_height = 0;
}
obj->objnode_tree_root = NULL;
- (*tmem_pamops.free_obj)(obj->pool, obj);
+#ifdef CONFIG_RAMSTER
+ if (tmem_pamops.free_obj != NULL)
+ (*tmem_pamops.free_obj)(obj->pool, obj, pool_destroy);
+#endif
}
/*
@@ -523,17 +549,16 @@ static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)
*/
/*
- * "Put" a page, e.g. copy a page from the kernel into newly allocated
- * PAM space (if such space is available). Tmem_put is complicated by
- * a corner case: What if a page with matching handle already exists in
- * tmem? To guarantee coherency, one of two actions is necessary: Either
- * the data for the page must be overwritten, or the page must be
- * "flushed" so that the data is not accessible to a subsequent "get".
- * Since these "duplicate puts" are relatively rare, this implementation
- * always flushes for simplicity.
+ * "Put" a page, e.g. associate the passed pampd with the passed handle.
+ * Tmem_put is complicated by a corner case: What if a page with matching
+ * handle already exists in tmem? To guarantee coherency, one of two
+ * actions is necessary: Either the data for the page must be overwritten,
+ * or the page must be "flushed" so that the data is not accessible to a
+ * subsequent "get". Since these "duplicate puts" are relatively rare,
+ * this implementation always flushes for simplicity.
*/
int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
- char *data, size_t size, bool raw, int ephemeral)
+ bool raw, void *pampd_to_use)
{
struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
void *pampd = NULL, *pampd_del = NULL;
@@ -566,19 +591,17 @@ int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
}
BUG_ON(obj == NULL);
BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound));
- pampd = (*tmem_pamops.create)(data, size, raw, ephemeral,
- obj->pool, &obj->oid, index);
- if (unlikely(pampd == NULL))
- goto free;
+ pampd = pampd_to_use;
+ BUG_ON(pampd_to_use == NULL);
ret = tmem_pampd_add_to_obj(obj, index, pampd);
if (unlikely(ret == -ENOMEM))
/* may have partially built objnode tree ("stump") */
goto delete_and_free;
+ (*tmem_pamops.create_finish)(pampd, is_ephemeral(pool));
goto out;
delete_and_free:
(void)tmem_pampd_delete_from_obj(obj, index);
-free:
if (pampd)
(*tmem_pamops.free)(pampd, pool, NULL, 0, true);
if (objnew) {
@@ -590,6 +613,16 @@ out:
return ret;
}
+#ifdef CONFIG_RAMSTER
+/*
+ * For ramster only: The following routines provide a two-step sequence
+ * to allow the caller to replace a pampd in the tmem data structures with
+ * another pampd. Here, we lookup the passed handle and, if found, return the
+ * associated pampd and object, leaving the hashbucket locked and returning
+ * a reference to it. The caller is expected to immediately call the
+ * matching tmem_localify_finish routine which will handles the replacement
+ * and unlocks the hashbucket.
+ */
void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp,
uint32_t index, struct tmem_obj **ret_obj,
void **saved_hb)
@@ -618,6 +651,7 @@ void tmem_localify_finish(struct tmem_obj *obj, uint32_t index,
if (pampd != NULL) {
BUG_ON(obj == NULL);
(void)tmem_pampd_replace_in_obj(obj, index, pampd, 1);
+ (*tmem_pamops.create_finish)(pampd, is_ephemeral(obj->pool));
} else if (delete) {
BUG_ON(obj == NULL);
(void)tmem_pampd_delete_from_obj(obj, index);
@@ -625,6 +659,9 @@ void tmem_localify_finish(struct tmem_obj *obj, uint32_t index,
spin_unlock(&hb->lock);
}
+/*
+ * For ramster only. Helper function to support asynchronous tmem_get.
+ */
static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb,
struct tmem_pool *pool, struct tmem_oid *oidp,
uint32_t index, bool free, char *data)
@@ -633,7 +670,6 @@ static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb,
bool intransit = false;
int ret = 0;
-
if (!is_ephemeral(pool))
new_pampd = (*tmem_pamops.repatriate_preload)(
old_pampd, pool, oidp, index, &intransit);
@@ -646,60 +682,91 @@ static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb,
if (!intransit)
ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool,
oidp, index, free, data);
+ if (ret == -EAGAIN) {
+ /* rare I think, but should cond_resched()??? */
+ usleep_range(10, 1000);
+ } else if (ret == -ENOTCONN || ret == -EHOSTDOWN) {
+ ret = -1;
+ } else if (ret != 0 && ret != -ENOENT) {
+ ret = -1;
+ }
+ /* note hb->lock has now been unlocked */
+ return ret;
+}
+
+/*
+ * For ramster only. If a page in tmem matches the handle, replace the
+ * page so that any subsequent "get" gets the new page. Returns 0 if
+ * there was a page to replace, else returns -1.
+ */
+int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
+ uint32_t index, void *new_pampd)
+{
+ struct tmem_obj *obj;
+ int ret = -1;
+ struct tmem_hashbucket *hb;
+
+ hb = &pool->hashbucket[tmem_oid_hash(oidp)];
+ spin_lock(&hb->lock);
+ obj = tmem_obj_find(hb, oidp);
+ if (obj == NULL)
+ goto out;
+ new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0);
+ /* if we bug here, pamops wasn't properly set up for ramster */
+ BUG_ON(tmem_pamops.replace_in_obj == NULL);
+ ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
+out:
+ spin_unlock(&hb->lock);
return ret;
}
+#endif
/*
- * "Get" a page, e.g. if one can be found, copy the tmem page with the
- * matching handle from PAM space to the kernel. By tmem definition,
- * when a "get" is successful on an ephemeral page, the page is "flushed",
- * and when a "get" is successful on a persistent page, the page is retained
- * in tmem. Note that to preserve
+ * "Get" a page, e.g. if a pampd can be found matching the passed handle,
+ * use a pamops callback to recreated the page from the pampd with the
+ * matching handle. By tmem definition, when a "get" is successful on
+ * an ephemeral page, the page is "flushed", and when a "get" is successful
+ * on a persistent page, the page is retained in tmem. Note that to preserve
* coherency, "get" can never be skipped if tmem contains the data.
* That is, if a get is done with a certain handle and fails, any
* subsequent "get" must also fail (unless of course there is a
* "put" done with the same handle).
-
*/
int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
- char *data, size_t *size, bool raw, int get_and_free)
+ char *data, size_t *sizep, bool raw, int get_and_free)
{
struct tmem_obj *obj;
- void *pampd;
+ void *pampd = NULL;
bool ephemeral = is_ephemeral(pool);
int ret = -1;
struct tmem_hashbucket *hb;
bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);
- bool lock_held = 0;
+ bool lock_held = false;
void **ppampd;
-again:
- hb = &pool->hashbucket[tmem_oid_hash(oidp)];
- spin_lock(&hb->lock);
- lock_held = 1;
- obj = tmem_obj_find(hb, oidp);
- if (obj == NULL)
- goto out;
- ppampd = __tmem_pampd_lookup_in_obj(obj, index);
- if (ppampd == NULL)
- goto out;
- if (tmem_pamops.is_remote(*ppampd)) {
- ret = tmem_repatriate(ppampd, hb, pool, oidp,
- index, free, data);
- lock_held = 0; /* note hb->lock has been unlocked */
- if (ret == -EAGAIN) {
- /* rare I think, but should cond_resched()??? */
- usleep_range(10, 1000);
- goto again;
- } else if (ret != 0) {
- if (ret != -ENOENT)
- pr_err("UNTESTED case in tmem_get, ret=%d\n",
- ret);
- ret = -1;
+ do {
+ hb = &pool->hashbucket[tmem_oid_hash(oidp)];
+ spin_lock(&hb->lock);
+ lock_held = true;
+ obj = tmem_obj_find(hb, oidp);
+ if (obj == NULL)
+ goto out;
+ ppampd = __tmem_pampd_lookup_in_obj(obj, index);
+ if (ppampd == NULL)
goto out;
+#ifdef CONFIG_RAMSTER
+ if ((tmem_pamops.is_remote != NULL) &&
+ tmem_pamops.is_remote(*ppampd)) {
+ ret = tmem_repatriate(ppampd, hb, pool, oidp,
+ index, free, data);
+ /* tmem_repatriate releases hb->lock */
+ lock_held = false;
+ *sizep = PAGE_SIZE;
+ if (ret != -EAGAIN)
+ goto out;
}
- goto out;
- }
+#endif
+ } while (ret == -EAGAIN);
if (free)
pampd = tmem_pampd_delete_from_obj(obj, index);
else
@@ -715,10 +782,10 @@ again:
}
if (free)
ret = (*tmem_pamops.get_data_and_free)(
- data, size, raw, pampd, pool, oidp, index);
+ data, sizep, raw, pampd, pool, oidp, index);
else
ret = (*tmem_pamops.get_data)(
- data, size, raw, pampd, pool, oidp, index);
+ data, sizep, raw, pampd, pool, oidp, index);
if (ret < 0)
goto out;
ret = 0;
@@ -762,30 +829,6 @@ out:
}
/*
- * If a page in tmem matches the handle, replace the page so that any
- * subsequent "get" gets the new page. Returns the new page if
- * there was a page to replace, else returns NULL.
- */
-int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
- uint32_t index, void *new_pampd)
-{
- struct tmem_obj *obj;
- int ret = -1;
- struct tmem_hashbucket *hb;
-
- hb = &pool->hashbucket[tmem_oid_hash(oidp)];
- spin_lock(&hb->lock);
- obj = tmem_obj_find(hb, oidp);
- if (obj == NULL)
- goto out;
- new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0);
- ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
-out:
- spin_unlock(&hb->lock);
- return ret;
-}
-
-/*
* "Flush" all pages in tmem matching this oid.
*/
int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
@@ -799,7 +842,7 @@ int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
obj = tmem_obj_find(hb, oidp);
if (obj == NULL)
goto out;
- tmem_pampd_destroy_all_in_obj(obj);
+ tmem_pampd_destroy_all_in_obj(obj, false);
tmem_obj_free(obj, hb);
(*tmem_hostops.obj_free)(obj, pool);
ret = 0;