How to deal with old references to a resized hash table?

问题

I'm currently working on a hash table implementation in C. I'm trying to implement dynamic resizing, but came across a problem.

If resizing a hash table means creating a new one with double (or half) the size, rehashing, and deleting the old one, how can I deal with old references the user may have made to the old table? Example code (I've omitted error checking just for this example):

int main(int argc, char *argv[])
{
    ht = ht_create(5) /* make hashtable with size 5 */
    ht_insert("john", "employee"); /* key-val pair "john -> employee" */
    ht_insert("alice", "employee");
    char *position = ht_get(ht, "alice"); /* get alice's position from hashtable ht */


    ht_insert("bob", "boss"); /* this insert exceeds the load factor, resizes the hash table */

    printf("%s", position); /* returns NULL because the previous hashtable that was resized was freed */

    return 0;
}

In this case position pointed to alice's value which was found in the hashtable. When it was resized, we freed the hash table and lost it. How can I fix this problem, so the user won't have to worry that a previously defined pointer was freed?

EDIT: my current hash table implementation

hash.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "hash.h"

#define LOADFACTOR 0.75

typedef struct tableentry /* hashtab entry */
{
    struct tableentry *next;
    char *key;
    void *val;
} tableentry_t;

typedef struct hashtable
{
    datatype_t type;
    size_t size;
    size_t load; /* number of keys filled */
    struct tableentry **tab;
} hashtable_t;

/* creates hashtable */
/* NOTE: dynamically allocated, remember to ht_free() */
hashtable_t *ht_create(size_t size, datatype_t type)
{
    hashtable_t *ht = NULL;
    if ((ht = malloc(sizeof(hashtable_t))) == NULL)
        return NULL;
    /* allocate ht's table */
    if ((ht->tab = malloc(sizeof(tableentry_t) * size)) == NULL)
        return NULL;
    /* null-initialize table */
    size_t i;
    for (i = 0; i < size; i++)
        ht->tab[i] = NULL;
    ht->size = size;
    ht->type = type;
    return ht;
}

/* creates hash for a hashtab */
static unsigned hash(char *s)
{
    unsigned hashval;
    for (hashval = 0; *s != '\0'; s++)
        hashval = *s + 31 * hashval;
    return hashval;
}

static int *intdup(int *i)
{
    int *new;
    if ((new = malloc(sizeof(int))) == NULL)
        return NULL;
    *new = *i;
    return new;
}

static void free_te(tableentry_t *te)
{
    free(te->key);
    free(te->val);
    free(te);
}

/* loops through linked list freeing */
static void free_te_list(tableentry_t *te)
{
    tableentry_t *next;
    while (te != NULL)
    {
        next = te->next;
        free_te(te);
        te = next;
    }
}

/* creates a key-val pair */
static tableentry_t *alloc_te(char *k, void *v, datatype_t type)
{
    tableentry_t *te = NULL;
    int status = 0;
    /* alloc struct */
    if ((te = calloc(1, sizeof(*te))) == NULL)
        status = -1;
    /* alloc key */
    if ((te->key = strdup(k)) == NULL)
        status = -1;
    /* alloc value */
    int *d;
    char *s;
    switch (type)
    {
        case STRING:
            s = (char *) v;
            if ((te->val = strdup(s)) == NULL)
                status = -1;
            break;
        case INTEGER:
            d = (int *) v;
            if ((te->val = intdup(d)) == NULL)
                status = -1;
            break;
        default:
            status = -1;
    }
    if (status < 0)
    {
        free_te_list(te);
        return NULL;
    }
    te->next = NULL;
    return te;
}

static tableentry_t *lookup(hashtable_t *ht, char *k)
{
    tableentry_t *te;
    /* step through linked list */
    for (te = ht->tab[hash(k) % ht->size]; te != NULL; te = te->next)
        if (strcmp(te->key, k) == 0)
            return te; /* found */
    return NULL; /* not found */
}

/* inserts the key-val pair */
hashtable_t *ht_insert(hashtable_t *ht, char *k, void *v)
{
    tableentry_t *te;
    /* unique entry */
    if ((te = lookup(ht, k)) == NULL)
    {
        te = alloc_te(k, v, ht->type);
        unsigned hashval = hash(k) % ht->size;
        /* insert at beginning of linked list */
        te->next = ht->tab[hashval]; 
        ht->tab[hashval] = te;
        ht->load++;
    }
    /* replace val of previous entry */
    else
    {
        free(te->val);
        switch (ht->type)
        {
            case STRING:
                if ((te->val = strdup(v)) == NULL)
                    return NULL;
                break;
            case INTEGER:
                if ((te->val = intdup(v)) == NULL)
                    return NULL;
                break;
            default:
                return NULL;
        }
    }
    return ht;
}

static void delete_te(hashtable_t *ht, char *k)
{
    tableentry_t *te, *prev;
    unsigned hashval = hash(k) % ht->size;
    te = ht->tab[hashval];
    /* point head to next element if deleting head */
    if (strcmp(te->key, k) == 0)
    {
        ht->tab[hashval] = te->next;
        free_te(te);
        ht->load--;
        return;
    }
    /* otherwise look through, keeping track of prev to reassign its ->next */
    for (; te != NULL; te = te->next)
    {
        if (strcmp(te->key, k) == 0)
        {
            prev->next = te->next;
            free_te(te);
            ht->load--;
            return;
        }
        prev = te;
    }   
}

hashtable_t *ht_delete(hashtable_t *ht, char *k)
{
    size_t i;
    if (lookup(ht, k) == NULL)
        return NULL;
    else
        delete_te(ht, k);

}

/* retrieve value from key */
void *ht_get(hashtable_t *ht, char *k)
{
    tableentry_t *te;
    if ((te = lookup(ht, k)) == NULL)
        return NULL;
    return te->val;
}

/* frees hashtable created from ht_create() */
void ht_free(hashtable_t *ht)
{
    size_t i;
    if (ht)
    {
        for (i = 0; i < ht->size; i++)
            if (ht->tab[i] != NULL)
                free_te_list(ht->tab[i]);
        free(ht);
    }
}

/* resizes hashtable, returns new hashtable and frees old */
static hashtable_t *resize(hashtable_t *oht, size_t size)
{
    hashtable_t *nht; /* new hashtable */
    nht = ht_create(size, oht->type);
    /* rehash */
    size_t i;
    tableentry_t *te;
    /* loop through hashtable */
    for (i = 0; i < oht->size; i++)
        /* loop through linked list */
        for (te = oht->tab[i]; te != NULL; te = te->next)
            /* insert & rehash old vals into new ht */
            if (ht_insert(nht, te->key, te->val) == NULL)
                return NULL;
    ht_free(oht);
    return nht;
}

hash.h

/* a hash-table implementation in c */
/*
hashing algorithm: hashval = *s + 31 * hashval
resolves collisions using linked lists
*/

#ifndef HASH
#define HASH

typedef struct hashtable hashtable_t;

typedef enum datatype {STRING, INTEGER} datatype_t;

/* inserts the key-val pair */
hashtable_t *ht_insert(hashtable_t *ht, char *k, void *v);

/* creates hashtable */
/* NOTE: dynamically allocated, remember to ht_free() */
hashtable_t *ht_create(size_t size, datatype_t type);

/* frees hashtable created from ht_create() */
void ht_free(hashtable_t *ht);

/* retrive value from key */
void *ht_get(hashtable_t *ht, char *k);

hashtable_t *ht_delete(hashtable_t *ht, char *k);

#endif

回答1:

Do not use the hash table as the container for the data; only use it to refer to the data, and you won't have that problem.

For example, let's say you have key-value pairs, using a structure with the actual data in the C99 flexible array member:

struct pair {
    struct pair  *next; /* For hash chaining */
    size_t        hash; /* For the raw key hash */

    /* Payload: */
    size_t        offset; /* value starts at (data + offset) */
    char          data[]; /* key starts at (data) */
};

static inline const char *pair_key(struct pair *ref)
{
    return (const char *)(ref->data);
}

static inline const char *pair_value(struct pair *ref)
{
    return (const char *)(ref->data + ref->offset);
}

Your hash table can then be simply

struct pair_hash_table {
    size_t        size;
    struct pair **entry;
};

If you have struct pair_hash_table *ht, and struct pair *foo with foo->hash containing the hash of the key, then foo should be in the singly-linked list hanging off ht->entry[foo->hash % ht->size];.

Let's say you wish to resize the hash table ht. You choose a new size, and allocate enough memory for that many struct pair *. Then, you go through each singly-linked list in each old hash entry, detaching them from the old list, and prepending them to the lists in correct hash table entries in the new hash table. Then you just free the old hash table entry array, replacing it with the new one:

int resize_pair_hash_table(struct pair_hash_table *ht, const size_t new_size)
{
    struct pair **entry, *curr, *next;
    size_t        i, k;

    if (!ht || new_size < 1)
        return -1; /* Invalid parameters */

    entry = malloc(new_size * sizeof entry[0]);
    if (!entry)
        return -1; /* Out of memory */

    /* Initialize new entry array to empty. */
    for (i = 0; i < new_size; i++)
        entry[i] = NULL;

    for (i = 0; i < ht->size; i++) {

        /* Detach the singly-linked list. */
        next = ht->entry[i];
        ht->entry[i] = NULL;

        while (next) {
            /* Detach the next element, as 'curr' */
            curr = next;
            next = next->next;

            /* k is the index to this hash in the new array */
            k = curr->hash % new_size;

            /* Prepend to the list in the new array */
            curr->next = entry[k];
            entry[k] = curr;
        }
    }

    /* Old array is no longer needed, */
    free(ht->entry);

    /* so replace it with the new one. */
    ht->entry = entry;
    ht->size = size;

    return 0; /* Success */
}

Note that the hash field in struct pair is not modified, nor recalculated.

Having the raw hash (as opposed to modulo table-size), means you can speed up the key search even when different keys use the same slot:

struct pair *find_key(struct pair_hash_table *ht,
                      const char *key, const size_t key_hash)
{
    struct pair *curr = ht->entry[key_hash % ht->size];

    while (curr)
        if (curr->hash == key_hash && !strcmp(key, pair_key(next)))
            return curr;
        else
            curr = curr->next;

    return NULL; /* Not found. */
}

In C, the logical and operator, &&, is short-circuiting. If the left side is not true, the right side is not evaluated at all, because the entire expression can never be true in that case.

Above, this means that the raw hash value of the key is compared, and only when they do match, the actual strings are compared. If your hash algorithm is even halfway good, this means that if the key already exists, typically only one string comparison is done; and if the key does not exist in the table, typically no string comparisons are done.

回答2:

You can deal with them the same way the standard library (C++) deals with this exact problem:

Some operations on containers (e.g. insertion, erasing, resizing) invalidate iterators.

For instance std::unordered_map which is basically a hash table implemented with buckets has these rules:

insertion

unordered_[multi]{set,map}: all iterators invalidated when rehashing occurs, but references unaffected [23.2.5/8]. Rehashing does not occur if the insertion does not cause the container's size to exceed z * B where z is the maximum load factor and B the current number of buckets. [23.2.5/14]

erasure

unordered_[multi]{set,map}: only iterators and references to the erased elements are invalidated [23.2.5/13]

Iterator invalidation rules

The C++ concept of iterators is a generalization of pointers. So this concept can be applied to C.

Your only other alternative is that instead of holding the objects directly into the container you add another level of indirection and hold some sort of proxy. And so the elements always stay at the same position in memory. It's the proxies that move around on resizing/inserting etc. But you need to analize this scenario: are the added double indirection (which will surely affect performance in a negative way) and increase implementation complexity worth it? Is is that important to have persistent pointers?

来源：https://stackoverflow.com/questions/46391702/how-to-deal-with-old-references-to-a-resized-hash-table

标签

hashtable