Nutshell

Thoughts come and go, words stay eternal

09 Sep 2022

[python internal] Implementation of Type Dict

Abstract

  1. using char[] store data, using Open Addressing handle hash collision
  2. hash table will be expanded only there are no valid solt or hash table isn’t combined (dict always combined ?)
  3. deleting item won’t decrease memory usage, only update matching solt with value DKIX_DUMMY
  4. during resizing, all old data will be copy into new hash table, then free old hash table memory usage
  5. in python3, keys() and values() create view object, not list object (python2)

Metrics

Operation Average Case Amortized Worst Case
copy O(n) O(n)
k in d O(1) O(n)
get O(1) O(n)
set O(1) O(n)
delete O(1) O(n)
iteration O(n) O(n)

Source Code

background:

  1. python-3.10.6

Basic Data Structure

Dict Type
// Objects/dictobject.c
//
PyTypeObject PyDict_Type = {
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
    "dict",
    sizeof(PyDictObject),
    0,
    (destructor)dict_dealloc,                   /* tp_dealloc */
    0,                                          /* tp_vectorcall_offset */
    0,                                          /* tp_getattr */
    0,                                          /* tp_setattr */
    0,                                          /* tp_as_async */
    (reprfunc)dict_repr,                        /* tp_repr */
    &dict_as_number,                            /* tp_as_number */
    &dict_as_sequence,                          /* tp_as_sequence */
    &dict_as_mapping,                           /* tp_as_mapping */
    PyObject_HashNotImplemented,                /* tp_hash */
    0,                                          /* tp_call */
    0,                                          /* tp_str */
    PyObject_GenericGetAttr,                    /* tp_getattro */
    0,                                          /* tp_setattro */
    0,                                          /* tp_as_buffer */
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
        Py_TPFLAGS_BASETYPE | Py_TPFLAGS_DICT_SUBCLASS |
        _Py_TPFLAGS_MATCH_SELF | Py_TPFLAGS_MAPPING,  /* tp_flags */
    dictionary_doc,                             /* tp_doc */
    dict_traverse,                              /* tp_traverse */
    dict_tp_clear,                              /* tp_clear */
    dict_richcompare,                           /* tp_richcompare */
    0,                                          /* tp_weaklistoffset */
    (getiterfunc)dict_iter,                     /* tp_iter */
    0,                                          /* tp_iternext */
    mapp_methods,                               /* tp_methods */
    0,                                          /* tp_members */
    0,                                          /* tp_getset */
    0,                                          /* tp_base */
    0,                                          /* tp_dict */
    0,                                          /* tp_descr_get */
    0,                                          /* tp_descr_set */
    0,                                          /* tp_dictoffset */
    dict_init,                                  /* tp_init */
    PyType_GenericAlloc,                        /* tp_alloc */
    dict_new,                                   /* tp_new */
    PyObject_GC_Del,                            /* tp_free */
    .tp_vectorcall = dict_vectorcall,
};
Dict Method Mapping
// Objects/dictobject.c
// 
// object's method mapping, 
// used by bytecode LOAD_METHOD and CALL_METHOD
//
static PyMethodDef mapp_methods[] = {
    DICT___CONTAINS___METHODDEF			// key in dict
    {"__getitem__", (PyCFunction)(void(*)(void))dict_subscript,        METH_O | METH_COEXIST,
     getitem__doc__},
    {"__sizeof__",      (PyCFunction)(void(*)(void))dict_sizeof,       METH_NOARGS,
     sizeof__doc__},
    DICT_GET_METHODDEF                  // dict.get(k)
    DICT_SETDEFAULT_METHODDEF           // dict.setdefault(v)
    DICT_POP_METHODDEF                  // dict.pop(k)
    DICT_POPITEM_METHODDEF				// dict.popitem()
    {"keys",            dictkeys_new,                   METH_NOARGS,
    keys__doc__},						// dict.keys()
    {"items",           dictitems_new,                  METH_NOARGS,
    items__doc__},						// dict.items()
    {"values",          dictvalues_new,                 METH_NOARGS,
    values__doc__},						// dict.values()
    {"update",          (PyCFunction)(void(*)(void))dict_update, METH_VARARGS | METH_KEYWORDS,
     update__doc__},					// dict.update()
    DICT_FROMKEYS_METHODDEF				// dict.fromkeys()
    {"clear",           (PyCFunction)dict_clear,        METH_NOARGS,
     clear__doc__},						// dict.clear()
    {"copy",            (PyCFunction)dict_copy,         METH_NOARGS,
     copy__doc__},						// dict.copy()
    DICT___REVERSED___METHODDEF
    {"__class_getitem__", (PyCFunction)Py_GenericAlias, METH_O|METH_CLASS, PyDoc_STR("See PEP 585")},
    {NULL,              NULL}   /* sentinel */
};

New

Python code:

  • d = dict(d1): create new empty Dictobject, then merge d1 into d, equal call d = dict() then d.update(d1)
  • d = {}: create new dictobject
// Objects/dictobject.c
//
// d = dict()
//
static PyObject *
dict_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
    PyObject *self;
    PyDictObject *d;

    assert(type != NULL && type->tp_alloc != NULL);
    self = type->tp_alloc(type, 0);
    if (self == NULL)
        return NULL;
    d = (PyDictObject *)self;

    /* The object has been implicitly tracked by tp_alloc */
    if (type == &PyDict_Type) {
        _PyObject_GC_UNTRACK(d);
    }

    d->ma_used = 0;
    d->ma_version_tag = DICT_NEXT_VERSION();
    dictkeys_incref(Py_EMPTY_KEYS);
    d->ma_keys = Py_EMPTY_KEYS;
    d->ma_values = empty_values;
    ASSERT_CONSISTENT(d);
    return self;
}
// Objects/dictobject.c
//
// d = {}
//
PyObject *
_PyDict_NewPresized(Py_ssize_t minused)
{
    const Py_ssize_t max_presize = 128 * 1024;
    Py_ssize_t newsize;
    PyDictKeysObject *new_keys;

    if (minused <= USABLE_FRACTION(PyDict_MINSIZE)) {
        return PyDict_New();
    }
    /* There are no strict guarantee that returned dict can contain minused
     * items without resize.  So we create medium size dict instead of very
     * large dict or MemoryError.
     */
    if (minused > USABLE_FRACTION(max_presize)) {
        newsize = max_presize;
    }
    else {
        newsize = estimate_keysize(minused);
    }

    new_keys = new_keys_object(newsize);
    if (new_keys == NULL)
        return NULL;
    return new_dict(new_keys, NULL);
}


/* Consumes a reference to the keys object */
static PyObject *
new_dict(PyDictKeysObject *keys, PyObject **values)
{
    PyDictObject *mp;
    assert(keys != NULL);
    struct _Py_dict_state *state = get_dict_state();
#ifdef Py_DEBUG
    // new_dict() must not be called after _PyDict_Fini()
    assert(state->numfree != -1);
#endif
    if (state->numfree) {
        mp = state->free_list[--state->numfree];
        assert (mp != NULL);
        assert (Py_IS_TYPE(mp, &PyDict_Type));
        _Py_NewReference((PyObject *)mp);
    }
    else {
        mp = PyObject_GC_New(PyDictObject, &PyDict_Type);
        if (mp == NULL) {
            dictkeys_decref(keys);
            if (values != empty_values) {
                free_values(values);
            }
            return NULL;
        }
    }
    mp->ma_keys = keys;
    mp->ma_values = values;
    mp->ma_used = 0;
    mp->ma_version_tag = DICT_NEXT_VERSION();
    ASSERT_CONSISTENT(mp);
    return (PyObject *)mp;
}
// Objects/dictobject.c
//
// d = dict(d1) or d.__init__(d1)
//
static int
dict_init(PyObject *self, PyObject *args, PyObject *kwds)
{
    return dict_update_common(self, args, kwds, "dict");
}

Set

Python code:

  • d[k] = v: at the end, call PyDict_SetItem
  • d.setitem(k, v): at the end, call PyDict_SetItem

Time to Resize:

  • can’t found valid(unuse) solt for new item
  • hash table is split

How to Handle Hash Collision:

  • Open Addressing
// Objects/dictobject.c
static PyMappingMethods dict_as_mapping = {
    (lenfunc)dict_length, /*mp_length*/
    (binaryfunc)dict_subscript, /*mp_subscript*/
    (objobjargproc)dict_ass_sub, /*mp_ass_subscript*/
};

//
static int
dict_ass_sub(PyDictObject *mp, PyObject *v, PyObject *w)
{
    if (w == NULL)
        return PyDict_DelItem((PyObject *)mp, v);
    else
        return PyDict_SetItem((PyObject *)mp, v, w);
}


/* CAUTION: PyDict_SetItem() must guarantee that it won't resize the
 * dictionary if it's merely replacing the value for an existing key.
 * This means that it's safe to loop over a dictionary with PyDict_Next()
 * and occasionally replace a value -- but you can't insert new keys or
 * remove them.
 */
int
PyDict_SetItem(PyObject *op, PyObject *key, PyObject *value)
{
    PyDictObject *mp;
    Py_hash_t hash;
    if (!PyDict_Check(op)) {
        PyErr_BadInternalCall();
        return -1;
    }
    assert(key);
    assert(value);
    mp = (PyDictObject *)op;
    if (!PyUnicode_CheckExact(key) ||
        (hash = ((PyASCIIObject *) key)->hash) == -1)
    {
        hash = PyObject_Hash(key);
        if (hash == -1)
            return -1;
    }

    if (mp->ma_keys == Py_EMPTY_KEYS) {
        return insert_to_emptydict(mp, key, hash, value);
    }
    /* insertdict() handles any resizing that might be necessary */
    return insertdict(mp, key, hash, value);
}

Get

python code:

  1. d.get(k): if k not exist, return default_value
  2. d[k]: if k not exist, raise KeyError
// Objects/dictobject.c
//
// d.get(k, default_value) or d.get(k)
//
static PyObject *
dict_get(PyDictObject *self, PyObject *const *args, Py_ssize_t nargs)
{
    PyObject *return_value = NULL;
    PyObject *key;
    PyObject *default_value = Py_None;

    if (!_PyArg_CheckPositional("get", nargs, 1, 2)) {
        goto exit;
    }
    key = args[0];
    if (nargs < 2) {
        goto skip_optional;
    }
    default_value = args[1];
skip_optional:
    return_value = dict_get_impl(self, key, default_value);

exit:
    return return_value;
}


/*[clinic input]
dict.get

    key: object
    default: object = None
    /

Return the value for key if key is in the dictionary, else default.
[clinic start generated code]*/

static PyObject *
dict_get_impl(PyDictObject *self, PyObject *key, PyObject *default_value)
/*[clinic end generated code: output=bba707729dee05bf input=279ddb5790b6b107]*/
{
    PyObject *val = NULL;
    Py_hash_t hash;
    Py_ssize_t ix;

    if (!PyUnicode_CheckExact(key) ||
        (hash = ((PyASCIIObject *) key)->hash) == -1) {
        hash = PyObject_Hash(key);
        if (hash == -1)
            return NULL;
    }
    ix = (self->ma_keys->dk_lookup) (self, key, hash, &val);
    if (ix == DKIX_ERROR)
        return NULL;
    if (ix == DKIX_EMPTY || val == NULL) {
        val = default_value;
    }
    Py_INCREF(val);
    return val;
}
// Objects/dictobject.c
//
// d[k] equal d.__getitem__(k)
//
static PyObject *
dict_subscript(PyDictObject *mp, PyObject *key)
{
    Py_ssize_t ix;
    Py_hash_t hash;
    PyObject *value;

    if (!PyUnicode_CheckExact(key) ||
        (hash = ((PyASCIIObject *) key)->hash) == -1) {
        hash = PyObject_Hash(key);
        if (hash == -1)
            return NULL;
    }
    ix = (mp->ma_keys->dk_lookup)(mp, key, hash, &value);
    if (ix == DKIX_ERROR)
        return NULL;
    if (ix == DKIX_EMPTY || value == NULL) {
        if (!PyDict_CheckExact(mp)) {
            /* Look up __missing__ method if we're a subclass. */
            PyObject *missing, *res;
            _Py_IDENTIFIER(__missing__);
            missing = _PyObject_LookupSpecial((PyObject *)mp, &PyId___missing__);
            if (missing != NULL) {
                res = PyObject_CallOneArg(missing, key);
                Py_DECREF(missing);
                return res;
            }
            else if (PyErr_Occurred())
                return NULL;
        }
        _PyErr_SetKeyError(key);
        return NULL;
    }
    Py_INCREF(value);
    return value;
}

Delete

Python code:

  • del d[k] equal with d.delitem(k), if k not exist, raise KeyError

Notice:

  • delete item won’t resize dict
  • delete item only update the solt into DKIX_DUMMY
// Objects/dictobject.c
//
// del d[k] or d.__delitem__(k)
//
int
PyDict_DelItem(PyObject *op, PyObject *key)
{
    Py_hash_t hash;
    assert(key);
    if (!PyUnicode_CheckExact(key) ||
        (hash = ((PyASCIIObject *) key)->hash) == -1) {
        hash = PyObject_Hash(key);
        if (hash == -1)
            return -1;
    }

    return _PyDict_DelItem_KnownHash(op, key, hash);
}

int
_PyDict_DelItem_KnownHash(PyObject *op, PyObject *key, Py_hash_t hash)
{
    Py_ssize_t ix;
    PyDictObject *mp;
    PyObject *old_value;

    if (!PyDict_Check(op)) {
        PyErr_BadInternalCall();
        return -1;
    }
    assert(key);
    assert(hash != -1);
    mp = (PyDictObject *)op;
    ix = (mp->ma_keys->dk_lookup)(mp, key, hash, &old_value);
    if (ix == DKIX_ERROR)
        return -1;
    if (ix == DKIX_EMPTY || old_value == NULL) {
        _PyErr_SetKeyError(key);
        return -1;
    }

    // Split table doesn't allow deletion.  Combine it.
    if (_PyDict_HasSplitTable(mp)) {
        if (dictresize(mp, DK_SIZE(mp->ma_keys))) {
            return -1;
        }
        ix = (mp->ma_keys->dk_lookup)(mp, key, hash, &old_value);
        assert(ix >= 0);
    }

    return delitem_common(mp, hash, ix, old_value);
}


static int
delitem_common(PyDictObject *mp, Py_hash_t hash, Py_ssize_t ix,
               PyObject *old_value)
{
    PyObject *old_key;
    PyDictKeyEntry *ep;

    Py_ssize_t hashpos = lookdict_index(mp->ma_keys, hash, ix);
    assert(hashpos >= 0);

    mp->ma_used--;
    mp->ma_version_tag = DICT_NEXT_VERSION();
    ep = &DK_ENTRIES(mp->ma_keys)[ix];
    dictkeys_set_index(mp->ma_keys, hashpos, DKIX_DUMMY);
    ENSURE_ALLOWS_DELETIONS(mp);
    old_key = ep->me_key;
    ep->me_key = NULL;
    ep->me_value = NULL;
    Py_DECREF(old_key);
    Py_DECREF(old_value);

    ASSERT_CONSISTENT(mp);
    return 0;
}

Resize

Time to Resize:

  • hash table is split
  • new item can’t find valid solt (hash collision)

How:

  • calculate new hash table size
  • allocate new hash table
  • copy all old data in old hash table into new hash table
  • free old hash table
// Objects/dictobject.c
//
//
/*
Restructure the table by allocating a new table and reinserting all
items again.  When entries have been deleted, the new table may
actually be smaller than the old one.
If a table is split (its keys and hashes are shared, its values are not),
then the values are temporarily copied into the table, it is resized as
a combined table, then the me_value slots in the old table are NULLed out.
After resizing a table is always combined,
but can be resplit by make_keys_shared().
*/
static int
dictresize(PyDictObject *mp, Py_ssize_t newsize)
{
    Py_ssize_t numentries;
    PyDictKeysObject *oldkeys;
    PyObject **oldvalues;
    PyDictKeyEntry *oldentries, *newentries;

    if (newsize <= 0) {
        PyErr_NoMemory();
        return -1;
    }
    assert(IS_POWER_OF_2(newsize));
    assert(newsize >= PyDict_MINSIZE);

    oldkeys = mp->ma_keys;

    /* NOTE: Current odict checks mp->ma_keys to detect resize happen.
     * So we can't reuse oldkeys even if oldkeys->dk_size == newsize.
     * TODO: Try reusing oldkeys when reimplement odict.
     */

    /* Allocate a new table. */
    mp->ma_keys = new_keys_object(newsize);
    if (mp->ma_keys == NULL) {
        mp->ma_keys = oldkeys;
        return -1;
    }
    // New table must be large enough.
    assert(mp->ma_keys->dk_usable >= mp->ma_used);
    if (oldkeys->dk_lookup == lookdict)
        mp->ma_keys->dk_lookup = lookdict;

    numentries = mp->ma_used;
    oldentries = DK_ENTRIES(oldkeys);
    newentries = DK_ENTRIES(mp->ma_keys);
    oldvalues = mp->ma_values;
    if (oldvalues != NULL) {
        /* Convert split table into new combined table.
         * We must incref keys; we can transfer values.
         * Note that values of split table is always dense.
         */
        for (Py_ssize_t i = 0; i < numentries; i++) {
            assert(oldvalues[i] != NULL);
            PyDictKeyEntry *ep = &oldentries[i];
            PyObject *key = ep->me_key;
            Py_INCREF(key);
            newentries[i].me_key = key;
            newentries[i].me_hash = ep->me_hash;
            newentries[i].me_value = oldvalues[i];
        }

        dictkeys_decref(oldkeys);
        mp->ma_values = NULL;
        if (oldvalues != empty_values) {
            free_values(oldvalues);
        }
    }
    else {  // combined table.
        if (oldkeys->dk_nentries == numentries) {
            memcpy(newentries, oldentries, numentries * sizeof(PyDictKeyEntry));
        }
        else {
            PyDictKeyEntry *ep = oldentries;
            for (Py_ssize_t i = 0; i < numentries; i++) {
                while (ep->me_value == NULL)
                    ep++;
                newentries[i] = *ep++;
            }
        }

        assert(oldkeys->dk_lookup != lookdict_split);
        assert(oldkeys->dk_refcnt == 1);
#ifdef Py_REF_DEBUG
        _Py_RefTotal--;
#endif
        struct _Py_dict_state *state = get_dict_state();
#ifdef Py_DEBUG
        // dictresize() must not be called after _PyDict_Fini()
        assert(state->keys_numfree != -1);
#endif
        if (oldkeys->dk_size == PyDict_MINSIZE &&
            state->keys_numfree < PyDict_MAXFREELIST)
        {
            state->keys_free_list[state->keys_numfree++] = oldkeys;
        }
        else {
            PyObject_Free(oldkeys);
        }
    }

    build_indices(mp->ma_keys, newentries, numentries);
    mp->ma_keys->dk_usable -= numentries;
    mp->ma_keys->dk_nentries = numentries;
    return 0;
}

d = {}
n = 1000000
used = set()
for k in range(n):
    mem_size = d.__sizeof__()
    d[k] = k
    mem_resize = d.__sizeof__()
    resize_ratio = mem_resize / mem_size
    length = len(d)
    if mem_size in used and mem_resize in used and k != n - 1:
        continue
    used.add(mem_size)
    used.add(mem_resize)
    fs = f"length = {length:-10}, mem_size = {mem_size:-10}, mem_resize = {mem_resize:-10}, resize_ratio = {resize_ratio:-10.5f}"
    print(fs)


### Output ###
length =          1, mem_size =         48, mem_resize =        216, resize_ratio =    4.50000
length =          6, mem_size =        216, mem_resize =        344, resize_ratio =    1.59259
length =         11, mem_size =        344, mem_resize =        624, resize_ratio =    1.81395
length =         22, mem_size =        624, mem_resize =       1160, resize_ratio =    1.85897
length =         43, mem_size =       1160, mem_resize =       2256, resize_ratio =    1.94483
length =         86, mem_size =       2256, mem_resize =       4680, resize_ratio =    2.07447
length =        171, mem_size =       4680, mem_resize =       9296, resize_ratio =    1.98632
length =        342, mem_size =       9296, mem_resize =      18504, resize_ratio =    1.99053
length =        683, mem_size =      18504, mem_resize =      36944, resize_ratio =    1.99654
length =       1366, mem_size =      36944, mem_resize =      73800, resize_ratio =    1.99762
length =       2731, mem_size =      73800, mem_resize =     147536, resize_ratio =    1.99913
length =       5462, mem_size =     147536, mem_resize =     294984, resize_ratio =    1.99940
length =      10923, mem_size =     294984, mem_resize =     589904, resize_ratio =    1.99978
length =      21846, mem_size =     589904, mem_resize =    1310792, resize_ratio =    2.22204
length =      43691, mem_size =    1310792, mem_resize =    2621520, resize_ratio =    1.99995
length =      87382, mem_size =    2621520, mem_resize =    5242952, resize_ratio =    1.99997
length =     174763, mem_size =    5242952, mem_resize =   10485840, resize_ratio =    1.99999
length =     349526, mem_size =   10485840, mem_resize =   20971592, resize_ratio =    1.99999
length =     699051, mem_size =   20971592, mem_resize =   41943120, resize_ratio =    2.00000
length =    1000000, mem_size =   41943120, mem_resize =   41943120, resize_ratio =    1.00000

Clear

// Objects/dictobject.c
//
// clear dict by creating new one
//
void
PyDict_Clear(PyObject *op)
{
    PyDictObject *mp;
    PyDictKeysObject *oldkeys;
    PyObject **oldvalues;
    Py_ssize_t i, n;

    if (!PyDict_Check(op))
        return;
    mp = ((PyDictObject *)op);
    oldkeys = mp->ma_keys;
    oldvalues = mp->ma_values;
    if (oldvalues == empty_values)
        return;
    /* Empty the dict... */
    dictkeys_incref(Py_EMPTY_KEYS);
    mp->ma_keys = Py_EMPTY_KEYS;
    mp->ma_values = empty_values;
    mp->ma_used = 0;
    mp->ma_version_tag = DICT_NEXT_VERSION();
    /* ...then clear the keys and values */
    if (oldvalues != NULL) {
        n = oldkeys->dk_nentries;
        for (i = 0; i < n; i++)
            Py_CLEAR(oldvalues[i]);
        free_values(oldvalues);
        dictkeys_decref(oldkeys);
    }
    else {
       assert(oldkeys->dk_refcnt == 1);
       dictkeys_decref(oldkeys);
    }
    ASSERT_CONSISTENT(mp);
}

Real World Python

Notice:

  • not thread-safe.
  • iterable object that created by dict object can be used after dict size (len(d)) changed.
  • the only way to decrease dict object size (sizeof) is create new one, then copy old into new created.
  • when resizing hash table, memory usage will be double.

Bug Code

>>> d = {1: 1, 2: 2, 3: 3}
>>> for k in d:
...     del d[k]
...
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
RuntimeError: dictionary changed size during iteration
>>>

References

  1. dis - Disassembler for Python bytecode
  2. python - sys.getsizeof()
  3. python - TimeComplexity
  4. python - PerformanceTips
  5. web - Compiler Explorer
  6. Hash_table - Collision_resolution
  7. Hash_table - Open_addressing