[python internal] Implementation of Type Dict
Abstract
- using char[] store data, using Open Addressing handle hash collision
- hash table will be expanded only there are no valid solt or hash table isn’t combined (dict always combined ?)
- deleting item won’t decrease memory usage, only update matching solt with value DKIX_DUMMY
- during resizing, all old data will be copy into new hash table, then free old hash table memory usage
- in python3, keys() and values() create view object, not list object (python2)
Metrics
Operation | Average Case | Amortized Worst Case |
---|---|---|
copy | O(n) | O(n) |
k in d | O(1) | O(n) |
get | O(1) | O(n) |
set | O(1) | O(n) |
delete | O(1) | O(n) |
iteration | O(n) | O(n) |
Source Code
background:
- python-3.10.6
Basic Data Structure
Dict Type
// Objects/dictobject.c
//
PyTypeObject PyDict_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
"dict",
sizeof(PyDictObject),
0,
(destructor)dict_dealloc, /* tp_dealloc */
0, /* tp_vectorcall_offset */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_as_async */
(reprfunc)dict_repr, /* tp_repr */
&dict_as_number, /* tp_as_number */
&dict_as_sequence, /* tp_as_sequence */
&dict_as_mapping, /* tp_as_mapping */
PyObject_HashNotImplemented, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
PyObject_GenericGetAttr, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
Py_TPFLAGS_BASETYPE | Py_TPFLAGS_DICT_SUBCLASS |
_Py_TPFLAGS_MATCH_SELF | Py_TPFLAGS_MAPPING, /* tp_flags */
dictionary_doc, /* tp_doc */
dict_traverse, /* tp_traverse */
dict_tp_clear, /* tp_clear */
dict_richcompare, /* tp_richcompare */
0, /* tp_weaklistoffset */
(getiterfunc)dict_iter, /* tp_iter */
0, /* tp_iternext */
mapp_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
dict_init, /* tp_init */
PyType_GenericAlloc, /* tp_alloc */
dict_new, /* tp_new */
PyObject_GC_Del, /* tp_free */
.tp_vectorcall = dict_vectorcall,
};
Dict Method Mapping
// Objects/dictobject.c
//
// object's method mapping,
// used by bytecode LOAD_METHOD and CALL_METHOD
//
static PyMethodDef mapp_methods[] = {
DICT___CONTAINS___METHODDEF // key in dict
{"__getitem__", (PyCFunction)(void(*)(void))dict_subscript, METH_O | METH_COEXIST,
getitem__doc__},
{"__sizeof__", (PyCFunction)(void(*)(void))dict_sizeof, METH_NOARGS,
sizeof__doc__},
DICT_GET_METHODDEF // dict.get(k)
DICT_SETDEFAULT_METHODDEF // dict.setdefault(v)
DICT_POP_METHODDEF // dict.pop(k)
DICT_POPITEM_METHODDEF // dict.popitem()
{"keys", dictkeys_new, METH_NOARGS,
keys__doc__}, // dict.keys()
{"items", dictitems_new, METH_NOARGS,
items__doc__}, // dict.items()
{"values", dictvalues_new, METH_NOARGS,
values__doc__}, // dict.values()
{"update", (PyCFunction)(void(*)(void))dict_update, METH_VARARGS | METH_KEYWORDS,
update__doc__}, // dict.update()
DICT_FROMKEYS_METHODDEF // dict.fromkeys()
{"clear", (PyCFunction)dict_clear, METH_NOARGS,
clear__doc__}, // dict.clear()
{"copy", (PyCFunction)dict_copy, METH_NOARGS,
copy__doc__}, // dict.copy()
DICT___REVERSED___METHODDEF
{"__class_getitem__", (PyCFunction)Py_GenericAlias, METH_O|METH_CLASS, PyDoc_STR("See PEP 585")},
{NULL, NULL} /* sentinel */
};
New
Python code:
- d = dict(d1): create new empty Dictobject, then merge d1 into d, equal call d = dict() then d.update(d1)
- d = {}: create new dictobject
// Objects/dictobject.c
//
// d = dict()
//
static PyObject *
dict_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
PyObject *self;
PyDictObject *d;
assert(type != NULL && type->tp_alloc != NULL);
self = type->tp_alloc(type, 0);
if (self == NULL)
return NULL;
d = (PyDictObject *)self;
/* The object has been implicitly tracked by tp_alloc */
if (type == &PyDict_Type) {
_PyObject_GC_UNTRACK(d);
}
d->ma_used = 0;
d->ma_version_tag = DICT_NEXT_VERSION();
dictkeys_incref(Py_EMPTY_KEYS);
d->ma_keys = Py_EMPTY_KEYS;
d->ma_values = empty_values;
ASSERT_CONSISTENT(d);
return self;
}
// Objects/dictobject.c
//
// d = {}
//
PyObject *
_PyDict_NewPresized(Py_ssize_t minused)
{
const Py_ssize_t max_presize = 128 * 1024;
Py_ssize_t newsize;
PyDictKeysObject *new_keys;
if (minused <= USABLE_FRACTION(PyDict_MINSIZE)) {
return PyDict_New();
}
/* There are no strict guarantee that returned dict can contain minused
* items without resize. So we create medium size dict instead of very
* large dict or MemoryError.
*/
if (minused > USABLE_FRACTION(max_presize)) {
newsize = max_presize;
}
else {
newsize = estimate_keysize(minused);
}
new_keys = new_keys_object(newsize);
if (new_keys == NULL)
return NULL;
return new_dict(new_keys, NULL);
}
/* Consumes a reference to the keys object */
static PyObject *
new_dict(PyDictKeysObject *keys, PyObject **values)
{
PyDictObject *mp;
assert(keys != NULL);
struct _Py_dict_state *state = get_dict_state();
#ifdef Py_DEBUG
// new_dict() must not be called after _PyDict_Fini()
assert(state->numfree != -1);
#endif
if (state->numfree) {
mp = state->free_list[--state->numfree];
assert (mp != NULL);
assert (Py_IS_TYPE(mp, &PyDict_Type));
_Py_NewReference((PyObject *)mp);
}
else {
mp = PyObject_GC_New(PyDictObject, &PyDict_Type);
if (mp == NULL) {
dictkeys_decref(keys);
if (values != empty_values) {
free_values(values);
}
return NULL;
}
}
mp->ma_keys = keys;
mp->ma_values = values;
mp->ma_used = 0;
mp->ma_version_tag = DICT_NEXT_VERSION();
ASSERT_CONSISTENT(mp);
return (PyObject *)mp;
}
// Objects/dictobject.c
//
// d = dict(d1) or d.__init__(d1)
//
static int
dict_init(PyObject *self, PyObject *args, PyObject *kwds)
{
return dict_update_common(self, args, kwds, "dict");
}
Set
Python code:
- d[k] = v: at the end, call PyDict_SetItem
- d.setitem(k, v): at the end, call PyDict_SetItem
Time to Resize:
- can’t found valid(unuse) solt for new item
- hash table is split
How to Handle Hash Collision:
- Open Addressing
// Objects/dictobject.c
static PyMappingMethods dict_as_mapping = {
(lenfunc)dict_length, /*mp_length*/
(binaryfunc)dict_subscript, /*mp_subscript*/
(objobjargproc)dict_ass_sub, /*mp_ass_subscript*/
};
//
static int
dict_ass_sub(PyDictObject *mp, PyObject *v, PyObject *w)
{
if (w == NULL)
return PyDict_DelItem((PyObject *)mp, v);
else
return PyDict_SetItem((PyObject *)mp, v, w);
}
/* CAUTION: PyDict_SetItem() must guarantee that it won't resize the
* dictionary if it's merely replacing the value for an existing key.
* This means that it's safe to loop over a dictionary with PyDict_Next()
* and occasionally replace a value -- but you can't insert new keys or
* remove them.
*/
int
PyDict_SetItem(PyObject *op, PyObject *key, PyObject *value)
{
PyDictObject *mp;
Py_hash_t hash;
if (!PyDict_Check(op)) {
PyErr_BadInternalCall();
return -1;
}
assert(key);
assert(value);
mp = (PyDictObject *)op;
if (!PyUnicode_CheckExact(key) ||
(hash = ((PyASCIIObject *) key)->hash) == -1)
{
hash = PyObject_Hash(key);
if (hash == -1)
return -1;
}
if (mp->ma_keys == Py_EMPTY_KEYS) {
return insert_to_emptydict(mp, key, hash, value);
}
/* insertdict() handles any resizing that might be necessary */
return insertdict(mp, key, hash, value);
}
Get
python code:
- d.get(k): if k not exist, return default_value
- d[k]: if k not exist, raise KeyError
// Objects/dictobject.c
//
// d.get(k, default_value) or d.get(k)
//
static PyObject *
dict_get(PyDictObject *self, PyObject *const *args, Py_ssize_t nargs)
{
PyObject *return_value = NULL;
PyObject *key;
PyObject *default_value = Py_None;
if (!_PyArg_CheckPositional("get", nargs, 1, 2)) {
goto exit;
}
key = args[0];
if (nargs < 2) {
goto skip_optional;
}
default_value = args[1];
skip_optional:
return_value = dict_get_impl(self, key, default_value);
exit:
return return_value;
}
/*[clinic input]
dict.get
key: object
default: object = None
/
Return the value for key if key is in the dictionary, else default.
[clinic start generated code]*/
static PyObject *
dict_get_impl(PyDictObject *self, PyObject *key, PyObject *default_value)
/*[clinic end generated code: output=bba707729dee05bf input=279ddb5790b6b107]*/
{
PyObject *val = NULL;
Py_hash_t hash;
Py_ssize_t ix;
if (!PyUnicode_CheckExact(key) ||
(hash = ((PyASCIIObject *) key)->hash) == -1) {
hash = PyObject_Hash(key);
if (hash == -1)
return NULL;
}
ix = (self->ma_keys->dk_lookup) (self, key, hash, &val);
if (ix == DKIX_ERROR)
return NULL;
if (ix == DKIX_EMPTY || val == NULL) {
val = default_value;
}
Py_INCREF(val);
return val;
}
// Objects/dictobject.c
//
// d[k] equal d.__getitem__(k)
//
static PyObject *
dict_subscript(PyDictObject *mp, PyObject *key)
{
Py_ssize_t ix;
Py_hash_t hash;
PyObject *value;
if (!PyUnicode_CheckExact(key) ||
(hash = ((PyASCIIObject *) key)->hash) == -1) {
hash = PyObject_Hash(key);
if (hash == -1)
return NULL;
}
ix = (mp->ma_keys->dk_lookup)(mp, key, hash, &value);
if (ix == DKIX_ERROR)
return NULL;
if (ix == DKIX_EMPTY || value == NULL) {
if (!PyDict_CheckExact(mp)) {
/* Look up __missing__ method if we're a subclass. */
PyObject *missing, *res;
_Py_IDENTIFIER(__missing__);
missing = _PyObject_LookupSpecial((PyObject *)mp, &PyId___missing__);
if (missing != NULL) {
res = PyObject_CallOneArg(missing, key);
Py_DECREF(missing);
return res;
}
else if (PyErr_Occurred())
return NULL;
}
_PyErr_SetKeyError(key);
return NULL;
}
Py_INCREF(value);
return value;
}
Delete
Python code:
- del d[k] equal with d.delitem(k), if k not exist, raise KeyError
Notice:
- delete item won’t resize dict
- delete item only update the solt into DKIX_DUMMY
// Objects/dictobject.c
//
// del d[k] or d.__delitem__(k)
//
int
PyDict_DelItem(PyObject *op, PyObject *key)
{
Py_hash_t hash;
assert(key);
if (!PyUnicode_CheckExact(key) ||
(hash = ((PyASCIIObject *) key)->hash) == -1) {
hash = PyObject_Hash(key);
if (hash == -1)
return -1;
}
return _PyDict_DelItem_KnownHash(op, key, hash);
}
int
_PyDict_DelItem_KnownHash(PyObject *op, PyObject *key, Py_hash_t hash)
{
Py_ssize_t ix;
PyDictObject *mp;
PyObject *old_value;
if (!PyDict_Check(op)) {
PyErr_BadInternalCall();
return -1;
}
assert(key);
assert(hash != -1);
mp = (PyDictObject *)op;
ix = (mp->ma_keys->dk_lookup)(mp, key, hash, &old_value);
if (ix == DKIX_ERROR)
return -1;
if (ix == DKIX_EMPTY || old_value == NULL) {
_PyErr_SetKeyError(key);
return -1;
}
// Split table doesn't allow deletion. Combine it.
if (_PyDict_HasSplitTable(mp)) {
if (dictresize(mp, DK_SIZE(mp->ma_keys))) {
return -1;
}
ix = (mp->ma_keys->dk_lookup)(mp, key, hash, &old_value);
assert(ix >= 0);
}
return delitem_common(mp, hash, ix, old_value);
}
static int
delitem_common(PyDictObject *mp, Py_hash_t hash, Py_ssize_t ix,
PyObject *old_value)
{
PyObject *old_key;
PyDictKeyEntry *ep;
Py_ssize_t hashpos = lookdict_index(mp->ma_keys, hash, ix);
assert(hashpos >= 0);
mp->ma_used--;
mp->ma_version_tag = DICT_NEXT_VERSION();
ep = &DK_ENTRIES(mp->ma_keys)[ix];
dictkeys_set_index(mp->ma_keys, hashpos, DKIX_DUMMY);
ENSURE_ALLOWS_DELETIONS(mp);
old_key = ep->me_key;
ep->me_key = NULL;
ep->me_value = NULL;
Py_DECREF(old_key);
Py_DECREF(old_value);
ASSERT_CONSISTENT(mp);
return 0;
}
Resize
Time to Resize:
- hash table is split
- new item can’t find valid solt (hash collision)
How:
- calculate new hash table size
- allocate new hash table
- copy all old data in old hash table into new hash table
- free old hash table
// Objects/dictobject.c
//
//
/*
Restructure the table by allocating a new table and reinserting all
items again. When entries have been deleted, the new table may
actually be smaller than the old one.
If a table is split (its keys and hashes are shared, its values are not),
then the values are temporarily copied into the table, it is resized as
a combined table, then the me_value slots in the old table are NULLed out.
After resizing a table is always combined,
but can be resplit by make_keys_shared().
*/
static int
dictresize(PyDictObject *mp, Py_ssize_t newsize)
{
Py_ssize_t numentries;
PyDictKeysObject *oldkeys;
PyObject **oldvalues;
PyDictKeyEntry *oldentries, *newentries;
if (newsize <= 0) {
PyErr_NoMemory();
return -1;
}
assert(IS_POWER_OF_2(newsize));
assert(newsize >= PyDict_MINSIZE);
oldkeys = mp->ma_keys;
/* NOTE: Current odict checks mp->ma_keys to detect resize happen.
* So we can't reuse oldkeys even if oldkeys->dk_size == newsize.
* TODO: Try reusing oldkeys when reimplement odict.
*/
/* Allocate a new table. */
mp->ma_keys = new_keys_object(newsize);
if (mp->ma_keys == NULL) {
mp->ma_keys = oldkeys;
return -1;
}
// New table must be large enough.
assert(mp->ma_keys->dk_usable >= mp->ma_used);
if (oldkeys->dk_lookup == lookdict)
mp->ma_keys->dk_lookup = lookdict;
numentries = mp->ma_used;
oldentries = DK_ENTRIES(oldkeys);
newentries = DK_ENTRIES(mp->ma_keys);
oldvalues = mp->ma_values;
if (oldvalues != NULL) {
/* Convert split table into new combined table.
* We must incref keys; we can transfer values.
* Note that values of split table is always dense.
*/
for (Py_ssize_t i = 0; i < numentries; i++) {
assert(oldvalues[i] != NULL);
PyDictKeyEntry *ep = &oldentries[i];
PyObject *key = ep->me_key;
Py_INCREF(key);
newentries[i].me_key = key;
newentries[i].me_hash = ep->me_hash;
newentries[i].me_value = oldvalues[i];
}
dictkeys_decref(oldkeys);
mp->ma_values = NULL;
if (oldvalues != empty_values) {
free_values(oldvalues);
}
}
else { // combined table.
if (oldkeys->dk_nentries == numentries) {
memcpy(newentries, oldentries, numentries * sizeof(PyDictKeyEntry));
}
else {
PyDictKeyEntry *ep = oldentries;
for (Py_ssize_t i = 0; i < numentries; i++) {
while (ep->me_value == NULL)
ep++;
newentries[i] = *ep++;
}
}
assert(oldkeys->dk_lookup != lookdict_split);
assert(oldkeys->dk_refcnt == 1);
#ifdef Py_REF_DEBUG
_Py_RefTotal--;
#endif
struct _Py_dict_state *state = get_dict_state();
#ifdef Py_DEBUG
// dictresize() must not be called after _PyDict_Fini()
assert(state->keys_numfree != -1);
#endif
if (oldkeys->dk_size == PyDict_MINSIZE &&
state->keys_numfree < PyDict_MAXFREELIST)
{
state->keys_free_list[state->keys_numfree++] = oldkeys;
}
else {
PyObject_Free(oldkeys);
}
}
build_indices(mp->ma_keys, newentries, numentries);
mp->ma_keys->dk_usable -= numentries;
mp->ma_keys->dk_nentries = numentries;
return 0;
}
d = {}
n = 1000000
used = set()
for k in range(n):
mem_size = d.__sizeof__()
d[k] = k
mem_resize = d.__sizeof__()
resize_ratio = mem_resize / mem_size
length = len(d)
if mem_size in used and mem_resize in used and k != n - 1:
continue
used.add(mem_size)
used.add(mem_resize)
fs = f"length = {length:-10}, mem_size = {mem_size:-10}, mem_resize = {mem_resize:-10}, resize_ratio = {resize_ratio:-10.5f}"
print(fs)
### Output ###
length = 1, mem_size = 48, mem_resize = 216, resize_ratio = 4.50000
length = 6, mem_size = 216, mem_resize = 344, resize_ratio = 1.59259
length = 11, mem_size = 344, mem_resize = 624, resize_ratio = 1.81395
length = 22, mem_size = 624, mem_resize = 1160, resize_ratio = 1.85897
length = 43, mem_size = 1160, mem_resize = 2256, resize_ratio = 1.94483
length = 86, mem_size = 2256, mem_resize = 4680, resize_ratio = 2.07447
length = 171, mem_size = 4680, mem_resize = 9296, resize_ratio = 1.98632
length = 342, mem_size = 9296, mem_resize = 18504, resize_ratio = 1.99053
length = 683, mem_size = 18504, mem_resize = 36944, resize_ratio = 1.99654
length = 1366, mem_size = 36944, mem_resize = 73800, resize_ratio = 1.99762
length = 2731, mem_size = 73800, mem_resize = 147536, resize_ratio = 1.99913
length = 5462, mem_size = 147536, mem_resize = 294984, resize_ratio = 1.99940
length = 10923, mem_size = 294984, mem_resize = 589904, resize_ratio = 1.99978
length = 21846, mem_size = 589904, mem_resize = 1310792, resize_ratio = 2.22204
length = 43691, mem_size = 1310792, mem_resize = 2621520, resize_ratio = 1.99995
length = 87382, mem_size = 2621520, mem_resize = 5242952, resize_ratio = 1.99997
length = 174763, mem_size = 5242952, mem_resize = 10485840, resize_ratio = 1.99999
length = 349526, mem_size = 10485840, mem_resize = 20971592, resize_ratio = 1.99999
length = 699051, mem_size = 20971592, mem_resize = 41943120, resize_ratio = 2.00000
length = 1000000, mem_size = 41943120, mem_resize = 41943120, resize_ratio = 1.00000
Clear
// Objects/dictobject.c
//
// clear dict by creating new one
//
void
PyDict_Clear(PyObject *op)
{
PyDictObject *mp;
PyDictKeysObject *oldkeys;
PyObject **oldvalues;
Py_ssize_t i, n;
if (!PyDict_Check(op))
return;
mp = ((PyDictObject *)op);
oldkeys = mp->ma_keys;
oldvalues = mp->ma_values;
if (oldvalues == empty_values)
return;
/* Empty the dict... */
dictkeys_incref(Py_EMPTY_KEYS);
mp->ma_keys = Py_EMPTY_KEYS;
mp->ma_values = empty_values;
mp->ma_used = 0;
mp->ma_version_tag = DICT_NEXT_VERSION();
/* ...then clear the keys and values */
if (oldvalues != NULL) {
n = oldkeys->dk_nentries;
for (i = 0; i < n; i++)
Py_CLEAR(oldvalues[i]);
free_values(oldvalues);
dictkeys_decref(oldkeys);
}
else {
assert(oldkeys->dk_refcnt == 1);
dictkeys_decref(oldkeys);
}
ASSERT_CONSISTENT(mp);
}
Real World Python
Notice:
- not thread-safe.
- iterable object that created by dict object can be used after dict size (len(d)) changed.
- the only way to decrease dict object size (sizeof) is create new one, then copy old into new created.
- when resizing hash table, memory usage will be double.
Bug Code
>>> d = {1: 1, 2: 2, 3: 3}
>>> for k in d:
... del d[k]
...
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
RuntimeError: dictionary changed size during iteration
>>>