[python internal] Implementation of Type Set
Abstract
- Using array to store element, and using Open Addressing to solve hash collision.
- When Resing, a new set will be created and all old elements will be copy into new set.
- When new element added, set will try to do expansion if (used + dummy) / total >= 0.6.
- When deleting an element, set will mark it as dummy, and the memory won’t free, and the memory usage won’t reduce, for example remove() and pop(), except difference_update() and symmetric_difference_update().
- When there are too many deleted (dummy) elements, there are two way to reduce memory usage, one is creating new one by using copy(), the other way is using difference_update() or symmetric_difference_update() which will resizing if dummy * 4 >= total.
- A valid size of set is the smallest value of 8 * (1 « n) that greater or equal with the required size, normally is used > 50000 ? 2 * used : 4 * used.
- Frozenset can’t be change after being created.
- Set using linear probing and randomized probing to searching key, so the worst case is O(n).
Metrics
Operation | Average Case | Amortized Worst Case |
---|---|---|
add | O(1) | |
clear | O(n) | |
copy | O(n) | |
difference s - t | O(len(s)) | |
difference_update s -= t | O(len(t)) | |
discard | O(1) | |
intersection s&t | O(min(len(s),len(t))) | O(len(s) * len(t)) |
intersection_update s &= t | O(min(len(s),len(t))) | O(len(s) * len(t)) |
isdisjoint - not (s >= t or t <= s) | O(len(t)) | |
issubset s1 <= t | O(len(t)) | |
issuperset s1 >= t | O(len(s)) | |
pop | O(1) | |
remove | O(1) | |
symmetric_difference s^t | O(len(s)) | |
symmetric_difference_update s ^= t | O(len(t)) | |
union s|t | O(len(t)) | |
update | O(1) | |
x in s | O(1) | |
s1 == s2 | O(n) |
Source Code
background:
- python-3.10.6
Basic Data Structure
PyTypeObject PySet_Type
// Objects/setobject.c
//
PyTypeObject PySet_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
"set", /* tp_name */
sizeof(PySetObject), /* tp_basicsize */
0, /* tp_itemsize */
/* methods */
(destructor)set_dealloc, /* tp_dealloc */
0, /* tp_vectorcall_offset */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_as_async */
(reprfunc)set_repr, /* tp_repr */
&set_as_number, /* tp_as_number */
&set_as_sequence, /* tp_as_sequence */
0, /* tp_as_mapping */
PyObject_HashNotImplemented, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
PyObject_GenericGetAttr, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
Py_TPFLAGS_BASETYPE |
_Py_TPFLAGS_MATCH_SELF, /* tp_flags */
set_doc, /* tp_doc */
(traverseproc)set_traverse, /* tp_traverse */
(inquiry)set_clear_internal, /* tp_clear */
(richcmpfunc)set_richcompare, /* tp_richcompare */
offsetof(PySetObject, weakreflist), /* tp_weaklistoffset */
(getiterfunc)set_iter, /* tp_iter */
0, /* tp_iternext */
set_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)set_init, /* tp_init */
PyType_GenericAlloc, /* tp_alloc */
set_new, /* tp_new */
PyObject_GC_Del, /* tp_free */
.tp_vectorcall = set_vectorcall,
};
Set Method Mapping
// Objects/setobject.c
//
// object's method mapping,
// used by bytecode LOAD_METHOD and CALL_METHOD
//
static PyMethodDef set_methods[] = {
{"add", (PyCFunction)set_add, METH_O,
add_doc},
{"clear", (PyCFunction)set_clear, METH_NOARGS,
clear_doc},
{"__contains__",(PyCFunction)set_direct_contains, METH_O | METH_COEXIST,
contains_doc},
{"copy", (PyCFunction)set_copy, METH_NOARGS,
copy_doc},
{"discard", (PyCFunction)set_discard, METH_O,
discard_doc},
{"difference", (PyCFunction)set_difference_multi, METH_VARARGS,
difference_doc},
{"difference_update", (PyCFunction)set_difference_update, METH_VARARGS,
difference_update_doc},
{"intersection",(PyCFunction)set_intersection_multi, METH_VARARGS,
intersection_doc},
{"intersection_update",(PyCFunction)set_intersection_update_multi, METH_VARARGS,
intersection_update_doc},
{"isdisjoint", (PyCFunction)set_isdisjoint, METH_O,
isdisjoint_doc},
{"issubset", (PyCFunction)set_issubset, METH_O,
issubset_doc},
{"issuperset", (PyCFunction)set_issuperset, METH_O,
issuperset_doc},
{"pop", (PyCFunction)set_pop, METH_NOARGS,
pop_doc},
{"__reduce__", (PyCFunction)set_reduce, METH_NOARGS,
reduce_doc},
{"remove", (PyCFunction)set_remove, METH_O,
remove_doc},
{"__sizeof__", (PyCFunction)set_sizeof, METH_NOARGS,
sizeof_doc},
{"symmetric_difference",(PyCFunction)set_symmetric_difference, METH_O,
symmetric_difference_doc},
{"symmetric_difference_update",(PyCFunction)set_symmetric_difference_update, METH_O,
symmetric_difference_update_doc},
#ifdef Py_DEBUG
{"test_c_api", (PyCFunction)test_c_api, METH_NOARGS,
test_c_api_doc},
#endif
{"union", (PyCFunction)set_union, METH_VARARGS,
union_doc},
{"update", (PyCFunction)set_update, METH_VARARGS,
update_doc},
{"__class_getitem__", (PyCFunction)Py_GenericAlias, METH_O|METH_CLASS, PyDoc_STR("See PEP 585")},
{NULL, NULL} /* sentinel */
};
static PyNumberMethods set_as_number = {
0, /*nb_add*/
(binaryfunc)set_sub, /*nb_subtract*/
0, /*nb_multiply*/
0, /*nb_remainder*/
0, /*nb_divmod*/
0, /*nb_power*/
0, /*nb_negative*/
0, /*nb_positive*/
0, /*nb_absolute*/
0, /*nb_bool*/
0, /*nb_invert*/
0, /*nb_lshift*/
0, /*nb_rshift*/
(binaryfunc)set_and, /*nb_and*/
(binaryfunc)set_xor, /*nb_xor*/
(binaryfunc)set_or, /*nb_or*/
0, /*nb_int*/
0, /*nb_reserved*/
0, /*nb_float*/
0, /*nb_inplace_add*/
(binaryfunc)set_isub, /*nb_inplace_subtract*/
0, /*nb_inplace_multiply*/
0, /*nb_inplace_remainder*/
0, /*nb_inplace_power*/
0, /*nb_inplace_lshift*/
0, /*nb_inplace_rshift*/
(binaryfunc)set_iand, /*nb_inplace_and*/
(binaryfunc)set_ixor, /*nb_inplace_xor*/
(binaryfunc)set_ior, /*nb_inplace_or*/
};
New - Create New Set Object
Calls: PySet_Type.tp_new -> set_new -> make_new_set -> set_update_internal
// Objects/setobject.c
//
// python code: set(iterable_object), for example, set([1,2,3])
// Time: O(n), n = len(iterable_object)
//
static PyObject *
set_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
return make_new_set(type, NULL);
}
...
static PyObject *
make_new_set(PyTypeObject *type, PyObject *iterable)
{
assert(PyType_Check(type));
PySetObject *so;
so = (PySetObject *)type->tp_alloc(type, 0);
if (so == NULL)
return NULL;
so->fill = 0;
so->used = 0;
so->mask = PySet_MINSIZE - 1;
so->table = so->smalltable;
so->hash = -1;
so->finger = 0;
so->weakreflist = NULL;
if (iterable != NULL) {
if (set_update_internal(so, iterable)) {
Py_DECREF(so);
return NULL;
}
}
return (PyObject *)so;
}
Add - Add or Update Element to Set
Calls: set_add -> set_add_key -> set_add_entry
Consulation:
- when (used + dummy) / total >= 0.6, will try to resize base on used
- used: the number of keys existed, equals with len(s)
- dummy: how many keys were added but later removed after a set was constructed
static PyObject *
set_add(PySetObject *so, PyObject *key)
{
if (set_add_key(so, key))
return NULL;
Py_RETURN_NONE;
}
static int
set_add_key(PySetObject *so, PyObject *key)
{
Py_hash_t hash;
if (!PyUnicode_CheckExact(key) ||
(hash = ((PyASCIIObject *) key)->hash) == -1) {
hash = PyObject_Hash(key);
if (hash == -1)
return -1;
}
return set_add_entry(so, key, hash);
}
static int
set_add_entry(PySetObject *so, PyObject *key, Py_hash_t hash)
{
setentry *table;
setentry *freeslot;
setentry *entry;
size_t perturb;
size_t mask;
size_t i; /* Unsigned for defined overflow behavior */
int probes;
int cmp;
/* Pre-increment is necessary to prevent arbitrary code in the rich
comparison from deallocating the key just before the insertion. */
Py_INCREF(key);
restart:
mask = so->mask;
i = (size_t)hash & mask;
freeslot = NULL;
perturb = hash;
while (1) {
entry = &so->table[i];
probes = (i + LINEAR_PROBES <= mask) ? LINEAR_PROBES: 0;
do {
if (entry->hash == 0 && entry->key == NULL)
goto found_unused_or_dummy;
if (entry->hash == hash) {
PyObject *startkey = entry->key;
assert(startkey != dummy);
if (startkey == key)
goto found_active;
if (PyUnicode_CheckExact(startkey)
&& PyUnicode_CheckExact(key)
&& _PyUnicode_EQ(startkey, key))
goto found_active;
table = so->table;
Py_INCREF(startkey);
cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
Py_DECREF(startkey);
if (cmp > 0)
goto found_active;
if (cmp < 0)
goto comparison_error;
if (table != so->table || entry->key != startkey)
goto restart;
mask = so->mask;
}
else if (entry->hash == -1) {
assert (entry->key == dummy);
freeslot = entry;
}
entry++;
} while (probes--);
perturb >>= PERTURB_SHIFT;
i = (i * 5 + 1 + perturb) & mask;
}
found_unused_or_dummy:
if (freeslot == NULL)
goto found_unused;
so->used++;
freeslot->key = key;
freeslot->hash = hash;
return 0;
found_unused:
so->fill++;
so->used++;
entry->key = key;
entry->hash = hash;
// Anotation:
// 1. when (used + dummy) / total >= 0.6, will try to resize base on used
//
if ((size_t)so->fill*5 < mask*3)
return 0;
return set_table_resize(so, so->used>50000 ? so->used*2 : so->used*4);
found_active:
Py_DECREF(key);
return 0;
comparison_error:
Py_DECREF(key);
return -1;
}
Delete - Remove Element from Set
Calls: set_remove -> set_discard_key -> set_discard_entry -> set_lookkey
Marking the key as dummy, and never free or resize set.
//
static PyObject *
set_remove(PySetObject *so, PyObject *key)
{
PyObject *tmpkey;
int rv;
rv = set_discard_key(so, key);
if (rv < 0) {
if (!PySet_Check(key) || !PyErr_ExceptionMatches(PyExc_TypeError))
return NULL;
PyErr_Clear();
tmpkey = make_new_set(&PyFrozenSet_Type, key);
if (tmpkey == NULL)
return NULL;
rv = set_discard_key(so, tmpkey);
Py_DECREF(tmpkey);
if (rv < 0)
return NULL;
}
if (rv == DISCARD_NOTFOUND) {
_PyErr_SetKeyError(key);
return NULL;
}
Py_RETURN_NONE;
}
static int
set_discard_key(PySetObject *so, PyObject *key)
{
Py_hash_t hash;
if (!PyUnicode_CheckExact(key) ||
(hash = ((PyASCIIObject *) key)->hash) == -1) {
hash = PyObject_Hash(key);
if (hash == -1)
return -1;
}
return set_discard_entry(so, key, hash);
}
static int
set_discard_entry(PySetObject *so, PyObject *key, Py_hash_t hash)
{
setentry *entry;
PyObject *old_key;
entry = set_lookkey(so, key, hash);
if (entry == NULL)
return -1;
if (entry->key == NULL)
return DISCARD_NOTFOUND;
old_key = entry->key;
entry->key = dummy;
entry->hash = -1;
so->used--;
Py_DECREF(old_key);
return DISCARD_FOUND;
}
Resize - Realloc Memory
Notice:
- used: the number of keys existed, equals with len(s)
- dummy: how many keys were added but later removed after a set was constructed
- when resizing, a new memory area was allocated, the old one was free, and the old data (not dummy) was copy into new one, but the id(s) was keeped.
When Resing Happen:
- expansion:
- (used + dummy) / total >= 0.6 and new elements add
- shrinking:
- when dummy > total / 4 and function difference_update() or symmetric_difference_update was called
- rare case: add(), new element added trigger set resize to smaller one, this only happen when too many dummy solt and new element not hit dummy solt
/*
Restructure the table by allocating a new table and reinserting all
keys again. When entries have been deleted, the new table may
actually be smaller than the old one.
*/
static int
set_table_resize(PySetObject *so, Py_ssize_t minused)
{
setentry *oldtable, *newtable, *entry;
Py_ssize_t oldmask = so->mask;
size_t newmask;
int is_oldtable_malloced;
setentry small_copy[PySet_MINSIZE];
assert(minused >= 0);
/* Find the smallest table size > minused. */
/* XXX speed-up with intrinsics */
size_t newsize = PySet_MINSIZE;
while (newsize <= (size_t)minused) {
newsize <<= 1; // The largest possible value is PY_SSIZE_T_MAX + 1.
}
/* Get space for a new table. */
oldtable = so->table;
assert(oldtable != NULL);
is_oldtable_malloced = oldtable != so->smalltable;
if (newsize == PySet_MINSIZE) {
/* A large table is shrinking, or we can't get any smaller. */
newtable = so->smalltable;
if (newtable == oldtable) {
if (so->fill == so->used) {
/* No dummies, so no point doing anything. */
return 0;
}
/* We're not going to resize it, but rebuild the
table anyway to purge old dummy entries.
Subtle: This is *necessary* if fill==size,
as set_lookkey needs at least one virgin slot to
terminate failing searches. If fill < size, it's
merely desirable, as dummies slow searches. */
assert(so->fill > so->used);
memcpy(small_copy, oldtable, sizeof(small_copy));
oldtable = small_copy;
}
}
else {
newtable = PyMem_NEW(setentry, newsize);
if (newtable == NULL) {
PyErr_NoMemory();
return -1;
}
}
/* Make the set empty, using the new table. */
assert(newtable != oldtable);
memset(newtable, 0, sizeof(setentry) * newsize);
so->mask = newsize - 1;
so->table = newtable;
/* Copy the data over; this is refcount-neutral for active entries;
dummy entries aren't copied over, of course */
newmask = (size_t)so->mask;
if (so->fill == so->used) {
for (entry = oldtable; entry <= oldtable + oldmask; entry++) {
if (entry->key != NULL) {
set_insert_clean(newtable, newmask, entry->key, entry->hash);
}
}
} else {
so->fill = so->used;
for (entry = oldtable; entry <= oldtable + oldmask; entry++) {
if (entry->key != NULL && entry->key != dummy) {
set_insert_clean(newtable, newmask, entry->key, entry->hash);
}
}
}
if (is_oldtable_malloced)
PyMem_Free(oldtable);
return 0;
}
d = set()
n = 1000000
used = set()
for k in range(n):
mem_size = d.__sizeof__()
d.add(k)
mem_resize = d.__sizeof__()
resize_ratio = mem_resize / mem_size
length = len(d)
if mem_size in used and mem_resize in used and k != n - 1:
continue
used.add(mem_size)
used.add(mem_resize)
fs = f"length = {length:-10}, mem_size = {mem_size:-10}, mem_resize = {mem_resize:-10}, resize_ratio = {resize_ratio:-10.5f}"
print(fs)
###### Output ######
length = 1, mem_size = 200, mem_resize = 200, resize_ratio = 1.00000
length = 5, mem_size = 200, mem_resize = 712, resize_ratio = 3.56000
length = 19, mem_size = 712, mem_resize = 2248, resize_ratio = 3.15730
length = 77, mem_size = 2248, mem_resize = 8392, resize_ratio = 3.73310
length = 307, mem_size = 8392, mem_resize = 32968, resize_ratio = 3.92850
length = 1229, mem_size = 32968, mem_resize = 131272, resize_ratio = 3.98180
length = 4915, mem_size = 131272, mem_resize = 524488, resize_ratio = 3.99543
length = 19661, mem_size = 524488, mem_resize = 2097352, resize_ratio = 3.99886
length = 78643, mem_size = 2097352, mem_resize = 4194504, resize_ratio = 1.99990
length = 157286, mem_size = 4194504, mem_resize = 8388808, resize_ratio = 1.99995
length = 314573, mem_size = 8388808, mem_resize = 16777416, resize_ratio = 1.99998
length = 629145, mem_size = 16777416, mem_resize = 33554632, resize_ratio = 1.99999
length = 1000000, mem_size = 33554632, mem_resize = 33554632, resize_ratio = 1.00000
Real World Python
Notice:
- not thread-safe.
- during iteration, the length of set can’t be changed.
Bug Code
>>> for v in s:
... s.remove(v)
...
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
RuntimeError: Set changed size during iteration
>>>
>>> for v in s:
... s.add(99)
...
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
RuntimeError: Set changed size during iteration
>>>