Nutshell

Thoughts come and go, words stay eternal

03 Sep 2022

[python internal] From Python to Bytecode until C

Abstract

  1. Use module dis to get the minimize assembly code.
  2. Found the entry from Python/ceval.c with the bytecode that disassembled before.
  3. Now the door is open and the road is at your feet.

Python -> Bytecode -> C

Vesion: Python-3.10.6

1. Example - Python

>>> def f():
...     d = {}
...     d[1] = 1
...

2. Python -> Bytecode

>>> import dis
>>>
>>> def f():
...     d = {}
...     d[1] = 1
...
>>>
>>> dis.dis(f)
  2           0 BUILD_MAP                0
              2 STORE_FAST               0 (d)

  3           4 LOAD_CONST               1 (1)
              6 LOAD_FAST                0 (d)
              8 LOAD_CONST               1 (1)
             10 STORE_SUBSCR
             12 LOAD_CONST               0 (None)
             14 RETURN_VALUE
>>>

3. Bytecode -> C

3.0 bytecode entry function
// All bytecode's entries are in Python/ceval.c
PyObject* _Py_HOT_FUNCTION
_PyEval_EvalFrameDefault(PyThreadState *tstate, PyFrameObject *f, int throwflag)
{
    _Py_EnsureTstateNotNULL(tstate);

	...
	
    return _Py_CheckFunctionResult(tstate, NULL, retval, __func__);
}

3.1 BUILD_MAP

bytecode: BUILD_MAP 0
python: d = {}

// Python/ceval.c
//
// opcode = BUILD_MAP
// oparg = 0
//
PyObject* _Py_HOT_FUNCTION
_PyEval_EvalFrameDefault(PyThreadState *tstate, PyFrameObject *f, int throwflag)
{
    _Py_EnsureTstateNotNULL(tstate);
	...

        switch (opcode) {
		...

        case TARGET(BUILD_MAP): {
            Py_ssize_t i;
            PyObject *map = _PyDict_NewPresized((Py_ssize_t)oparg);
            if (map == NULL)
                goto error;
            for (i = oparg; i > 0; i--) {
                int err;
                PyObject *key = PEEK(2*i);
                PyObject *value = PEEK(2*i - 1);
                err = PyDict_SetItem(map, key, value);
                if (err != 0) {
                    Py_DECREF(map);
                    goto error;
                }
            }

            while (oparg--) {
                Py_DECREF(POP());
                Py_DECREF(POP());
            }
            PUSH(map);
            DISPATCH();
        }
		
		...
		}
	...
    return _Py_CheckFunctionResult(tstate, NULL, retval, __func__);
}

// Objects/dictobject.c
//
// create a dict object
//
PyObject *
_PyDict_NewPresized(Py_ssize_t minused)
{
    const Py_ssize_t max_presize = 128 * 1024;
    Py_ssize_t newsize;
    PyDictKeysObject *new_keys;

    if (minused <= USABLE_FRACTION(PyDict_MINSIZE)) {
        return PyDict_New();
    }
    /* There are no strict guarantee that returned dict can contain minused
     * items without resize.  So we create medium size dict instead of very
     * large dict or MemoryError.
     */
    if (minused > USABLE_FRACTION(max_presize)) {
        newsize = max_presize;
    }
    else {
        newsize = estimate_keysize(minused);
    }

    new_keys = new_keys_object(newsize);
    if (new_keys == NULL)
        return NULL;
    return new_dict(new_keys, NULL);
}

3.2 STORE_FAST

bytecode: STORE_FAST 0 (d)

// Python/ceval.c
//
// give a variable name for the new created object
//
//
PyObject* _Py_HOT_FUNCTION
_PyEval_EvalFrameDefault(PyThreadState *tstate, PyFrameObject *f, int throwflag)
{
    _Py_EnsureTstateNotNULL(tstate);
	...

        switch (opcode) {
		...

        case TARGET(STORE_FAST): {
            PREDICTED(STORE_FAST);
            PyObject *value = POP();
            SETLOCAL(oparg, value);
            DISPATCH();
        }
		
		...
		}
	...
    return _Py_CheckFunctionResult(tstate, NULL, retval, __func__);
}

3.3 LOAD_CONST

bytecode: LOAD_CONST 1 (1)

// Python/ceval.c
//
// Pushes `co_consts[consti]` onto the stack.
//
//
PyObject* _Py_HOT_FUNCTION
_PyEval_EvalFrameDefault(PyThreadState *tstate, PyFrameObject *f, int throwflag)
{
    _Py_EnsureTstateNotNULL(tstate);
	...

        switch (opcode) {
		...

        case TARGET(LOAD_CONST): {
            PREDICTED(LOAD_CONST);
            PyObject *value = GETITEM(consts, oparg);
            Py_INCREF(value);
            PUSH(value);
            DISPATCH();
        }
		
		...
		}
	...
    return _Py_CheckFunctionResult(tstate, NULL, retval, __func__);
}

3.4 LOAD_FAST

bytecode: LOAD_FAST 0 (d)

// Python/ceval.c
//
// Pushes a reference to the local `co_varnames[var_num]` onto the stack.
//
//
PyObject* _Py_HOT_FUNCTION
_PyEval_EvalFrameDefault(PyThreadState *tstate, PyFrameObject *f, int throwflag)
{
    _Py_EnsureTstateNotNULL(tstate);
	...

        switch (opcode) {
		...

        case TARGET(LOAD_FAST): {
            PyObject *value = GETLOCAL(oparg);
            if (value == NULL) {
                format_exc_check_arg(tstate, PyExc_UnboundLocalError,
                                     UNBOUNDLOCAL_ERROR_MSG,
                                     PyTuple_GetItem(co->co_varnames, oparg));
                goto error;
            }
            Py_INCREF(value);
            PUSH(value);
            DISPATCH();
        }
		
		...
		}
	...
    return _Py_CheckFunctionResult(tstate, NULL, retval, __func__);
}

3.5 STORE_SUBSCR

bytecode: STORE_SUBSCR

  3           4 LOAD_CONST               1 (1) // TOS = 1
              6 LOAD_FAST                0 (d) // TOS1 = d
              8 LOAD_CONST               1 (1) // TOS2 = 1
             10 STORE_SUBSCR                   // TOS1[TOS] = TOS2 => d[1] = 1

Chain: _PyEval_EvalFrameDefault -> PyObject_SetItem -> dict_ass_sub -> PyDict_SetItem -> insertdict

// Python/ceval.c
//
// Implements `TOS1[TOS] = TOS2`.
//
//
PyObject* _Py_HOT_FUNCTION
_PyEval_EvalFrameDefault(PyThreadState *tstate, PyFrameObject *f, int throwflag)
{
    _Py_EnsureTstateNotNULL(tstate);
	...

        switch (opcode) {
		...

        case TARGET(STORE_SUBSCR): {
            PyObject *sub = TOP();
            PyObject *container = SECOND();
            PyObject *v = THIRD();
            int err;
            STACK_SHRINK(3);
            /* container[sub] = v */
            err = PyObject_SetItem(container, sub, v);
            Py_DECREF(v);
            Py_DECREF(container);
            Py_DECREF(sub);
            if (err != 0)
                goto error;
            DISPATCH();
        }
		
		...
		}
	...
    return _Py_CheckFunctionResult(tstate, NULL, retval, __func__);
}

// Objects/abstract.c
//
// abstract function implement SetItem, container[k] = v
// type(container) is dict or list
//
int
PyObject_SetItem(PyObject *o, PyObject *key, PyObject *value)
{
    if (o == NULL || key == NULL || value == NULL) {
        null_error();
        return -1;
    }

	// Annotation:
	// 1. check object is mapping object or not, like dict()
    PyMappingMethods *m = Py_TYPE(o)->tp_as_mapping;
    if (m && m->mp_ass_subscript) {
        int res = m->mp_ass_subscript(o, key, value);
        assert(_Py_CheckSlotResult(o, "__setitem__", res >= 0));
        return res;
    }

	// Annotation:
	// 1. check object is sequence object or not, like list()
    if (Py_TYPE(o)->tp_as_sequence) {
        if (_PyIndex_Check(key)) {
            Py_ssize_t key_value;
            key_value = PyNumber_AsSsize_t(key, PyExc_IndexError);
            if (key_value == -1 && PyErr_Occurred())
                return -1;
            return PySequence_SetItem(o, key_value, value);
        }
        else if (Py_TYPE(o)->tp_as_sequence->sq_ass_item) {
            type_error("sequence index must be "
                       "integer, not '%.200s'", key);
            return -1;
        }
    }

    type_error("'%.200s' object does not support item assignment", o);
    return -1;
}

// include/object.h
// define all attrs that the typeObject supported
//
/* PyTypeObject structure is defined in cpython/object.h.
   In Py_LIMITED_API, PyTypeObject is an opaque structure. */
typedef struct _typeobject PyTypeObject;
// include/cpython/object.h
// If this structure is modified, Doc/includes/typestruct.h should be updated
// as well.
struct _typeobject {
    PyObject_VAR_HEAD
    const char *tp_name; /* For printing, in format "<module>.<name>" */
    Py_ssize_t tp_basicsize, tp_itemsize; /* For allocation */

    /* Methods to implement standard operations */

    destructor tp_dealloc;
    Py_ssize_t tp_vectorcall_offset;
    getattrfunc tp_getattr;
    setattrfunc tp_setattr;
    PyAsyncMethods *tp_as_async; /* formerly known as tp_compare (Python 2)
                                    or tp_reserved (Python 3) */
    reprfunc tp_repr;

    /* Method suites for standard classes */

    PyNumberMethods *tp_as_number;
    PySequenceMethods *tp_as_sequence;
    PyMappingMethods *tp_as_mapping;

    /* More standard operations (here for binary compatibility) */

    hashfunc tp_hash;
    ternaryfunc tp_call;
    reprfunc tp_str;
    getattrofunc tp_getattro;
    setattrofunc tp_setattro;

    /* Functions to access object as input/output buffer */
    PyBufferProcs *tp_as_buffer;

    /* Flags to define presence of optional/expanded features */
    unsigned long tp_flags;

    const char *tp_doc; /* Documentation string */

    /* Assigned meaning in release 2.0 */
    /* call function for all accessible objects */
    traverseproc tp_traverse;

    /* delete references to contained objects */
    inquiry tp_clear;

    /* Assigned meaning in release 2.1 */
    /* rich comparisons */
    richcmpfunc tp_richcompare;

    /* weak reference enabler */
    Py_ssize_t tp_weaklistoffset;

    /* Iterators */
    getiterfunc tp_iter;
    iternextfunc tp_iternext;

    /* Attribute descriptor and subclassing stuff */
    struct PyMethodDef *tp_methods; // object's supported methods, part of dir(o) show
    struct PyMemberDef *tp_members;
    struct PyGetSetDef *tp_getset;
    // Strong reference on a heap type, borrowed reference on a static type
    struct _typeobject *tp_base;
    PyObject *tp_dict;
    descrgetfunc tp_descr_get;
    descrsetfunc tp_descr_set;
    Py_ssize_t tp_dictoffset;
    initproc tp_init;
    allocfunc tp_alloc;
    newfunc tp_new;
    freefunc tp_free; /* Low-level free-memory routine */
    inquiry tp_is_gc; /* For PyObject_IS_GC */
    PyObject *tp_bases;
    PyObject *tp_mro; /* method resolution order */
    PyObject *tp_cache;
    PyObject *tp_subclasses;
    PyObject *tp_weaklist;
    destructor tp_del;

    /* Type attribute cache version tag. Added in version 2.6 */
    unsigned int tp_version_tag;

    destructor tp_finalize;
    vectorcallfunc tp_vectorcall;
};

// Include/cpython/object.h
typedef struct {
    lenfunc mp_length;
    binaryfunc mp_subscript;
    objobjargproc mp_ass_subscript;
} PyMappingMethods;

// Objects/dictobject.c
//

// math rule: PyTypeObject->tp_as_mapping != NULL
PyTypeObject PyDict_Type = {
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
    "dict",
	
	...
	
    &dict_as_mapping,                           /* tp_as_mapping */

	...
};

// match rule: PyMappingMethods->mp_ass_subscript(PyDictObject *mp, PyObject *v, PyObject *w)
static PyMappingMethods dict_as_mapping = {
    (lenfunc)dict_length, /*mp_length*/
    (binaryfunc)dict_subscript, /*mp_subscript*/
    (objobjargproc)dict_ass_sub, /*mp_ass_subscript*/
};


// DELETE_SUBSCR also using this function, when "w" is NULL, it means delete key
// when "w" is NOT NULL, it means set key
static int
dict_ass_sub(PyDictObject *mp, PyObject *v, PyObject *w)
{
    if (w == NULL)
        return PyDict_DelItem((PyObject *)mp, v);
    else
        return PyDict_SetItem((PyObject *)mp, v, w);
}

/* CAUTION: PyDict_SetItem() must guarantee that it won't resize the
 * dictionary if it's merely replacing the value for an existing key.
 * This means that it's safe to loop over a dictionary with PyDict_Next()
 * and occasionally replace a value -- but you can't insert new keys or
 * remove them.
 */
int
PyDict_SetItem(PyObject *op, PyObject *key, PyObject *value)
{
    PyDictObject *mp;
    Py_hash_t hash;
    if (!PyDict_Check(op)) {
        PyErr_BadInternalCall();
        return -1;
    }
    assert(key);
    assert(value);
    mp = (PyDictObject *)op;
    if (!PyUnicode_CheckExact(key) ||
        (hash = ((PyASCIIObject *) key)->hash) == -1)
    {
        hash = PyObject_Hash(key);
        if (hash == -1)
            return -1;
    }

    if (mp->ma_keys == Py_EMPTY_KEYS) {
        return insert_to_emptydict(mp, key, hash, value);
    }
    /* insertdict() handles any resizing that might be necessary */
    return insertdict(mp, key, hash, value);
}

References

  1. dis - Disassembler for Python bytecode
  2. python3 - Type Objects