Graeme Winter

Hybrid Programming

Extending Micropython with native modules is well estabished for modules written in C, but of course that is really not so interesting. It would be much more interesting to extend the module in assembly. So, let’s do that.

Started from the cookie-cutter example which had a few limitations -

The former was the annoyance, since I wanted to do “proper” assembly i.e. not inlining a few bits. Wanted to take my Mandelbrot set calculation and write it in sane assembly, since that was already shown to be correct. As such however I dug into one of my other repos where I had written the same calculation a few times over in different variants of ARM assembly, and started with the ARM7 variant.

Key points:

Starting from this, let’s do some work.

Project layout

We are writing MicroPython extension modules here so cannot avoid some dependency thereon - export MPY_DIR to point at where the source code is checked out and built (we depend on the mpy compiler stuff as part of the tool chain). Also depends on having the ARM compiler suite installed, but if you just built µPython then you have that already.

File list

Following files:

General Makefile

# Makefile fragment for generating native .mpy files from C source
# MPY_DIR must be set to the top of the MicroPython source tree

BUILD ?= build

ECHO = @echo
RM = /bin/rm
MKDIR = /bin/mkdir
PYTHON = python3
MPY_CROSS = $(MPY_DIR)/mpy-cross/build/mpy-cross
MPY_TOOL = $(PYTHON) $(MPY_DIR)/tools/mpy-tool.py
MPY_LD = $(PYTHON) $(MPY_DIR)/tools/mpy_ld.py

Q = @
ifeq ("$(origin V)", "command line")
ifeq ($(V),1)
Q =
MPY_LD += '-vvv'
endif
endif

ARCH_UPPER = $(shell echo $(ARCH) | tr '[:lower:]' '[:upper:]')
CONFIG_H = $(BUILD)/$(MOD).config.h

CFLAGS += -I. -I$(MPY_DIR)
CFLAGS += -std=c99
CFLAGS += -Os
CFLAGS += -Wall -Werror -DNDEBUG
CFLAGS += -DNO_QSTR
CFLAGS += -DMICROPY_ENABLE_DYNRUNTIME
CFLAGS += -DMP_CONFIGFILE='<$(CONFIG_H)>'
CFLAGS += -fpic -fno-common
CFLAGS += -U _FORTIFY_SOURCE # prevent use of __*_chk libc functions
#CFLAGS += -fdata-sections -ffunction-sections

MPY_CROSS_FLAGS += -march=$(ARCH)

SRC_O += $(addprefix $(BUILD)/, $(patsubst %.c,%.o,$(filter %.c,$(SRC))) $(patsubst %.S,%.o,$(filter %.S,$(SRC))))
SRC_MPY += $(addprefix $(BUILD)/, $(patsubst %.py,%.mpy,$(filter %.py,$(SRC))))

################################################################################
# Architecture configuration

ifeq ($(ARCH),x86)

# x86
CROSS =
CFLAGS += -m32 -fno-stack-protector
MICROPY_FLOAT_IMPL ?= double
AFLAGS =

else ifeq ($(ARCH),x64)

# x64
CROSS =
CFLAGS += -fno-stack-protector
MICROPY_FLOAT_IMPL ?= double
AFLAGS =

else ifeq ($(ARCH),armv6m)

# thumb
CROSS = arm-none-eabi-
CFLAGS += -mthumb -mcpu=cortex-m0
MICROPY_FLOAT_IMPL ?= none
AFLAGS = -mthumb -mcpu=cortex-m0

else ifeq ($(ARCH),armv7m)

# thumb
CROSS = arm-none-eabi-
CFLAGS += -mthumb -mcpu=cortex-m3
MICROPY_FLOAT_IMPL ?= none
AFLAGS = -mthumb -mcpu=cortex-m3

else ifeq ($(ARCH),armv7emsp)

# thumb
CROSS = arm-none-eabi-
CFLAGS += -mthumb -mcpu=cortex-m4
CFLAGS += -mfpu=fpv4-sp-d16 -mfloat-abi=hard
MICROPY_FLOAT_IMPL ?= float
AFLAGS = -mthumb -mcpu=cortex-m4

else ifeq ($(ARCH),armv7emdp)

# thumb
CROSS = arm-none-eabi-
CFLAGS += -mthumb -mcpu=cortex-m7
CFLAGS += -mfpu=fpv5-d16 -mfloat-abi=hard
MICROPY_FLOAT_IMPL ?= double
AFLAGS = -mthumb -mcpu=cortex-m7

else ifeq ($(ARCH),xtensa)

# xtensa
CROSS = xtensa-lx106-elf-
CFLAGS += -mforce-l32
MICROPY_FLOAT_IMPL ?= none
AFLAGS =

else ifeq ($(ARCH),xtensawin)

# xtensawin
CROSS = xtensa-esp32-elf-
CFLAGS +=
MICROPY_FLOAT_IMPL ?= float
AFLAGS =

else
$(error architecture '$(ARCH)' not supported)
endif

MICROPY_FLOAT_IMPL_UPPER = $(shell echo $(MICROPY_FLOAT_IMPL) | tr '[:lower:]' '[:upper:]')
CFLAGS += -DMICROPY_FLOAT_IMPL=MICROPY_FLOAT_IMPL_$(MICROPY_FLOAT_IMPL_UPPER)

CFLAGS += $(CFLAGS_EXTRA)

################################################################################
# Build rules

.PHONY: all clean

all: $(MOD).mpy

clean:
	$(RM) -rf $(BUILD) $(CLEAN_EXTRA)

# Create build destination directories first
BUILD_DIRS = $(sort $(dir $(CONFIG_H) $(SRC_O) $(SRC_MPY)))
$(CONFIG_H) $(SRC_O) $(SRC_MPY): | $(BUILD_DIRS)
$(BUILD_DIRS):
	$(Q)$(MKDIR) -p $@

# Preprocess all source files to generate $(CONFIG_H)
$(CONFIG_H): $(SRC)
	$(ECHO) "GEN $@"
	$(Q)$(MPY_LD) --arch $(ARCH) --preprocess -o $@ $^

# Build .o from .c source files
$(BUILD)/%.o: %.c $(CONFIG_H) Makefile
	$(ECHO) "CC $<"
	$(Q)$(CROSS)gcc $(CFLAGS) -o $@ -c $<

# Build .o from .S source files
$(BUILD)/%.o: %.S $(CONFIG_H) Makefile
	$(ECHO) "AS $<"
	$(Q)$(CROSS)as ${AFLAGS} -c -o $@ $<

# Build .mpy from .py source files
$(BUILD)/%.mpy: %.py
	$(ECHO) "MPY $<"
	$(Q)$(MPY_CROSS) $(MPY_CROSS_FLAGS) -o $@ $<

# Build native .mpy from object files
$(BUILD)/$(MOD).native.mpy: $(SRC_O)
	$(ECHO) "LINK $<"
	$(Q)$(MPY_LD) --arch $(ARCH) --qstrs $(CONFIG_H) -o $@ $^

# Build final .mpy from all intermediate .mpy files
$(MOD).mpy: $(BUILD)/$(MOD).native.mpy $(SRC_MPY)
	$(ECHO) "GEN $@"
	$(Q)$(MPY_TOOL) --merge -o $@ $^

This is general for all projects, will be re-used in future so put it at the root of my extension module repo.

Specific Makefile

Really simple, point at source, name output, set architecture:

MOD = mandelbrot
SRC = mandelbrot.c mandelrow_impl.S

# ideally this should come from invocation?
ARCH = armv7emsp

# the actual Makefile code
include ../mpy.mk

C Binding

This is a very slight variation on the cookie cutter example - bridged to the external symbol for mandelrow_impl (Mandelbrot calculation, for one row) - lots of macros are working in here.

#include "py/dynruntime.h"

extern int mandelrow_impl(int addr, int Ci);

STATIC mp_obj_t mandelrow(mp_obj_t addr_obj, mp_obj_t ci_obj) {
  mp_int_t addr = mp_obj_get_int(addr_obj);
  mp_int_t ci = mp_obj_get_int(ci_obj);
  mp_int_t result = mandelrow_impl(addr, ci);
  return mp_obj_new_int(result);
}

STATIC MP_DEFINE_CONST_FUN_OBJ_2(mandelrow_obj, mandelrow);

mp_obj_t mpy_init(mp_obj_fun_bc_t *self, size_t n_args, size_t n_kw,
                  mp_obj_t *args) {
  MP_DYNRUNTIME_INIT_ENTRY

  mp_store_global(MP_QSTR_mandelbrot, MP_OBJ_FROM_PTR(&mandelrow_obj));

  MP_DYNRUNTIME_INIT_EXIT
}

Assembly Code

Warn assembler this is thumb / unified syntax, inputs are R0 => pointer to row data area, R1 => Ci value as before.

.thumb
.syntax unified

.text
.global mandelrow_impl

	// register allocation on input
	// r0 pointer to data area, assumed to be uint32_t
	// r1 7.24 signed fixed point value for Ci

	// register allocations
	// r0 iter counter
	// r1/r2 real / imag c
	// r3/r4 real / imag z
	// r5 tmp / scratch to alias zr
	// r8, r9 for zr2, zi2
	// r6, r7 for working space for SMULL instruction
	// r10 pointer to next word to write

	// data are stored in 7.24 format fixed point
	// domain in real is -2 to 0.5, imag -1.25 to +1.25

mandelrow_impl:
	stmdb sp!, {R4-R10, lr}

	mov R10, R0
	mov R2, R1

	// initial values for cr - origin as above + 0.5 x box
	mov R1, #-2
	lsl R1, #24
	add R1, R1, #0x4000

real:
	// set up for iter - count, zr, zi
	mov R0, #0
	mov R3, #0
	mov R4, #0

iter:
	// zr^2
	smull R6, R7, R3, R3
	lsr R6, #24
	orr R8, R6, R7, lsl #8

	// zi^2
	smull R6, R7, R4, R4
	lsr R6, #24
	orr R9, R6, R7, lsl #8

	// sum and cmp for zr^2 + zi^2
	add R5, R8, R9
	cmp R5, #0x4000000
	bge end

	// next zr
	mov R5, R3
	sub R6, R8, R9
	add R3, R6, R1

	// next zi
	smull R6, R7, R5, R4
	lsr R6, #24
	orr R5, R6, R7, lsl #8
	add R4, R2, R5, lsl #1

	add R0, #1
	cmp R0, #0x1000
	bne iter

end:
	// save value
	str R0, [R10, #0]
	add R10, #4

	// increment real, continue
	add R1, R1, #0x8000
	cmp R1, #0x800000
	blt real

	mov R0, R2
	ldmia.w sp!, {R4-R10, pc}

Results

Ran in same Python process as previous version mentioned above, gave bitwise identical results, win.