zephyr: ARM: FPU: using Unshared FP Services mode can still result in corrupted floating point registers

Describe the bug Unshared FP Services mode is supposed to work properly when a single thread uses the FPU. However, it looks like GCC (version 9 and above) generates code that temporarily stores integer data to FPU registers, even if no FPU math operations are carried out within a function. The values in FPU registers can be overwritten by other threads, then such modified values are loaded into original function and cause program corruption. This happens because under unshared FP registers mode, the Cortex-M (and perhaps other architectures) do not save/restore the FP context during thread context-switch or even during exception entry and return.

A straightforward workaround to this problem is to force Shared FP Registers mode for Cortex-M whenever CONFIG_FPU is enabled. This could be done with the following work-around:

  1. Force CONFIG_FPU_SHARING=y. This will be enabled FP context preservation.
  2. Force K_FP_REGS for all threads, so when HW_STACK_PROTECTION is enabled, privileged stack overflows can be detected. Alternatively, the flag could be ignored for Cortex-M (it is only used to set up the right stack guard for privileged threads)

To Reproduce An out of tree closed application code was used on nRF52840 with CONFIG_FPU=y and CONFIG_FPU_SHARING=n. Issue reproduced with some toolchains only, look at “Environment” below.

Additionally, the issue is reproduced by in-tree code, see #31472

Expected behavior Using Unshared FP Services mode should be either safe to use (assuming only one thread operates on floats) or disallowed (not being possible to select by Kconfig) if not supported. (As described above, this is not sufficient: we need to ensure FP Sharing mode is also working as expected.)

Impact Using Unshared FP Services mode with ARM Cortex-M4F (and possibly others) results in undefined behavior.

Code listing Here is the function that stores temporarily into FPU register (vmov s16, r3) and then loads from it later (vmov r1, s16).

00028014 <at_send>:
{
   28014:	e92d 4ff0 	stmdb	sp!, {r4, r5, r6, r7, r8, r9, sl, fp, lr}
   28018:	ed2d 8b02 	vpush	{d8}
   2801c:	b087      	sub	sp, #28
   2801e:	4680      	mov	r8, r0
   28020:	ee08 3a10 	vmov	s16, r3
   28024:	468a      	mov	sl, r1
   28026:	9203      	str	r2, [sp, #12]
	int64_t ts = k_uptime_get();
   28028:	f022 ffb4 	bl	4af94 <k_uptime_get>
	int64_t remaining_timeout = timeout;
   2802c:	9c12      	ldr	r4, [sp, #72]	; 0x48
	int64_t ts = k_uptime_get();
   2802e:	4606      	mov	r6, r0
   28030:	460f      	mov	r7, r1
	int64_t remaining_timeout = timeout;
   28032:	17e5      	asrs	r5, r4, #31
	while (k_msgq_get(&modem->at.msgq, &c,
   28034:	f508 7ba4 	add.w	fp, r8, #328	; 0x148
	return z_impl_k_msgq_get(msgq, data, timeout);
   28038:	f640 42cd 	movw	r2, #3277	; 0xccd
   2803c:	2300      	movs	r3, #0
   2803e:	f10d 0117 	add.w	r1, sp, #23
   28042:	4658      	mov	r0, fp
   28044:	f00b fafc 	bl	33640 <z_impl_k_msgq_get>
   28048:	2800      	cmp	r0, #0
   2804a:	d052      	beq.n	280f2 <at_send+0xde>
	uart_push_at(modem->uart, request);
   2804c:	f8d8 0008 	ldr.w	r0, [r8, #8]
	uart_push_str(uart_dev, str);
   28050:	9001      	str	r0, [sp, #4]
   28052:	4651      	mov	r1, sl
   28054:	f022 ff41 	bl	4aeda <uart_push_str>
	uart_push_str(uart_dev, "\r");
   28058:	493a      	ldr	r1, [pc, #232]	; (28144 <at_send+0x130>)
   2805a:	9801      	ldr	r0, [sp, #4]
   2805c:	f022 ff3d 	bl	4aeda <uart_push_str>
   28060:	4b39      	ldr	r3, [pc, #228]	; (28148 <at_send+0x134>)
   28062:	4a3a      	ldr	r2, [pc, #232]	; (2814c <at_send+0x138>)
   28064:	1a9b      	subs	r3, r3, r2
	uint8_t *recv_ptr = modem->at.recv_buf;
   28066:	f508 79b8 	add.w	r9, r8, #368	; 0x170
   2806a:	08db      	lsrs	r3, r3, #3
   2806c:	9302      	str	r3, [sp, #8]
   2806e:	46c8      	mov	r8, r9
	uptime = k_uptime_get();
   28070:	f022 ff90 	bl	4af94 <k_uptime_get>
   28074:	19a6      	adds	r6, r4, r6
   28076:	eb45 0707 	adc.w	r7, r5, r7
		remaining_timeout -= k_uptime_delta(&ts);
   2807a:	1a34      	subs	r4, r6, r0
   2807c:	eb67 0501 	sbc.w	r5, r7, r1
		if (remaining_timeout < 0) {
   28080:	2c00      	cmp	r4, #0
   28082:	f175 0300 	sbcs.w	r3, r5, #0
   28086:	9001      	str	r0, [sp, #4]
   28088:	468a      	mov	sl, r1
   2808a:	db54      	blt.n	28136 <at_send+0x122>
   2808c:	03e9      	lsls	r1, r5, #15
   2808e:	03e0      	lsls	r0, r4, #15
   28090:	f240 36e7 	movw	r6, #999	; 0x3e7
   28094:	1980      	adds	r0, r0, r6
   28096:	ea41 4154 	orr.w	r1, r1, r4, lsr #17
   2809a:	f44f 727a 	mov.w	r2, #1000	; 0x3e8
   2809e:	f04f 0300 	mov.w	r3, #0
   280a2:	f141 0100 	adc.w	r1, r1, #0
   280a6:	f7d8 fd31 	bl	b0c <__aeabi_uldivmod>
   280aa:	4602      	mov	r2, r0
   280ac:	460b      	mov	r3, r1
   280ae:	f10d 0117 	add.w	r1, sp, #23
   280b2:	4658      	mov	r0, fp
   280b4:	f00b fac4 	bl	33640 <z_impl_k_msgq_get>
		if (k_msgq_get(&modem->at.msgq, &c,
   280b8:	2800      	cmp	r0, #0
   280ba:	d13c      	bne.n	28136 <at_send+0x122>
		if (c == '\r' || c == '\n' || c == '\0') {
   280bc:	f89d 2017 	ldrb.w	r2, [sp, #23]
   280c0:	2a0d      	cmp	r2, #13
   280c2:	d825      	bhi.n	28110 <at_send+0xfc>
   280c4:	f242 4301 	movw	r3, #9217	; 0x2401
   280c8:	40d3      	lsrs	r3, r2
   280ca:	43db      	mvns	r3, r3
   280cc:	f013 0301 	ands.w	r3, r3, #1
   280d0:	d11e      	bne.n	28110 <at_send+0xfc>
			if (recv_ptr == modem->at.recv_buf) {
   280d2:	45c8      	cmp	r8, r9
   280d4:	d00a      	beq.n	280ec <at_send+0xd8>
			*recv_ptr = '\0';
   280d6:	f888 3000 	strb.w	r3, [r8]
			ret = line_cb(modem->at.recv_buf, user_data);
   280da:	ee18 1a10 	vmov	r1, s16
   280de:	9b03      	ldr	r3, [sp, #12]
   280e0:	4648      	mov	r0, r9
   280e2:	4798      	blx	r3
			if (ret != -EAGAIN) {
   280e4:	f110 0f0b 	cmn.w	r0, #11
   280e8:	d127      	bne.n	2813a <at_send+0x126>
	uint8_t *recv_ptr = modem->at.recv_buf;
   280ea:	46c8      	mov	r8, r9
   280ec:	9e01      	ldr	r6, [sp, #4]
   280ee:	4657      	mov	r7, sl
   280f0:	e7be      	b.n	28070 <at_send+0x5c>
   280f2:	f022 ff4f 	bl	4af94 <k_uptime_get>
		remaining_timeout -= k_uptime_delta(&ts);
   280f6:	1a36      	subs	r6, r6, r0
   280f8:	eb67 0701 	sbc.w	r7, r7, r1
   280fc:	19a4      	adds	r4, r4, r6
   280fe:	eb47 0505 	adc.w	r5, r7, r5
		if (remaining_timeout < 0) {
   28102:	2c00      	cmp	r4, #0
   28104:	f175 0300 	sbcs.w	r3, r5, #0
   28108:	db15      	blt.n	28136 <at_send+0x122>
	*reftime = uptime;
   2810a:	4606      	mov	r6, r0
   2810c:	460f      	mov	r7, r1
   2810e:	e793      	b.n	28038 <at_send+0x24>
			*recv_ptr = c;
   28110:	f808 2b01 	strb.w	r2, [r8], #1
			if (recv_ptr - modem->at.recv_buf >=
   28114:	eba8 0309 	sub.w	r3, r8, r9
   28118:	2b3f      	cmp	r3, #63	; 0x3f
   2811a:	dde7      	ble.n	280ec <at_send+0xd8>
				LOG_WRN("AT response overflow");
   2811c:	4b0c      	ldr	r3, [pc, #48]	; (28150 <at_send+0x13c>)
   2811e:	681b      	ldr	r3, [r3, #0]
   28120:	f013 0f06 	tst.w	r3, #6
   28124:	d0e1      	beq.n	280ea <at_send+0xd6>
   28126:	9b02      	ldr	r3, [sp, #8]
   28128:	480a      	ldr	r0, [pc, #40]	; (28154 <at_send+0x140>)
   2812a:	0199      	lsls	r1, r3, #6
   2812c:	f041 0102 	orr.w	r1, r1, #2
   28130:	f01a fba9 	bl	42886 <log_0>
   28134:	e7d9      	b.n	280ea <at_send+0xd6>
			return -ETIMEDOUT;
   28136:	f06f 0073 	mvn.w	r0, #115	; 0x73
}
   2813a:	b007      	add	sp, #28
   2813c:	ecbd 8b02 	vpop	{d8}
   28140:	e8bd 8ff0 	ldmia.w	sp!, {r4, r5, r6, r7, r8, r9, sl, fp, pc}
   28144:	00060bb6 	.word	0x00060bb6
   28148:	0005183c 	.word	0x0005183c
   2814c:	000516ac 	.word	0x000516ac
   28150:	20002ecc 	.word	0x20002ecc
   28154:	0006523e 	.word	0x0006523e

Environment

  • OS: Linux
  • Toolchain: Zephyr SDK 0.11.2, 0.12.0 Beta1, GNU ARM Embedded 9.2 2019 Q4 major (but not reproducible with GNU ARM Embedded 8.3 2019 Q3 update)
  • Commit SHA or Version used: 966015f503d1438c25d59793762495452be5ebbc

Additional context Similar issue was reported here: https://answers.launchpad.net/gcc-arm-embedded/+question/691604

About this issue

  • Original URL
  • State: closed
  • Created 4 years ago
  • Comments: 24 (21 by maintainers)

Most upvoted comments

@ioannisg x86 does not seem to be affected by this. FPU/SSE have to be explicitly enabled before those registers can be used on 32-bit. On 64-bit, FPU/SSE are always available so we are saving those registers anyway.

So, I am still thinking around this issue. I am involving @tejlmand (Toolchain responsible and build-system maintainer) to get his view.

What I can tell about the workaround is:

  1. We can enable FPU_SHARING unconditionally, when we use FPU (this means we enforce sharing registers mode). This enables FP register stacking when needed. This is not enough though: we need to treat all threads as having the K_FP_REGS flag set. This adds some extra memory overhead, cause the stack area wasted for the stack guard is 128 instead of 32 bytes.
  2. If ISRs use the FP registers, arm context stacking and unstacking should do the work. Am I missing something @mniestroj @ABOSTM ?