zephyr: ARM: FPU: using Unshared FP Services mode can still result in corrupted floating point registers
Describe the bug Unshared FP Services mode is supposed to work properly when a single thread uses the FPU. However, it looks like GCC (version 9 and above) generates code that temporarily stores integer data to FPU registers, even if no FPU math operations are carried out within a function. The values in FPU registers can be overwritten by other threads, then such modified values are loaded into original function and cause program corruption. This happens because under unshared FP registers mode, the Cortex-M (and perhaps other architectures) do not save/restore the FP context during thread context-switch or even during exception entry and return.
A straightforward workaround to this problem is to force Shared FP Registers mode for Cortex-M whenever CONFIG_FPU is enabled. This could be done with the following work-around:
- Force
CONFIG_FPU_SHARING=y. This will be enabled FP context preservation. - Force K_FP_REGS for all threads, so when HW_STACK_PROTECTION is enabled, privileged stack overflows can be detected. Alternatively, the flag could be ignored for Cortex-M (it is only used to set up the right stack guard for privileged threads)
To Reproduce
An out of tree closed application code was used on nRF52840 with CONFIG_FPU=y and CONFIG_FPU_SHARING=n. Issue reproduced with some toolchains only, look at “Environment” below.
Additionally, the issue is reproduced by in-tree code, see #31472
Expected behavior Using Unshared FP Services mode should be either safe to use (assuming only one thread operates on floats) or disallowed (not being possible to select by Kconfig) if not supported. (As described above, this is not sufficient: we need to ensure FP Sharing mode is also working as expected.)
Impact Using Unshared FP Services mode with ARM Cortex-M4F (and possibly others) results in undefined behavior.
Code listing
Here is the function that stores temporarily into FPU register (vmov s16, r3) and then loads from it later (vmov r1, s16).
00028014 <at_send>:
{
28014: e92d 4ff0 stmdb sp!, {r4, r5, r6, r7, r8, r9, sl, fp, lr}
28018: ed2d 8b02 vpush {d8}
2801c: b087 sub sp, #28
2801e: 4680 mov r8, r0
28020: ee08 3a10 vmov s16, r3
28024: 468a mov sl, r1
28026: 9203 str r2, [sp, #12]
int64_t ts = k_uptime_get();
28028: f022 ffb4 bl 4af94 <k_uptime_get>
int64_t remaining_timeout = timeout;
2802c: 9c12 ldr r4, [sp, #72] ; 0x48
int64_t ts = k_uptime_get();
2802e: 4606 mov r6, r0
28030: 460f mov r7, r1
int64_t remaining_timeout = timeout;
28032: 17e5 asrs r5, r4, #31
while (k_msgq_get(&modem->at.msgq, &c,
28034: f508 7ba4 add.w fp, r8, #328 ; 0x148
return z_impl_k_msgq_get(msgq, data, timeout);
28038: f640 42cd movw r2, #3277 ; 0xccd
2803c: 2300 movs r3, #0
2803e: f10d 0117 add.w r1, sp, #23
28042: 4658 mov r0, fp
28044: f00b fafc bl 33640 <z_impl_k_msgq_get>
28048: 2800 cmp r0, #0
2804a: d052 beq.n 280f2 <at_send+0xde>
uart_push_at(modem->uart, request);
2804c: f8d8 0008 ldr.w r0, [r8, #8]
uart_push_str(uart_dev, str);
28050: 9001 str r0, [sp, #4]
28052: 4651 mov r1, sl
28054: f022 ff41 bl 4aeda <uart_push_str>
uart_push_str(uart_dev, "\r");
28058: 493a ldr r1, [pc, #232] ; (28144 <at_send+0x130>)
2805a: 9801 ldr r0, [sp, #4]
2805c: f022 ff3d bl 4aeda <uart_push_str>
28060: 4b39 ldr r3, [pc, #228] ; (28148 <at_send+0x134>)
28062: 4a3a ldr r2, [pc, #232] ; (2814c <at_send+0x138>)
28064: 1a9b subs r3, r3, r2
uint8_t *recv_ptr = modem->at.recv_buf;
28066: f508 79b8 add.w r9, r8, #368 ; 0x170
2806a: 08db lsrs r3, r3, #3
2806c: 9302 str r3, [sp, #8]
2806e: 46c8 mov r8, r9
uptime = k_uptime_get();
28070: f022 ff90 bl 4af94 <k_uptime_get>
28074: 19a6 adds r6, r4, r6
28076: eb45 0707 adc.w r7, r5, r7
remaining_timeout -= k_uptime_delta(&ts);
2807a: 1a34 subs r4, r6, r0
2807c: eb67 0501 sbc.w r5, r7, r1
if (remaining_timeout < 0) {
28080: 2c00 cmp r4, #0
28082: f175 0300 sbcs.w r3, r5, #0
28086: 9001 str r0, [sp, #4]
28088: 468a mov sl, r1
2808a: db54 blt.n 28136 <at_send+0x122>
2808c: 03e9 lsls r1, r5, #15
2808e: 03e0 lsls r0, r4, #15
28090: f240 36e7 movw r6, #999 ; 0x3e7
28094: 1980 adds r0, r0, r6
28096: ea41 4154 orr.w r1, r1, r4, lsr #17
2809a: f44f 727a mov.w r2, #1000 ; 0x3e8
2809e: f04f 0300 mov.w r3, #0
280a2: f141 0100 adc.w r1, r1, #0
280a6: f7d8 fd31 bl b0c <__aeabi_uldivmod>
280aa: 4602 mov r2, r0
280ac: 460b mov r3, r1
280ae: f10d 0117 add.w r1, sp, #23
280b2: 4658 mov r0, fp
280b4: f00b fac4 bl 33640 <z_impl_k_msgq_get>
if (k_msgq_get(&modem->at.msgq, &c,
280b8: 2800 cmp r0, #0
280ba: d13c bne.n 28136 <at_send+0x122>
if (c == '\r' || c == '\n' || c == '\0') {
280bc: f89d 2017 ldrb.w r2, [sp, #23]
280c0: 2a0d cmp r2, #13
280c2: d825 bhi.n 28110 <at_send+0xfc>
280c4: f242 4301 movw r3, #9217 ; 0x2401
280c8: 40d3 lsrs r3, r2
280ca: 43db mvns r3, r3
280cc: f013 0301 ands.w r3, r3, #1
280d0: d11e bne.n 28110 <at_send+0xfc>
if (recv_ptr == modem->at.recv_buf) {
280d2: 45c8 cmp r8, r9
280d4: d00a beq.n 280ec <at_send+0xd8>
*recv_ptr = '\0';
280d6: f888 3000 strb.w r3, [r8]
ret = line_cb(modem->at.recv_buf, user_data);
280da: ee18 1a10 vmov r1, s16
280de: 9b03 ldr r3, [sp, #12]
280e0: 4648 mov r0, r9
280e2: 4798 blx r3
if (ret != -EAGAIN) {
280e4: f110 0f0b cmn.w r0, #11
280e8: d127 bne.n 2813a <at_send+0x126>
uint8_t *recv_ptr = modem->at.recv_buf;
280ea: 46c8 mov r8, r9
280ec: 9e01 ldr r6, [sp, #4]
280ee: 4657 mov r7, sl
280f0: e7be b.n 28070 <at_send+0x5c>
280f2: f022 ff4f bl 4af94 <k_uptime_get>
remaining_timeout -= k_uptime_delta(&ts);
280f6: 1a36 subs r6, r6, r0
280f8: eb67 0701 sbc.w r7, r7, r1
280fc: 19a4 adds r4, r4, r6
280fe: eb47 0505 adc.w r5, r7, r5
if (remaining_timeout < 0) {
28102: 2c00 cmp r4, #0
28104: f175 0300 sbcs.w r3, r5, #0
28108: db15 blt.n 28136 <at_send+0x122>
*reftime = uptime;
2810a: 4606 mov r6, r0
2810c: 460f mov r7, r1
2810e: e793 b.n 28038 <at_send+0x24>
*recv_ptr = c;
28110: f808 2b01 strb.w r2, [r8], #1
if (recv_ptr - modem->at.recv_buf >=
28114: eba8 0309 sub.w r3, r8, r9
28118: 2b3f cmp r3, #63 ; 0x3f
2811a: dde7 ble.n 280ec <at_send+0xd8>
LOG_WRN("AT response overflow");
2811c: 4b0c ldr r3, [pc, #48] ; (28150 <at_send+0x13c>)
2811e: 681b ldr r3, [r3, #0]
28120: f013 0f06 tst.w r3, #6
28124: d0e1 beq.n 280ea <at_send+0xd6>
28126: 9b02 ldr r3, [sp, #8]
28128: 480a ldr r0, [pc, #40] ; (28154 <at_send+0x140>)
2812a: 0199 lsls r1, r3, #6
2812c: f041 0102 orr.w r1, r1, #2
28130: f01a fba9 bl 42886 <log_0>
28134: e7d9 b.n 280ea <at_send+0xd6>
return -ETIMEDOUT;
28136: f06f 0073 mvn.w r0, #115 ; 0x73
}
2813a: b007 add sp, #28
2813c: ecbd 8b02 vpop {d8}
28140: e8bd 8ff0 ldmia.w sp!, {r4, r5, r6, r7, r8, r9, sl, fp, pc}
28144: 00060bb6 .word 0x00060bb6
28148: 0005183c .word 0x0005183c
2814c: 000516ac .word 0x000516ac
28150: 20002ecc .word 0x20002ecc
28154: 0006523e .word 0x0006523e
Environment
- OS: Linux
- Toolchain: Zephyr SDK 0.11.2, 0.12.0 Beta1, GNU ARM Embedded 9.2 2019 Q4 major (but not reproducible with GNU ARM Embedded 8.3 2019 Q3 update)
- Commit SHA or Version used: 966015f503d1438c25d59793762495452be5ebbc
Additional context Similar issue was reported here: https://answers.launchpad.net/gcc-arm-embedded/+question/691604
About this issue
- Original URL
- State: closed
- Created 4 years ago
- Comments: 24 (21 by maintainers)
@ioannisg x86 does not seem to be affected by this. FPU/SSE have to be explicitly enabled before those registers can be used on 32-bit. On 64-bit, FPU/SSE are always available so we are saving those registers anyway.
So, I am still thinking around this issue. I am involving @tejlmand (Toolchain responsible and build-system maintainer) to get his view.
What I can tell about the workaround is: