| From 03d97c104f2d68cffd1bfc48cd62727e13a64712 Mon Sep 17 00:00:00 2001 |
| From: Rui Miguel Silva <rui.silva@linaro.org> |
| Date: Fri, 14 Oct 2022 17:42:52 +0100 |
| Subject: [PATCH] newlib: memcpy: remove optimized version |
| |
| When creating messages packed to send over openamp we may need |
| to do some copy in unaligned address, because of that we may |
| not always use the assembler optimized version, which will |
| trough a data-abort on aligned address exception. |
| |
| So, we may just use the version in string.h (the same used in |
| optee-os) that will take care to check and use different |
| optimization based on given source or destination address's. |
| |
| Upstream-Status: Pending |
| Signed-off-by: Rui Miguel Silva <rui.silva@linaro.org> |
| --- |
| newlib/libc/machine/aarch64/memcpy-stub.c | 2 +- |
| newlib/libc/machine/aarch64/memcpy.S | 166 ---------------------- |
| 2 files changed, 1 insertion(+), 167 deletions(-) |
| |
| diff --git a/newlib/libc/machine/aarch64/memcpy-stub.c b/newlib/libc/machine/aarch64/memcpy-stub.c |
| index cd6d72a8b8af..5f2b7968c7fc 100644 |
| --- a/newlib/libc/machine/aarch64/memcpy-stub.c |
| +++ b/newlib/libc/machine/aarch64/memcpy-stub.c |
| @@ -27,5 +27,5 @@ |
| #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) |
| # include "../../string/memcpy.c" |
| #else |
| -/* See memcpy.S */ |
| +# include "../../string/memcpy.c" |
| #endif |
| diff --git a/newlib/libc/machine/aarch64/memcpy.S b/newlib/libc/machine/aarch64/memcpy.S |
| index 463bad0a1816..2a1460546374 100644 |
| --- a/newlib/libc/machine/aarch64/memcpy.S |
| +++ b/newlib/libc/machine/aarch64/memcpy.S |
| @@ -61,170 +61,4 @@ |
| #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) |
| /* See memcpy-stub.c */ |
| #else |
| - |
| -#define dstin x0 |
| -#define src x1 |
| -#define count x2 |
| -#define dst x3 |
| -#define srcend x4 |
| -#define dstend x5 |
| -#define A_l x6 |
| -#define A_lw w6 |
| -#define A_h x7 |
| -#define A_hw w7 |
| -#define B_l x8 |
| -#define B_lw w8 |
| -#define B_h x9 |
| -#define C_l x10 |
| -#define C_h x11 |
| -#define D_l x12 |
| -#define D_h x13 |
| -#define E_l src |
| -#define E_h count |
| -#define F_l srcend |
| -#define F_h dst |
| -#define tmp1 x9 |
| - |
| -#define L(l) .L ## l |
| - |
| - .macro def_fn f p2align=0 |
| - .text |
| - .p2align \p2align |
| - .global \f |
| - .type \f, %function |
| -\f: |
| - .endm |
| - |
| -/* Copies are split into 3 main cases: small copies of up to 16 bytes, |
| - medium copies of 17..96 bytes which are fully unrolled. Large copies |
| - of more than 96 bytes align the destination and use an unrolled loop |
| - processing 64 bytes per iteration. |
| - Small and medium copies read all data before writing, allowing any |
| - kind of overlap, and memmove tailcalls memcpy for these cases as |
| - well as non-overlapping copies. |
| -*/ |
| - |
| -def_fn memcpy p2align=6 |
| - prfm PLDL1KEEP, [src] |
| - add srcend, src, count |
| - add dstend, dstin, count |
| - cmp count, 16 |
| - b.ls L(copy16) |
| - cmp count, 96 |
| - b.hi L(copy_long) |
| - |
| - /* Medium copies: 17..96 bytes. */ |
| - sub tmp1, count, 1 |
| - ldp A_l, A_h, [src] |
| - tbnz tmp1, 6, L(copy96) |
| - ldp D_l, D_h, [srcend, -16] |
| - tbz tmp1, 5, 1f |
| - ldp B_l, B_h, [src, 16] |
| - ldp C_l, C_h, [srcend, -32] |
| - stp B_l, B_h, [dstin, 16] |
| - stp C_l, C_h, [dstend, -32] |
| -1: |
| - stp A_l, A_h, [dstin] |
| - stp D_l, D_h, [dstend, -16] |
| - ret |
| - |
| - .p2align 4 |
| - /* Small copies: 0..16 bytes. */ |
| -L(copy16): |
| - cmp count, 8 |
| - b.lo 1f |
| - ldr A_l, [src] |
| - ldr A_h, [srcend, -8] |
| - str A_l, [dstin] |
| - str A_h, [dstend, -8] |
| - ret |
| - .p2align 4 |
| -1: |
| - tbz count, 2, 1f |
| - ldr A_lw, [src] |
| - ldr A_hw, [srcend, -4] |
| - str A_lw, [dstin] |
| - str A_hw, [dstend, -4] |
| - ret |
| - |
| - /* Copy 0..3 bytes. Use a branchless sequence that copies the same |
| - byte 3 times if count==1, or the 2nd byte twice if count==2. */ |
| -1: |
| - cbz count, 2f |
| - lsr tmp1, count, 1 |
| - ldrb A_lw, [src] |
| - ldrb A_hw, [srcend, -1] |
| - ldrb B_lw, [src, tmp1] |
| - strb A_lw, [dstin] |
| - strb B_lw, [dstin, tmp1] |
| - strb A_hw, [dstend, -1] |
| -2: ret |
| - |
| - .p2align 4 |
| - /* Copy 64..96 bytes. Copy 64 bytes from the start and |
| - 32 bytes from the end. */ |
| -L(copy96): |
| - ldp B_l, B_h, [src, 16] |
| - ldp C_l, C_h, [src, 32] |
| - ldp D_l, D_h, [src, 48] |
| - ldp E_l, E_h, [srcend, -32] |
| - ldp F_l, F_h, [srcend, -16] |
| - stp A_l, A_h, [dstin] |
| - stp B_l, B_h, [dstin, 16] |
| - stp C_l, C_h, [dstin, 32] |
| - stp D_l, D_h, [dstin, 48] |
| - stp E_l, E_h, [dstend, -32] |
| - stp F_l, F_h, [dstend, -16] |
| - ret |
| - |
| - /* Align DST to 16 byte alignment so that we don't cross cache line |
| - boundaries on both loads and stores. There are at least 96 bytes |
| - to copy, so copy 16 bytes unaligned and then align. The loop |
| - copies 64 bytes per iteration and prefetches one iteration ahead. */ |
| - |
| - .p2align 4 |
| -L(copy_long): |
| - and tmp1, dstin, 15 |
| - bic dst, dstin, 15 |
| - ldp D_l, D_h, [src] |
| - sub src, src, tmp1 |
| - add count, count, tmp1 /* Count is now 16 too large. */ |
| - ldp A_l, A_h, [src, 16] |
| - stp D_l, D_h, [dstin] |
| - ldp B_l, B_h, [src, 32] |
| - ldp C_l, C_h, [src, 48] |
| - ldp D_l, D_h, [src, 64]! |
| - subs count, count, 128 + 16 /* Test and readjust count. */ |
| - b.ls 2f |
| -1: |
| - stp A_l, A_h, [dst, 16] |
| - ldp A_l, A_h, [src, 16] |
| - stp B_l, B_h, [dst, 32] |
| - ldp B_l, B_h, [src, 32] |
| - stp C_l, C_h, [dst, 48] |
| - ldp C_l, C_h, [src, 48] |
| - stp D_l, D_h, [dst, 64]! |
| - ldp D_l, D_h, [src, 64]! |
| - subs count, count, 64 |
| - b.hi 1b |
| - |
| - /* Write the last full set of 64 bytes. The remainder is at most 64 |
| - bytes, so it is safe to always copy 64 bytes from the end even if |
| - there is just 1 byte left. */ |
| -2: |
| - ldp E_l, E_h, [srcend, -64] |
| - stp A_l, A_h, [dst, 16] |
| - ldp A_l, A_h, [srcend, -48] |
| - stp B_l, B_h, [dst, 32] |
| - ldp B_l, B_h, [srcend, -32] |
| - stp C_l, C_h, [dst, 48] |
| - ldp C_l, C_h, [srcend, -16] |
| - stp D_l, D_h, [dst, 64] |
| - stp E_l, E_h, [dstend, -64] |
| - stp A_l, A_h, [dstend, -48] |
| - stp B_l, B_h, [dstend, -32] |
| - stp C_l, C_h, [dstend, -16] |
| - ret |
| - |
| - .size memcpy, . - memcpy |
| #endif |
| -- |
| 2.38.0 |
| |