Blame - meta-arm/meta-arm-bsp/recipes-security/trusted-services/corstone1000/ts-newlib/0001-newlib-memcpy-remove-optimized-version.patch - openbmc/openbmc

blob: 7d8504d9389c78769b26d7eb3f9426b559a10988 [file] [log] [blame]

Patrick Williams	975a06f	2022-10-21 14:42:47 -0500	[diff] [blame^]	1	From 03d97c104f2d68cffd1bfc48cd62727e13a64712 Mon Sep 17 00:00:00 2001
				2	From: Rui Miguel Silva <rui.silva@linaro.org>
				3	Date: Fri, 14 Oct 2022 17:42:52 +0100
				4	Subject: [PATCH] newlib: memcpy: remove optimized version
				5
				6	When creating messages packed to send over openamp we may need
				7	to do some copy in unaligned address, because of that we may
				8	not always use the assembler optimized version, which will
				9	trough a data-abort on aligned address exception.
				10
				11	So, we may just use the version in string.h (the same used in
				12	optee-os) that will take care to check and use different
				13	optimization based on given source or destination address's.
				14
				15	Upstream-Status: Pending
				16	Signed-off-by: Rui Miguel Silva <rui.silva@linaro.org>
				17	---
				18	newlib/libc/machine/aarch64/memcpy-stub.c \| 2 +-
				19	newlib/libc/machine/aarch64/memcpy.S \| 166 ----------------------
				20	2 files changed, 1 insertion(+), 167 deletions(-)
				21
				22	diff --git a/newlib/libc/machine/aarch64/memcpy-stub.c b/newlib/libc/machine/aarch64/memcpy-stub.c
				23	index cd6d72a8b8af..5f2b7968c7fc 100644
				24	--- a/newlib/libc/machine/aarch64/memcpy-stub.c
				25	+++ b/newlib/libc/machine/aarch64/memcpy-stub.c
				26	@@ -27,5 +27,5 @@
				27	#if (defined (__OPTIMIZE_SIZE__) \|\| defined (PREFER_SIZE_OVER_SPEED))
				28	# include "../../string/memcpy.c"
				29	#else
				30	-/* See memcpy.S */
				31	+# include "../../string/memcpy.c"
				32	#endif
				33	diff --git a/newlib/libc/machine/aarch64/memcpy.S b/newlib/libc/machine/aarch64/memcpy.S
				34	index 463bad0a1816..2a1460546374 100644
				35	--- a/newlib/libc/machine/aarch64/memcpy.S
				36	+++ b/newlib/libc/machine/aarch64/memcpy.S
				37	@@ -61,170 +61,4 @@
				38	#if (defined (__OPTIMIZE_SIZE__) \|\| defined (PREFER_SIZE_OVER_SPEED))
				39	/* See memcpy-stub.c */
				40	#else
				41	-
				42	-#define dstin x0
				43	-#define src x1
				44	-#define count x2
				45	-#define dst x3
				46	-#define srcend x4
				47	-#define dstend x5
				48	-#define A_l x6
				49	-#define A_lw w6
				50	-#define A_h x7
				51	-#define A_hw w7
				52	-#define B_l x8
				53	-#define B_lw w8
				54	-#define B_h x9
				55	-#define C_l x10
				56	-#define C_h x11
				57	-#define D_l x12
				58	-#define D_h x13
				59	-#define E_l src
				60	-#define E_h count
				61	-#define F_l srcend
				62	-#define F_h dst
				63	-#define tmp1 x9
				64	-
				65	-#define L(l) .L ## l
				66	-
				67	- .macro def_fn f p2align=0
				68	- .text
				69	- .p2align \p2align
				70	- .global \f
				71	- .type \f, %function
				72	-\f:
				73	- .endm
				74	-
				75	-/* Copies are split into 3 main cases: small copies of up to 16 bytes,
				76	- medium copies of 17..96 bytes which are fully unrolled. Large copies
				77	- of more than 96 bytes align the destination and use an unrolled loop
				78	- processing 64 bytes per iteration.
				79	- Small and medium copies read all data before writing, allowing any
				80	- kind of overlap, and memmove tailcalls memcpy for these cases as
				81	- well as non-overlapping copies.
				82	-*/
				83	-
				84	-def_fn memcpy p2align=6
				85	- prfm PLDL1KEEP, [src]
				86	- add srcend, src, count
				87	- add dstend, dstin, count
				88	- cmp count, 16
				89	- b.ls L(copy16)
				90	- cmp count, 96
				91	- b.hi L(copy_long)
				92	-
				93	- /* Medium copies: 17..96 bytes. */
				94	- sub tmp1, count, 1
				95	- ldp A_l, A_h, [src]
				96	- tbnz tmp1, 6, L(copy96)
				97	- ldp D_l, D_h, [srcend, -16]
				98	- tbz tmp1, 5, 1f
				99	- ldp B_l, B_h, [src, 16]
				100	- ldp C_l, C_h, [srcend, -32]
				101	- stp B_l, B_h, [dstin, 16]
				102	- stp C_l, C_h, [dstend, -32]
				103	-1:
				104	- stp A_l, A_h, [dstin]
				105	- stp D_l, D_h, [dstend, -16]
				106	- ret
				107	-
				108	- .p2align 4
				109	- /* Small copies: 0..16 bytes. */
				110	-L(copy16):
				111	- cmp count, 8
				112	- b.lo 1f
				113	- ldr A_l, [src]
				114	- ldr A_h, [srcend, -8]
				115	- str A_l, [dstin]
				116	- str A_h, [dstend, -8]
				117	- ret
				118	- .p2align 4
				119	-1:
				120	- tbz count, 2, 1f
				121	- ldr A_lw, [src]
				122	- ldr A_hw, [srcend, -4]
				123	- str A_lw, [dstin]
				124	- str A_hw, [dstend, -4]
				125	- ret
				126	-
				127	- /* Copy 0..3 bytes. Use a branchless sequence that copies the same
				128	- byte 3 times if count==1, or the 2nd byte twice if count==2. */
				129	-1:
				130	- cbz count, 2f
				131	- lsr tmp1, count, 1
				132	- ldrb A_lw, [src]
				133	- ldrb A_hw, [srcend, -1]
				134	- ldrb B_lw, [src, tmp1]
				135	- strb A_lw, [dstin]
				136	- strb B_lw, [dstin, tmp1]
				137	- strb A_hw, [dstend, -1]
				138	-2: ret
				139	-
				140	- .p2align 4
				141	- /* Copy 64..96 bytes. Copy 64 bytes from the start and
				142	- 32 bytes from the end. */
				143	-L(copy96):
				144	- ldp B_l, B_h, [src, 16]
				145	- ldp C_l, C_h, [src, 32]
				146	- ldp D_l, D_h, [src, 48]
				147	- ldp E_l, E_h, [srcend, -32]
				148	- ldp F_l, F_h, [srcend, -16]
				149	- stp A_l, A_h, [dstin]
				150	- stp B_l, B_h, [dstin, 16]
				151	- stp C_l, C_h, [dstin, 32]
				152	- stp D_l, D_h, [dstin, 48]
				153	- stp E_l, E_h, [dstend, -32]
				154	- stp F_l, F_h, [dstend, -16]
				155	- ret
				156	-
				157	- /* Align DST to 16 byte alignment so that we don't cross cache line
				158	- boundaries on both loads and stores. There are at least 96 bytes
				159	- to copy, so copy 16 bytes unaligned and then align. The loop
				160	- copies 64 bytes per iteration and prefetches one iteration ahead. */
				161	-
				162	- .p2align 4
				163	-L(copy_long):
				164	- and tmp1, dstin, 15
				165	- bic dst, dstin, 15
				166	- ldp D_l, D_h, [src]
				167	- sub src, src, tmp1
				168	- add count, count, tmp1 /* Count is now 16 too large. */
				169	- ldp A_l, A_h, [src, 16]
				170	- stp D_l, D_h, [dstin]
				171	- ldp B_l, B_h, [src, 32]
				172	- ldp C_l, C_h, [src, 48]
				173	- ldp D_l, D_h, [src, 64]!
				174	- subs count, count, 128 + 16 /* Test and readjust count. */
				175	- b.ls 2f
				176	-1:
				177	- stp A_l, A_h, [dst, 16]
				178	- ldp A_l, A_h, [src, 16]
				179	- stp B_l, B_h, [dst, 32]
				180	- ldp B_l, B_h, [src, 32]
				181	- stp C_l, C_h, [dst, 48]
				182	- ldp C_l, C_h, [src, 48]
				183	- stp D_l, D_h, [dst, 64]!
				184	- ldp D_l, D_h, [src, 64]!
				185	- subs count, count, 64
				186	- b.hi 1b
				187	-
				188	- /* Write the last full set of 64 bytes. The remainder is at most 64
				189	- bytes, so it is safe to always copy 64 bytes from the end even if
				190	- there is just 1 byte left. */
				191	-2:
				192	- ldp E_l, E_h, [srcend, -64]
				193	- stp A_l, A_h, [dst, 16]
				194	- ldp A_l, A_h, [srcend, -48]
				195	- stp B_l, B_h, [dst, 32]
				196	- ldp B_l, B_h, [srcend, -32]
				197	- stp C_l, C_h, [dst, 48]
				198	- ldp C_l, C_h, [srcend, -16]
				199	- stp D_l, D_h, [dst, 64]
				200	- stp E_l, E_h, [dstend, -64]
				201	- stp A_l, A_h, [dstend, -48]
				202	- stp B_l, B_h, [dstend, -32]
				203	- stp C_l, C_h, [dstend, -16]
				204	- ret
				205	-
				206	- .size memcpy, . - memcpy
				207	#endif
				208	--
				209	2.38.0
				210