Andrew Geissler | 84ad7c5 | 2020-06-27 00:00:16 -0500 | [diff] [blame] | 1 | From b35b582ef3f6575447097585174302fde1761078 Mon Sep 17 00:00:00 2001 |
| 2 | From: Nagaraju <nmekala@xilinx.com> |
| 3 | Date: Wed, 24 Apr 2019 23:29:21 +0530 |
| 4 | Subject: [PATCH 11/11] Removing the Assembly implementation of 64bit string |
| 5 | function. Revisit in next release and fix it |
| 6 | |
| 7 | --- |
| 8 | newlib/libc/machine/microblaze/mb_endian.h | 4 ++ |
| 9 | newlib/libc/machine/microblaze/strcmp.c | 93 ++++++++++-------------------- |
| 10 | newlib/libc/machine/microblaze/strcpy.c | 82 ++++++++------------------ |
| 11 | newlib/libc/machine/microblaze/strlen.c | 59 +++++++------------ |
| 12 | 4 files changed, 81 insertions(+), 157 deletions(-) |
| 13 | |
| 14 | diff --git a/newlib/libc/machine/microblaze/mb_endian.h b/newlib/libc/machine/microblaze/mb_endian.h |
| 15 | index fb217ec..17772c8 100644 |
| 16 | --- a/newlib/libc/machine/microblaze/mb_endian.h |
| 17 | +++ b/newlib/libc/machine/microblaze/mb_endian.h |
| 18 | @@ -8,8 +8,12 @@ |
| 19 | #ifdef __LITTLE_ENDIAN__ |
| 20 | #define LOAD4BYTES(rD,rA,rB) "\tlwr\t" rD ", " rA ", " rB "\n" |
| 21 | #define STORE4BYTES(rD,rA,rB) "\tswr\t" rD ", " rA ", " rB "\n" |
| 22 | +#define LOAD8BYTES(rD,rA,rB) "\tllr\t" rD ", " rA ", " rB "\n" |
| 23 | +#define STORE8BYTES(rD,rA,rB) "\tslr\t" rD ", " rA ", " rB "\n" |
| 24 | #else |
| 25 | #define LOAD4BYTES(rD,rA,rB) "\tlw\t" rD ", " rA ", " rB "\n" |
| 26 | #define STORE4BYTES(rD,rA,rB) "\tsw\t" rD ", " rA ", " rB "\n" |
| 27 | +#define LOAD8BYTES(rD,rA,rB) "\tll\t" rD ", " rA ", " rB "\n" |
| 28 | +#define STORE8BYTES(rD,rA,rB) "\tsl\t" rD ", " rA ", " rB "\n" |
| 29 | #endif |
| 30 | #endif |
| 31 | diff --git a/newlib/libc/machine/microblaze/strcmp.c b/newlib/libc/machine/microblaze/strcmp.c |
| 32 | index acfe4cd..e34c64a 100644 |
| 33 | --- a/newlib/libc/machine/microblaze/strcmp.c |
| 34 | +++ b/newlib/libc/machine/microblaze/strcmp.c |
| 35 | @@ -129,70 +129,42 @@ strcmp (const char *s1, |
| 36 | return (*(unsigned char *) s1) - (*(unsigned char *) s2); |
| 37 | #endif /* not PREFER_SIZE_OVER_SPEED */ |
| 38 | |
| 39 | +#elif __arch64__ |
| 40 | + unsigned int *a1; |
| 41 | + unsigned int *a2; |
| 42 | + |
| 43 | + /* If s1 or s2 are unaligned, then compare bytes. */ |
| 44 | + if (!UNALIGNED (s1, s2)) |
| 45 | + { |
| 46 | + /* If s1 and s2 are word-aligned, compare them a word at a time. */ |
| 47 | + a1 = (unsigned int*)s1; |
| 48 | + a2 = (unsigned int*)s2; |
| 49 | + while (*a1 == *a2) |
| 50 | + { |
| 51 | + /* To get here, *a1 == *a2, thus if we find a null in *a1, |
| 52 | + then the strings must be equal, so return zero. */ |
| 53 | + if (DETECTNULL (*a1)) |
| 54 | + return 0; |
| 55 | + |
| 56 | + a1++; |
| 57 | + a2++; |
| 58 | + } |
| 59 | + |
| 60 | + /* A difference was detected in last few bytes of s1, so search bytewise */ |
| 61 | + s1 = (char*)a1; |
| 62 | + s2 = (char*)a2; |
| 63 | + } |
| 64 | + |
| 65 | + while (*s1 != '\0' && *s1 == *s2) |
| 66 | + { |
| 67 | + s1++; |
| 68 | + s2++; |
| 69 | + } |
| 70 | + return (*(unsigned char *) s1) - (*(unsigned char *) s2); |
| 71 | #else |
| 72 | |
| 73 | #include "mb_endian.h" |
| 74 | |
| 75 | -#ifdef __arch64__ |
| 76 | - asm volatile (" \n\ |
| 77 | - orl r9, r0, r0 /* Index register */ \n\ |
| 78 | -check_alignment: \n\ |
| 79 | - andli r3, r5, 3 \n\ |
| 80 | - andli r4, r6, 3 \n\ |
| 81 | - beanei r3, try_align_args \n\ |
| 82 | - beanei r4, regular_strcmp /* At this point we don't have a choice */ \n\ |
| 83 | -cmp_loop: \n" |
| 84 | - LOAD4BYTES("r3", "r5", "r9") |
| 85 | - LOAD4BYTES("r4", "r6", "r9") |
| 86 | -" \n\ |
| 87 | - pcmplbf r7, r3, r0 /* See if there is Null byte */ \n\ |
| 88 | - beanei r7, end_cmp_loop /* IF yes (r7 > 0) use byte compares in end_cmp_loop */ \n\ |
| 89 | - cmplu r7, r4, r3 /* ELSE compare whole word */ \n\ |
| 90 | - beanei r7, end_cmp \n\ |
| 91 | - addlik r9, r9, 4 /* delay slot */ \n\ |
| 92 | - breaid cmp_loop \n\ |
| 93 | - nop /* delay slot */ \n\ |
| 94 | -end_cmp_loop: \n\ |
| 95 | - lbu r3, r5, r9 /* byte compare loop */ \n\ |
| 96 | - lbu r4, r6, r9 \n\ |
| 97 | - cmplu r7, r4, r3 /* Compare bytes */ \n\ |
| 98 | - beanei r7, end_cmp_early \n\ |
| 99 | - addlik r9, r9, 1 /* delay slot */ \n\ |
| 100 | - beaneid r3, end_cmp_loop /* If reached null on one string, terminate */ \n\ |
| 101 | - nop \n\ |
| 102 | -end_cmp_early: \n\ |
| 103 | - orl r3, r0, r7 /* delay slot */ \n\ |
| 104 | - rtsd r15, 8 \n\ |
| 105 | - nop \n\ |
| 106 | -try_align_args: \n\ |
| 107 | - xorl r7, r4, r3 \n\ |
| 108 | - beanei r7, regular_strcmp /* cannot align args */ \n\ |
| 109 | - rsublik r10, r3, 4 /* Number of initial bytes to align */ \n\ |
| 110 | -align_loop: \n\ |
| 111 | - lbu r3, r5, r9 \n\ |
| 112 | - lbu r4, r6, r9 \n\ |
| 113 | - cmplu r7, r4, r3 \n\ |
| 114 | - beanei r7, end_cmp \n\ |
| 115 | - beaeqi r3, end_cmp \n\ |
| 116 | - addlik r10, r10, -1 \n\ |
| 117 | - addlik r9, r9, 1 \n\ |
| 118 | - beaeqid r10, cmp_loop \n\ |
| 119 | - nop \n\ |
| 120 | - breai align_loop \n\ |
| 121 | -regular_strcmp: \n\ |
| 122 | - lbu r3, r5, r9 \n\ |
| 123 | - lbu r4, r6, r9 \n\ |
| 124 | - cmplu r7, r4, r3 \n\ |
| 125 | - beanei r7, end_cmp \n\ |
| 126 | - beaeqi r3, end_cmp \n\ |
| 127 | - addlik r9, r9, 1 \n\ |
| 128 | - breaid regular_strcmp \n\ |
| 129 | - nop \n\ |
| 130 | -end_cmp: \n\ |
| 131 | - orl r3, r0, r7 \n\ |
| 132 | - rtsd r15, 8 \n\ |
| 133 | - nop /* Return strcmp result */"); |
| 134 | -#else |
| 135 | asm volatile (" \n\ |
| 136 | or r9, r0, r0 /* Index register */\n\ |
| 137 | check_alignment: \n\ |
| 138 | @@ -246,7 +218,6 @@ end_cmp: |
| 139 | rtsd r15, 8 \n\ |
| 140 | or r3, r0, r7 /* Return strcmp result */"); |
| 141 | |
| 142 | -#endif |
| 143 | #endif /* ! HAVE_HW_PCMP */ |
| 144 | } |
| 145 | |
| 146 | diff --git a/newlib/libc/machine/microblaze/strcpy.c b/newlib/libc/machine/microblaze/strcpy.c |
| 147 | index 6dbc60d..ddb6922 100644 |
| 148 | --- a/newlib/libc/machine/microblaze/strcpy.c |
| 149 | +++ b/newlib/libc/machine/microblaze/strcpy.c |
| 150 | @@ -121,67 +121,36 @@ strcpy (char *__restrict dst0, |
| 151 | ; |
| 152 | return dst0; |
| 153 | #endif /* not PREFER_SIZE_OVER_SPEED */ |
| 154 | +#elif __arch64__ |
| 155 | + char *dst = dst0; |
| 156 | + const char *src = src0; |
| 157 | + long *aligned_dst; |
| 158 | + const long *aligned_src; |
| 159 | |
| 160 | -#else |
| 161 | + /* If SRC or DEST is unaligned, then copy bytes. */ |
| 162 | + if (!UNALIGNED (src, dst)) |
| 163 | + { |
| 164 | + aligned_dst = (long*)dst; |
| 165 | + aligned_src = (long*)src; |
| 166 | |
| 167 | -#include "mb_endian.h" |
| 168 | -#ifdef __arch64__ |
| 169 | + /* SRC and DEST are both "long int" aligned, try to do "long int" |
| 170 | + sized copies. */ |
| 171 | + while (!DETECTNULL(*aligned_src)) |
| 172 | + { |
| 173 | + *aligned_dst++ = *aligned_src++; |
| 174 | + } |
| 175 | |
| 176 | - asm volatile (" \n\ |
| 177 | - orl r9, r0, r0 /* Index register */ \n\ |
| 178 | -check_alignment: \n\ |
| 179 | - andli r3, r5, 3 \n\ |
| 180 | - andli r4, r6, 3 \n\ |
| 181 | - beanei r3, try_align_args \n\ |
| 182 | - beanei r4, regular_strcpy /* At this point we dont have a choice */ \n\ |
| 183 | -cpy_loop: \n" |
| 184 | - LOAD4BYTES("r3", "r6", "r9") |
| 185 | -" \n\ |
| 186 | - pcmplbf r4, r0, r3 \n\ |
| 187 | - beanei r4, cpy_bytes /* If r4 != 0, then null present within string */\n" |
| 188 | - STORE4BYTES("r3", "r5", "r9") |
| 189 | -" \n\ |
| 190 | - addlik r9, r9, 4 \n\ |
| 191 | - breaid cpy_loop \n\ |
| 192 | - nop \n\ |
| 193 | -cpy_bytes: \n\ |
| 194 | - lbu r3, r6, r9 \n\ |
| 195 | - sb r3, r5, r9 \n\ |
| 196 | - addlik r4, r4, -1 \n\ |
| 197 | - addlik r9, r9, 1 /* delay slot */\n\ |
| 198 | - beaneid r4, cpy_bytes \n\ |
| 199 | - nop \n\ |
| 200 | -cpy_null: \n\ |
| 201 | - orl r3, r0, r5 /* Return strcpy result */\n\ |
| 202 | - rtsd r15, 8 \n\ |
| 203 | - nop \n\ |
| 204 | -try_align_args: \n\ |
| 205 | - xorl r7, r4, r3 \n\ |
| 206 | - beanei r7, regular_strcpy /* cannot align args */\n\ |
| 207 | - rsublik r10, r3, 4 /* Number of initial bytes to align */\n\ |
| 208 | -align_loop: \n\ |
| 209 | - lbu r3, r6, r9 \n\ |
| 210 | - sb r3, r5, r9 \n\ |
| 211 | - addlik r10, r10, -1 \n\ |
| 212 | - beaeqid r3, end_cpy /* Break if we have seen null character */\n\ |
| 213 | - nop \n\ |
| 214 | - addlik r9, r9, 1 \n\ |
| 215 | - beaneid r10, align_loop \n\ |
| 216 | - nop \n\ |
| 217 | - breai cpy_loop \n\ |
| 218 | -regular_strcpy: \n\ |
| 219 | - lbu r3, r6, r9 \n\ |
| 220 | - sb r3, r5, r9 \n\ |
| 221 | - addlik r9, r9, 1 \n\ |
| 222 | - beaneid r3, regular_strcpy \n\ |
| 223 | - nop \n\ |
| 224 | -end_cpy: \n\ |
| 225 | - orl r3, r0, r5 \n\ |
| 226 | - rtsd r15, 8 \n\ |
| 227 | - nop /* Return strcpy result */"); |
| 228 | + dst = (char*)aligned_dst; |
| 229 | + src = (char*)aligned_src; |
| 230 | + } |
| 231 | |
| 232 | -#else |
| 233 | + while (*dst++ = *src++) |
| 234 | + ; |
| 235 | + return dst0; |
| 236 | + |
| 237 | +#else |
| 238 | |
| 239 | +#include "mb_endian.h" |
| 240 | asm volatile (" \n\ |
| 241 | or r9, r0, r0 /* Index register */ \n\ |
| 242 | check_alignment: \n\ |
| 243 | @@ -227,7 +196,6 @@ regular_strcpy: \n\ |
| 244 | end_cpy: \n\ |
| 245 | rtsd r15, 8 \n\ |
| 246 | or r3, r0, r5 /* Return strcpy result */"); |
| 247 | -#endif |
| 248 | #endif /* ! HAVE_HW_PCMP */ |
| 249 | } |
| 250 | |
| 251 | diff --git a/newlib/libc/machine/microblaze/strlen.c b/newlib/libc/machine/microblaze/strlen.c |
| 252 | index b6f2d3c..9407539 100644 |
| 253 | --- a/newlib/libc/machine/microblaze/strlen.c |
| 254 | +++ b/newlib/libc/machine/microblaze/strlen.c |
| 255 | @@ -112,47 +112,29 @@ strlen (const char *str) |
| 256 | return str - start; |
| 257 | #endif /* not PREFER_SIZE_OVER_SPEED */ |
| 258 | |
| 259 | -#else |
| 260 | - |
| 261 | -#include "mb_endian.h" |
| 262 | +#elif __arch64__ |
| 263 | + const char *start = str; |
| 264 | + unsigned long *aligned_addr; |
| 265 | |
| 266 | -#ifdef __arch64__ |
| 267 | - asm volatile (" \n\ |
| 268 | - orl r9, r0, r0 /* Index register */ \n\ |
| 269 | -check_alignment: \n\ |
| 270 | - andli r3, r5, 3 \n\ |
| 271 | - beanei r3, align_arg \n\ |
| 272 | -len_loop: \n" |
| 273 | - LOAD4BYTES("r3", "r5", "r9") |
| 274 | -" \n\ |
| 275 | - pcmplbf r4, r3, r0 \n\ |
| 276 | - beanei r4, end_len \n\ |
| 277 | - addlik r9, r9, 4 \n\ |
| 278 | - breaid len_loop \n\ |
| 279 | - nop \n\ |
| 280 | -end_len: \n\ |
| 281 | - lbu r3, r5, r9 \n\ |
| 282 | - beaeqi r3, done_len \n\ |
| 283 | - addlik r9, r9, 1 \n\ |
| 284 | - breaid end_len \n\ |
| 285 | - nop \n\ |
| 286 | -done_len: \n\ |
| 287 | - orl r3, r0, r9 /* Return len */ \n\ |
| 288 | - rtsd r15, 8 \n\ |
| 289 | - nop \n\ |
| 290 | -align_arg: \n\ |
| 291 | - rsublik r10, r3, 4 \n\ |
| 292 | -align_loop: \n\ |
| 293 | - lbu r3, r5, r9 \n\ |
| 294 | - addlik r10, r10, -1 \n\ |
| 295 | - beaeqid r3, done_len \n\ |
| 296 | - nop \n\ |
| 297 | - addlik r9, r9, 1 \n\ |
| 298 | - beaneid r10, align_loop \n\ |
| 299 | - nop \n\ |
| 300 | - breai len_loop"); |
| 301 | + if (!UNALIGNED (str)) |
| 302 | + { |
| 303 | + /* If the string is word-aligned, we can check for the presence of |
| 304 | + a null in each word-sized block. */ |
| 305 | + aligned_addr = (unsigned long*)str; |
| 306 | + while (!DETECTNULL (*aligned_addr)) |
| 307 | + aligned_addr++; |
| 308 | |
| 309 | + /* Once a null is detected, we check each byte in that block for a |
| 310 | + precise position of the null. */ |
| 311 | + str = (char*)aligned_addr; |
| 312 | + } |
| 313 | + |
| 314 | + while (*str) |
| 315 | + str++; |
| 316 | + return str - start; |
| 317 | #else |
| 318 | + |
| 319 | +#include "mb_endian.h" |
| 320 | asm volatile (" \n\ |
| 321 | or r9, r0, r0 /* Index register */ \n\ |
| 322 | check_alignment: \n\ |
| 323 | @@ -183,6 +165,5 @@ align_loop: \n\ |
| 324 | addik r9, r9, 1 \n\ |
| 325 | bri len_loop"); |
| 326 | |
| 327 | -#endif |
| 328 | #endif /* ! HAVE_HW_PCMP */ |
| 329 | } |
| 330 | -- |
| 331 | 2.7.4 |
| 332 | |