Brad Bishop | 1932369 | 2019-04-05 15:28:33 -0400 | [diff] [blame^] | 1 | From 02a138f0b247fb08b799f32c49b35912b2921321 Mon Sep 17 00:00:00 2001 |
| 2 | From: Khem Raj <raj.khem@gmail.com> |
| 3 | Date: Tue, 12 Feb 2019 11:38:46 -0800 |
| 4 | Subject: [PATCH] math_vfp_asm.S: Convert fldmia/fstmia instructions to UAL |
| 5 | syntax for clang |
| 6 | |
| 7 | This is flagged with clang internal assembler, since it does not allow |
| 8 | non UAL syntax |
| 9 | |
| 10 | Upstream-Status: Pending |
| 11 | |
| 12 | Signed-off-by: Khem Raj <raj.khem@gmail.com> |
| 13 | --- |
| 14 | liboil/arm/math_vfp_asm.S | 94 +++++++++++++++++++-------------------- |
| 15 | 1 file changed, 47 insertions(+), 47 deletions(-) |
| 16 | |
| 17 | diff --git a/liboil/arm/math_vfp_asm.S b/liboil/arm/math_vfp_asm.S |
| 18 | index ae5c803..3dd14d9 100644 |
| 19 | --- a/liboil/arm/math_vfp_asm.S |
| 20 | +++ b/liboil/arm/math_vfp_asm.S |
| 21 | @@ -25,7 +25,7 @@ |
| 22 | */ |
| 23 | |
| 24 | #if defined(__VFP_FP__) && !defined(__SOFTFP__) |
| 25 | -/* |
| 26 | +/* |
| 27 | ** compile with -mcpu=arm1136j-s -mfpu=vfp -mfloat-abi=softfp |
| 28 | ** |
| 29 | ** void vfp_add_f32 (float *d, const float *s1, const float *s2, int n); |
| 30 | @@ -48,10 +48,10 @@ |
| 31 | ands ip, r3, #7; /* ip = n % 8 */ \ |
| 32 | beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \ |
| 33 | vfp_ ## fname ## _loop1: \ |
| 34 | - fldmias r1!, {s0}; \ |
| 35 | - fldmias r2!, {s1}; \ |
| 36 | + vldmia.f32 r1!, {s0}; \ |
| 37 | + vldmia.f32 r2!, {s1}; \ |
| 38 | ## finst ##s s2, s0, s1; \ |
| 39 | - fstmias r0!, {s2}; \ |
| 40 | + vstmia.f32 r0!, {s2}; \ |
| 41 | subs ip, ip, #1; \ |
| 42 | bne vfp_ ## fname ## _loop1; \ |
| 43 | vfp_ ## fname ## _unroll: /* unroll by 8 */ \ |
| 44 | @@ -62,15 +62,15 @@ |
| 45 | orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ \ |
| 46 | fmxr fpscr, fp; \ |
| 47 | vfp_ ## fname ## _loop2: \ |
| 48 | - fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}; \ |
| 49 | - fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}; \ |
| 50 | + vldmia.f32 r1!, {s8, s9, s10, s11, s12, s13, s14, s15}; \ |
| 51 | + vldmia.f32 r2!, {s16, s17, s18, s19, s20, s21, s22, s23}; \ |
| 52 | ## finst ##s s24, s8, s16; \ |
| 53 | - fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}; \ |
| 54 | + vstmia.f32 r0!, {s24, s25, s26, s27, s28, s29, s30, s31}; \ |
| 55 | subs ip, ip, #1; \ |
| 56 | bne vfp_ ## fname ## _loop2; \ |
| 57 | fmxr fpscr, lr; /* restore original fpscr */ \ |
| 58 | vfp_ ## fname ## _end: \ |
| 59 | - ldmia sp!, {fp, pc}; /* recovering from stack and return */ |
| 60 | + ldmia sp!, {fp, pc}; /* recovering from stack and return */ |
| 61 | |
| 62 | #define UNROLL_F64_TEMPLATE(fname,finst) \ |
| 63 | .global vfp_ ## fname ## ; \ |
| 64 | @@ -79,10 +79,10 @@ |
| 65 | ands ip, r3, #3; /* ip = n % 3 */ \ |
| 66 | beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \ |
| 67 | vfp_ ## fname ## _loop1: \ |
| 68 | - fldmiad r1!, {d0}; \ |
| 69 | - fldmiad r2!, {d1}; \ |
| 70 | + vldmia.f64 r1!, {d0}; \ |
| 71 | + vldmia.f64 r2!, {d1}; \ |
| 72 | ## finst ##d d2, d0, d1; \ |
| 73 | - fstmiad r0!, {d2}; \ |
| 74 | + vstmia.f64 r0!, {d2}; \ |
| 75 | subs ip, ip, #1; \ |
| 76 | bne vfp_ ## fname ## _loop1; \ |
| 77 | vfp_ ## fname ## _unroll: /* unroll by 4 */ \ |
| 78 | @@ -93,15 +93,15 @@ |
| 79 | orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ \ |
| 80 | fmxr fpscr, fp; \ |
| 81 | vfp_ ## fname ## _loop2: \ |
| 82 | - fldmiad r1!, {d4, d5, d6, d7}; \ |
| 83 | - fldmiad r2!, {d8, d9, d10, d11}; \ |
| 84 | + vldmia.f64 r1!, {d4, d5, d6, d7}; \ |
| 85 | + vldmia.f64 r2!, {d8, d9, d10, d11}; \ |
| 86 | ## finst ##d d12, d4, d8; \ |
| 87 | - fstmiad r0!, {d12, d13, d14, d15}; \ |
| 88 | + vstmia.f64 r0!, {d12, d13, d14, d15}; \ |
| 89 | subs ip, ip, #1; \ |
| 90 | bne vfp_ ## fname ## _loop2; \ |
| 91 | fmxr fpscr, lr; /* restore original fpscr */ \ |
| 92 | vfp_ ## fname ## _end: \ |
| 93 | - ldmia sp!, {fp, pc}; /* recovering from stack and return */ |
| 94 | + ldmia sp!, {fp, pc}; /* recovering from stack and return */ |
| 95 | |
| 96 | .align 2 |
| 97 | UNROLL_F32_TEMPLATE(add_f32,fadd); |
| 98 | @@ -119,7 +119,7 @@ UNROLL_F64_TEMPLATE(subtract_f64,fsub); |
| 99 | #undef UNROLL_F32_TEMPLATE |
| 100 | #undef UNROLL_F64_TEMPLATE |
| 101 | |
| 102 | -/* |
| 103 | +/* |
| 104 | ** |
| 105 | ** void vfp_scalaradd_f32_ns (float *d, const float *s1, const float *s2_1, int n); |
| 106 | ** void vfp_scalaradd_f64_ns (double *d, const double *s1, const double *s2_1, int n); |
| 107 | @@ -133,13 +133,13 @@ UNROLL_F64_TEMPLATE(subtract_f64,fsub); |
| 108 | .global vfp_ ## fname ## ; \ |
| 109 | vfp_ ## fname ## : \ |
| 110 | stmdb sp!, {fp, lr}; /* save registers to stack */ \ |
| 111 | - fldmias r2, {s1}; /* load scalar value */ \ |
| 112 | + vldmia.f32 r2, {s1}; /* load scalar value */ \ |
| 113 | ands ip, r3, #7; /* ip = n % 8 */ \ |
| 114 | beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \ |
| 115 | vfp_ ## fname ## _loop1: \ |
| 116 | - fldmias r1!, {s0}; \ |
| 117 | + vldmia.f32 r1!, {s0}; \ |
| 118 | ## finst ##s s2, s0, s1; \ |
| 119 | - fstmias r0!, {s2}; \ |
| 120 | + vstmia.f32 r0!, {s2}; \ |
| 121 | subs ip, ip, #1; \ |
| 122 | bne vfp_ ## fname ## _loop1; \ |
| 123 | vfp_ ## fname ## _unroll: /* unroll by 8 */ \ |
| 124 | @@ -150,26 +150,26 @@ UNROLL_F64_TEMPLATE(subtract_f64,fsub); |
| 125 | orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ \ |
| 126 | fmxr fpscr, fp; \ |
| 127 | vfp_ ## fname ## _loop2: \ |
| 128 | - fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}; \ |
| 129 | + vldmia.f32 r1!, {s8, s9, s10, s11, s12, s13, s14, s15}; \ |
| 130 | ## finst ##s s24, s8, s1; \ |
| 131 | - fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}; \ |
| 132 | + vstmia.f32 r0!, {s24, s25, s26, s27, s28, s29, s30, s31}; \ |
| 133 | subs ip, ip, #1; \ |
| 134 | bne vfp_ ## fname ## _loop2; \ |
| 135 | fmxr fpscr, lr; /* restore original fpscr */ \ |
| 136 | vfp_ ## fname ## _end: \ |
| 137 | - ldmia sp!, {fp, pc}; /* recovering from stack and return */ |
| 138 | + ldmia sp!, {fp, pc}; /* recovering from stack and return */ |
| 139 | |
| 140 | #define UNROLL_F64_TEMPLATE(fname,finst) \ |
| 141 | .global vfp_ ## fname ## ; \ |
| 142 | vfp_ ## fname ## : \ |
| 143 | stmdb sp!, {fp, lr}; /* save registers to stack */ \ |
| 144 | - fldmiad r2, {d1}; /* load scalar value */ \ |
| 145 | + vldmia.f64 r2, {d1}; /* load scalar value */ \ |
| 146 | ands ip, r3, #3; /* ip = n % 3 */ \ |
| 147 | beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \ |
| 148 | vfp_ ## fname ## _loop1: \ |
| 149 | - fldmiad r1!, {d0}; \ |
| 150 | + vldmia.f64 r1!, {d0}; \ |
| 151 | ## finst ##d d2, d0, d1; \ |
| 152 | - fstmiad r0!, {d2}; \ |
| 153 | + vstmia.f64 r0!, {d2}; \ |
| 154 | subs ip, ip, #1; \ |
| 155 | bne vfp_ ## fname ## _loop1; \ |
| 156 | vfp_ ## fname ## _unroll: /* unroll by 4 */ \ |
| 157 | @@ -180,14 +180,14 @@ UNROLL_F64_TEMPLATE(subtract_f64,fsub); |
| 158 | orr fp, lr, fp, lsl #16; /* set vector lenght to 4 */ \ |
| 159 | fmxr fpscr, fp; \ |
| 160 | vfp_ ## fname ## _loop2: \ |
| 161 | - fldmiad r1!, {d4, d5, d6, d7}; \ |
| 162 | + vldmia.f64 r1!, {d4, d5, d6, d7}; \ |
| 163 | ## finst ##d d12, d4, d1; \ |
| 164 | - fstmiad r0!, {d12, d13, d14, d15}; \ |
| 165 | + vstmia.f64 r0!, {d12, d13, d14, d15}; \ |
| 166 | subs ip, ip, #1; \ |
| 167 | bne vfp_ ## fname ## _loop2; \ |
| 168 | fmxr fpscr, lr; /* restore original fpscr */ \ |
| 169 | vfp_ ## fname ## _end: \ |
| 170 | - ldmia sp!, {fp, pc}; /* recovering from stack and return */ |
| 171 | + ldmia sp!, {fp, pc}; /* recovering from stack and return */ |
| 172 | |
| 173 | UNROLL_F32_TEMPLATE(scalaradd_f32_ns,fadd); |
| 174 | UNROLL_F64_TEMPLATE(scalaradd_f64_ns,fadd); |
| 175 | @@ -198,7 +198,7 @@ UNROLL_F64_TEMPLATE(scalarmultiply_f64_ns,fmul); |
| 176 | #undef UNROLL_F32_TEMPLATE |
| 177 | #undef UNROLL_F64_TEMPLATE |
| 178 | |
| 179 | -/* |
| 180 | +/* |
| 181 | ** |
| 182 | ** void vfp_abs_f32_f32_ns(float *d, const float *s, int n); |
| 183 | ** void vfp_abs_f64_f64_ns(double *d, const double *s, int n); |
| 184 | @@ -215,9 +215,9 @@ UNROLL_F64_TEMPLATE(scalarmultiply_f64_ns,fmul); |
| 185 | ands ip, r2, #7; /* ip = n % 8 */ \ |
| 186 | beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \ |
| 187 | vfp_ ## fname ## _loop1: \ |
| 188 | - fldmias r1!, {s0}; \ |
| 189 | - ## finst ##s s2, s0; \ |
| 190 | - fstmias r0!, {s2}; \ |
| 191 | + vldmia.f32 r1!, {s0}; \ |
| 192 | + ## finst ##.f32 s2, s0; \ |
| 193 | + vstmia.f32 r0!, {s2}; \ |
| 194 | subs ip, ip, #1; \ |
| 195 | bne vfp_ ## fname ## _loop1; \ |
| 196 | vfp_ ## fname ## _unroll: /* unroll by 8 */ \ |
| 197 | @@ -228,14 +228,14 @@ UNROLL_F64_TEMPLATE(scalarmultiply_f64_ns,fmul); |
| 198 | orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ \ |
| 199 | fmxr fpscr, fp; \ |
| 200 | vfp_ ## fname ## _loop2: \ |
| 201 | - fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}; \ |
| 202 | - ## finst ##s s24, s8; \ |
| 203 | - fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}; \ |
| 204 | + vldmia.f32 r1!, {s8, s9, s10, s11, s12, s13, s14, s15}; \ |
| 205 | + ## finst ##.f32 s24, s8; \ |
| 206 | + vstmia.f32 r0!, {s24, s25, s26, s27, s28, s29, s30, s31}; \ |
| 207 | subs ip, ip, #1; \ |
| 208 | bne vfp_ ## fname ## _loop2; \ |
| 209 | fmxr fpscr, lr; /* restore original fpscr */ \ |
| 210 | vfp_ ## fname ## _end: \ |
| 211 | - ldmia sp!, {fp, pc}; /* recovering from stack and return */ |
| 212 | + ldmia sp!, {fp, pc}; /* recovering from stack and return */ |
| 213 | |
| 214 | #define UNROLL_F64_TEMPLATE(fname,finst) \ |
| 215 | .global vfp_ ## fname ## ; \ |
| 216 | @@ -244,9 +244,9 @@ UNROLL_F64_TEMPLATE(scalarmultiply_f64_ns,fmul); |
| 217 | ands ip, r2, #3; /* ip = n % 3 */ \ |
| 218 | beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \ |
| 219 | vfp_ ## fname ## _loop1: \ |
| 220 | - fldmiad r1!, {d0}; \ |
| 221 | - ## finst ##d d2, d0; \ |
| 222 | - fstmiad r0!, {d2}; \ |
| 223 | + vldmia.f64 r1!, {d0}; \ |
| 224 | + ## finst ##.f64 d2, d0; \ |
| 225 | + vstmia.f64 r0!, {d2}; \ |
| 226 | subs ip, ip, #1; \ |
| 227 | bne vfp_ ## fname ## _loop1; \ |
| 228 | vfp_ ## fname ## _unroll: /* unroll by 4 */ \ |
| 229 | @@ -257,20 +257,20 @@ UNROLL_F64_TEMPLATE(scalarmultiply_f64_ns,fmul); |
| 230 | orr fp, lr, fp, lsl #16; /* set vector lenght to 4 */ \ |
| 231 | fmxr fpscr, fp; \ |
| 232 | vfp_ ## fname ## _loop2: \ |
| 233 | - fldmiad r1!, {d4, d5, d6, d7}; \ |
| 234 | - ## finst ##d d12, d4; \ |
| 235 | - fstmiad r0!, {d12, d13, d14, d15}; \ |
| 236 | + vldmia.f64 r1!, {d4, d5, d6, d7}; \ |
| 237 | + ## finst ##.f64 d12, d4; \ |
| 238 | + vstmia.f64 r0!, {d12, d13, d14, d15}; \ |
| 239 | subs ip, ip, #1; \ |
| 240 | bne vfp_ ## fname ## _loop2; \ |
| 241 | fmxr fpscr, lr; /* restore original fpscr */ \ |
| 242 | vfp_ ## fname ## _end: \ |
| 243 | - ldmia sp!, {fp, pc}; /* recovering from stack and return */ |
| 244 | + ldmia sp!, {fp, pc}; /* recovering from stack and return */ |
| 245 | |
| 246 | -UNROLL_F32_TEMPLATE(abs_f32_f32_ns,fabs); |
| 247 | -UNROLL_F64_TEMPLATE(abs_f64_f64_ns,fabs); |
| 248 | +UNROLL_F32_TEMPLATE(abs_f32_f32_ns,vabs); |
| 249 | +UNROLL_F64_TEMPLATE(abs_f64_f64_ns,vabs); |
| 250 | |
| 251 | -UNROLL_F32_TEMPLATE(negative_f32,fneg); |
| 252 | -UNROLL_F64_TEMPLATE(negative_f64,fneg); |
| 253 | +UNROLL_F32_TEMPLATE(negative_f32,vneg); |
| 254 | +UNROLL_F64_TEMPLATE(negative_f64,vneg); |
| 255 | |
| 256 | #undef UNROLL_F32_TEMPLATE |
| 257 | #undef UNROLL_F64_TEMPLATE |