forked from simonjhall/copies-and-fills
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmemcpy.S
337 lines (274 loc) · 6.47 KB
/
memcpy.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
/*
Part of the "copies and fills" library by Simon Hall
The inner loop of the misaligned path is derived from the GNU libc ARM port.
The rest is my own work.
This code is licensed under the GNU Lesser General Public License version 2.1
void memcpy(void *dest,
const void *source,
size_t count);
*/
.global memcpy
.func memcpy
memcpy:
cmp r2, #0
pld [r1]
bxeq lr /* get straight out on zero, NB: count is unsigned */
cmp r2, #4 /* basic copy for four bytes */
pld [r1, #32]
ldreq r3, [r1] /* we're relying on the cpu misalignment support here */
streq r3, [r0]
bxeq lr
cmp r2, #8
pld [r1, #64] /* basic copy for eight bytes, with fall through for < 8 */
ldreq r3, [r1] /* can't use ldrd without checking alignment...can't trust os alignment handling */
streq r3, [r0] /* if we do trust the os then r2 is free for ldrd */
ldreq r3, [r1, #4]
streq r3, [r0, #4]
bxeq lr
cmp r2, #32
pld [r1, #96]
blt byte_at_a_time_no_pld /* fast path for small sizes, no stack push */
push {r0, r4-r11} /* memcpy returns the original destination, hence push r0 */
/* compute the dest pointer alignment */
.if 0
and r3, r0, #3 /* slightly slower compared to conditional version below */
cmp r3, #3 /* three bytes misaligned, one to do */
beq head_1
cmp r3, #2
beq head_2 /* two bytes misaligned, two to do */
cmp r3, #1
beq head_3 /* one byte misaligned, three to do */
.else
ands r3, r0, #3
beq skip_byte_realignment
rsb r4, r3, #4 /* how many bytes need to be read */
cmp r4, #2
ldrgtb r5, [r1], #1 /* three bytes */
ldrgeb r6, [r1], #1 /* two+ bytes */
ldrb r7, [r1], #1 /* one+ byte */
strgtb r5, [r0], #1
strgeb r6, [r0], #1
strb r7, [r0], #1
sub r2, r4
skip_byte_realignment:
.endif
.if 0
eor r3, r0, r1 /* check the 4b alignment of the two pointers */
tst r3, #3 /* ideally the bottom two bits should line up */
.else
ands r3, r1, #3
.endif
bne misaligned
/* dest pointer now 4b aligned */
/* let's try and 32b align the destination */
tst r0, #31
beq pre_fast_loop
align_up:
.if 1
ldr r3, [r1], #4
add r0, #4
sub r2, #4
tst r0, #31 /* do it early for the next run */
str r3, [r0, #-4]
bne align_up
.else
and r3, r0, #31 /* jump based on the amount of bytes to do - slower than loop above */
add pc, pc, r3
nop; nop
ldr r4, [r1], #4
ldr r5, [r1], #4
ldr r6, [r1], #4
ldr r7, [r1], #4
ldr r8, [r1], #4
ldr r9, [r1], #4
ldr r10, [r1], #4
add pc, pc, r3
nop; nop
str r4, [r0], #4
str r5, [r0], #4
str r6, [r0], #4
str r7, [r0], #4
str r8, [r0], #4
str r9, [r0], #4
str r10, [r0], #4
rsb r3, #32
sub r2, r3
.endif
pre_fast_loop:
/* round byte count down to nearest 32 */
bics r3, r2, #31
/* compute the spare */
and r2, #31
beq post_fast_loop /* nothing to do in the main loop */
/* work through 32b at a time */
fast_loop:
.if 0
ldmia r1!, {r4-r11} /* original version */
subs r3, #32
stmia r0!, {r4-r11}
pld [r1, #128]
bne fast_loop
.else
ldmia r1!, {r4-r7} /* slightly fast version suggested by tufty */
ldmia r1!, {r8-r11}
stmia r0!, {r4-r7}
pld [r1, #128]
subs r3, #32
stmia r0!, {r8-r11}
bne fast_loop
.endif
/* handle the spare bytes, up to 32 of them */
post_fast_loop:
cmp r2, #0 /* there might be none */
beq full_out
bics r3, r2, #3
and r2, #3
beq tail_fast_loop_byte
tail_fast_loop:
ldr r4, [r1], #4
subs r3, #4
str r4, [r0], #4
bne tail_fast_loop
cmp r2, #0
beq full_out
tail_fast_loop_byte:
subs r2, #1
ldrb r3, [r1], #1
strb r3, [r0], #1
bne tail_fast_loop_byte
full_out:
pop {r0, r4-r11}
bx lr
byte_at_a_time_no_pld:
subs r2, #1
ldrb r3, [r1, r2] /* one byte at a time, so we don't have to check for odd */
strb r3, [r0, r2] /* sizes and alignments etc; also no stack push necessary */
bne byte_at_a_time_no_pld
bx lr /* leaving r0 intact */
/*head_3:
ldrb r3, [r1], #1
strb r3, [r0], #1
sub r2, #1
head_2:
ldrb r3, [r1], #1
strb r3, [r0], #1
sub r2, #1
head_1:
ldrb r3, [r1], #1
strb r3, [r0], #1
sub r2, #1
b pre_fast_loop
*/
misaligned:
bic r1, #3 /* align down r1, with r3 containing the r1 misalignment */
cmp r3, #2
ldr r11, [r1], #4
beq misaligned_2
bgt misaligned_3
misaligned_1:
cmp r2, #32
blt post_misalignment_1
mis_1_loop:
lsr r3, r11, #8 /* we want the high three bytes of this */
ldmia r1!, {r4-r11}
sub r2, #32
cmp r2, #32
orr r3, r4, lsl #24
lsr r4, #8; orr r4, r5, lsl #24
lsr r5, #8; orr r5, r6, lsl #24
lsr r6, #8; orr r6, r7, lsl #24
lsr r7, #8; orr r7, r8, lsl #24
lsr r8, #8; orr r8, r9, lsl #24
lsr r9, #8; orr r9, r10, lsl #24
lsr r10, #8; orr r10, r11, lsl #24
pld [r1, #128]
stmia r0!, {r3-r10}
bge mis_1_loop
post_misalignment_1:
cmp r2, #0
beq full_out
lsr r11, #8
mov r3, #3
post_misalignment_1_loop:
cmp r3, #0
ldreq r11, [r1], #4
moveq r3, #4
strb r11, [r0], #1
sub r3, #1
subs r2, #1
lsr r11, #8
bne post_misalignment_1_loop
b full_out
misaligned_2:
cmp r2, #32
blt post_misalignment_2
mis_2_loop:
lsr r3, r11, #16 /* we want the high two bytes of this */
ldmia r1!, {r4-r11}
sub r2, #32
cmp r2, #32
orr r3, r4, lsl #16
lsr r4, #16; orr r4, r5, lsl #16
lsr r5, #16; orr r5, r6, lsl #16
lsr r6, #16; orr r6, r7, lsl #16
lsr r7, #16; orr r7, r8, lsl #16
lsr r8, #16; orr r8, r9, lsl #16
lsr r9, #16; orr r9, r10, lsl #16
lsr r10, #16; orr r10, r11, lsl #16
pld [r1, #128]
stmia r0!, {r3-r10}
bge mis_2_loop
post_misalignment_2:
cmp r2, #0
beq full_out
lsr r11, #16
mov r3, #2
post_misalignment_2_loop:
cmp r3, #0
ldreq r11, [r1], #4
moveq r3, #4
strb r11, [r0], #1
sub r3, #1
subs r2, #1
lsr r11, #8
bne post_misalignment_2_loop
b full_out
misaligned_3:
cmp r2, #32
blt post_misalignment_3
mis_3_loop:
lsr r3, r11, #24 /* we want the high byte of this */
ldmia r1!, {r4-r11}
sub r2, #32
cmp r2, #32
orr r3, r4, lsl #8
lsr r4, #24; orr r4, r5, lsl #8
lsr r5, #24; orr r5, r6, lsl #8
lsr r6, #24; orr r6, r7, lsl #8
lsr r7, #24; orr r7, r8, lsl #8
lsr r8, #24; orr r8, r9, lsl #8
lsr r9, #24; orr r9, r10, lsl #8
lsr r10, #24; orr r10, r11, lsl #8
pld [r1, #128]
stmia r0!, {r3-r10}
bge mis_3_loop
post_misalignment_3:
cmp r2, #0
beq full_out
lsr r11, #24
mov r3, #1
post_misalignment_3_loop:
cmp r3, #0
ldreq r11, [r1], #4
moveq r3, #4
strb r11, [r0], #1
sub r3, #1
subs r2, #1
lsr r11, #8
bne post_misalignment_3_loop
b full_out
.endfunc
/* Raj: Added this to enable no exec stack */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif