Add Neon versions of memset32 and memset16

Patch by pgalizia (of codeaurora.org)

(Note: I don't read ARM and I didn't manage to find a reviewer for the
ARM assembly code so this is landing somewhat unreviewed.)

http://codereview.appspot.com/1157045/show

git-svn-id: http://skia.googlecode.com/svn/trunk@573 2bbb7eff-a529-9590-31e7-b0007b416f81
This commit is contained in:
agl@chromium.org 2010-06-04 14:47:38 +00:00
parent 268013bfa6
commit aab4090b57
3 changed files with 319 additions and 0 deletions

152
src/opts/memset16_neon.S Normal file
View File

@ -0,0 +1,152 @@
/***************************************************************************
Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you
may not use this file except in compliance with the License. You may
obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied. See the License for the specific language governing
permissions and limitations under the License.
***************************************************************************/
/***************************************************************************
Neon memset: Attempts to do a memset with Neon registers if possible,
Inputs:
s: The buffer to write to
c: The integer data to write to the buffer
n: The size_t count.
Outputs:
***************************************************************************/
.code 32
.fpu neon
.align 4
.globl memset16_neon
.func
memset16_neon:
cmp r2, #0
bxeq lr
/* Keep in mind that r2 -- the count argument -- is for the
* number of 16-bit items to copy.
*/
lsl r2, r2, #1
push {r0}
/* If we have < 8 bytes, just do a quick loop to handle that */
cmp r2, #8
bgt memset_gt4
memset_smallcopy_loop:
strh r1, [r0], #2
subs r2, r2, #2
bne memset_smallcopy_loop
memset_smallcopy_done:
pop {r0}
bx lr
memset_gt4:
/*
* Duplicate the r1 lowest 16-bits across r1. The idea is to have
* a register with two 16-bit-values we can copy. We do this by
* duplicating lowest 16-bits of r1 to upper 16-bits.
*/
orr r1, r1, r1, lsl #16
/*
* If we're copying > 64 bytes, then we may want to get
* onto a 16-byte boundary to improve speed even more.
*/
cmp r2, #64
blt memset_route
ands r12, r0, #0xf
beq memset_route
/*
* Determine the number of bytes to move forward to get to the 16-byte
* boundary. Note that this will be a multiple of 4, since we
* already are word-aligned.
*/
rsb r12, r12, #16
sub r2, r2, r12
lsls r12, r12, #29
strmi r1, [r0], #4
strcs r1, [r0], #4
strcs r1, [r0], #4
lsls r12, r12, #2
strcsh r1, [r0], #2
memset_route:
/*
* Decide where to route for the maximum copy sizes. Note that we
* build q0 and q1 depending on if we'll need it, so that's
* interwoven here as well.
*/
vdup.u32 d0, r1
cmp r2, #16
blt memset_8
vmov d1, d0
cmp r2, #64
blt memset_16
vmov q1, q0
cmp r2, #128
blt memset_32
memset_128:
mov r12, r2, lsr #7
memset_128_loop:
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
subs r12, r12, #1
bne memset_128_loop
ands r2, r2, #0x7f
beq memset_end
memset_32:
movs r12, r2, lsr #5
beq memset_16
memset_32_loop:
subs r12, r12, #1
vst1.64 {q0, q1}, [r0]!
bne memset_32_loop
ands r2, r2, #0x1f
beq memset_end
memset_16:
movs r12, r2, lsr #4
beq memset_8
memset_16_loop:
subs r12, r12, #1
vst1.32 {q0}, [r0]!
bne memset_16_loop
ands r2, r2, #0xf
beq memset_end
/*
* memset_8 isn't a loop, since we try to do our loops at 16
* bytes and above. We should loop there, then drop down here
* to finish the <16-byte versions. Same for memset_4 and
* memset_1.
*/
memset_8:
cmp r2, #8
blt memset_4
subs r2, r2, #8
vst1.32 {d0}, [r0]!
memset_4:
cmp r2, #4
blt memset_2
subs r2, r2, #4
str r1, [r0], #4
memset_2:
cmp r2, #0
ble memset_end
strh r1, [r0], #2
memset_end:
pop {r0}
bx lr
.endfunc
.end

122
src/opts/memset32_neon.S Normal file
View File

@ -0,0 +1,122 @@
/***************************************************************************
Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you
may not use this file except in compliance with the License. You may
obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied. See the License for the specific language governing
permissions and limitations under the License.
***************************************************************************/
.code 32
.fpu neon
.align 4
.globl memset32_neon
.func
/* r0 = buffer, r1 = value, r2 = times to write */
memset32_neon:
cmp r2, #1
streq r1, [r0], #4
bxeq lr
cmp r2, #4
bgt memset32_neon_start
cmp r2, #0
bxeq lr
memset32_neon_small:
str r1, [r0], #4
subs r2, r2, #1
bne memset32_neon_small
bx lr
memset32_neon_start:
cmp r2, #16
blt memset32_dropthru
vdup.32 q0, r1
vmov q1, q0
cmp r2, #32
blt memset32_16
cmp r2, #64
blt memset32_32
cmp r2, #128
blt memset32_64
memset32_128:
movs r12, r2, lsr #7
memset32_loop128:
subs r12, r12, #1
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
bne memset32_loop128
ands r2, r2, #0x7f
bxeq lr
memset32_64:
movs r12, r2, lsr #6
beq memset32_32
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
ands r2, r2, #0x3f
bxeq lr
memset32_32:
movs r12, r2, lsr #5
beq memset32_16
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
ands r2, r2, #0x1f
bxeq lr
memset32_16:
movs r12, r2, lsr #4
beq memset32_dropthru
and r2, r2, #0xf
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
memset32_dropthru:
rsb r2, r2, #15
add pc, pc, r2, lsl #2
nop
str r1, [r0, #56]
str r1, [r0, #52]
str r1, [r0, #48]
str r1, [r0, #44]
str r1, [r0, #40]
str r1, [r0, #36]
str r1, [r0, #32]
str r1, [r0, #28]
str r1, [r0, #24]
str r1, [r0, #20]
str r1, [r0, #16]
str r1, [r0, #12]
str r1, [r0, #8]
str r1, [r0, #4]
str r1, [r0, #0]
bx lr
.endfunc
.end

View File

@ -0,0 +1,45 @@
/***************************************************************************
Copyright (c) 2010, Code Aurora Forum. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you
may not use this file except in compliance with the License. You may
obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied. See the License for the specific language governing
permissions and limitations under the License.
***************************************************************************/
#include "SkUtils.h"
extern "C" void memset16_neon(uint16_t dst[], uint16_t value, int count);
extern "C" void memset32_neon(uint32_t dst[], uint32_t value, int count);
static inline bool hasNeonRegisters() {
#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
return true;
#else
return false;
#endif
}
SkMemset16Proc SkMemset16GetPlatformProc() {
if (hasNeonRegisters()) {
return memset16_neon;
} else {
return NULL;
}
}
SkMemset32Proc SkMemset32GetPlatformProc() {
if (hasNeonRegisters()) {
return memset32_neon;
} else {
return NULL;
}
}