SkJumper: Windows
- Compile stages with -DWIN to pick up MS-specific start_pipeline(). - Add SkJumper_generated_win.S with MS-specific assembly. - Add a minimal asm tool to our GN Windows toolchain. The SkRasterPipeline_f16 benchmark run ~4x faster on my desktop. Change-Id: Ia45afb4ecb6a055e2c0e43f0f54f59e081c23b7f Reviewed-on: https://skia-review.googlesource.com/8778 Reviewed-by: Mike Klein <mtklein@chromium.org> Commit-Queue: Mike Klein <mtklein@chromium.org>
This commit is contained in:
parent
497890630b
commit
ed1b9022b3
8
BUILD.gn
8
BUILD.gn
@ -29,7 +29,7 @@ declare_args() {
|
||||
skia_enable_android_framework_defines = false
|
||||
skia_enable_discrete_gpu = true
|
||||
skia_enable_effects = true
|
||||
skia_enable_jumper = is_skia_standalone && sanitize != "MSAN" && !is_win
|
||||
skia_enable_jumper = is_skia_standalone && sanitize != "MSAN"
|
||||
skia_enable_gpu = true
|
||||
skia_enable_pdf = true
|
||||
skia_enable_tools = is_skia_standalone
|
||||
@ -498,9 +498,13 @@ optional("jumper") {
|
||||
public_defines = [ "SK_JUMPER" ]
|
||||
sources = [
|
||||
"src/jumper/SkJumper.cpp",
|
||||
"src/jumper/SkJumper_generated.S",
|
||||
"src/jumper/SkJumper_stages.cpp",
|
||||
]
|
||||
if (is_win && target_cpu == "x64") {
|
||||
sources += [ "src/jumper/SkJumper_generated_win.S" ]
|
||||
} else if (!is_win) {
|
||||
sources += [ "src/jumper/SkJumper_generated.S" ]
|
||||
}
|
||||
}
|
||||
|
||||
optional("typeface_freetype") {
|
||||
|
@ -441,6 +441,14 @@ toolchain("msvc") {
|
||||
env_setup = "cmd /c $windk/win_sdk/bin/SetEnv.cmd /x86 && "
|
||||
}
|
||||
|
||||
tool("asm") {
|
||||
command = "$env_setup$bin/ml64.exe /nologo /c /Fo {{output}} {{source}}"
|
||||
outputs = [
|
||||
"{{source_out_dir}}/{{target_output_name}}.{{source_name_part}}.obj",
|
||||
]
|
||||
description = "assemble {{source}}"
|
||||
}
|
||||
|
||||
tool("cc") {
|
||||
rspfile = "{{output}}.rsp"
|
||||
precompiled_header_type = "msvc"
|
||||
|
@ -56,7 +56,7 @@ static K kConstants = {
|
||||
using StageFn = void(void);
|
||||
|
||||
// Some platforms expect C "name" maps to asm "_name", others to "name".
|
||||
#if defined(_MSC_VER) || defined(__APPLE__)
|
||||
#if defined(__APPLE__)
|
||||
#define ASM(name, suffix) sk_##name##_##suffix
|
||||
#else
|
||||
#define ASM(name, suffix) _sk_##name##_##suffix
|
||||
|
@ -1163,50 +1163,6 @@ _sk_start_pipeline_hsw:
|
||||
.byte 0xc5,0xc4,0x57,0xff // vxorps %ymm7,%ymm7,%ymm7
|
||||
.byte 0xff,0xe0 // jmpq *%rax
|
||||
|
||||
.globl _sk_start_pipeline_ms_hsw
|
||||
_sk_start_pipeline_ms_hsw:
|
||||
.byte 0x56 // push %rsi
|
||||
.byte 0x57 // push %rdi
|
||||
.byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 // sub $0xa8,%rsp
|
||||
.byte 0xc5,0x78,0x29,0xbc,0x24,0x90,0x00,0x00,0x00 // vmovaps %xmm15,0x90(%rsp)
|
||||
.byte 0xc5,0x78,0x29,0xb4,0x24,0x80,0x00,0x00,0x00 // vmovaps %xmm14,0x80(%rsp)
|
||||
.byte 0xc5,0x78,0x29,0x6c,0x24,0x70 // vmovaps %xmm13,0x70(%rsp)
|
||||
.byte 0xc5,0x78,0x29,0x64,0x24,0x60 // vmovaps %xmm12,0x60(%rsp)
|
||||
.byte 0xc5,0x78,0x29,0x5c,0x24,0x50 // vmovaps %xmm11,0x50(%rsp)
|
||||
.byte 0xc5,0x78,0x29,0x54,0x24,0x40 // vmovaps %xmm10,0x40(%rsp)
|
||||
.byte 0xc5,0x78,0x29,0x4c,0x24,0x30 // vmovaps %xmm9,0x30(%rsp)
|
||||
.byte 0xc5,0x78,0x29,0x44,0x24,0x20 // vmovaps %xmm8,0x20(%rsp)
|
||||
.byte 0xc5,0xf8,0x29,0x7c,0x24,0x10 // vmovaps %xmm7,0x10(%rsp)
|
||||
.byte 0xc5,0xf8,0x29,0x34,0x24 // vmovaps %xmm6,(%rsp)
|
||||
.byte 0x48,0x89,0xd6 // mov %rdx,%rsi
|
||||
.byte 0x48,0xad // lods %ds:(%rsi),%rax
|
||||
.byte 0xc5,0xfc,0x57,0xc0 // vxorps %ymm0,%ymm0,%ymm0
|
||||
.byte 0xc5,0xf4,0x57,0xc9 // vxorps %ymm1,%ymm1,%ymm1
|
||||
.byte 0xc5,0xec,0x57,0xd2 // vxorps %ymm2,%ymm2,%ymm2
|
||||
.byte 0xc5,0xe4,0x57,0xdb // vxorps %ymm3,%ymm3,%ymm3
|
||||
.byte 0xc5,0xdc,0x57,0xe4 // vxorps %ymm4,%ymm4,%ymm4
|
||||
.byte 0xc5,0xd4,0x57,0xed // vxorps %ymm5,%ymm5,%ymm5
|
||||
.byte 0xc5,0xcc,0x57,0xf6 // vxorps %ymm6,%ymm6,%ymm6
|
||||
.byte 0xc5,0xc4,0x57,0xff // vxorps %ymm7,%ymm7,%ymm7
|
||||
.byte 0x48,0x89,0xcf // mov %rcx,%rdi
|
||||
.byte 0x4c,0x89,0xc2 // mov %r8,%rdx
|
||||
.byte 0xff,0xd0 // callq *%rax
|
||||
.byte 0xc5,0xf8,0x28,0x34,0x24 // vmovaps (%rsp),%xmm6
|
||||
.byte 0xc5,0xf8,0x28,0x7c,0x24,0x10 // vmovaps 0x10(%rsp),%xmm7
|
||||
.byte 0xc5,0x78,0x28,0x44,0x24,0x20 // vmovaps 0x20(%rsp),%xmm8
|
||||
.byte 0xc5,0x78,0x28,0x4c,0x24,0x30 // vmovaps 0x30(%rsp),%xmm9
|
||||
.byte 0xc5,0x78,0x28,0x54,0x24,0x40 // vmovaps 0x40(%rsp),%xmm10
|
||||
.byte 0xc5,0x78,0x28,0x5c,0x24,0x50 // vmovaps 0x50(%rsp),%xmm11
|
||||
.byte 0xc5,0x78,0x28,0x64,0x24,0x60 // vmovaps 0x60(%rsp),%xmm12
|
||||
.byte 0xc5,0x78,0x28,0x6c,0x24,0x70 // vmovaps 0x70(%rsp),%xmm13
|
||||
.byte 0xc5,0x78,0x28,0xb4,0x24,0x80,0x00,0x00,0x00 // vmovaps 0x80(%rsp),%xmm14
|
||||
.byte 0xc5,0x78,0x28,0xbc,0x24,0x90,0x00,0x00,0x00 // vmovaps 0x90(%rsp),%xmm15
|
||||
.byte 0x48,0x81,0xc4,0xa8,0x00,0x00,0x00 // add $0xa8,%rsp
|
||||
.byte 0x5f // pop %rdi
|
||||
.byte 0x5e // pop %rsi
|
||||
.byte 0xc5,0xf8,0x77 // vzeroupper
|
||||
.byte 0xc3 // retq
|
||||
|
||||
.globl _sk_just_return_hsw
|
||||
_sk_just_return_hsw:
|
||||
.byte 0xc5,0xf8,0x77 // vzeroupper
|
||||
@ -1695,49 +1651,6 @@ _sk_start_pipeline_sse41:
|
||||
.byte 0x0f,0x57,0xff // xorps %xmm7,%xmm7
|
||||
.byte 0xff,0xe0 // jmpq *%rax
|
||||
|
||||
.globl _sk_start_pipeline_ms_sse41
|
||||
_sk_start_pipeline_ms_sse41:
|
||||
.byte 0x56 // push %rsi
|
||||
.byte 0x57 // push %rdi
|
||||
.byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 // sub $0xa8,%rsp
|
||||
.byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0x00,0x00,0x00 // movaps %xmm15,0x90(%rsp)
|
||||
.byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0x00,0x00,0x00 // movaps %xmm14,0x80(%rsp)
|
||||
.byte 0x44,0x0f,0x29,0x6c,0x24,0x70 // movaps %xmm13,0x70(%rsp)
|
||||
.byte 0x44,0x0f,0x29,0x64,0x24,0x60 // movaps %xmm12,0x60(%rsp)
|
||||
.byte 0x44,0x0f,0x29,0x5c,0x24,0x50 // movaps %xmm11,0x50(%rsp)
|
||||
.byte 0x44,0x0f,0x29,0x54,0x24,0x40 // movaps %xmm10,0x40(%rsp)
|
||||
.byte 0x44,0x0f,0x29,0x4c,0x24,0x30 // movaps %xmm9,0x30(%rsp)
|
||||
.byte 0x44,0x0f,0x29,0x44,0x24,0x20 // movaps %xmm8,0x20(%rsp)
|
||||
.byte 0x0f,0x29,0x7c,0x24,0x10 // movaps %xmm7,0x10(%rsp)
|
||||
.byte 0x0f,0x29,0x34,0x24 // movaps %xmm6,(%rsp)
|
||||
.byte 0x48,0x89,0xd6 // mov %rdx,%rsi
|
||||
.byte 0x48,0xad // lods %ds:(%rsi),%rax
|
||||
.byte 0x0f,0x57,0xc0 // xorps %xmm0,%xmm0
|
||||
.byte 0x0f,0x57,0xc9 // xorps %xmm1,%xmm1
|
||||
.byte 0x0f,0x57,0xd2 // xorps %xmm2,%xmm2
|
||||
.byte 0x0f,0x57,0xdb // xorps %xmm3,%xmm3
|
||||
.byte 0x0f,0x57,0xe4 // xorps %xmm4,%xmm4
|
||||
.byte 0x0f,0x57,0xed // xorps %xmm5,%xmm5
|
||||
.byte 0x0f,0x57,0xf6 // xorps %xmm6,%xmm6
|
||||
.byte 0x0f,0x57,0xff // xorps %xmm7,%xmm7
|
||||
.byte 0x48,0x89,0xcf // mov %rcx,%rdi
|
||||
.byte 0x4c,0x89,0xc2 // mov %r8,%rdx
|
||||
.byte 0xff,0xd0 // callq *%rax
|
||||
.byte 0x0f,0x28,0x34,0x24 // movaps (%rsp),%xmm6
|
||||
.byte 0x0f,0x28,0x7c,0x24,0x10 // movaps 0x10(%rsp),%xmm7
|
||||
.byte 0x44,0x0f,0x28,0x44,0x24,0x20 // movaps 0x20(%rsp),%xmm8
|
||||
.byte 0x44,0x0f,0x28,0x4c,0x24,0x30 // movaps 0x30(%rsp),%xmm9
|
||||
.byte 0x44,0x0f,0x28,0x54,0x24,0x40 // movaps 0x40(%rsp),%xmm10
|
||||
.byte 0x44,0x0f,0x28,0x5c,0x24,0x50 // movaps 0x50(%rsp),%xmm11
|
||||
.byte 0x44,0x0f,0x28,0x64,0x24,0x60 // movaps 0x60(%rsp),%xmm12
|
||||
.byte 0x44,0x0f,0x28,0x6c,0x24,0x70 // movaps 0x70(%rsp),%xmm13
|
||||
.byte 0x44,0x0f,0x28,0xb4,0x24,0x80,0x00,0x00,0x00 // movaps 0x80(%rsp),%xmm14
|
||||
.byte 0x44,0x0f,0x28,0xbc,0x24,0x90,0x00,0x00,0x00 // movaps 0x90(%rsp),%xmm15
|
||||
.byte 0x48,0x81,0xc4,0xa8,0x00,0x00,0x00 // add $0xa8,%rsp
|
||||
.byte 0x5f // pop %rdi
|
||||
.byte 0x5e // pop %rsi
|
||||
.byte 0xc3 // retq
|
||||
|
||||
.globl _sk_just_return_sse41
|
||||
_sk_just_return_sse41:
|
||||
.byte 0xc3 // retq
|
||||
@ -2410,49 +2323,6 @@ _sk_start_pipeline_sse2:
|
||||
.byte 0x0f,0x57,0xff // xorps %xmm7,%xmm7
|
||||
.byte 0xff,0xe0 // jmpq *%rax
|
||||
|
||||
.globl _sk_start_pipeline_ms_sse2
|
||||
_sk_start_pipeline_ms_sse2:
|
||||
.byte 0x56 // push %rsi
|
||||
.byte 0x57 // push %rdi
|
||||
.byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 // sub $0xa8,%rsp
|
||||
.byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0x00,0x00,0x00 // movaps %xmm15,0x90(%rsp)
|
||||
.byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0x00,0x00,0x00 // movaps %xmm14,0x80(%rsp)
|
||||
.byte 0x44,0x0f,0x29,0x6c,0x24,0x70 // movaps %xmm13,0x70(%rsp)
|
||||
.byte 0x44,0x0f,0x29,0x64,0x24,0x60 // movaps %xmm12,0x60(%rsp)
|
||||
.byte 0x44,0x0f,0x29,0x5c,0x24,0x50 // movaps %xmm11,0x50(%rsp)
|
||||
.byte 0x44,0x0f,0x29,0x54,0x24,0x40 // movaps %xmm10,0x40(%rsp)
|
||||
.byte 0x44,0x0f,0x29,0x4c,0x24,0x30 // movaps %xmm9,0x30(%rsp)
|
||||
.byte 0x44,0x0f,0x29,0x44,0x24,0x20 // movaps %xmm8,0x20(%rsp)
|
||||
.byte 0x0f,0x29,0x7c,0x24,0x10 // movaps %xmm7,0x10(%rsp)
|
||||
.byte 0x0f,0x29,0x34,0x24 // movaps %xmm6,(%rsp)
|
||||
.byte 0x48,0x89,0xd6 // mov %rdx,%rsi
|
||||
.byte 0x48,0xad // lods %ds:(%rsi),%rax
|
||||
.byte 0x0f,0x57,0xc0 // xorps %xmm0,%xmm0
|
||||
.byte 0x0f,0x57,0xc9 // xorps %xmm1,%xmm1
|
||||
.byte 0x0f,0x57,0xd2 // xorps %xmm2,%xmm2
|
||||
.byte 0x0f,0x57,0xdb // xorps %xmm3,%xmm3
|
||||
.byte 0x0f,0x57,0xe4 // xorps %xmm4,%xmm4
|
||||
.byte 0x0f,0x57,0xed // xorps %xmm5,%xmm5
|
||||
.byte 0x0f,0x57,0xf6 // xorps %xmm6,%xmm6
|
||||
.byte 0x0f,0x57,0xff // xorps %xmm7,%xmm7
|
||||
.byte 0x48,0x89,0xcf // mov %rcx,%rdi
|
||||
.byte 0x4c,0x89,0xc2 // mov %r8,%rdx
|
||||
.byte 0xff,0xd0 // callq *%rax
|
||||
.byte 0x0f,0x28,0x34,0x24 // movaps (%rsp),%xmm6
|
||||
.byte 0x0f,0x28,0x7c,0x24,0x10 // movaps 0x10(%rsp),%xmm7
|
||||
.byte 0x44,0x0f,0x28,0x44,0x24,0x20 // movaps 0x20(%rsp),%xmm8
|
||||
.byte 0x44,0x0f,0x28,0x4c,0x24,0x30 // movaps 0x30(%rsp),%xmm9
|
||||
.byte 0x44,0x0f,0x28,0x54,0x24,0x40 // movaps 0x40(%rsp),%xmm10
|
||||
.byte 0x44,0x0f,0x28,0x5c,0x24,0x50 // movaps 0x50(%rsp),%xmm11
|
||||
.byte 0x44,0x0f,0x28,0x64,0x24,0x60 // movaps 0x60(%rsp),%xmm12
|
||||
.byte 0x44,0x0f,0x28,0x6c,0x24,0x70 // movaps 0x70(%rsp),%xmm13
|
||||
.byte 0x44,0x0f,0x28,0xb4,0x24,0x80,0x00,0x00,0x00 // movaps 0x80(%rsp),%xmm14
|
||||
.byte 0x44,0x0f,0x28,0xbc,0x24,0x90,0x00,0x00,0x00 // movaps 0x90(%rsp),%xmm15
|
||||
.byte 0x48,0x81,0xc4,0xa8,0x00,0x00,0x00 // add $0xa8,%rsp
|
||||
.byte 0x5f // pop %rdi
|
||||
.byte 0x5e // pop %rsi
|
||||
.byte 0xc3 // retq
|
||||
|
||||
.globl _sk_just_return_sse2
|
||||
_sk_just_return_sse2:
|
||||
.byte 0xc3 // retq
|
||||
|
1944
src/jumper/SkJumper_generated_win.S
Normal file
1944
src/jumper/SkJumper_generated_win.S
Normal file
File diff suppressed because it is too large
Load Diff
@ -216,19 +216,15 @@ static void* load_and_inc(void**& program) {
|
||||
|
||||
// Some glue stages that don't fit the normal pattern of stages.
|
||||
|
||||
#if defined(JUMPER) && defined(WIN)
|
||||
__attribute__((ms_abi))
|
||||
#endif
|
||||
extern "C" void WRAP(start_pipeline)(size_t x, void** program, K* k) {
|
||||
auto next = (Stage*)load_and_inc(program);
|
||||
F v{}; // TODO: faster uninitialized?
|
||||
next(x,program,k, v,v,v,v, v,v,v,v);
|
||||
}
|
||||
|
||||
#if defined(JUMPER) && defined(__x86_64__)
|
||||
__attribute__((ms_abi))
|
||||
extern "C" void WRAP(start_pipeline_ms)(size_t x, void** program, K* k) {
|
||||
WRAP(start_pipeline)(x,program,k);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller).
|
||||
extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {
|
||||
#if defined(JUMPER) && defined(__AVX2__)
|
||||
|
@ -21,16 +21,25 @@ sse2 = '-mno-red-zone -msse2 -mno-sse3 -mno-ssse3 -mno-sse4.1'.split()
|
||||
subprocess.check_call(['clang++'] + cflags + sse2 +
|
||||
['-c', 'src/jumper/SkJumper_stages.cpp'] +
|
||||
['-o', 'sse2.o'])
|
||||
subprocess.check_call(['clang++'] + cflags + sse2 + ['-DWIN'] +
|
||||
['-c', 'src/jumper/SkJumper_stages.cpp'] +
|
||||
['-o', 'win_sse2.o'])
|
||||
|
||||
sse41 = '-mno-red-zone -msse4.1'.split()
|
||||
subprocess.check_call(['clang++'] + cflags + sse41 +
|
||||
['-c', 'src/jumper/SkJumper_stages.cpp'] +
|
||||
['-o', 'sse41.o'])
|
||||
subprocess.check_call(['clang++'] + cflags + sse41 + ['-DWIN'] +
|
||||
['-c', 'src/jumper/SkJumper_stages.cpp'] +
|
||||
['-o', 'win_sse41.o'])
|
||||
|
||||
hsw = '-mno-red-zone -mavx2 -mfma -mf16c'.split()
|
||||
subprocess.check_call(['clang++'] + cflags + hsw +
|
||||
['-c', 'src/jumper/SkJumper_stages.cpp'] +
|
||||
['-o', 'hsw.o'])
|
||||
subprocess.check_call(['clang++'] + cflags + hsw + ['-DWIN'] +
|
||||
['-c', 'src/jumper/SkJumper_stages.cpp'] +
|
||||
['-o', 'win_hsw.o'])
|
||||
|
||||
aarch64 = [
|
||||
'--target=aarch64-linux-android',
|
||||
@ -51,6 +60,11 @@ subprocess.check_call(['clang++'] + cflags + vfp4 +
|
||||
['-o', 'vfp4.o'])
|
||||
|
||||
def parse_object_file(dot_o, directive, target=None):
|
||||
globl, label, comment, dehex = '.globl', ':', '// ', lambda h: '0x'+h
|
||||
if 'win' in dot_o:
|
||||
globl, label, comment = 'PUBLIC', ' LABEL PROC', '; '
|
||||
dehex = lambda h: str(int(h, 16))
|
||||
|
||||
cmd = [ objdump, '-d', '--insn-width=9', dot_o]
|
||||
if target:
|
||||
cmd += ['--target', target]
|
||||
@ -65,8 +79,8 @@ def parse_object_file(dot_o, directive, target=None):
|
||||
m = re.match('''[0-9a-f]+ <_?(.*)>:''', line)
|
||||
if m:
|
||||
print
|
||||
print '.globl _' + m.group(1)
|
||||
print '_' + m.group(1) + ':'
|
||||
print globl + ' _' + m.group(1)
|
||||
print '_' + m.group(1) + label
|
||||
continue
|
||||
|
||||
columns = line.split('\t')
|
||||
@ -84,10 +98,10 @@ def parse_object_file(dot_o, directive, target=None):
|
||||
for arg in args:
|
||||
assert 'rip' not in arg # TODO: detect on aarch64 too
|
||||
|
||||
hexed = ','.join('0x'+x for x in code.split(' '))
|
||||
hexed = ','.join(dehex(x) for x in code.split(' '))
|
||||
|
||||
print ' ' + directive + ' ' + hexed + ' '*(48-len(hexed)) + \
|
||||
'// ' + inst + (' '*(14-len(inst)) + args if args else '')
|
||||
comment + inst + (' '*(14-len(inst)) + args if args else '')
|
||||
|
||||
sys.stdout = open('src/jumper/SkJumper_generated.S', 'w')
|
||||
|
||||
@ -99,7 +113,6 @@ print '''# Copyright 2017 Google Inc.
|
||||
# This file is generated semi-automatically with this command:
|
||||
# $ src/jumper/build_stages.py
|
||||
'''
|
||||
|
||||
print '.text'
|
||||
|
||||
print '#if defined(__aarch64__)'
|
||||
@ -114,5 +127,20 @@ print '#elif defined(__x86_64__)'
|
||||
parse_object_file('hsw.o', '.byte')
|
||||
parse_object_file('sse41.o', '.byte')
|
||||
parse_object_file('sse2.o', '.byte')
|
||||
|
||||
print '#endif'
|
||||
|
||||
sys.stdout = open('src/jumper/SkJumper_generated_win.S', 'w')
|
||||
|
||||
print '''; Copyright 2017 Google Inc.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license that can be
|
||||
; found in the LICENSE file.
|
||||
|
||||
; This file is generated semi-automatically with this command:
|
||||
; $ src/jumper/build_stages.py
|
||||
'''
|
||||
print '_text SEGMENT'
|
||||
parse_object_file('win_hsw.o', 'DB')
|
||||
parse_object_file('win_sse41.o', 'DB')
|
||||
parse_object_file('win_sse2.o', 'DB')
|
||||
print 'END'
|
||||
|
Loading…
Reference in New Issue
Block a user