Revert "Generate sse2/sse4.1 splices, use them."

This reverts commit 1fe55dc9fa.

Reason for revert: breaking Win GPU bots?

Original change's description:
> Generate sse2/sse4.1 splices, use them.
> 
> While we're at it, tidy up build_stages.py a bit.
> Redirecting stdout seems a lot easier than print >>f all over the place.
> 
> TODO: non-VEX-encoded before_loop() and after_loop()
> 
> CQ_INCLUDE_TRYBOTS=skia.primary:Test-Win2k8-MSVC-GCE-CPU-AVX2-x86_64-Debug
> 
> Change-Id: I3f38e55f081670dd598c6050435466d9f394e5be
> Reviewed-on: https://skia-review.googlesource.com/8230
> Commit-Queue: Mike Klein <mtklein@chromium.org>
> Reviewed-by: Herb Derby <herb@google.com>
> 

TBR=mtklein@chromium.org,herb@google.com
NOPRESUBMIT=true
NOTREECHECKS=true
NOTRY=true
CQ_INCLUDE_TRYBOTS=skia.primary:Test-Win2k8-MSVC-GCE-CPU-AVX2-x86_64-Debug

Change-Id: Iba1905c54cb2dc42a880b9e6a8093835ffd530a2
Reviewed-on: https://skia-review.googlesource.com/8347
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Mike Klein <mtklein@chromium.org>
This commit is contained in:
Mike Klein 2017-02-11 14:01:52 +00:00 committed by Skia Commit-Bot
parent 3c727d2386
commit f8866c5b3a
3 changed files with 219 additions and 1184 deletions

View File

@ -112,6 +112,12 @@ namespace {
splice(buf, jb_near); // jb <next 4 bytes> (b == "before", unsigned less than)
splice(buf, loop_start - (int)(buf->bytesWritten() + 4));
}
static void ret(SkWStream* buf) {
static const uint8_t vzeroupper[] = { 0xc5, 0xf8, 0x77 };
static const uint8_t ret[] = { 0xc3 };
splice(buf, vzeroupper);
splice(buf, ret);
}
#endif
#if defined(_MSC_VER)
@ -121,16 +127,16 @@ namespace {
0x56, // push %rsi
0x57, // push %rdi
0x48,0x81,0xec,0xa8,0x00,0x00,0x00, // sub $0xa8,%rsp
0x44,0x0f,0x29,0xbc,0x24,0x90,0x00,0x00,0x00, // movaps %xmm15,0x90(%rsp)
0x44,0x0f,0x29,0xb4,0x24,0x80,0x00,0x00,0x00, // movaps %xmm14,0x80(%rsp)
0x44,0x0f,0x29,0x6c,0x24,0x70, // movaps %xmm13,0x70(%rsp)
0x44,0x0f,0x29,0x64,0x24,0x60, // movaps %xmm12,0x60(%rsp)
0x44,0x0f,0x29,0x5c,0x24,0x50, // movaps %xmm11,0x50(%rsp)
0x44,0x0f,0x29,0x54,0x24,0x40, // movaps %xmm10,0x40(%rsp)
0x44,0x0f,0x29,0x4c,0x24,0x30, // movaps %xmm9,0x30(%rsp)
0x44,0x0f,0x29,0x44,0x24,0x20, // movaps %xmm8,0x20(%rsp)
0x0f,0x29,0x7c,0x24,0x10, // movaps %xmm7,0x10(%rsp)
0x0f,0x29,0x34,0x24, // movaps %xmm6,(%rsp)
0xc5,0x78,0x29,0xbc,0x24,0x90,0x00,0x00,0x00, // vmovaps %xmm15,0x90(%rsp)
0xc5,0x78,0x29,0xb4,0x24,0x80,0x00,0x00,0x00, // vmovaps %xmm14,0x80(%rsp)
0xc5,0x78,0x29,0x6c,0x24,0x70, // vmovaps %xmm13,0x70(%rsp)
0xc5,0x78,0x29,0x64,0x24,0x60, // vmovaps %xmm12,0x60(%rsp)
0xc5,0x78,0x29,0x5c,0x24,0x50, // vmovaps %xmm11,0x50(%rsp)
0xc5,0x78,0x29,0x54,0x24,0x40, // vmovaps %xmm10,0x40(%rsp)
0xc5,0x78,0x29,0x4c,0x24,0x30, // vmovaps %xmm9,0x30(%rsp)
0xc5,0x78,0x29,0x44,0x24,0x20, // vmovaps %xmm8,0x20(%rsp)
0xc5,0xf8,0x29,0x7c,0x24,0x10, // vmovaps %xmm7,0x10(%rsp)
0xc5,0xf8,0x29,0x34,0x24, // vmovaps %xmm6,(%rsp)
0x48,0x89,0xcf, // mov %rcx,%rdi
0x48,0x89,0xd6, // mov %rdx,%rsi
0x4c,0x89,0xc2, // mov %r8,%rdx
@ -140,17 +146,16 @@ namespace {
}
static void after_loop(SkWStream* buf) {
static const uint8_t system_v_to_ms[] = {
// TODO: vzeroupper here?
0x0f,0x28,0x34,0x24, // movaps (%rsp),%xmm6
0x0f,0x28,0x7c,0x24,0x10, // movaps 0x10(%rsp),%xmm7
0x44,0x0f,0x28,0x44,0x24,0x20, // movaps 0x20(%rsp),%xmm8
0x44,0x0f,0x28,0x4c,0x24,0x30, // movaps 0x30(%rsp),%xmm9
0x44,0x0f,0x28,0x54,0x24,0x40, // movaps 0x40(%rsp),%xmm10
0x44,0x0f,0x28,0x5c,0x24,0x50, // movaps 0x50(%rsp),%xmm11
0x44,0x0f,0x28,0x64,0x24,0x60, // movaps 0x60(%rsp),%xmm12
0x44,0x0f,0x28,0x6c,0x24,0x70, // movaps 0x70(%rsp),%xmm13
0x44,0x0f,0x28,0xb4,0x24,0x80,0x00,0x00,0x00, // movaps 0x80(%rsp),%xmm14
0x44,0x0f,0x28,0xbc,0x24,0x90,0x00,0x00,0x00, // movaps 0x90(%rsp),%xmm15
0xc5,0xf8,0x28,0x34,0x24, // vmovaps (%rsp),%xmm6
0xc5,0xf8,0x28,0x7c,0x24,0x10, // vmovaps 0x10(%rsp),%xmm7
0xc5,0x78,0x28,0x44,0x24,0x20, // vmovaps 0x20(%rsp),%xmm8
0xc5,0x78,0x28,0x4c,0x24,0x30, // vmovaps 0x30(%rsp),%xmm9
0xc5,0x78,0x28,0x54,0x24,0x40, // vmovaps 0x40(%rsp),%xmm10
0xc5,0x78,0x28,0x5c,0x24,0x50, // vmovaps 0x50(%rsp),%xmm11
0xc5,0x78,0x28,0x64,0x24,0x60, // vmovaps 0x60(%rsp),%xmm12
0xc5,0x78,0x28,0x6c,0x24,0x70, // vmovaps 0x70(%rsp),%xmm13
0xc5,0x78,0x28,0xb4,0x24,0x80,0x00,0x00,0x00, // vmovaps 0x80(%rsp),%xmm14
0xc5,0x78,0x28,0xbc,0x24,0x90,0x00,0x00,0x00, // vmovaps 0x90(%rsp),%xmm15
0x48,0x81,0xc4,0xa8,0x00,0x00,0x00, // add $0xa8,%rsp
0x5f, // pop %rdi
0x5e, // pop %rsi
@ -236,46 +241,35 @@ namespace {
}
#endif
#define CASE(prefix, st) case SkRasterPipeline::st: splice_until_ret(buf, prefix##_##st); break
#define DEFINE_SPLICE(prefix) \
static bool prefix##_##splice(SkWStream* buf, SkRasterPipeline::StockStage st) { \
switch (st) { \
default: return false; \
CASE(prefix, clear); \
CASE(prefix, plus_); \
CASE(prefix, srcover); \
CASE(prefix, dstover); \
CASE(prefix, clamp_0); \
CASE(prefix, clamp_1); \
CASE(prefix, clamp_a); \
CASE(prefix, swap); \
CASE(prefix, move_src_dst); \
CASE(prefix, move_dst_src); \
CASE(prefix, premul); \
CASE(prefix, unpremul); \
CASE(prefix, from_srgb); \
CASE(prefix, to_srgb); \
CASE(prefix, scale_u8); \
CASE(prefix, load_tables); \
CASE(prefix, load_8888); \
CASE(prefix, store_8888); \
CASE(prefix, load_f16); \
CASE(prefix, store_f16); \
CASE(prefix, matrix_3x4); \
} \
return true; \
static bool splice(SkWStream* buf, SkRasterPipeline::StockStage st) {
switch (st) {
default: return false;
#define CASE(st) case SkRasterPipeline::st: splice_until_ret(buf, kSplice_##st); break
CASE(clear);
CASE(plus_);
CASE(srcover);
CASE(dstover);
CASE(clamp_0);
CASE(clamp_1);
CASE(clamp_a);
CASE(swap);
CASE(move_src_dst);
CASE(move_dst_src);
CASE(premul);
CASE(unpremul);
CASE(from_srgb);
CASE(to_srgb);
CASE(scale_u8);
CASE(load_tables);
CASE(load_8888);
CASE(store_8888);
CASE(load_f16);
CASE(store_f16);
CASE(matrix_3x4);
#undef CASE
}
return true;
}
#if defined(__aarch64__)
DEFINE_SPLICE(aarch64)
#elif defined(__ARM_NEON__)
DEFINE_SPLICE(armv7)
#else
DEFINE_SPLICE(sse2)
DEFINE_SPLICE(sse41)
DEFINE_SPLICE(hsw)
#endif
#undef DEFINE_SPLICE
#undef CASE
struct Spliced {
@ -288,46 +282,17 @@ namespace {
fSpliced = nullptr;
// If we return early anywhere in here, !fSpliced means we'll use fBackup instead.
#if defined(__aarch64__)
auto splice_stage = [](SkWStream* buf, SkRasterPipeline::StockStage st) {
return aarch64_splice(buf, st);
};
auto inc_x = [](SkWStream* buf) { splice_until_ret(buf, aarch64_inc_x); };
#elif defined(__ARM_NEON__)
// Late generation ARMv7, e.g. Cortex A15 or Krait.
if (!SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
return;
}
auto splice_stage = [](SkWStream* buf, SkRasterPipeline::StockStage st) {
return armv7_splice(buf, st);
};
auto inc_x = [](SkWStream* buf) { splice_until_ret(buf, armv7_inc_x); };
#else
// To keep things simple, only x86-64 for now.
if (sizeof(void*) != 8) {
// To keep things simple, only one x86 target supported: Haswell+ x86-64.
if (!SkCpu::Supports(SkCpu::HSW) || sizeof(void*) != 8) {
return;
}
bool hsw = true && SkCpu::Supports(SkCpu::HSW),
sse41 = true && SkCpu::Supports(SkCpu::SSE41);
auto splice_stage = [&](SkWStream* buf, SkRasterPipeline::StockStage st) {
if ( hsw) { return hsw_splice(buf, st); }
if (sse41) { return sse41_splice(buf, st); }
return sse2_splice(buf, st);
};
auto inc_x = [&](SkWStream* buf) {
if ( hsw) { splice_until_ret(buf, hsw_inc_x); return; }
if (sse41) { splice_until_ret(buf, sse41_inc_x); return; }
splice_until_ret(buf, sse2_inc_x);
};
auto ret = [&](SkWStream* buf) {
static const uint8_t vzeroupper[] = { 0xc5, 0xf8, 0x77 };
static const uint8_t ret[] = { 0xc3 };
if (hsw) {
splice(buf, vzeroupper);
}
splice(buf, ret);
};
#endif
SkDynamicMemoryWStream buf;
@ -347,13 +312,13 @@ namespace {
}
// Splice in the code for the Stages, generated offline into SkSplicer_generated.h.
if (!splice_stage(&buf, stages[i].stage)) {
if (!splice(&buf, stages[i].stage)) {
//SkDebugf("SkSplicer can't yet handle stage %d.\n", stages[i].stage);
return;
}
}
inc_x(&buf);
splice_until_ret(&buf, kSplice_inc_x);
loop(&buf, loop_start); // Loop back to handle more pixels if not done.
after_loop(&buf);
ret(&buf); // We're done.

File diff suppressed because it is too large Load Diff

View File

@ -9,8 +9,6 @@ import re
import subprocess
import sys
sys.stdout = open('src/splicer/SkSplicer_generated.h', 'w')
ndk = '/Users/mtklein/brew/opt/android-ndk/'
objdump = 'gobjdump'
@ -53,9 +51,8 @@ subprocess.check_call(['clang++'] + cflags + armv7 +
['-c', 'src/splicer/SkSplicer_stages.cpp'] +
['-o', 'armv7.o'])
def parse_object_file(dot_o, array_type, jump, ret, target=None):
prefix = dot_o.replace('.o', '_')
cmd = [ objdump, '-d', '--insn-width=8', dot_o]
def parse_object_file(dst, dot_o, array_type, jump, ret, target=None):
cmd = [ objdump, '-d', dot_o]
if target:
cmd += ['--target', target]
for line in subprocess.check_output(cmd).split('\n'):
@ -66,7 +63,7 @@ def parse_object_file(dot_o, array_type, jump, ret, target=None):
# E.g. 00000000000003a4 <_load_f16>:
m = re.match('''[0-9a-f]+ <_?(.*)>:''', line)
if m:
print 'static const', array_type, prefix + m.group(1) + '[] = {'
print >>dst,'static const', array_type, 'kSplice_' + m.group(1) + '[] = {'
continue
columns = line.split('\t')
@ -87,15 +84,16 @@ def parse_object_file(dot_o, array_type, jump, ret, target=None):
if code == jump:
code = ret
inst = 'return'
args = ''
args = '(synthetic)'
hexed = ''.join('0x'+x+',' for x in code.split(' '))
print ' ' + hexed + ' '*(44-len(hexed)) + \
'// ' + inst + (' '*(14-len(inst)) + args if args else '')
print >>dst,' ' + hexed + ' '*(44-len(hexed)) + \
'// ' + inst + ' '*(14-len(inst)) + args
if code == ret:
print '};'
print >>dst,'};'
print '''/*
with open('src/splicer/SkSplicer_generated.h', 'w') as f:
print >>f,'''/*
* Copyright 2017 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
@ -107,11 +105,14 @@ print '''/*
// This file is generated semi-automatically with this command:
// $ src/splicer/build_stages.py
#if defined(__aarch64__)
'''
parse_object_file('aarch64.o', 'unsigned int', '14000000', 'd65f03c0')
parse_object_file( 'armv7.o', 'unsigned int', 'eafffffe', 'e12fff1e',
parse_object_file(f, 'aarch64.o', 'unsigned int', '14000000', 'd65f03c0')
print >>f,'\n#elif defined(__ARM_NEON__)\n'
parse_object_file(f, 'armv7.o', 'unsigned int', 'eafffffe', 'e12fff1e',
target='elf32-littlearm')
parse_object_file( 'sse2.o', 'unsigned char', 'e9 00 00 00 00', 'c3')
parse_object_file('sse41.o', 'unsigned char', 'e9 00 00 00 00', 'c3')
parse_object_file( 'hsw.o', 'unsigned char', 'e9 00 00 00 00', 'c3')
print '#endif//SkSplicer_generated_DEFINED'
print >>f,'\n#else\n'
parse_object_file(f, 'hsw.o', 'unsigned char', 'e9 00 00 00 00', 'c3')
print >>f,'\n#endif\n'
print >>f,'#endif//SkSplicer_generated_DEFINED'