Revert of Move CPU feature detection to its own file. (patchset #7 id:120001 of https://codereview.chromium.org/1890483002/ )
Reason for revert: many unexpected GM diffs across GPU+CPU configs on Windows (hopefully just text masks on GPU?). seems like we pick a different srcover variant in some places. Original issue's description: > Move CPU feature detection to its own file. > > - Moves CPU feature detection to its own file. > - Cleans up some redundant feature detection scattered around core/ and opts/. > - Can now detect a few new CPU features: > * F16C -> Intel f16<->f32 instructions, added between AVX and AVX2 > * FMA -> Intel FMA instructions, added at the same time as AVX2 > * VFP_FP16 -> ARM f16<->f32 instructions, quite common > * NEON_FMA -> ARM FMA instructions, also quite common > * SSE and SSE3... why not? > > This new internal API makes it very cheap to do fine-grained runtime CPU > feature detection. Redundant calls to SkCpu::Supports() should be eliminated > and it's hoistable out of loops. It compiles away entirely when we have the > appropriate instructions available at compile time. > > This means we can call it to guard even a little snippet of 1 or 2 instructions > right where needed and let inlining hoist the check (if any at all) up to > somewhere that doesn't hurt performance. I've explained how I made this work > in the private section of the new header. > > Once this lands and bakes a bit, I'll start following up with CLs to use it more > and to add a bunch of those little 1-2 instruction snippets we've been wanting, > e.g. cvtps2ph, cvtph2ps, ptest, pmulld, pmovzxbd, blendvps, pshufb, roundps > (for floor) on x86, and vcvt.f32.f16, vcvt.f16.f32 on ARM. > > BUG=skia: > GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1890483002 > CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot > > Committed: https://skia.googlesource.com/skia/+/872ea29357439f05b1f6995dd300fc054733e607 TBR=fmalita@chromium.org,herb@google.com,reed@google.com,mtklein@chromium.org # Skipping CQ checks because original CL landed less than 1 days ago. NOPRESUBMIT=true NOTREECHECKS=true NOTRY=true BUG=skia: Review URL: https://codereview.chromium.org/1892643003
This commit is contained in:
parent
2c7f24093a
commit
86498fbfcb
@ -87,6 +87,13 @@
|
||||
'android_deps.gyp:cpu_features',
|
||||
],
|
||||
}],
|
||||
[ 'skia_arch_type == "arm"', {
|
||||
# The code in SkUtilsArm.cpp can be used on an ARM-based Linux system, not only Android.
|
||||
'sources': [
|
||||
'../src/core/SkUtilsArm.cpp',
|
||||
'../src/core/SkUtilsArm.h',
|
||||
],
|
||||
}],
|
||||
['skia_gpu == 1', {
|
||||
'include_dirs': [
|
||||
'../include/gpu',
|
||||
|
@ -85,8 +85,6 @@
|
||||
'<(skia_src_path)/core/SkConvolver.cpp',
|
||||
'<(skia_src_path)/core/SkConvolver.h',
|
||||
'<(skia_src_path)/core/SkCoreBlitters.h',
|
||||
'<(skia_src_path)/core/SkCpu.cpp',
|
||||
'<(skia_src_path)/core/SkCpu.h',
|
||||
'<(skia_src_path)/core/SkCubicClipper.cpp',
|
||||
'<(skia_src_path)/core/SkCubicClipper.h',
|
||||
'<(skia_src_path)/core/SkData.cpp',
|
||||
|
@ -1,90 +0,0 @@
|
||||
/*
|
||||
* Copyright 2016 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "SkCpu.h"
|
||||
#include "SkOncePtr.h"
|
||||
|
||||
#if defined(SK_CPU_X86)
|
||||
#if defined(SK_BUILD_FOR_WIN32)
|
||||
#include <intrin.h>
|
||||
static void cpuid (uint32_t abcd[4]) { __cpuid ((int*)abcd, 1); }
|
||||
static void cpuid7(uint32_t abcd[4]) { __cpuidex((int*)abcd, 7, 0); }
|
||||
static uint64_t xgetbv(uint32_t xcr) { return _xgetbv(xcr); }
|
||||
#else
|
||||
#include <cpuid.h>
|
||||
#if !defined(__cpuid_count) // Old Mac Clang doesn't have this defined.
|
||||
#define __cpuid_count(eax, ecx, a, b, c, d) \
|
||||
__asm__("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eax), "2"(ecx))
|
||||
#endif
|
||||
static void cpuid (uint32_t abcd[4]) { __get_cpuid(1, abcd+0, abcd+1, abcd+2, abcd+3); }
|
||||
static void cpuid7(uint32_t abcd[4]) {
|
||||
__cpuid_count(7, 0, abcd[0], abcd[1], abcd[2], abcd[3]);
|
||||
}
|
||||
static uint64_t xgetbv(uint32_t xcr) {
|
||||
uint32_t eax, edx;
|
||||
__asm__ __volatile__ ( "xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr));
|
||||
return (uint64_t)(edx) << 32 | eax;
|
||||
}
|
||||
#endif
|
||||
|
||||
static uint32_t read_cpu_features() {
|
||||
uint32_t features = 0;
|
||||
uint32_t abcd[4] = {0,0,0,0};
|
||||
|
||||
// You might want to refer to http://www.sandpile.org/x86/cpuid.htm
|
||||
|
||||
cpuid(abcd);
|
||||
if (abcd[3] & (1<<25)) { features |= SkCpu:: SSE1; }
|
||||
if (abcd[3] & (1<<26)) { features |= SkCpu:: SSE2; }
|
||||
if (abcd[2] & (1<< 0)) { features |= SkCpu:: SSE3; }
|
||||
if (abcd[2] & (1<< 9)) { features |= SkCpu::SSSE3; }
|
||||
if (abcd[2] & (1<<19)) { features |= SkCpu::SSE41; }
|
||||
if (abcd[2] & (1<<20)) { features |= SkCpu::SSE42; }
|
||||
|
||||
if ((abcd[2] & (3<<26)) == (3<<26) && (xgetbv(0) & 6) == 6) { // XSAVE + OSXSAVE
|
||||
if (abcd[2] & (1<<28)) { features |= SkCpu:: AVX; }
|
||||
if (abcd[2] & (1<<29)) { features |= SkCpu::F16C; }
|
||||
if (abcd[2] & (1<<12)) { features |= SkCpu:: FMA; }
|
||||
|
||||
cpuid7(abcd);
|
||||
if (abcd[1] & (1<<5)) { features |= SkCpu::AVX2; }
|
||||
}
|
||||
return features;
|
||||
}
|
||||
|
||||
#elif defined(SK_CPU_ARM32) && \
|
||||
defined(SK_BUILD_FOR_ANDROID) && \
|
||||
!defined(SK_BUILD_FOR_ANDROID_FRAMEWORK)
|
||||
#include <cpu-features.h>
|
||||
|
||||
static uint32_t read_cpu_features() {
|
||||
uint32_t features = 0;
|
||||
|
||||
uint64_t android_features = android_getCpuFeatures();
|
||||
if (android_features & ANDROID_CPU_ARM_FEATURE_NEON ) { features |= SkCpu::NEON ; }
|
||||
if (android_features & ANDROID_CPU_ARM_FEATURE_NEON_FMA) { features |= SkCpu::NEON_FMA; }
|
||||
if (android_features & ANDROID_CPU_ARM_FEATURE_VFP_FP16) { features |= SkCpu::VFP_FP16; }
|
||||
return features;
|
||||
}
|
||||
|
||||
#else
|
||||
static uint32_t read_cpu_features() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
SK_DECLARE_STATIC_ONCE_PTR(uint32_t, gCachedCpuFeatures);
|
||||
uint32_t SkCpu::RuntimeCpuFeatures() {
|
||||
return *gCachedCpuFeatures.get([]{ return new uint32_t{read_cpu_features()}; });
|
||||
}
|
||||
|
||||
#else
|
||||
const uint32_t SkCpu::gCachedCpuFeatures = read_cpu_features();
|
||||
|
||||
#endif
|
123
src/core/SkCpu.h
123
src/core/SkCpu.h
@ -1,123 +0,0 @@
|
||||
/*
|
||||
* Copyright 2016 Google Inc.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#ifndef SkCpu_DEFINED
|
||||
#define SkCpu_DEFINED
|
||||
|
||||
#include "SkTypes.h"
|
||||
|
||||
struct SkCpu {
|
||||
enum {
|
||||
SSE1 = 1 << 0,
|
||||
SSE2 = 1 << 1,
|
||||
SSE3 = 1 << 2,
|
||||
SSSE3 = 1 << 3,
|
||||
SSE41 = 1 << 4,
|
||||
SSE42 = 1 << 5,
|
||||
AVX = 1 << 6,
|
||||
F16C = 1 << 7,
|
||||
FMA = 1 << 8,
|
||||
AVX2 = 1 << 9,
|
||||
};
|
||||
enum {
|
||||
NEON = 1 << 0,
|
||||
NEON_FMA = 1 << 1,
|
||||
VFP_FP16 = 1 << 2,
|
||||
};
|
||||
|
||||
static bool Supports(uint32_t);
|
||||
|
||||
private:
|
||||
// Consider a loop like this that expands 16-bit floats out to 32-bit, does math, and repacks:
|
||||
// for (int i = 0; i < N; i++) {
|
||||
// if (SkCpu::Supports(SkCpu::F16C)) {
|
||||
// f32s = SkCpu::F16C_cvtph_ps(f16s);
|
||||
// } else {
|
||||
// f32s = some_slower_f16_to_f32_routine(f16s);
|
||||
// }
|
||||
//
|
||||
// ... do some math with f32s ...
|
||||
//
|
||||
// if (SkCpu::Supports(SkCpu::F16C)) {
|
||||
// f16s = SkCpu::F16C_cvtps_ph(f32s);
|
||||
// } else {
|
||||
// f16s = some_slower_f32_to_f16_routine(f32s);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// We would like SkCpu::Supports() to participate in common sub-expression elimination,
|
||||
// so that it's called exactly 1 time, rather than N or 2N times. This is especially
|
||||
// important when the if-else blocks you see above are really inline functions.
|
||||
//
|
||||
// The key to this is to make sure to implement RuntimeCpuFeatures() with the same
|
||||
// capacity for common sub-expression elimination.
|
||||
//
|
||||
// __attribute__((const)) works perfectly when available.
|
||||
//
|
||||
// When it's not (MSVC), we fall back to a static initializer.
|
||||
// (Static intializers would work fine everywhere, but Chrome really dislikes them.)
|
||||
|
||||
#if defined(__GNUC__) || defined(__clang__) // i.e. GCC, Clang, or clang-cl
|
||||
__attribute__((const))
|
||||
static uint32_t RuntimeCpuFeatures();
|
||||
#else
|
||||
static const uint32_t gCachedCpuFeatures;
|
||||
static uint32_t RuntimeCpuFeatures() {
|
||||
return gCachedCpuFeatures;
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
inline bool SkCpu::Supports(uint32_t mask) {
|
||||
uint32_t features = RuntimeCpuFeatures();
|
||||
|
||||
// If we mask in compile-time known lower limits, the compiler can completely
|
||||
// drop many calls to RuntimeCpuFeatures().
|
||||
#if SK_CPU_X86
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
|
||||
features |= SSE1;
|
||||
#endif
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
||||
features |= SSE2;
|
||||
#endif
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE3
|
||||
features |= SSE3;
|
||||
#endif
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
|
||||
features |= SSSE3;
|
||||
#endif
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
|
||||
features |= SSE41;
|
||||
#endif
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42
|
||||
features |= SSE42;
|
||||
#endif
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
|
||||
features |= AVX;
|
||||
#endif
|
||||
// F16C goes here if we add SK_CPU_SSE_LEVEL_F16C
|
||||
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
|
||||
features |= AVX2;
|
||||
#endif
|
||||
// FMA doesn't fit neatly into this total ordering.
|
||||
// It's available on Haswell+ just like AVX2, but it's technically a different bit.
|
||||
// TODO: circle back on this if we find ourselves limited by lack of compile-time FMA
|
||||
|
||||
#else
|
||||
#if defined(SK_ARM_HAS_NEON)
|
||||
features |= NEON;
|
||||
#endif
|
||||
|
||||
#if defined(SK_CPU_ARM64)
|
||||
features |= NEON|NEON_FMA|VFP_FP16;
|
||||
#endif
|
||||
|
||||
#endif
|
||||
return (features & mask) == mask;
|
||||
}
|
||||
|
||||
#endif//SkCpu_DEFINED
|
@ -5,7 +5,6 @@
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
#include "SkCpu.h"
|
||||
#include "SkHalf.h"
|
||||
#include "SkOnce.h"
|
||||
#include "SkOpts.h"
|
||||
@ -33,6 +32,35 @@ namespace SK_OPTS_NS {
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(SK_CPU_X86) && !defined(SK_BUILD_FOR_IOS)
|
||||
#if defined(SK_BUILD_FOR_WIN32)
|
||||
#include <intrin.h>
|
||||
static void cpuid (uint32_t abcd[4]) { __cpuid ((int*)abcd, 1); }
|
||||
static void cpuid7(uint32_t abcd[4]) { __cpuidex((int*)abcd, 7, 0); }
|
||||
static uint64_t xgetbv(uint32_t xcr) { return _xgetbv(xcr); }
|
||||
#else
|
||||
#include <cpuid.h>
|
||||
#if !defined(__cpuid_count) // Old Mac Clang doesn't have this defined.
|
||||
#define __cpuid_count(eax, ecx, a, b, c, d) \
|
||||
__asm__("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eax), "2"(ecx))
|
||||
#endif
|
||||
static void cpuid (uint32_t abcd[4]) { __get_cpuid(1, abcd+0, abcd+1, abcd+2, abcd+3); }
|
||||
static void cpuid7(uint32_t abcd[4]) {
|
||||
__cpuid_count(7, 0, abcd[0], abcd[1], abcd[2], abcd[3]);
|
||||
}
|
||||
static uint64_t xgetbv(uint32_t xcr) {
|
||||
uint32_t eax, edx;
|
||||
__asm__ __volatile__ ( "xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr));
|
||||
return (uint64_t)(edx) << 32 | eax;
|
||||
}
|
||||
#endif
|
||||
#elif !defined(SK_ARM_HAS_NEON) && \
|
||||
defined(SK_CPU_ARM32) && \
|
||||
defined(SK_BUILD_FOR_ANDROID) && \
|
||||
!defined(SK_BUILD_FOR_ANDROID_FRAMEWORK)
|
||||
#include <cpu-features.h>
|
||||
#endif
|
||||
|
||||
namespace SkOpts {
|
||||
|
||||
// Define default function pointer values here...
|
||||
@ -84,16 +112,28 @@ namespace SkOpts {
|
||||
static void init() {
|
||||
// TODO: Chrome's not linking _sse* opts on iOS simulator builds. Bug or feature?
|
||||
#if defined(SK_CPU_X86) && !defined(SK_BUILD_FOR_IOS)
|
||||
if (SkCpu::Supports(SkCpu::SSSE3)) { Init_ssse3(); }
|
||||
if (SkCpu::Supports(SkCpu::SSE41)) { Init_sse41(); }
|
||||
if (SkCpu::Supports(SkCpu::SSE42)) { Init_sse42(); }
|
||||
if (SkCpu::Supports(SkCpu::AVX )) { Init_avx(); }
|
||||
if (SkCpu::Supports(SkCpu::AVX2 )) { Init_avx2(); }
|
||||
uint32_t abcd[] = {0,0,0,0};
|
||||
cpuid(abcd);
|
||||
if (abcd[2] & (1<< 9)) { Init_ssse3(); }
|
||||
if (abcd[2] & (1<<19)) { Init_sse41(); }
|
||||
if (abcd[2] & (1<<20)) { Init_sse42(); }
|
||||
|
||||
#elif defined(SK_CPU_ARM32) && \
|
||||
defined(SK_BUILD_FOR_ANDROID) && \
|
||||
!defined(SK_BUILD_FOR_ANDROID_FRAMEWORK)
|
||||
if (SkCpu::Supports(SkCpu::NEON)) { Init_neon(); }
|
||||
// AVX detection's kind of a pain. This is cribbed from Chromium.
|
||||
if ( ( abcd[2] & (7<<26)) == (7<<26) && // Check bits 26-28 of ecx are all set,
|
||||
(xgetbv(0) & 6 ) == 6 ){ // and check the OS supports XSAVE.
|
||||
Init_avx();
|
||||
|
||||
// AVX2 additionally needs bit 5 set on ebx after calling cpuid(7).
|
||||
uint32_t abcd7[] = {0,0,0,0};
|
||||
cpuid7(abcd7);
|
||||
if (abcd7[1] & (1<<5)) { Init_avx2(); }
|
||||
}
|
||||
|
||||
#elif !defined(SK_ARM_HAS_NEON) && \
|
||||
defined(SK_CPU_ARM32) && \
|
||||
defined(SK_BUILD_FOR_ANDROID) && \
|
||||
!defined(SK_BUILD_FOR_ANDROID_FRAMEWORK)
|
||||
if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) { Init_neon(); }
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -5,4 +5,141 @@
|
||||
* found in the LICENSE file.
|
||||
*/
|
||||
|
||||
// This file no longer needs to exist, but it's still referenced by Chrome's GYP / GN builds.
|
||||
#include "SkUtilsArm.h"
|
||||
|
||||
#if SK_ARM_NEON_IS_DYNAMIC
|
||||
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <pthread.h>
|
||||
|
||||
#if defined(SK_BUILD_FOR_ANDROID)
|
||||
# include <cpu-features.h>
|
||||
#endif
|
||||
|
||||
// A function used to determine at runtime if the target CPU supports
|
||||
// the ARM NEON instruction set. This implementation is Linux-specific.
|
||||
static bool sk_cpu_arm_check_neon(void) {
|
||||
// If we fail any of the following, assume we don't have NEON instructions
|
||||
// This allows us to return immediately in case of error.
|
||||
bool result = false;
|
||||
|
||||
// Use the Android NDK's cpu-features helper library to detect NEON at runtime.
|
||||
// See http://crbug.com/164154 to see why this is needed in Chromium for Android.
|
||||
#ifdef SK_BUILD_FOR_ANDROID
|
||||
|
||||
result = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) != 0;
|
||||
|
||||
#else // SK_BUILD_FOR_ANDROID
|
||||
|
||||
// There is no user-accessible CPUID instruction on ARM that we can use.
|
||||
// Instead, we must parse /proc/cpuinfo and look for the 'neon' feature.
|
||||
// For example, here's a typical output (Nexus S running ICS 4.0.3):
|
||||
/*
|
||||
Processor : ARMv7 Processor rev 2 (v7l)
|
||||
BogoMIPS : 994.65
|
||||
Features : swp half thumb fastmult vfp edsp thumbee neon vfpv3
|
||||
CPU implementer : 0x41
|
||||
CPU architecture: 7
|
||||
CPU variant : 0x2
|
||||
CPU part : 0xc08
|
||||
CPU revision : 2
|
||||
|
||||
Hardware : herring
|
||||
Revision : 000b
|
||||
Serial : 3833c77d6dc000ec
|
||||
*/
|
||||
char buffer[4096];
|
||||
|
||||
do {
|
||||
// open /proc/cpuinfo
|
||||
int fd = TEMP_FAILURE_RETRY(open("/proc/cpuinfo", O_RDONLY));
|
||||
if (fd < 0) {
|
||||
SkDebugf("Could not open /proc/cpuinfo: %s\n", strerror(errno));
|
||||
break;
|
||||
}
|
||||
|
||||
// Read the file. To simplify our search, we're going to place two
|
||||
// sentinel '\n' characters: one at the start of the buffer, and one at
|
||||
// the end. This means we reserve the first and last buffer bytes.
|
||||
buffer[0] = '\n';
|
||||
int size = TEMP_FAILURE_RETRY(read(fd, buffer+1, sizeof(buffer)-2));
|
||||
close(fd);
|
||||
|
||||
if (size < 0) { // should not happen
|
||||
SkDebugf("Could not read /proc/cpuinfo: %s\n", strerror(errno));
|
||||
break;
|
||||
}
|
||||
|
||||
SkDebugf("START /proc/cpuinfo:\n%.*s\nEND /proc/cpuinfo\n",
|
||||
size, buffer+1);
|
||||
|
||||
// Compute buffer limit, and place final sentinel
|
||||
char* buffer_end = buffer + 1 + size;
|
||||
buffer_end[0] = '\n';
|
||||
|
||||
// Now, find a line that starts with "Features", i.e. look for
|
||||
// '\nFeatures ' in our buffer.
|
||||
const char features[] = "\nFeatures\t";
|
||||
const size_t features_len = sizeof(features)-1;
|
||||
|
||||
char* line = (char*) memmem(buffer, buffer_end - buffer,
|
||||
features, features_len);
|
||||
if (line == nullptr) { // Weird, no Features line, bad kernel?
|
||||
SkDebugf("Could not find a line starting with 'Features'"
|
||||
"in /proc/cpuinfo ?\n");
|
||||
break;
|
||||
}
|
||||
|
||||
line += features_len; // Skip the "\nFeatures\t" prefix
|
||||
|
||||
// Find the end of the current line
|
||||
char* line_end = (char*) memchr(line, '\n', buffer_end - line);
|
||||
if (line_end == nullptr)
|
||||
line_end = buffer_end;
|
||||
|
||||
// Now find an instance of 'neon' in the flags list. We want to
|
||||
// ensure it's only 'neon' and not something fancy like 'noneon'
|
||||
// so check that it follows a space.
|
||||
const char neon[] = " neon";
|
||||
const size_t neon_len = sizeof(neon)-1;
|
||||
const char* flag = (const char*) memmem(line, line_end - line,
|
||||
neon, neon_len);
|
||||
if (flag == nullptr)
|
||||
break;
|
||||
|
||||
// Ensure it is followed by a space or a newline.
|
||||
if (flag[neon_len] != ' ' && flag[neon_len] != '\n')
|
||||
break;
|
||||
|
||||
// Fine, we support Arm NEON !
|
||||
result = true;
|
||||
|
||||
} while (0);
|
||||
|
||||
#endif // SK_BUILD_FOR_ANDROID
|
||||
|
||||
if (result) {
|
||||
SkDEBUGF(("Device supports ARM NEON instructions!\n"));
|
||||
} else {
|
||||
SkDEBUGF(("Device does NOT support ARM NEON instructions!\n"));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static pthread_once_t sOnce;
|
||||
static bool sHasArmNeon;
|
||||
|
||||
// called through pthread_once()
|
||||
void sk_cpu_arm_probe_features(void) {
|
||||
sHasArmNeon = sk_cpu_arm_check_neon();
|
||||
}
|
||||
|
||||
bool sk_cpu_arm_has_neon(void) {
|
||||
pthread_once(&sOnce, sk_cpu_arm_probe_features);
|
||||
return sHasArmNeon;
|
||||
}
|
||||
|
||||
#endif // SK_ARM_NEON_IS_DYNAMIC
|
||||
|
@ -8,7 +8,6 @@
|
||||
#ifndef SkUtilsArm_DEFINED
|
||||
#define SkUtilsArm_DEFINED
|
||||
|
||||
#include "SkCpu.h"
|
||||
#include "SkUtils.h"
|
||||
|
||||
// Define SK_ARM_NEON_MODE to one of the following values
|
||||
@ -38,13 +37,18 @@
|
||||
// is ARMv7-A and supports Neon instructions. In DYNAMIC mode, this actually
|
||||
// probes the CPU at runtime (and caches the result).
|
||||
|
||||
static inline bool sk_cpu_arm_has_neon(void) {
|
||||
#if SK_ARM_NEON_IS_NONE
|
||||
static inline bool sk_cpu_arm_has_neon(void) {
|
||||
return false;
|
||||
#else
|
||||
return SkCpu::Supports(SkCpu::NEON);
|
||||
#endif
|
||||
}
|
||||
#elif SK_ARM_NEON_IS_ALWAYS
|
||||
static inline bool sk_cpu_arm_has_neon(void) {
|
||||
return true;
|
||||
}
|
||||
#else // SK_ARM_NEON_IS_DYNAMIC
|
||||
|
||||
extern bool sk_cpu_arm_has_neon(void) SK_PURE_FUNC;
|
||||
#endif
|
||||
|
||||
// Use SK_ARM_NEON_WRAP(symbol) to map 'symbol' to a NEON-specific symbol
|
||||
// when applicable. This will transform 'symbol' differently depending on
|
||||
|
@ -12,7 +12,6 @@
|
||||
#include "SkBlitMask.h"
|
||||
#include "SkBlitRow.h"
|
||||
#include "SkBlitRow_opts_SSE2.h"
|
||||
#include "SkCpu.h"
|
||||
#include "SkOncePtr.h"
|
||||
#include "SkRTConf.h"
|
||||
|
||||
@ -29,16 +28,111 @@
|
||||
*/
|
||||
|
||||
|
||||
#if defined(_MSC_VER) && defined(_WIN64)
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
/* This file must *not* be compiled with -msse or any other optional SIMD
|
||||
extension, otherwise gcc may generate SIMD instructions even for scalar ops
|
||||
(and thus give an invalid instruction on Pentium3 on the code below).
|
||||
For example, only files named *_SSE2.cpp in this directory should be
|
||||
compiled with -msse2 or higher. */
|
||||
|
||||
|
||||
/* Function to get the CPU SSE-level in runtime, for different compilers. */
|
||||
#ifdef _MSC_VER
|
||||
static inline void getcpuid(int info_type, int info[4]) {
|
||||
#if defined(_WIN64)
|
||||
__cpuid(info, info_type);
|
||||
#else
|
||||
__asm {
|
||||
mov eax, [info_type]
|
||||
cpuid
|
||||
mov edi, [info]
|
||||
mov [edi], eax
|
||||
mov [edi+4], ebx
|
||||
mov [edi+8], ecx
|
||||
mov [edi+12], edx
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#elif defined(__x86_64__)
|
||||
static inline void getcpuid(int info_type, int info[4]) {
|
||||
asm volatile (
|
||||
"cpuid \n\t"
|
||||
: "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
|
||||
: "a"(info_type)
|
||||
);
|
||||
}
|
||||
#else
|
||||
static inline void getcpuid(int info_type, int info[4]) {
|
||||
// We save and restore ebx, so this code can be compatible with -fPIC
|
||||
asm volatile (
|
||||
"pushl %%ebx \n\t"
|
||||
"cpuid \n\t"
|
||||
"movl %%ebx, %1 \n\t"
|
||||
"popl %%ebx \n\t"
|
||||
: "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3])
|
||||
: "a"(info_type)
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/* Fetch the SIMD level directly from the CPU, at run-time.
|
||||
* Only checks the levels needed by the optimizations in this file.
|
||||
*/
|
||||
static int* get_SIMD_level() {
|
||||
int cpu_info[4] = { 0, 0, 0, 0 };
|
||||
getcpuid(1, cpu_info);
|
||||
|
||||
int* level = new int;
|
||||
|
||||
if ((cpu_info[2] & (1<<20)) != 0) {
|
||||
*level = SK_CPU_SSE_LEVEL_SSE42;
|
||||
} else if ((cpu_info[2] & (1<<19)) != 0) {
|
||||
*level = SK_CPU_SSE_LEVEL_SSE41;
|
||||
} else if ((cpu_info[2] & (1<<9)) != 0) {
|
||||
*level = SK_CPU_SSE_LEVEL_SSSE3;
|
||||
} else if ((cpu_info[3] & (1<<26)) != 0) {
|
||||
*level = SK_CPU_SSE_LEVEL_SSE2;
|
||||
} else {
|
||||
*level = 0;
|
||||
}
|
||||
return level;
|
||||
}
|
||||
|
||||
SK_DECLARE_STATIC_ONCE_PTR(int, gSIMDLevel);
|
||||
|
||||
/* Verify that the requested SIMD level is supported in the build.
|
||||
* If not, check if the platform supports it.
|
||||
*/
|
||||
static inline bool supports_simd(int minLevel) {
|
||||
#if defined(SK_CPU_SSE_LEVEL)
|
||||
if (minLevel <= SK_CPU_SSE_LEVEL) {
|
||||
return true;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
#if defined(SK_BUILD_FOR_ANDROID_FRAMEWORK)
|
||||
/* For the Android framework we should always know at compile time if the device
|
||||
* we are building for supports SSSE3. The one exception to this rule is on the
|
||||
* emulator where we are compiled without the -mssse3 option (so we have no
|
||||
* SSSE3 procs) but can be run on a host machine that supports SSSE3
|
||||
* instructions. So for that particular case we disable our SSSE3 options.
|
||||
*/
|
||||
return false;
|
||||
#else
|
||||
return minLevel <= *gSIMDLevel.get(get_SIMD_level);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void SkBitmapScaler::PlatformConvolutionProcs(SkConvolutionProcs* procs) {
|
||||
if (SkCpu::Supports(SkCpu::SSE2)) {
|
||||
if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
|
||||
procs->fExtraHorizontalReads = 3;
|
||||
procs->fConvolveVertically = &convolveVertically_SSE2;
|
||||
procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2;
|
||||
@ -51,10 +145,10 @@ void SkBitmapScaler::PlatformConvolutionProcs(SkConvolutionProcs* procs) {
|
||||
|
||||
void SkBitmapProcState::platformProcs() {
|
||||
/* Every optimization in the function requires at least SSE2 */
|
||||
if (!SkCpu::Supports(SkCpu::SSE2)) {
|
||||
if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
|
||||
return;
|
||||
}
|
||||
const bool ssse3 = SkCpu::Supports(SkCpu::SSSE3);
|
||||
const bool ssse3 = supports_simd(SK_CPU_SSE_LEVEL_SSSE3);
|
||||
|
||||
/* Check fSampleProc32 */
|
||||
if (fSampleProc32 == S32_opaque_D32_filter_DX) {
|
||||
@ -105,7 +199,7 @@ static const SkBlitRow::Proc16 platform_16_procs[] = {
|
||||
};
|
||||
|
||||
SkBlitRow::Proc16 SkBlitRow::PlatformFactory565(unsigned flags) {
|
||||
if (SkCpu::Supports(SkCpu::SSE2)) {
|
||||
if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
|
||||
return platform_16_procs[flags];
|
||||
} else {
|
||||
return nullptr;
|
||||
@ -123,7 +217,7 @@ SkBlitRow::ColorProc16 SkBlitRow::PlatformColorFactory565(unsigned flags) {
|
||||
* SSE2 version on Silvermont, and only marginally faster on a Core i7,
|
||||
* mainly due to the MULLD timings.
|
||||
*/
|
||||
if (SkCpu::Supports(SkCpu::SSE2)) {
|
||||
if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
|
||||
return platform_565_colorprocs_SSE2[flags];
|
||||
} else {
|
||||
return nullptr;
|
||||
@ -138,7 +232,7 @@ static const SkBlitRow::Proc32 platform_32_procs_SSE2[] = {
|
||||
};
|
||||
|
||||
SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
|
||||
if (SkCpu::Supports(SkCpu::SSE2)) {
|
||||
if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
|
||||
return platform_32_procs_SSE2[flags];
|
||||
} else {
|
||||
return nullptr;
|
||||
@ -148,7 +242,7 @@ SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
|
||||
if (SkCpu::Supports(SkCpu::SSE2)) {
|
||||
if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
|
||||
if (isOpaque) {
|
||||
return SkBlitLCD16OpaqueRow_SSE2;
|
||||
} else {
|
||||
|
Loading…
Reference in New Issue
Block a user