2008-12-17 15:59:43 +00:00
|
|
|
/*
|
2011-07-28 14:26:00 +00:00
|
|
|
* Copyright 2006 The Android Open Source Project
|
2008-12-17 15:59:43 +00:00
|
|
|
*
|
2011-07-28 14:26:00 +00:00
|
|
|
* Use of this source code is governed by a BSD-style license that can be
|
|
|
|
* found in the LICENSE file.
|
2008-12-17 15:59:43 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef SkUtils_DEFINED
|
|
|
|
#define SkUtils_DEFINED
|
|
|
|
|
|
|
|
#include "SkTypes.h"
|
2015-07-31 21:02:36 +00:00
|
|
|
|
|
|
|
namespace SkOpts {
|
|
|
|
extern void (*memset16)(uint16_t[], uint16_t, int);
|
|
|
|
extern void (*memset32)(uint32_t[], uint32_t, int);
|
|
|
|
}
|
2008-12-17 15:59:43 +00:00
|
|
|
|
2011-04-10 01:04:37 +00:00
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
2008-12-17 15:59:43 +00:00
|
|
|
|
Revert of update memset16/32 inlining heuristics (patchset #1 id:1 of https://codereview.chromium.org/1357193002/ )
Reason for revert:
Who wants to land forever?
Original issue's description:
> update memset16/32 inlining heuristics
>
> I spent some time looking at perf.skia.org and it looks like we can do better.
>
> It is weird, weird, weird that on x86, we see three completely different behaviors:
> - x86 Android: inlining better for small N, custom better for large N;
> - Windows: inlining better for large N, custom better for small N;
> - other x86: inlining generally better
>
> BUG=skia:4316,chromium:516426
>
> (Temporary, plan to revert.)
> TBR=reed@google.com
>
> Committed: https://skia.googlesource.com/skia/+/b68fa409fc00ce2f38e2a0fd6f9dc2379b372481
TBR=reed@google.com,jcgregorio@google.com,mtklein@chromium.org
NOPRESUBMIT=true
NOTREECHECKS=true
NOTRY=true
BUG=skia:4316,chromium:516426
Review URL: https://codereview.chromium.org/1358793002
2015-09-20 22:02:54 +00:00
|
|
|
// The inlining heuristics below were determined using bench/MemsetBench.cpp
|
|
|
|
// on a x86 desktop, a Nexus 7 with and without NEON, and a Nexus 9:
|
|
|
|
// - on x86, inlining was never faster,
|
|
|
|
// - on ARMv7, inlining was faster for N<=10. Putting this check inside the NEON
|
|
|
|
// code was not helpful; it's got to be here outside.
|
|
|
|
// - NEON code generation for ARMv8 with GCC 4.9 is terrible,
|
|
|
|
// making the NEON code ~8x slower that just a serial loop.
|
2015-04-09 21:05:17 +00:00
|
|
|
|
2008-12-17 15:59:43 +00:00
|
|
|
/** Similar to memset(), but it assigns a 16bit value into the buffer.
|
|
|
|
@param buffer The memory to have value copied into it
|
|
|
|
@param value The 16bit value to be copied into buffer
|
|
|
|
@param count The number of times value should be copied into the buffer.
|
|
|
|
*/
|
Port SkUtils opts to SkOpts.
With this new arrangement, the benefits of inlining sk_memset16/32 have changed.
On x86, they're not significantly different, except for small N<=10 where the inlined code is significantly slower.
On ARMv7 with NEON, our custom code is still significantly faster for N>10 (up to 2x faster). For small N<=10 inlining is still significantly faster.
On ARMv7 without NEON, our custom code is still ridiculously faster (up to 10x) than inlining for N>10, though for small N<=10 inlining is still a little faster.
We were not using the NEON memset16 and memset32 procs on ARMv8. At first blush, that seems to be an oversight, but if so it's an extremely lucky one. The ARMv8 code generation for our memset16/32 procs is total garbage, leaving those methods ~8x slower than just inlining the memset, using the compiler's autovectorization.
So, no need to inline any more on x86, and still inline for N<=10 on ARMv7. Always inline for ARMv8.
BUG=skia:4117
Review URL: https://codereview.chromium.org/1270573002
2015-07-31 17:46:50 +00:00
|
|
|
static inline void sk_memset16(uint16_t buffer[], uint16_t value, int count) {
|
2015-09-21 02:04:21 +00:00
|
|
|
#if defined(SK_ARM_HAS_NEON) || SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
|
|
|
while (count --> 0) { *buffer++ = value; }
|
|
|
|
#else
|
2015-09-21 01:06:03 +00:00
|
|
|
SkOpts::memset16(buffer, value, count);
|
2015-09-21 02:04:21 +00:00
|
|
|
#endif
|
2015-04-09 21:05:17 +00:00
|
|
|
}
|
2008-12-17 15:59:43 +00:00
|
|
|
|
|
|
|
/** Similar to memset(), but it assigns a 32bit value into the buffer.
|
|
|
|
@param buffer The memory to have value copied into it
|
|
|
|
@param value The 32bit value to be copied into buffer
|
|
|
|
@param count The number of times value should be copied into the buffer.
|
|
|
|
*/
|
Port SkUtils opts to SkOpts.
With this new arrangement, the benefits of inlining sk_memset16/32 have changed.
On x86, they're not significantly different, except for small N<=10 where the inlined code is significantly slower.
On ARMv7 with NEON, our custom code is still significantly faster for N>10 (up to 2x faster). For small N<=10 inlining is still significantly faster.
On ARMv7 without NEON, our custom code is still ridiculously faster (up to 10x) than inlining for N>10, though for small N<=10 inlining is still a little faster.
We were not using the NEON memset16 and memset32 procs on ARMv8. At first blush, that seems to be an oversight, but if so it's an extremely lucky one. The ARMv8 code generation for our memset16/32 procs is total garbage, leaving those methods ~8x slower than just inlining the memset, using the compiler's autovectorization.
So, no need to inline any more on x86, and still inline for N<=10 on ARMv7. Always inline for ARMv8.
BUG=skia:4117
Review URL: https://codereview.chromium.org/1270573002
2015-07-31 17:46:50 +00:00
|
|
|
static inline void sk_memset32(uint32_t buffer[], uint32_t value, int count) {
|
2015-09-21 02:04:21 +00:00
|
|
|
#if defined(SK_ARM_HAS_NEON) || SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
|
|
|
|
while (count --> 0) { *buffer++ = value; }
|
|
|
|
#else
|
2015-09-21 01:06:03 +00:00
|
|
|
SkOpts::memset32(buffer, value, count);
|
2015-09-21 02:04:21 +00:00
|
|
|
#endif
|
2015-04-09 21:05:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-04-10 01:04:37 +00:00
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
2008-12-17 15:59:43 +00:00
|
|
|
|
|
|
|
#define kMaxBytesInUTF8Sequence 4
|
|
|
|
|
|
|
|
#ifdef SK_DEBUG
|
|
|
|
int SkUTF8_LeadByteToCount(unsigned c);
|
|
|
|
#else
|
|
|
|
#define SkUTF8_LeadByteToCount(c) ((((0xE5 << 24) >> ((unsigned)c >> 4 << 1)) & 3) + 1)
|
|
|
|
#endif
|
|
|
|
|
2011-04-10 01:04:37 +00:00
|
|
|
inline int SkUTF8_CountUTF8Bytes(const char utf8[]) {
|
2008-12-17 15:59:43 +00:00
|
|
|
SkASSERT(utf8);
|
|
|
|
return SkUTF8_LeadByteToCount(*(const uint8_t*)utf8);
|
|
|
|
}
|
|
|
|
|
|
|
|
int SkUTF8_CountUnichars(const char utf8[]);
|
|
|
|
int SkUTF8_CountUnichars(const char utf8[], size_t byteLength);
|
|
|
|
SkUnichar SkUTF8_ToUnichar(const char utf8[]);
|
|
|
|
SkUnichar SkUTF8_NextUnichar(const char**);
|
|
|
|
SkUnichar SkUTF8_PrevUnichar(const char**);
|
|
|
|
|
|
|
|
/** Return the number of bytes need to convert a unichar
|
|
|
|
into a utf8 sequence. Will be 1..kMaxBytesInUTF8Sequence,
|
|
|
|
or 0 if uni is illegal.
|
|
|
|
*/
|
|
|
|
size_t SkUTF8_FromUnichar(SkUnichar uni, char utf8[] = NULL);
|
|
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
#define SkUTF16_IsHighSurrogate(c) (((c) & 0xFC00) == 0xD800)
|
|
|
|
#define SkUTF16_IsLowSurrogate(c) (((c) & 0xFC00) == 0xDC00)
|
|
|
|
|
|
|
|
int SkUTF16_CountUnichars(const uint16_t utf16[]);
|
2013-11-21 14:24:16 +00:00
|
|
|
int SkUTF16_CountUnichars(const uint16_t utf16[], int numberOf16BitValues);
|
2008-12-17 15:59:43 +00:00
|
|
|
// returns the current unichar and then moves past it (*p++)
|
|
|
|
SkUnichar SkUTF16_NextUnichar(const uint16_t**);
|
|
|
|
// this guy backs up to the previus unichar value, and returns it (*--p)
|
|
|
|
SkUnichar SkUTF16_PrevUnichar(const uint16_t**);
|
|
|
|
size_t SkUTF16_FromUnichar(SkUnichar uni, uint16_t utf16[] = NULL);
|
|
|
|
|
|
|
|
size_t SkUTF16_ToUTF8(const uint16_t utf16[], int numberOf16BitValues,
|
2013-11-21 14:24:16 +00:00
|
|
|
char utf8[] = NULL);
|
2008-12-17 15:59:43 +00:00
|
|
|
|
2011-12-21 16:31:23 +00:00
|
|
|
inline bool SkUnichar_IsVariationSelector(SkUnichar uni) {
|
2011-12-21 15:21:32 +00:00
|
|
|
/* The 'true' ranges are:
|
|
|
|
* 0x180B <= uni <= 0x180D
|
|
|
|
* 0xFE00 <= uni <= 0xFE0F
|
|
|
|
* 0xE0100 <= uni <= 0xE01EF
|
|
|
|
*/
|
|
|
|
if (uni < 0x180B || uni > 0xE01EF) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if ((uni > 0x180D && uni < 0xFE00) || (uni > 0xFE0F && uni < 0xE0100)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2008-12-17 15:59:43 +00:00
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
class SkAutoTrace {
|
|
|
|
public:
|
|
|
|
/** NOTE: label contents are not copied, just the ptr is
|
|
|
|
retained, so DON'T DELETE IT.
|
|
|
|
*/
|
|
|
|
SkAutoTrace(const char label[]) : fLabel(label) {
|
|
|
|
SkDebugf("--- trace: %s Enter\n", fLabel);
|
|
|
|
}
|
|
|
|
~SkAutoTrace() {
|
|
|
|
SkDebugf("--- trace: %s Leave\n", fLabel);
|
|
|
|
}
|
|
|
|
private:
|
|
|
|
const char* fLabel;
|
|
|
|
};
|
2013-11-18 16:03:59 +00:00
|
|
|
#define SkAutoTrace(...) SK_REQUIRE_LOCAL_VAR(SkAutoTrace)
|
2008-12-17 15:59:43 +00:00
|
|
|
|
|
|
|
#endif
|