Optimize FT_MulFix for x86_64 GCC builds.
This patch provides an optimized `FT_MulFix' implementation for x86_64 machines when FreeType is built with GCC, or compatible compilers like Clang. Example: bin/ftbench -p -t 5 -s 14 -f 0008 Arial.ttf Before: Load 4.863 us/op Load_Advances (Normal) 4.816 us/op Load_Advances (Fast) 0.028 us/op Render 2.753 us/op Get_Glyph 0.463 us/op Get_CBox 0.077 us/op Get_Char_Index 0.023 us/op Iterate CMap 13.898 us/op New_Face 12.368 us/op Embolden 0.028 us/op Get_BBox 0.302 us/op After: Load 4.617 us/op Load_Advances (Normal) 4.645 us/op Load_Advances (Fast) 0.027 us/op Render 2.789 us/op Get_Glyph 0.460 us/op Get_CBox 0.077 us/op Get_Char_Index 0.024 us/op Iterate CMap 13.403 us/op New_Face 12.278 us/op Embolden 0.028 us/op Get_BBox 0.301 us/op * builds/unix/ftconfig.in, include/freetype/config/ftconfig.h (FT_MulFix_x86_64): New function.
This commit is contained in:
parent
a5f33eeb8a
commit
b28908860d
42
ChangeLog
42
ChangeLog
@ -1,3 +1,45 @@
|
||||
2013-07-16 David Turner <digit@google.com>
|
||||
|
||||
Optimize FT_MulFix for x86_64 GCC builds.
|
||||
|
||||
This patch provides an optimized `FT_MulFix' implementation for
|
||||
x86_64 machines when FreeType is built with GCC, or compatible
|
||||
compilers like Clang.
|
||||
|
||||
Example:
|
||||
bin/ftbench -p -t 5 -s 14 -f 0008 Arial.ttf
|
||||
|
||||
Before:
|
||||
|
||||
Load 4.863 us/op
|
||||
Load_Advances (Normal) 4.816 us/op
|
||||
Load_Advances (Fast) 0.028 us/op
|
||||
Render 2.753 us/op
|
||||
Get_Glyph 0.463 us/op
|
||||
Get_CBox 0.077 us/op
|
||||
Get_Char_Index 0.023 us/op
|
||||
Iterate CMap 13.898 us/op
|
||||
New_Face 12.368 us/op
|
||||
Embolden 0.028 us/op
|
||||
Get_BBox 0.302 us/op
|
||||
|
||||
After:
|
||||
|
||||
Load 4.617 us/op
|
||||
Load_Advances (Normal) 4.645 us/op
|
||||
Load_Advances (Fast) 0.027 us/op
|
||||
Render 2.789 us/op
|
||||
Get_Glyph 0.460 us/op
|
||||
Get_CBox 0.077 us/op
|
||||
Get_Char_Index 0.024 us/op
|
||||
Iterate CMap 13.403 us/op
|
||||
New_Face 12.278 us/op
|
||||
Embolden 0.028 us/op
|
||||
Get_BBox 0.301 us/op
|
||||
|
||||
* builds/unix/ftconfig.in, include/freetype/config/ftconfig.h
|
||||
(FT_MulFix_x86_64): New function.
|
||||
|
||||
2013-07-16 David Turner <digit@google.com>
|
||||
|
||||
Speed up ARMv7 support.
|
||||
|
@ -366,6 +366,7 @@ FT_BEGIN_HEADER
|
||||
/* These must be defined `static __inline__' with GCC. */
|
||||
|
||||
#if defined( __CC_ARM ) || defined( __ARMCC__ ) /* RVCT */
|
||||
|
||||
#define FT_MULFIX_ASSEMBLER FT_MulFix_arm
|
||||
|
||||
/* documentation is in freetype.h */
|
||||
@ -428,7 +429,9 @@ FT_BEGIN_HEADER
|
||||
/* ( __thumb2__ || !__thumb__ ) && */
|
||||
/* !( __CC_ARM || __ARMCC__ ) */
|
||||
|
||||
|
||||
#if defined( __i386__ )
|
||||
|
||||
#define FT_MULFIX_ASSEMBLER FT_MulFix_i386
|
||||
|
||||
/* documentation is in freetype.h */
|
||||
@ -497,6 +500,62 @@ FT_BEGIN_HEADER
|
||||
|
||||
#endif /* _MSC_VER */
|
||||
|
||||
|
||||
#if defined( __GNUC__ ) && defined( __x86_64__ )
|
||||
|
||||
#define FT_MULFIX_ASSEMBLER FT_MulFix_x86_64
|
||||
|
||||
static __inline__ FT_Int32
|
||||
FT_MulFix_x86_64( FT_Int32 a,
|
||||
FT_Int32 b )
|
||||
{
|
||||
/* Temporarily disable the warning that C90 doesn't support */
|
||||
/* `long long'. */
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wlong-long"
|
||||
|
||||
#if 1
|
||||
/* Technically not an assembly fragment, but GCC does a really good */
|
||||
/* job at inlining it and generating good machine code for it. */
|
||||
long long ret, tmp;
|
||||
|
||||
|
||||
ret = (long long)a * b;
|
||||
tmp = ret >> 63;
|
||||
ret += 0x8000 + tmp;
|
||||
|
||||
return (FT_Int32)( ret >> 16 );
|
||||
#else
|
||||
|
||||
/* For some reason, GCC 4.6 on Ubuntu 12.04 generates invalid machine */
|
||||
/* code from the lines below. The main issue is that `wide_a' is not */
|
||||
/* properly initialized by sign-extending `a'. Instead, the generated */
|
||||
/* machine code assumes that the register that contains `a' on input */
|
||||
/* can be used directly as a 64-bit value, which is wrong most of the */
|
||||
/* time. */
|
||||
long long wide_a = (long long)a;
|
||||
long long wide_b = (long long)b;
|
||||
long long result;
|
||||
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"imul %2, %1\n"
|
||||
"mov %1, %0\n"
|
||||
"sar $63, %0\n"
|
||||
"lea 0x8000(%1, %0), %0\n"
|
||||
"sar $16, %0\n"
|
||||
: "=&r"(result), "=&r"(wide_a)
|
||||
: "r"(wide_b)
|
||||
: "cc" );
|
||||
|
||||
return (FT_Int32)result;
|
||||
#endif
|
||||
|
||||
#pragma GCC diagnostic pop
|
||||
}
|
||||
|
||||
#endif /* __GNUC__ && __x86_64__ */
|
||||
|
||||
#endif /* !FT_CONFIG_OPTION_NO_ASSEMBLER */
|
||||
|
||||
|
||||
|
@ -338,6 +338,7 @@ FT_BEGIN_HEADER
|
||||
/* These must be defined `static __inline__' with GCC. */
|
||||
|
||||
#if defined( __CC_ARM ) || defined( __ARMCC__ ) /* RVCT */
|
||||
|
||||
#define FT_MULFIX_ASSEMBLER FT_MulFix_arm
|
||||
|
||||
/* documentation is in freetype.h */
|
||||
@ -370,6 +371,7 @@ FT_BEGIN_HEADER
|
||||
#if defined( __arm__ ) && \
|
||||
( !defined( __thumb__ ) || defined( __thumb2__ ) ) && \
|
||||
!( defined( __CC_ARM ) || defined( __ARMCC__ ) )
|
||||
|
||||
#define FT_MULFIX_ASSEMBLER FT_MulFix_arm
|
||||
|
||||
/* documentation is in freetype.h */
|
||||
@ -399,7 +401,9 @@ FT_BEGIN_HEADER
|
||||
/* ( __thumb2__ || !__thumb__ ) && */
|
||||
/* !( __CC_ARM || __ARMCC__ ) */
|
||||
|
||||
|
||||
#if defined( __i386__ )
|
||||
|
||||
#define FT_MULFIX_ASSEMBLER FT_MulFix_i386
|
||||
|
||||
/* documentation is in freetype.h */
|
||||
@ -468,6 +472,62 @@ FT_BEGIN_HEADER
|
||||
|
||||
#endif /* _MSC_VER */
|
||||
|
||||
|
||||
#if defined( __GNUC__ ) && defined( __x86_64__ )
|
||||
|
||||
#define FT_MULFIX_ASSEMBLER FT_MulFix_x86_64
|
||||
|
||||
static __inline__ FT_Int32
|
||||
FT_MulFix_x86_64( FT_Int32 a,
|
||||
FT_Int32 b )
|
||||
{
|
||||
/* Temporarily disable the warning that C90 doesn't support */
|
||||
/* `long long'. */
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wlong-long"
|
||||
|
||||
#if 1
|
||||
/* Technically not an assembly fragment, but GCC does a really good */
|
||||
/* job at inlining it and generating good machine code for it. */
|
||||
long long ret, tmp;
|
||||
|
||||
|
||||
ret = (long long)a * b;
|
||||
tmp = ret >> 63;
|
||||
ret += 0x8000 + tmp;
|
||||
|
||||
return (FT_Int32)( ret >> 16 );
|
||||
#else
|
||||
|
||||
/* For some reason, GCC 4.6 on Ubuntu 12.04 generates invalid machine */
|
||||
/* code from the lines below. The main issue is that `wide_a' is not */
|
||||
/* properly initialized by sign-extending `a'. Instead, the generated */
|
||||
/* machine code assumes that the register that contains `a' on input */
|
||||
/* can be used directly as a 64-bit value, which is wrong most of the */
|
||||
/* time. */
|
||||
long long wide_a = (long long)a;
|
||||
long long wide_b = (long long)b;
|
||||
long long result;
|
||||
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"imul %2, %1\n"
|
||||
"mov %1, %0\n"
|
||||
"sar $63, %0\n"
|
||||
"lea 0x8000(%1, %0), %0\n"
|
||||
"sar $16, %0\n"
|
||||
: "=&r"(result), "=&r"(wide_a)
|
||||
: "r"(wide_b)
|
||||
: "cc" );
|
||||
|
||||
return (FT_Int32)result;
|
||||
#endif
|
||||
|
||||
#pragma GCC diagnostic pop
|
||||
}
|
||||
|
||||
#endif /* __GNUC__ && __x86_64__ */
|
||||
|
||||
#endif /* !FT_CONFIG_OPTION_NO_ASSEMBLER */
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user