Optimize FT_MulFix for x86_64 GCC builds.

This patch provides an optimized `FT_MulFix' implementation for
x86_64 machines when FreeType is built with GCC, or compatible
compilers like Clang.

Example:
  bin/ftbench -p -t 5 -s 14 -f 0008 Arial.ttf

Before:

  Load                       4.863 us/op
  Load_Advances (Normal)     4.816 us/op
  Load_Advances (Fast)       0.028 us/op
  Render                     2.753 us/op
  Get_Glyph                  0.463 us/op
  Get_CBox                   0.077 us/op
  Get_Char_Index             0.023 us/op
  Iterate CMap              13.898 us/op
  New_Face                  12.368 us/op
  Embolden                   0.028 us/op
  Get_BBox                   0.302 us/op

After:

  Load                       4.617 us/op
  Load_Advances (Normal)     4.645 us/op
  Load_Advances (Fast)       0.027 us/op
  Render                     2.789 us/op
  Get_Glyph                  0.460 us/op
  Get_CBox                   0.077 us/op
  Get_Char_Index             0.024 us/op
  Iterate CMap              13.403 us/op
  New_Face                  12.278 us/op
  Embolden                   0.028 us/op
  Get_BBox                   0.301 us/op

* builds/unix/ftconfig.in, include/freetype/config/ftconfig.h
(FT_MulFix_x86_64): New function.
This commit is contained in:
David Turner 2013-07-16 12:52:18 +02:00 committed by Werner Lemberg
parent a5f33eeb8a
commit b28908860d
3 changed files with 161 additions and 0 deletions

View File

@ -1,3 +1,45 @@
2013-07-16 David Turner <digit@google.com>
Optimize FT_MulFix for x86_64 GCC builds.
This patch provides an optimized `FT_MulFix' implementation for
x86_64 machines when FreeType is built with GCC, or compatible
compilers like Clang.
Example:
bin/ftbench -p -t 5 -s 14 -f 0008 Arial.ttf
Before:
Load 4.863 us/op
Load_Advances (Normal) 4.816 us/op
Load_Advances (Fast) 0.028 us/op
Render 2.753 us/op
Get_Glyph 0.463 us/op
Get_CBox 0.077 us/op
Get_Char_Index 0.023 us/op
Iterate CMap 13.898 us/op
New_Face 12.368 us/op
Embolden 0.028 us/op
Get_BBox 0.302 us/op
After:
Load 4.617 us/op
Load_Advances (Normal) 4.645 us/op
Load_Advances (Fast) 0.027 us/op
Render 2.789 us/op
Get_Glyph 0.460 us/op
Get_CBox 0.077 us/op
Get_Char_Index 0.024 us/op
Iterate CMap 13.403 us/op
New_Face 12.278 us/op
Embolden 0.028 us/op
Get_BBox 0.301 us/op
* builds/unix/ftconfig.in, include/freetype/config/ftconfig.h
(FT_MulFix_x86_64): New function.
2013-07-16 David Turner <digit@google.com>
Speed up ARMv7 support.

View File

@ -366,6 +366,7 @@ FT_BEGIN_HEADER
/* These must be defined `static __inline__' with GCC. */
#if defined( __CC_ARM ) || defined( __ARMCC__ ) /* RVCT */
#define FT_MULFIX_ASSEMBLER FT_MulFix_arm
/* documentation is in freetype.h */
@ -428,7 +429,9 @@ FT_BEGIN_HEADER
/* ( __thumb2__ || !__thumb__ ) && */
/* !( __CC_ARM || __ARMCC__ ) */
#if defined( __i386__ )
#define FT_MULFIX_ASSEMBLER FT_MulFix_i386
/* documentation is in freetype.h */
@ -497,6 +500,62 @@ FT_BEGIN_HEADER
#endif /* _MSC_VER */
#if defined( __GNUC__ ) && defined( __x86_64__ )
#define FT_MULFIX_ASSEMBLER FT_MulFix_x86_64
static __inline__ FT_Int32
FT_MulFix_x86_64( FT_Int32 a,
FT_Int32 b )
{
/* Temporarily disable the warning that C90 doesn't support */
/* `long long'. */
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wlong-long"
#if 1
/* Technically not an assembly fragment, but GCC does a really good */
/* job at inlining it and generating good machine code for it. */
long long ret, tmp;
ret = (long long)a * b;
tmp = ret >> 63;
ret += 0x8000 + tmp;
return (FT_Int32)( ret >> 16 );
#else
/* For some reason, GCC 4.6 on Ubuntu 12.04 generates invalid machine */
/* code from the lines below. The main issue is that `wide_a' is not */
/* properly initialized by sign-extending `a'. Instead, the generated */
/* machine code assumes that the register that contains `a' on input */
/* can be used directly as a 64-bit value, which is wrong most of the */
/* time. */
long long wide_a = (long long)a;
long long wide_b = (long long)b;
long long result;
__asm__ __volatile__ (
"imul %2, %1\n"
"mov %1, %0\n"
"sar $63, %0\n"
"lea 0x8000(%1, %0), %0\n"
"sar $16, %0\n"
: "=&r"(result), "=&r"(wide_a)
: "r"(wide_b)
: "cc" );
return (FT_Int32)result;
#endif
#pragma GCC diagnostic pop
}
#endif /* __GNUC__ && __x86_64__ */
#endif /* !FT_CONFIG_OPTION_NO_ASSEMBLER */

View File

@ -338,6 +338,7 @@ FT_BEGIN_HEADER
/* These must be defined `static __inline__' with GCC. */
#if defined( __CC_ARM ) || defined( __ARMCC__ ) /* RVCT */
#define FT_MULFIX_ASSEMBLER FT_MulFix_arm
/* documentation is in freetype.h */
@ -370,6 +371,7 @@ FT_BEGIN_HEADER
#if defined( __arm__ ) && \
( !defined( __thumb__ ) || defined( __thumb2__ ) ) && \
!( defined( __CC_ARM ) || defined( __ARMCC__ ) )
#define FT_MULFIX_ASSEMBLER FT_MulFix_arm
/* documentation is in freetype.h */
@ -399,7 +401,9 @@ FT_BEGIN_HEADER
/* ( __thumb2__ || !__thumb__ ) && */
/* !( __CC_ARM || __ARMCC__ ) */
#if defined( __i386__ )
#define FT_MULFIX_ASSEMBLER FT_MulFix_i386
/* documentation is in freetype.h */
@ -468,6 +472,62 @@ FT_BEGIN_HEADER
#endif /* _MSC_VER */
#if defined( __GNUC__ ) && defined( __x86_64__ )
#define FT_MULFIX_ASSEMBLER FT_MulFix_x86_64
static __inline__ FT_Int32
FT_MulFix_x86_64( FT_Int32 a,
FT_Int32 b )
{
/* Temporarily disable the warning that C90 doesn't support */
/* `long long'. */
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wlong-long"
#if 1
/* Technically not an assembly fragment, but GCC does a really good */
/* job at inlining it and generating good machine code for it. */
long long ret, tmp;
ret = (long long)a * b;
tmp = ret >> 63;
ret += 0x8000 + tmp;
return (FT_Int32)( ret >> 16 );
#else
/* For some reason, GCC 4.6 on Ubuntu 12.04 generates invalid machine */
/* code from the lines below. The main issue is that `wide_a' is not */
/* properly initialized by sign-extending `a'. Instead, the generated */
/* machine code assumes that the register that contains `a' on input */
/* can be used directly as a 64-bit value, which is wrong most of the */
/* time. */
long long wide_a = (long long)a;
long long wide_b = (long long)b;
long long result;
__asm__ __volatile__ (
"imul %2, %1\n"
"mov %1, %0\n"
"sar $63, %0\n"
"lea 0x8000(%1, %0), %0\n"
"sar $16, %0\n"
: "=&r"(result), "=&r"(wide_a)
: "r"(wide_b)
: "cc" );
return (FT_Int32)result;
#endif
#pragma GCC diagnostic pop
}
#endif /* __GNUC__ && __x86_64__ */
#endif /* !FT_CONFIG_OPTION_NO_ASSEMBLER */