Add FLAC__SSE_SUPPORTED and FLAC__SSE2_SUPPORTED flags.

* Allow compiling using GCC GCC w/o SSE support.
* Allow SSE4.1 intrinsic functions to be enabled.

Patch-from: lvqcl <lvqcl.mail@gmail.com>
This commit is contained in:
Erik de Castro Lopo 2014-01-30 21:49:51 +11:00
parent c2747bec1c
commit d40e986a1e
11 changed files with 87 additions and 26 deletions

View File

@ -199,12 +199,4 @@ int flac_snprintf(char *str, size_t size, const char *fmt, ...);
};
#endif
/* SSSE3, SSE4 support: MSVS 2008, GCC 4.3 -- currently disabled, Intel Compiler 10.0 */
#if ( defined _MSC_VER && _MSC_VER >= 1500 ) \
|| ( 0 && defined __GNUC__ && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) ) \
|| ( defined __INTEL_COMPILER && __INTEL_COMPILER >= 1000 )
#define FLAC__SSSE3_SUPPORTED 1
#define FLAC__SSE4_SUPPORTED 1
#endif
#endif /* FLAC__SHARE__COMPAT_H */

View File

@ -39,6 +39,47 @@
#include <config.h>
#endif
/* SSE intrinsics support by ICC/MSVC/GCC */
#if defined __INTEL_COMPILER
#define FLAC__SSE_TARGET(x)
#define FLAC__SSE_SUPPORTED 1
#define FLAC__SSE2_SUPPORTED 1
#if (__INTEL_COMPILER >= 1000) /* Intel C++ Compiler 10.0 */
#define FLAC__SSSE3_SUPPORTED 1
#define FLAC__SSE4_1_SUPPORTED 1
#endif
#elif defined _MSC_VER
#define FLAC__SSE_TARGET(x)
#define FLAC__SSE_SUPPORTED 1
#define FLAC__SSE2_SUPPORTED 1
#if (_MSC_VER >= 1500) /* MS Visual Studio 2008 */
#define FLAC__SSSE3_SUPPORTED 1
#define FLAC__SSE4_1_SUPPORTED 1
#endif
#elif defined __GNUC__
#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) /* since GCC 4.9 -msse.. compiler options aren't necessary */
#define FLAC__SSE_TARGET(x) __attribute__ ((__target__ (x)))
#define FLAC__SSE_SUPPORTED 1
#define FLAC__SSE2_SUPPORTED 1
#define FLAC__SSSE3_SUPPORTED 1
#define FLAC__SSE4_1_SUPPORTED 1
#else /* for GCC older than 4.9 */
#define FLAC__SSE_TARGET(x)
#ifdef __SSE__
#define FLAC__SSE_SUPPORTED 1
#endif
#ifdef __SSE2__
#define FLAC__SSE2_SUPPORTED 1
#endif
#ifdef __SSSE3__
#define FLAC__SSSE3_SUPPORTED 1
#endif
#ifdef __SSE4_1__
#define FLAC__SSE4_1_SUPPORTED 1
#endif
#endif /* GCC version */
#endif /* compiler version */
typedef enum {
FLAC__CPUINFO_TYPE_IA32,
FLAC__CPUINFO_TYPE_X86_64,

View File

@ -37,6 +37,7 @@
#include <config.h>
#endif
#include "private/cpu.h"
#include "private/float.h"
#include "FLAC/format.h"
@ -80,10 +81,12 @@ void FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow(const FLAC__real data[], u
# endif
# endif
# if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
# ifdef FLAC__SSE_SUPPORTED
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
# endif
# endif
#endif
@ -156,9 +159,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32(const FLAC__
# endif
# endif
# if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
# ifdef FLAC__SSE2_SUPPORTED
void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
# ifdef FLAC__SSE4_SUPPORTED
# endif
# ifdef FLAC__SSE4_1_SUPPORTED
void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
# endif
# endif
@ -195,9 +200,9 @@ void FLAC__lpc_restore_signal_asm_ppc_altivec_16(const FLAC__int32 residual[], u
void FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
# endif /* FLAC__CPU_IA32 || FLAC__CPU_PPC */
# if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
# ifdef FLAC__SSE4_SUPPORTED
# ifdef FLAC__SSE4_1_SUPPORTED
void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
# endif
# endif
# endif
#endif /* FLAC__NO_ASM */

View File

@ -38,11 +38,13 @@
#endif
#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
#include "share/compat.h"
#include "private/cpu.h"
#include "FLAC/format.h"
#ifdef FLAC__SSE2_SUPPORTED
extern void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps);
#endif
#ifdef FLAC__SSSE3_SUPPORTED
extern void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],

View File

@ -37,13 +37,15 @@
#ifndef FLAC__INTEGER_ONLY_LIBRARY
#ifndef FLAC__NO_ASM
#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
#include "private/lpc.h"
#ifdef FLAC__SSE_SUPPORTED
#include "FLAC/assert.h"
#include "FLAC/format.h"
#include "private/lpc.h"
#include <xmmintrin.h> /* SSE */
FLAC__SSE_TARGET("sse")
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
__m128 xmm0, xmm2, xmm5;
@ -80,6 +82,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[],
_mm_storeu_ps(autoc, xmm5);
}
FLAC__SSE_TARGET("sse")
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
__m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6;
@ -125,6 +128,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[],
_mm_storeu_ps(autoc+4, xmm6);
}
FLAC__SSE_TARGET("sse")
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
@ -178,6 +182,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[]
_mm_storeu_ps(autoc+8, xmm7);
}
FLAC__SSE_TARGET("sse")
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9;
@ -241,6 +246,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[]
_mm_storeu_ps(autoc+12,xmm9);
}
#endif /* FLAC__SSE_SUPPORTED */
#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
#endif /* FLAC__NO_ASM */
#endif /* FLAC__INTEGER_ONLY_LIBRARY */

View File

@ -37,13 +37,15 @@
#ifndef FLAC__INTEGER_ONLY_LIBRARY
#ifndef FLAC__NO_ASM
#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
#include "private/lpc.h"
#ifdef FLAC__SSE2_SUPPORTED
#include "FLAC/assert.h"
#include "FLAC/format.h"
#include "private/lpc.h"
#include <emmintrin.h> /* SSE2 */
FLAC__SSE_TARGET("sse2")
void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
{
int i;
@ -787,6 +789,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
#define RESIDUAL_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
FLAC__SSE_TARGET("sse2")
void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
{
int i;
@ -1313,6 +1316,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
}
}
#endif /* FLAC__SSE2_SUPPORTED */
#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
#endif /* FLAC__NO_ASM */
#endif /* FLAC__INTEGER_ONLY_LIBRARY */

View File

@ -34,16 +34,14 @@
# include <config.h>
#endif
#include "share/compat.h"
#ifndef FLAC__INTEGER_ONLY_LIBRARY
#ifndef FLAC__NO_ASM
#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
#ifdef FLAC__SSE4_SUPPORTED
#include "private/lpc.h"
#ifdef FLAC__SSE4_1_SUPPORTED
#include "FLAC/assert.h"
#include "FLAC/format.h"
#include "private/lpc.h"
#include <smmintrin.h> /* SSE4.1 */
@ -68,6 +66,7 @@
#define DATA_RESULT(xmmN) data[i] = residual[i] + (FLAC__int32)(_mm_cvtsi128_si64(xmmN) >> lp_quantization);
#endif
FLAC__SSE_TARGET("sse4.1")
void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
{
int i;
@ -594,6 +593,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
}
}
FLAC__SSE_TARGET("sse4.1")
void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
{
int i;
@ -1120,7 +1120,7 @@ void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], un
}
}
#endif /* FLAC__SSE4_SUPPORTED */
#endif /* FLAC__SSE4_1_SUPPORTED */
#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
#endif /* FLAC__NO_ASM */
#endif /* FLAC__INTEGER_ONLY_LIBRARY */

View File

@ -417,7 +417,7 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_(
}
#endif
#ifdef FLAC__HAS_X86INTRIN
# if defined FLAC__SSE4_SUPPORTED && 0 /* now we have FLAC__lpc_restore_signal_wide_asm_ia32() which is slightly faster */
# if defined FLAC__SSE4_1_SUPPORTED && 0 /* now we have FLAC__lpc_restore_signal_wide_asm_ia32() which is slightly faster */
if(decoder->private_->cpuinfo.ia32.sse41)
decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_intrin_sse41;
# endif

View File

@ -920,11 +920,13 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov;
# endif /* FLAC__HAS_NASM */
# ifdef FLAC__HAS_X86INTRIN
# ifdef FLAC__SSE2_SUPPORTED
if(encoder->private_->cpuinfo.ia32.sse2) {
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2;
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2;
}
# ifdef FLAC__SSE4_SUPPORTED
# endif
# ifdef FLAC__SSE4_1_SUPPORTED
if(encoder->private_->cpuinfo.ia32.sse41)
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41;
# endif
@ -932,6 +934,7 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
# elif defined FLAC__CPU_X86_64
FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_X86_64);
# ifdef FLAC__HAS_X86INTRIN
# ifdef FLAC__SSE_SUPPORTED
if(encoder->protected_->max_lpc_order < 4)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4;
else if(encoder->protected_->max_lpc_order < 8)
@ -940,9 +943,11 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12;
else if(encoder->protected_->max_lpc_order < 16)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16;
# endif
# ifdef FLAC__SSE2_SUPPORTED
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2;
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2;
# endif
# endif /* FLAC__HAS_X86INTRIN */
# endif /* FLAC__CPU_... */
}
@ -956,15 +961,19 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_ssse3;
else
# endif
# ifdef FLAC__SSE2_SUPPORTED
if(encoder->private_->cpuinfo.ia32.sse2)
encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_sse2;
# endif
# elif defined FLAC__CPU_X86_64
# ifdef FLAC__SSSE3_SUPPORTED
if(encoder->private_->cpuinfo.x86_64.ssse3)
encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_ssse3;
else
# endif
# ifdef FLAC__SSE2_SUPPORTED
encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_sse2;
# endif
# endif /* FLAC__CPU_... */
}
#endif /* !FLAC__NO_ASM && FLAC__HAS_X86INTRIN */

View File

@ -36,12 +36,14 @@
#ifndef FLAC__NO_ASM
#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
#include "private/stream_encoder.h"
#ifdef FLAC__SSE2_SUPPORTED
#include <stdlib.h> /* for abs() */
#include <emmintrin.h> /* SSE2 */
#include "FLAC/assert.h"
#include "private/stream_encoder.h"
FLAC__SSE_TARGET("sse2")
void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps)
{
@ -157,5 +159,6 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual
}
}
#endif /* FLAC__SSE2_SUPPORTED */
#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
#endif /* FLAC__NO_ASM */

View File

@ -34,17 +34,16 @@
# include <config.h>
#endif
#include "share/compat.h"
#ifndef FLAC__NO_ASM
#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
#include "private/stream_encoder.h"
#ifdef FLAC__SSSE3_SUPPORTED
#include <stdlib.h> /* for abs() */
#include <tmmintrin.h> /* SSSE3 */
#include "FLAC/assert.h"
#include "private/stream_encoder.h"
FLAC__SSE_TARGET("ssse3")
void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps)
{