Optimize compression by using neon function.

This commit is contained in:
caoyzh 2020-03-16 11:07:31 +08:00 committed by Nick Terrell
parent 45c66dd298
commit b2e56f7f7f
2 changed files with 36 additions and 3 deletions

View File

@ -21,6 +21,9 @@
***************************************/
#include "compiler.h"
#include "mem.h"
#ifdef __aarch64__
#include "arm_neon.h"
#endif
#include "debug.h" /* assert, DEBUGLOG, RAWLOG, g_debuglevel */
#include "error_private.h"
#define ZSTD_STATIC_LINKING_ONLY
@ -224,10 +227,22 @@ static const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
/*-*******************************************
* Shared functions to include for inlining
*********************************************/
static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
static void ZSTD_copy8(void* dst, const void* src) {
#ifdef __aarch64__
vst1_u8((uint8_t*)dst, vld1_u8((const uint8_t*)src));
#else
memcpy(dst, src, 8);
#endif
}
#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
static void ZSTD_copy16(void* dst, const void* src) { memcpy(dst, src, 16); }
static void ZSTD_copy16(void* dst, const void* src) {
#ifdef __aarch64__
vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src));
#else
memcpy(dst, src, 16);
#endif
}
#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
#define WILDCOPY_OVERLENGTH 32
@ -269,8 +284,10 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
* one COPY16() in the first call. Then, do two calls per loop since
* at that point it is more likely to have a high trip count.
*/
#ifndef __aarch64__
COPY16(op, ip);
if (op >= oend) return;
#endif
do {
COPY16(op, ip);
COPY16(op, ip);

View File

@ -1865,6 +1865,20 @@ ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerVa
for (rowNb=0 ; rowNb < nbRows ; rowNb++) {
int column;
#ifdef __aarch64__
for (column=0; column<ZSTD_ROWSIZE; column+=4) {
uint32x4_t const zero = {0, 0, 0, 0};
uint32x4_t const reducer = vdupq_n_u32(reducerValue);
uint32x4_t data = vld1q_u32(table + cellNb);
if (preserveMark) {
uint32x4_t const mark = {ZSTD_DUBT_UNSORTED_MARK, ZSTD_DUBT_UNSORTED_MARK, ZSTD_DUBT_UNSORTED_MARK, ZSTD_DUBT_UNSORTED_MARK};
data = vbslq_u32(vceqq_u32(data, mark), vaddq_u32(data, reducer), data);
}
data = vbslq_u32(vcltq_u32(data, reducer), zero, vsubq_u32(data, reducer));
vst1q_u32(table + cellNb, data);
cellNb+=4;
}
#else
for (column=0; column<ZSTD_ROWSIZE; column++) {
if (preserveMark) {
U32 const adder = (table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) ? reducerValue : 0;
@ -1873,7 +1887,9 @@ ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerVa
if (table[cellNb] < reducerValue) table[cellNb] = 0;
else table[cellNb] -= reducerValue;
cellNb++;
} }
}
#endif
}
}
static void ZSTD_reduceTable(U32* const table, U32 const size, U32 const reducerValue)