Optimize compression by using neon function.
This commit is contained in:
parent
45c66dd298
commit
b2e56f7f7f
@ -21,6 +21,9 @@
|
||||
***************************************/
|
||||
#include "compiler.h"
|
||||
#include "mem.h"
|
||||
#ifdef __aarch64__
|
||||
#include "arm_neon.h"
|
||||
#endif
|
||||
#include "debug.h" /* assert, DEBUGLOG, RAWLOG, g_debuglevel */
|
||||
#include "error_private.h"
|
||||
#define ZSTD_STATIC_LINKING_ONLY
|
||||
@ -224,10 +227,22 @@ static const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
|
||||
/*-*******************************************
|
||||
* Shared functions to include for inlining
|
||||
*********************************************/
|
||||
static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
|
||||
static void ZSTD_copy8(void* dst, const void* src) {
|
||||
#ifdef __aarch64__
|
||||
vst1_u8((uint8_t*)dst, vld1_u8((const uint8_t*)src));
|
||||
#else
|
||||
memcpy(dst, src, 8);
|
||||
#endif
|
||||
}
|
||||
|
||||
#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
|
||||
static void ZSTD_copy16(void* dst, const void* src) { memcpy(dst, src, 16); }
|
||||
static void ZSTD_copy16(void* dst, const void* src) {
|
||||
#ifdef __aarch64__
|
||||
vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src));
|
||||
#else
|
||||
memcpy(dst, src, 16);
|
||||
#endif
|
||||
}
|
||||
#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
|
||||
|
||||
#define WILDCOPY_OVERLENGTH 32
|
||||
@ -269,8 +284,10 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
|
||||
* one COPY16() in the first call. Then, do two calls per loop since
|
||||
* at that point it is more likely to have a high trip count.
|
||||
*/
|
||||
#ifndef __aarch64__
|
||||
COPY16(op, ip);
|
||||
if (op >= oend) return;
|
||||
#endif
|
||||
do {
|
||||
COPY16(op, ip);
|
||||
COPY16(op, ip);
|
||||
|
@ -1865,6 +1865,20 @@ ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerVa
|
||||
|
||||
for (rowNb=0 ; rowNb < nbRows ; rowNb++) {
|
||||
int column;
|
||||
#ifdef __aarch64__
|
||||
for (column=0; column<ZSTD_ROWSIZE; column+=4) {
|
||||
uint32x4_t const zero = {0, 0, 0, 0};
|
||||
uint32x4_t const reducer = vdupq_n_u32(reducerValue);
|
||||
uint32x4_t data = vld1q_u32(table + cellNb);
|
||||
if (preserveMark) {
|
||||
uint32x4_t const mark = {ZSTD_DUBT_UNSORTED_MARK, ZSTD_DUBT_UNSORTED_MARK, ZSTD_DUBT_UNSORTED_MARK, ZSTD_DUBT_UNSORTED_MARK};
|
||||
data = vbslq_u32(vceqq_u32(data, mark), vaddq_u32(data, reducer), data);
|
||||
}
|
||||
data = vbslq_u32(vcltq_u32(data, reducer), zero, vsubq_u32(data, reducer));
|
||||
vst1q_u32(table + cellNb, data);
|
||||
cellNb+=4;
|
||||
}
|
||||
#else
|
||||
for (column=0; column<ZSTD_ROWSIZE; column++) {
|
||||
if (preserveMark) {
|
||||
U32 const adder = (table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) ? reducerValue : 0;
|
||||
@ -1873,7 +1887,9 @@ ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerVa
|
||||
if (table[cellNb] < reducerValue) table[cellNb] = 0;
|
||||
else table[cellNb] -= reducerValue;
|
||||
cellNb++;
|
||||
} }
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void ZSTD_reduceTable(U32* const table, U32 const size, U32 const reducerValue)
|
||||
|
Loading…
Reference in New Issue
Block a user