Integrate dictBuilder into libzstd
This commit is contained in:
parent
82260ddd8a
commit
7d36028078
@ -1,6 +1,6 @@
|
|||||||
# ##########################################################################
|
# ##########################################################################
|
||||||
# Dict Builder - Makefile
|
# Dict Builder - Makefile
|
||||||
# Copyright (C) Yann Collet 2015
|
# Copyright (C) Yann Collet 2016
|
||||||
#
|
#
|
||||||
# GPL v2 License
|
# GPL v2 License
|
||||||
#
|
#
|
||||||
@ -19,8 +19,7 @@
|
|||||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
#
|
#
|
||||||
# You can contact the author at :
|
# You can contact the author at :
|
||||||
# - ZSTD source repository : http://code.google.com/p/zstd/
|
# - ZSTD homepage : http://www.zstd.net/
|
||||||
# - Public forum : https://groups.google.com/forum/#!forum/lz4c
|
|
||||||
# ##########################################################################
|
# ##########################################################################
|
||||||
|
|
||||||
CPPFLAGS= -I../lib
|
CPPFLAGS= -I../lib
|
||||||
@ -47,7 +46,7 @@ default: dictBuilder
|
|||||||
|
|
||||||
all: dictBuilder
|
all: dictBuilder
|
||||||
|
|
||||||
dictBuilder: dictBuilder.c dibcli.c divsufsort.c sssort.c trsort.c $(ZSTDDIR)/huff0.c $(ZSTDDIR)/fse.c $(ZSTDDIR)/zstd_decompress.c
|
dictBuilder: dibio.c dibcli.c $(ZSTDDIR)/dictBuilder.c $(ZSTDDIR)/huff0.c $(ZSTDDIR)/fse.c $(ZSTDDIR)/zstd_decompress.c $(ZSTDDIR)/zstd_compress.c $(ZSTDDIR)/divsufsort.c
|
||||||
$(CC) $(FLAGS) $^ -o $@$(EXT)
|
$(CC) $(FLAGS) $^ -o $@$(EXT)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
|
@ -37,8 +37,8 @@
|
|||||||
#include <stdlib.h> /* exit, calloc, free */
|
#include <stdlib.h> /* exit, calloc, free */
|
||||||
#include <string.h> /* strcmp, strlen */
|
#include <string.h> /* strcmp, strlen */
|
||||||
#include <stdio.h> /* fprintf, getchar */
|
#include <stdio.h> /* fprintf, getchar */
|
||||||
|
#include "dibio.h"
|
||||||
#include "dictBuilder.h"
|
#include "zstd.h" /* version numbers */
|
||||||
|
|
||||||
|
|
||||||
/*-************************************
|
/*-************************************
|
||||||
@ -48,7 +48,7 @@
|
|||||||
#ifndef PROGRAM_VERSION
|
#ifndef PROGRAM_VERSION
|
||||||
# define QUOTE(str) #str
|
# define QUOTE(str) #str
|
||||||
# define EXP_Q(str) QUOTE(str)
|
# define EXP_Q(str) QUOTE(str)
|
||||||
# define PROGRAM_VERSION "v" EXP_Q(DiB_VERSION_MAJOR) "." EXP_Q(DiB_VERSION_MINOR) "." EXP_Q(DiB_VERSION_RELEASE)
|
# define PROGRAM_VERSION "v" EXP_Q(ZSTD_VERSION_MAJOR) "." EXP_Q(ZSTD_VERSION_MINOR) "." EXP_Q(ZSTD_VERSION_RELEASE)
|
||||||
#endif
|
#endif
|
||||||
#define AUTHOR "Yann Collet"
|
#define AUTHOR "Yann Collet"
|
||||||
#define WELCOME_MESSAGE "*** %s %s %i-bits, by %s ***\n", PROGRAM_DESCRIPTION, PROGRAM_VERSION, (int)(sizeof(void*)*8), AUTHOR
|
#define WELCOME_MESSAGE "*** %s %s %i-bits, by %s ***\n", PROGRAM_DESCRIPTION, PROGRAM_VERSION, (int)(sizeof(void*)*8), AUTHOR
|
||||||
@ -248,7 +248,7 @@ int main(int argCount, const char** argv)
|
|||||||
|
|
||||||
/* building ... */
|
/* building ... */
|
||||||
{
|
{
|
||||||
DiB_params_t param;
|
ZDICT_params_t param;
|
||||||
param.selectivityLevel = selectionLevel;
|
param.selectivityLevel = selectionLevel;
|
||||||
param.compressionLevel = cLevel;
|
param.compressionLevel = cLevel;
|
||||||
DiB_setNotificationLevel(g_displayLevel);
|
DiB_setNotificationLevel(g_displayLevel);
|
||||||
|
281
dictBuilder/dibio.c
Normal file
281
dictBuilder/dibio.c
Normal file
@ -0,0 +1,281 @@
|
|||||||
|
/*
|
||||||
|
dibio - I/O API for dictionary builder
|
||||||
|
Copyright (C) Yann Collet 2016
|
||||||
|
|
||||||
|
GPL v2 License
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
You can contact the author at :
|
||||||
|
- zstd homepage : http://www.zstd.net/
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*-**************************************
|
||||||
|
* Compiler Options
|
||||||
|
****************************************/
|
||||||
|
/* Disable some Visual warning messages */
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
# define _CRT_SECURE_NO_WARNINGS /* fopen */
|
||||||
|
# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Unix Large Files support (>4GB) */
|
||||||
|
#define _FILE_OFFSET_BITS 64
|
||||||
|
#if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
|
||||||
|
# define _LARGEFILE_SOURCE
|
||||||
|
#elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
|
||||||
|
# define _LARGEFILE64_SOURCE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
/*-*************************************
|
||||||
|
* Includes
|
||||||
|
***************************************/
|
||||||
|
#include <stdlib.h> /* malloc, free */
|
||||||
|
#include <string.h> /* memset */
|
||||||
|
#include <stdio.h> /* fprintf, fopen, ftello64 */
|
||||||
|
#include <sys/types.h> /* stat64 */
|
||||||
|
#include <sys/stat.h> /* stat64 */
|
||||||
|
#include <time.h> /* clock */
|
||||||
|
|
||||||
|
#include "mem.h" /* read */
|
||||||
|
#include "error_private.h"
|
||||||
|
#include "dictBuilder_static.h"
|
||||||
|
|
||||||
|
/*-*************************************
|
||||||
|
* Compiler specifics
|
||||||
|
***************************************/
|
||||||
|
#if !defined(S_ISREG)
|
||||||
|
# define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
/*-*************************************
|
||||||
|
* Constants
|
||||||
|
***************************************/
|
||||||
|
#define KB *(1 <<10)
|
||||||
|
#define MB *(1 <<20)
|
||||||
|
#define GB *(1U<<30)
|
||||||
|
|
||||||
|
#define DICTLISTSIZE 10000
|
||||||
|
#define MEMMULT 11
|
||||||
|
static const size_t maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
|
||||||
|
|
||||||
|
#define NOISELENGTH 32
|
||||||
|
#define PRIME1 2654435761U
|
||||||
|
#define PRIME2 2246822519U
|
||||||
|
|
||||||
|
#define MINRATIO 4
|
||||||
|
static const U32 g_compressionLevel_default = 5;
|
||||||
|
static const U32 g_selectivity_default = 9;
|
||||||
|
static const size_t g_provision_entropySize = 200;
|
||||||
|
static const size_t g_min_fast_dictContent = 192;
|
||||||
|
|
||||||
|
|
||||||
|
/*-*************************************
|
||||||
|
* Console display
|
||||||
|
***************************************/
|
||||||
|
#define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
|
||||||
|
#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
|
||||||
|
static unsigned g_displayLevel = 0; /* 0 : no display; 1: errors; 2: default; 4: full information */
|
||||||
|
void DiB_setNotificationLevel(unsigned l) { g_displayLevel=l; ZDICT_setNotificationLevel(l); }
|
||||||
|
|
||||||
|
void DiB_printHex(U32 dlevel, const void* ptr, size_t length)
|
||||||
|
{
|
||||||
|
const BYTE* const b = (const BYTE*)ptr;
|
||||||
|
size_t u;
|
||||||
|
for (u=0; u<length; u++) {
|
||||||
|
BYTE c = b[u];
|
||||||
|
if (c<32 || c>126) c = '.'; /* non-printable char */
|
||||||
|
DISPLAYLEVEL(dlevel, "%c", c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*-*************************************
|
||||||
|
* Exceptions
|
||||||
|
***************************************/
|
||||||
|
#ifndef DEBUG
|
||||||
|
# define DEBUG 0
|
||||||
|
#endif
|
||||||
|
#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
|
||||||
|
#define EXM_THROW(error, ...) \
|
||||||
|
{ \
|
||||||
|
DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
|
||||||
|
DISPLAYLEVEL(1, "Error %i : ", error); \
|
||||||
|
DISPLAYLEVEL(1, __VA_ARGS__); \
|
||||||
|
DISPLAYLEVEL(1, "\n"); \
|
||||||
|
exit(error); \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* ********************************************************
|
||||||
|
* Helper functions
|
||||||
|
**********************************************************/
|
||||||
|
unsigned DiB_isError(size_t errorCode) { return ERR_isError(errorCode); }
|
||||||
|
|
||||||
|
const char* DiB_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
|
||||||
|
|
||||||
|
|
||||||
|
/* ********************************************************
|
||||||
|
* File related operations
|
||||||
|
**********************************************************/
|
||||||
|
static unsigned long long DiB_getFileSize(const char* infilename)
|
||||||
|
{
|
||||||
|
int r;
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
struct _stat64 statbuf;
|
||||||
|
r = _stat64(infilename, &statbuf);
|
||||||
|
#else
|
||||||
|
struct stat statbuf;
|
||||||
|
r = stat(infilename, &statbuf);
|
||||||
|
#endif
|
||||||
|
if (r || !S_ISREG(statbuf.st_mode)) return 0; /* No good... */
|
||||||
|
return (unsigned long long)statbuf.st_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static unsigned long long DiB_getTotalFileSize(const char** fileNamesTable, unsigned nbFiles)
|
||||||
|
{
|
||||||
|
unsigned long long total = 0;
|
||||||
|
unsigned n;
|
||||||
|
for (n=0; n<nbFiles; n++)
|
||||||
|
total += DiB_getFileSize(fileNamesTable[n]);
|
||||||
|
return total;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void DiB_loadFiles(void* buffer, size_t bufferSize,
|
||||||
|
size_t* fileSizes,
|
||||||
|
const char** fileNamesTable, unsigned nbFiles)
|
||||||
|
{
|
||||||
|
char* buff = (char*)buffer;
|
||||||
|
size_t pos = 0;
|
||||||
|
unsigned n;
|
||||||
|
|
||||||
|
for (n=0; n<nbFiles; n++) {
|
||||||
|
size_t readSize;
|
||||||
|
unsigned long long fileSize = DiB_getFileSize(fileNamesTable[n]);
|
||||||
|
FILE* f = fopen(fileNamesTable[n], "rb");
|
||||||
|
if (f==NULL) EXM_THROW(10, "impossible to open file %s", fileNamesTable[n]);
|
||||||
|
DISPLAYLEVEL(2, "Loading %s... \r", fileNamesTable[n]);
|
||||||
|
if (fileSize > bufferSize-pos) fileSize = 0; /* stop there, not enough memory to load all files */
|
||||||
|
readSize = fread(buff+pos, 1, (size_t)fileSize, f);
|
||||||
|
if (readSize != (size_t)fileSize) EXM_THROW(11, "could not read %s", fileNamesTable[n]);
|
||||||
|
pos += readSize;
|
||||||
|
fileSizes[n] = (size_t)fileSize;
|
||||||
|
fclose(f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*-********************************************************
|
||||||
|
* Dictionary training functions
|
||||||
|
**********************************************************/
|
||||||
|
static size_t DiB_findMaxMem(unsigned long long requiredMem)
|
||||||
|
{
|
||||||
|
size_t step = 8 MB;
|
||||||
|
void* testmem = NULL;
|
||||||
|
|
||||||
|
requiredMem = (((requiredMem >> 23) + 1) << 23);
|
||||||
|
requiredMem += 2 * step;
|
||||||
|
if (requiredMem > maxMemory) requiredMem = maxMemory;
|
||||||
|
|
||||||
|
while (!testmem) {
|
||||||
|
requiredMem -= step;
|
||||||
|
testmem = malloc((size_t)requiredMem);
|
||||||
|
}
|
||||||
|
|
||||||
|
free(testmem);
|
||||||
|
return (size_t)(requiredMem - step);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void DiB_fillNoise(void* buffer, size_t length)
|
||||||
|
{
|
||||||
|
unsigned acc = PRIME1;
|
||||||
|
size_t p=0;;
|
||||||
|
|
||||||
|
for (p=0; p<length; p++) {
|
||||||
|
acc *= PRIME2;
|
||||||
|
((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void DiB_saveDict(const char* dictFileName,
|
||||||
|
const void* buff, size_t buffSize)
|
||||||
|
{
|
||||||
|
FILE* f;
|
||||||
|
size_t n;
|
||||||
|
|
||||||
|
f = fopen(dictFileName, "wb");
|
||||||
|
if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
|
||||||
|
|
||||||
|
n = fwrite(buff, 1, buffSize, f);
|
||||||
|
if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName)
|
||||||
|
|
||||||
|
n = (size_t)fclose(f);
|
||||||
|
if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
|
||||||
|
const char** fileNamesTable, unsigned nbFiles,
|
||||||
|
ZDICT_params_t params)
|
||||||
|
{
|
||||||
|
void* srcBuffer;
|
||||||
|
size_t benchedSize;
|
||||||
|
size_t* fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
|
||||||
|
unsigned long long totalSizeToLoad = DiB_getTotalFileSize(fileNamesTable, nbFiles);
|
||||||
|
void* dictBuffer = malloc(maxDictSize);
|
||||||
|
size_t dictSize;
|
||||||
|
int result = 0;
|
||||||
|
|
||||||
|
/* init */
|
||||||
|
benchedSize = DiB_findMaxMem(totalSizeToLoad * MEMMULT) / MEMMULT;
|
||||||
|
if ((unsigned long long)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad;
|
||||||
|
if (benchedSize < totalSizeToLoad)
|
||||||
|
DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20));
|
||||||
|
|
||||||
|
/* Memory allocation & restrictions */
|
||||||
|
srcBuffer = malloc(benchedSize+NOISELENGTH); /* + noise */
|
||||||
|
if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
|
||||||
|
|
||||||
|
/* Load input buffer */
|
||||||
|
DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles);
|
||||||
|
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
|
||||||
|
|
||||||
|
/* call buffer version */
|
||||||
|
dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
|
||||||
|
srcBuffer, fileSizes, nbFiles,
|
||||||
|
params);
|
||||||
|
if (ZDICT_isError(dictSize)) {
|
||||||
|
DISPLAYLEVEL(1, "dictionary training failed : %s", ZDICT_getErrorName(dictSize)); /* should not happen */
|
||||||
|
result = 1;
|
||||||
|
goto _cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* save dict */
|
||||||
|
DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
|
||||||
|
DiB_saveDict(dictFileName, dictBuffer, dictSize);
|
||||||
|
|
||||||
|
/* clean up */
|
||||||
|
_cleanup:
|
||||||
|
free(srcBuffer);
|
||||||
|
free(dictBuffer);
|
||||||
|
free(fileSizes);
|
||||||
|
return result;
|
||||||
|
}
|
64
dictBuilder/dibio.h
Normal file
64
dictBuilder/dibio.h
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
/*
|
||||||
|
dibio.h - I/O API for dictionary builder
|
||||||
|
Copyright (C) Yann Collet 2016
|
||||||
|
|
||||||
|
GPL v2 License
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
You can contact the author at :
|
||||||
|
- zstd source repository : https://github.com/Cyan4973/zstd
|
||||||
|
- ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* This library is designed for a single-threaded console application.
|
||||||
|
* It exit() and printf() into stderr when it encounters an error condition. */
|
||||||
|
|
||||||
|
#ifndef DIBIO_H_003
|
||||||
|
#define DIBIO_H_003
|
||||||
|
|
||||||
|
|
||||||
|
/*-*************************************
|
||||||
|
* Dependencies
|
||||||
|
***************************************/
|
||||||
|
#include "dictBuilder_static.h" /* ZDICT_params_t */
|
||||||
|
|
||||||
|
|
||||||
|
/*-*************************************
|
||||||
|
* Public functions
|
||||||
|
***************************************/
|
||||||
|
/*! DiB_trainFromFiles() :
|
||||||
|
Train a dictionary from a set of files provided by `fileNamesTable`.
|
||||||
|
Resulting dictionary is written into file `dictFileName`.
|
||||||
|
`parameters` is optional and can be provided with values set to 0, meaning "default".
|
||||||
|
@return : 0 == ok. Any other : error.
|
||||||
|
*/
|
||||||
|
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
|
||||||
|
const char** fileNamesTable, unsigned nbFiles,
|
||||||
|
ZDICT_params_t parameters);
|
||||||
|
|
||||||
|
|
||||||
|
/*-*************************************
|
||||||
|
* Helper functions
|
||||||
|
***************************************/
|
||||||
|
/*! DiB_setNotificationLevel
|
||||||
|
Set amount of notification to be displayed on the console.
|
||||||
|
default initial value : 0 = no console notification.
|
||||||
|
Note : not thread-safe (use a global constant)
|
||||||
|
*/
|
||||||
|
void DiB_setNotificationLevel(unsigned l);
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
@ -1,94 +0,0 @@
|
|||||||
/*
|
|
||||||
dictBuilder.h
|
|
||||||
Copyright (C) Yann Collet 2016
|
|
||||||
|
|
||||||
GPL v2 License
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
You can contact the author at :
|
|
||||||
- zstd source repository : https://github.com/Cyan4973/zstd
|
|
||||||
- ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* This library is designed for a single-threaded console application.
|
|
||||||
* It exit() and printf() into stderr when it encounters an error condition. */
|
|
||||||
|
|
||||||
#ifndef DICTBUILDER_H_001
|
|
||||||
#define DICTBUILDER_H_001
|
|
||||||
|
|
||||||
/*-*************************************
|
|
||||||
* Version
|
|
||||||
***************************************/
|
|
||||||
#define DiB_VERSION_MAJOR 0 /* for breaking interface changes */
|
|
||||||
#define DiB_VERSION_MINOR 0 /* for new (non-breaking) interface capabilities */
|
|
||||||
#define DiB_VERSION_RELEASE 1 /* for tweaks, bug-fixes, or development */
|
|
||||||
#define DiB_VERSION_NUMBER (DiB_VERSION_MAJOR *100*100 + DiB_VERSION_MINOR *100 + DiB_VERSION_RELEASE)
|
|
||||||
unsigned DiB_versionNumber (void);
|
|
||||||
|
|
||||||
|
|
||||||
/*-*************************************
|
|
||||||
* Public type
|
|
||||||
***************************************/
|
|
||||||
typedef struct {
|
|
||||||
unsigned selectivityLevel; /* 0 means default; larger => bigger selection => larger dictionary */
|
|
||||||
unsigned compressionLevel; /* 0 means default; target a specific zstd compression level */
|
|
||||||
} DiB_params_t;
|
|
||||||
|
|
||||||
|
|
||||||
/*-*************************************
|
|
||||||
* Public functions
|
|
||||||
***************************************/
|
|
||||||
/*! DiB_trainFromBuffer
|
|
||||||
Train a dictionary from a memory buffer @samplesBuffer
|
|
||||||
where @nbSamples samples have been stored concatenated.
|
|
||||||
Each sample size is provided into an orderly table @sampleSizes.
|
|
||||||
Resulting dictionary will be saved into @dictBuffer.
|
|
||||||
@parameters is optional and can be provided with 0 values to mean "default".
|
|
||||||
@result : size of dictionary stored into @dictBuffer (<= @dictBufferSize)
|
|
||||||
or an error code, which can be tested by DiB_isError().
|
|
||||||
note : DiB_trainFromBuffer() will send notifications into stderr if instructed to, using DiB_setNotificationLevel()
|
|
||||||
*/
|
|
||||||
size_t DiB_trainFromBuffer(void* dictBuffer, size_t dictBufferSize,
|
|
||||||
const void* samplesBuffer, const size_t* sampleSizes, unsigned nbSamples,
|
|
||||||
DiB_params_t parameters);
|
|
||||||
|
|
||||||
|
|
||||||
/*! DiB_trainFromFiles
|
|
||||||
Train a dictionary from a set of files provided by @fileNamesTable
|
|
||||||
Resulting dictionary is written into file @dictFileName.
|
|
||||||
@parameters is optional and can be provided with 0 values.
|
|
||||||
@result : 0 == ok. Any other : error.
|
|
||||||
*/
|
|
||||||
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
|
|
||||||
const char** fileNamesTable, unsigned nbFiles,
|
|
||||||
DiB_params_t parameters);
|
|
||||||
|
|
||||||
|
|
||||||
/*-*************************************
|
|
||||||
* Helper functions
|
|
||||||
***************************************/
|
|
||||||
unsigned DiB_isError(size_t errorCode);
|
|
||||||
const char* DiB_getErrorName(size_t errorCode);
|
|
||||||
|
|
||||||
/*! DiB_setNotificationLevel
|
|
||||||
Set amount of notification to be displayed on the console.
|
|
||||||
default initial value : 0 = no console notification.
|
|
||||||
Note : not thread-safe (use a global constant)
|
|
||||||
*/
|
|
||||||
void DiB_setNotificationLevel(unsigned l);
|
|
||||||
|
|
||||||
|
|
||||||
#endif
|
|
BIN
dictBuilder/dictionary
Normal file
BIN
dictBuilder/dictionary
Normal file
Binary file not shown.
@ -1,404 +0,0 @@
|
|||||||
/*
|
|
||||||
* divsufsort.c for libdivsufsort
|
|
||||||
* Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*- Compiler specifics -*/
|
|
||||||
#ifdef __clang__
|
|
||||||
#pragma clang diagnostic ignored "-Wshorten-64-to-32"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*- Dependencies -*/
|
|
||||||
#include "divsufsort_private.h"
|
|
||||||
#ifdef _OPENMP
|
|
||||||
# include <omp.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
/*- Private Functions -*/
|
|
||||||
|
|
||||||
/* Sorts suffixes of type B*. */
|
|
||||||
static
|
|
||||||
saidx_t
|
|
||||||
sort_typeBstar(const sauchar_t *T, saidx_t *SA,
|
|
||||||
saidx_t *bucket_A, saidx_t *bucket_B,
|
|
||||||
saidx_t n) {
|
|
||||||
saidx_t *PAb, *ISAb, *buf;
|
|
||||||
#ifdef _OPENMP
|
|
||||||
saidx_t *curbuf;
|
|
||||||
saidx_t l;
|
|
||||||
#endif
|
|
||||||
saidx_t i, j, k, t, m, bufsize;
|
|
||||||
saint_t c0, c1;
|
|
||||||
#ifdef _OPENMP
|
|
||||||
saint_t d0, d1;
|
|
||||||
int tmp;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Initialize bucket arrays. */
|
|
||||||
for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; }
|
|
||||||
for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; }
|
|
||||||
|
|
||||||
/* Count the number of occurrences of the first one or two characters of each
|
|
||||||
type A, B and B* suffix. Moreover, store the beginning position of all
|
|
||||||
type B* suffixes into the array SA. */
|
|
||||||
for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) {
|
|
||||||
/* type A suffix. */
|
|
||||||
do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1));
|
|
||||||
if(0 <= i) {
|
|
||||||
/* type B* suffix. */
|
|
||||||
++BUCKET_BSTAR(c0, c1);
|
|
||||||
SA[--m] = i;
|
|
||||||
/* type B suffix. */
|
|
||||||
for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) {
|
|
||||||
++BUCKET_B(c0, c1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
m = n - m;
|
|
||||||
/*
|
|
||||||
note:
|
|
||||||
A type B* suffix is lexicographically smaller than a type B suffix that
|
|
||||||
begins with the same first two characters.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* Calculate the index of start/end point of each bucket. */
|
|
||||||
for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) {
|
|
||||||
t = i + BUCKET_A(c0);
|
|
||||||
BUCKET_A(c0) = i + j; /* start point */
|
|
||||||
i = t + BUCKET_B(c0, c0);
|
|
||||||
for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) {
|
|
||||||
j += BUCKET_BSTAR(c0, c1);
|
|
||||||
BUCKET_BSTAR(c0, c1) = j; /* end point */
|
|
||||||
i += BUCKET_B(c0, c1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(0 < m) {
|
|
||||||
/* Sort the type B* suffixes by their first two characters. */
|
|
||||||
PAb = SA + n - m; ISAb = SA + m;
|
|
||||||
for(i = m - 2; 0 <= i; --i) {
|
|
||||||
t = PAb[i], c0 = T[t], c1 = T[t + 1];
|
|
||||||
SA[--BUCKET_BSTAR(c0, c1)] = i;
|
|
||||||
}
|
|
||||||
t = PAb[m - 1], c0 = T[t], c1 = T[t + 1];
|
|
||||||
SA[--BUCKET_BSTAR(c0, c1)] = m - 1;
|
|
||||||
|
|
||||||
/* Sort the type B* substrings using sssort. */
|
|
||||||
#ifdef _OPENMP
|
|
||||||
tmp = omp_get_max_threads();
|
|
||||||
buf = SA + m, bufsize = (n - (2 * m)) / tmp;
|
|
||||||
c0 = ALPHABET_SIZE - 2, c1 = ALPHABET_SIZE - 1, j = m;
|
|
||||||
#pragma omp parallel default(shared) private(curbuf, k, l, d0, d1, tmp)
|
|
||||||
{
|
|
||||||
tmp = omp_get_thread_num();
|
|
||||||
curbuf = buf + tmp * bufsize;
|
|
||||||
k = 0;
|
|
||||||
for(;;) {
|
|
||||||
#pragma omp critical(sssort_lock)
|
|
||||||
{
|
|
||||||
if(0 < (l = j)) {
|
|
||||||
d0 = c0, d1 = c1;
|
|
||||||
do {
|
|
||||||
k = BUCKET_BSTAR(d0, d1);
|
|
||||||
if(--d1 <= d0) {
|
|
||||||
d1 = ALPHABET_SIZE - 1;
|
|
||||||
if(--d0 < 0) { break; }
|
|
||||||
}
|
|
||||||
} while(((l - k) <= 1) && (0 < (l = k)));
|
|
||||||
c0 = d0, c1 = d1, j = k;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(l == 0) { break; }
|
|
||||||
sssort(T, PAb, SA + k, SA + l,
|
|
||||||
curbuf, bufsize, 2, n, *(SA + k) == (m - 1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
buf = SA + m, bufsize = n - (2 * m);
|
|
||||||
for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {
|
|
||||||
for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {
|
|
||||||
i = BUCKET_BSTAR(c0, c1);
|
|
||||||
if(1 < (j - i)) {
|
|
||||||
sssort(T, PAb, SA + i, SA + j,
|
|
||||||
buf, bufsize, 2, n, *(SA + i) == (m - 1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Compute ranks of type B* substrings. */
|
|
||||||
for(i = m - 1; 0 <= i; --i) {
|
|
||||||
if(0 <= SA[i]) {
|
|
||||||
j = i;
|
|
||||||
do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i]));
|
|
||||||
SA[i + 1] = i - j;
|
|
||||||
if(i <= 0) { break; }
|
|
||||||
}
|
|
||||||
j = i;
|
|
||||||
do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0);
|
|
||||||
ISAb[SA[i]] = j;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Construct the inverse suffix array of type B* suffixes using trsort. */
|
|
||||||
trsort(ISAb, SA, m, 1);
|
|
||||||
|
|
||||||
/* Set the sorted order of tyoe B* suffixes. */
|
|
||||||
for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) {
|
|
||||||
for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { }
|
|
||||||
if(0 <= i) {
|
|
||||||
t = i;
|
|
||||||
for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { }
|
|
||||||
SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Calculate the index of start/end point of each bucket. */
|
|
||||||
BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */
|
|
||||||
for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) {
|
|
||||||
i = BUCKET_A(c0 + 1) - 1;
|
|
||||||
for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) {
|
|
||||||
t = i - BUCKET_B(c0, c1);
|
|
||||||
BUCKET_B(c0, c1) = i; /* end point */
|
|
||||||
|
|
||||||
/* Move all type B* suffixes to the correct position. */
|
|
||||||
for(i = t, j = BUCKET_BSTAR(c0, c1);
|
|
||||||
j <= k;
|
|
||||||
--i, --k) { SA[i] = SA[k]; }
|
|
||||||
}
|
|
||||||
BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */
|
|
||||||
BUCKET_B(c0, c0) = i; /* end point */
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return m;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Constructs the suffix array by using the sorted order of type B* suffixes. */
|
|
||||||
static
|
|
||||||
void
|
|
||||||
construct_SA(const sauchar_t *T, saidx_t *SA,
|
|
||||||
saidx_t *bucket_A, saidx_t *bucket_B,
|
|
||||||
saidx_t n, saidx_t m) {
|
|
||||||
saidx_t *i, *j, *k;
|
|
||||||
saidx_t s;
|
|
||||||
saint_t c0, c1, c2;
|
|
||||||
|
|
||||||
if(0 < m) {
|
|
||||||
/* Construct the sorted order of type B suffixes by using
|
|
||||||
the sorted order of type B* suffixes. */
|
|
||||||
for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
|
|
||||||
/* Scan the suffix array from right to left. */
|
|
||||||
for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
|
|
||||||
j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
|
|
||||||
i <= j;
|
|
||||||
--j) {
|
|
||||||
if(0 < (s = *j)) {
|
|
||||||
assert(T[s] == c1);
|
|
||||||
assert(((s + 1) < n) && (T[s] <= T[s + 1]));
|
|
||||||
assert(T[s - 1] <= T[s]);
|
|
||||||
*j = ~s;
|
|
||||||
c0 = T[--s];
|
|
||||||
if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
|
|
||||||
if(c0 != c2) {
|
|
||||||
if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
|
|
||||||
k = SA + BUCKET_B(c2 = c0, c1);
|
|
||||||
}
|
|
||||||
assert(k < j);
|
|
||||||
*k-- = s;
|
|
||||||
} else {
|
|
||||||
assert(((s == 0) && (T[s] == c1)) || (s < 0));
|
|
||||||
*j = ~s;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Construct the suffix array by using
|
|
||||||
the sorted order of type B suffixes. */
|
|
||||||
k = SA + BUCKET_A(c2 = T[n - 1]);
|
|
||||||
*k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1);
|
|
||||||
/* Scan the suffix array from left to right. */
|
|
||||||
for(i = SA, j = SA + n; i < j; ++i) {
|
|
||||||
if(0 < (s = *i)) {
|
|
||||||
assert(T[s - 1] >= T[s]);
|
|
||||||
c0 = T[--s];
|
|
||||||
if((s == 0) || (T[s - 1] < c0)) { s = ~s; }
|
|
||||||
if(c0 != c2) {
|
|
||||||
BUCKET_A(c2) = k - SA;
|
|
||||||
k = SA + BUCKET_A(c2 = c0);
|
|
||||||
}
|
|
||||||
assert(i < k);
|
|
||||||
*k++ = s;
|
|
||||||
} else {
|
|
||||||
assert(s < 0);
|
|
||||||
*i = ~s;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Constructs the burrows-wheeler transformed string directly
|
|
||||||
by using the sorted order of type B* suffixes. */
|
|
||||||
static
|
|
||||||
saidx_t
|
|
||||||
construct_BWT(const sauchar_t *T, saidx_t *SA,
|
|
||||||
saidx_t *bucket_A, saidx_t *bucket_B,
|
|
||||||
saidx_t n, saidx_t m) {
|
|
||||||
saidx_t *i, *j, *k, *orig;
|
|
||||||
saidx_t s;
|
|
||||||
saint_t c0, c1, c2;
|
|
||||||
|
|
||||||
if(0 < m) {
|
|
||||||
/* Construct the sorted order of type B suffixes by using
|
|
||||||
the sorted order of type B* suffixes. */
|
|
||||||
for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
|
|
||||||
/* Scan the suffix array from right to left. */
|
|
||||||
for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
|
|
||||||
j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
|
|
||||||
i <= j;
|
|
||||||
--j) {
|
|
||||||
if(0 < (s = *j)) {
|
|
||||||
assert(T[s] == c1);
|
|
||||||
assert(((s + 1) < n) && (T[s] <= T[s + 1]));
|
|
||||||
assert(T[s - 1] <= T[s]);
|
|
||||||
c0 = T[--s];
|
|
||||||
*j = ~((saidx_t)c0);
|
|
||||||
if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
|
|
||||||
if(c0 != c2) {
|
|
||||||
if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
|
|
||||||
k = SA + BUCKET_B(c2 = c0, c1);
|
|
||||||
}
|
|
||||||
assert(k < j);
|
|
||||||
*k-- = s;
|
|
||||||
} else if(s != 0) {
|
|
||||||
*j = ~s;
|
|
||||||
#ifndef NDEBUG
|
|
||||||
} else {
|
|
||||||
assert(T[s] == c1);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Construct the BWTed string by using
|
|
||||||
the sorted order of type B suffixes. */
|
|
||||||
k = SA + BUCKET_A(c2 = T[n - 1]);
|
|
||||||
*k++ = (T[n - 2] < c2) ? ~((saidx_t)T[n - 2]) : (n - 1);
|
|
||||||
/* Scan the suffix array from left to right. */
|
|
||||||
for(i = SA, j = SA + n, orig = SA; i < j; ++i) {
|
|
||||||
if(0 < (s = *i)) {
|
|
||||||
assert(T[s - 1] >= T[s]);
|
|
||||||
c0 = T[--s];
|
|
||||||
*i = c0;
|
|
||||||
if((0 < s) && (T[s - 1] < c0)) { s = ~((saidx_t)T[s - 1]); }
|
|
||||||
if(c0 != c2) {
|
|
||||||
BUCKET_A(c2) = k - SA;
|
|
||||||
k = SA + BUCKET_A(c2 = c0);
|
|
||||||
}
|
|
||||||
assert(i < k);
|
|
||||||
*k++ = s;
|
|
||||||
} else if(s != 0) {
|
|
||||||
*i = ~s;
|
|
||||||
} else {
|
|
||||||
orig = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return orig - SA;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*---------------------------------------------------------------------------*/
|
|
||||||
|
|
||||||
/*- Function -*/
|
|
||||||
|
|
||||||
saint_t
|
|
||||||
divsufsort(const sauchar_t *T, saidx_t *SA, saidx_t n) {
|
|
||||||
saidx_t *bucket_A, *bucket_B;
|
|
||||||
saidx_t m;
|
|
||||||
saint_t err = 0;
|
|
||||||
|
|
||||||
/* Check arguments. */
|
|
||||||
if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; }
|
|
||||||
else if(n == 0) { return 0; }
|
|
||||||
else if(n == 1) { SA[0] = 0; return 0; }
|
|
||||||
else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; }
|
|
||||||
|
|
||||||
bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t));
|
|
||||||
bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t));
|
|
||||||
|
|
||||||
/* Suffixsort. */
|
|
||||||
if((bucket_A != NULL) && (bucket_B != NULL)) {
|
|
||||||
m = sort_typeBstar(T, SA, bucket_A, bucket_B, n);
|
|
||||||
construct_SA(T, SA, bucket_A, bucket_B, n, m);
|
|
||||||
} else {
|
|
||||||
err = -2;
|
|
||||||
}
|
|
||||||
|
|
||||||
free(bucket_B);
|
|
||||||
free(bucket_A);
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
saidx_t
|
|
||||||
divbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n) {
|
|
||||||
saidx_t *B;
|
|
||||||
saidx_t *bucket_A, *bucket_B;
|
|
||||||
saidx_t m, pidx, i;
|
|
||||||
|
|
||||||
/* Check arguments. */
|
|
||||||
if((T == NULL) || (U == NULL) || (n < 0)) { return -1; }
|
|
||||||
else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; }
|
|
||||||
|
|
||||||
if((B = A) == NULL) { B = (saidx_t *)malloc((size_t)(n + 1) * sizeof(saidx_t)); }
|
|
||||||
bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t));
|
|
||||||
bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t));
|
|
||||||
|
|
||||||
/* Burrows-Wheeler Transform. */
|
|
||||||
if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) {
|
|
||||||
m = sort_typeBstar(T, B, bucket_A, bucket_B, n);
|
|
||||||
pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m);
|
|
||||||
|
|
||||||
/* Copy to output string. */
|
|
||||||
U[0] = T[n - 1];
|
|
||||||
for(i = 0; i < pidx; ++i) { U[i + 1] = (sauchar_t)B[i]; }
|
|
||||||
for(i += 1; i < n; ++i) { U[i] = (sauchar_t)B[i]; }
|
|
||||||
pidx += 1;
|
|
||||||
} else {
|
|
||||||
pidx = -2;
|
|
||||||
}
|
|
||||||
|
|
||||||
free(bucket_B);
|
|
||||||
free(bucket_A);
|
|
||||||
if(A == NULL) { free(B); }
|
|
||||||
|
|
||||||
return pidx;
|
|
||||||
}
|
|
||||||
|
|
||||||
const char *
|
|
||||||
divsufsort_version(void) {
|
|
||||||
return PROJECT_VERSION_FULL;
|
|
||||||
}
|
|
@ -1,180 +0,0 @@
|
|||||||
/*
|
|
||||||
* divsufsort.h for libdivsufsort
|
|
||||||
* Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef _DIVSUFSORT_H
|
|
||||||
#define _DIVSUFSORT_H 1
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif /* __cplusplus */
|
|
||||||
|
|
||||||
#include <inttypes.h>
|
|
||||||
|
|
||||||
#ifndef DIVSUFSORT_API
|
|
||||||
# ifdef DIVSUFSORT_BUILD_DLL
|
|
||||||
# define DIVSUFSORT_API
|
|
||||||
# else
|
|
||||||
# define DIVSUFSORT_API
|
|
||||||
# endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*- Datatypes -*/
|
|
||||||
#ifndef SAUCHAR_T
|
|
||||||
#define SAUCHAR_T
|
|
||||||
typedef uint8_t sauchar_t;
|
|
||||||
#endif /* SAUCHAR_T */
|
|
||||||
#ifndef SAINT_T
|
|
||||||
#define SAINT_T
|
|
||||||
typedef int32_t saint_t;
|
|
||||||
#endif /* SAINT_T */
|
|
||||||
#ifndef SAIDX_T
|
|
||||||
#define SAIDX_T
|
|
||||||
typedef int32_t saidx_t;
|
|
||||||
#endif /* SAIDX_T */
|
|
||||||
#ifndef PRIdSAINT_T
|
|
||||||
#define PRIdSAINT_T PRId32
|
|
||||||
#endif /* PRIdSAINT_T */
|
|
||||||
#ifndef PRIdSAIDX_T
|
|
||||||
#define PRIdSAIDX_T PRId32
|
|
||||||
#endif /* PRIdSAIDX_T */
|
|
||||||
|
|
||||||
|
|
||||||
/*- Prototypes -*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructs the suffix array of a given string.
|
|
||||||
* @param T[0..n-1] The input string.
|
|
||||||
* @param SA[0..n-1] The output array of suffixes.
|
|
||||||
* @param n The length of the given string.
|
|
||||||
* @return 0 if no error occurred, -1 or -2 otherwise.
|
|
||||||
*/
|
|
||||||
DIVSUFSORT_API
|
|
||||||
saint_t
|
|
||||||
divsufsort(const sauchar_t *T, saidx_t *SA, saidx_t n);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructs the burrows-wheeler transformed string of a given string.
|
|
||||||
* @param T[0..n-1] The input string.
|
|
||||||
* @param U[0..n-1] The output string. (can be T)
|
|
||||||
* @param A[0..n-1] The temporary array. (can be NULL)
|
|
||||||
* @param n The length of the given string.
|
|
||||||
* @return The primary index if no error occurred, -1 or -2 otherwise.
|
|
||||||
*/
|
|
||||||
DIVSUFSORT_API
|
|
||||||
saidx_t
|
|
||||||
divbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the version of the divsufsort library.
|
|
||||||
* @return The version number string.
|
|
||||||
*/
|
|
||||||
DIVSUFSORT_API
|
|
||||||
const char *
|
|
||||||
divsufsort_version(void);
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructs the burrows-wheeler transformed string of a given string and suffix array.
|
|
||||||
* @param T[0..n-1] The input string.
|
|
||||||
* @param U[0..n-1] The output string. (can be T)
|
|
||||||
* @param SA[0..n-1] The suffix array. (can be NULL)
|
|
||||||
* @param n The length of the given string.
|
|
||||||
* @param idx The output primary index.
|
|
||||||
* @return 0 if no error occurred, -1 or -2 otherwise.
|
|
||||||
*/
|
|
||||||
DIVSUFSORT_API
|
|
||||||
saint_t
|
|
||||||
bw_transform(const sauchar_t *T, sauchar_t *U,
|
|
||||||
saidx_t *SA /* can NULL */,
|
|
||||||
saidx_t n, saidx_t *idx);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Inverse BW-transforms a given BWTed string.
|
|
||||||
* @param T[0..n-1] The input string.
|
|
||||||
* @param U[0..n-1] The output string. (can be T)
|
|
||||||
* @param A[0..n-1] The temporary array. (can be NULL)
|
|
||||||
* @param n The length of the given string.
|
|
||||||
* @param idx The primary index.
|
|
||||||
* @return 0 if no error occurred, -1 or -2 otherwise.
|
|
||||||
*/
|
|
||||||
DIVSUFSORT_API
|
|
||||||
saint_t
|
|
||||||
inverse_bw_transform(const sauchar_t *T, sauchar_t *U,
|
|
||||||
saidx_t *A /* can NULL */,
|
|
||||||
saidx_t n, saidx_t idx);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Checks the correctness of a given suffix array.
|
|
||||||
* @param T[0..n-1] The input string.
|
|
||||||
* @param SA[0..n-1] The input suffix array.
|
|
||||||
* @param n The length of the given string.
|
|
||||||
* @param verbose The verbose mode.
|
|
||||||
* @return 0 if no error occurred.
|
|
||||||
*/
|
|
||||||
DIVSUFSORT_API
|
|
||||||
saint_t
|
|
||||||
sufcheck(const sauchar_t *T, const saidx_t *SA, saidx_t n, saint_t verbose);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Search for the pattern P in the string T.
|
|
||||||
* @param T[0..Tsize-1] The input string.
|
|
||||||
* @param Tsize The length of the given string.
|
|
||||||
* @param P[0..Psize-1] The input pattern string.
|
|
||||||
* @param Psize The length of the given pattern string.
|
|
||||||
* @param SA[0..SAsize-1] The input suffix array.
|
|
||||||
* @param SAsize The length of the given suffix array.
|
|
||||||
* @param idx The output index.
|
|
||||||
* @return The count of matches if no error occurred, -1 otherwise.
|
|
||||||
*/
|
|
||||||
DIVSUFSORT_API
|
|
||||||
saidx_t
|
|
||||||
sa_search(const sauchar_t *T, saidx_t Tsize,
|
|
||||||
const sauchar_t *P, saidx_t Psize,
|
|
||||||
const saidx_t *SA, saidx_t SAsize,
|
|
||||||
saidx_t *left);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Search for the character c in the string T.
|
|
||||||
* @param T[0..Tsize-1] The input string.
|
|
||||||
* @param Tsize The length of the given string.
|
|
||||||
* @param SA[0..SAsize-1] The input suffix array.
|
|
||||||
* @param SAsize The length of the given suffix array.
|
|
||||||
* @param c The input character.
|
|
||||||
* @param idx The output index.
|
|
||||||
* @return The count of matches if no error occurred, -1 otherwise.
|
|
||||||
*/
|
|
||||||
DIVSUFSORT_API
|
|
||||||
saidx_t
|
|
||||||
sa_simplesearch(const sauchar_t *T, saidx_t Tsize,
|
|
||||||
const saidx_t *SA, saidx_t SAsize,
|
|
||||||
saint_t c, saidx_t *left);
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
} /* extern "C" */
|
|
||||||
#endif /* __cplusplus */
|
|
||||||
|
|
||||||
#endif /* _DIVSUFSORT_H */
|
|
@ -1,212 +0,0 @@
|
|||||||
/*
|
|
||||||
* divsufsort_private.h for libdivsufsort
|
|
||||||
* Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef _DIVSUFSORT_PRIVATE_H
|
|
||||||
#define _DIVSUFSORT_PRIVATE_H 1
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif /* __cplusplus */
|
|
||||||
|
|
||||||
/* *************************
|
|
||||||
* Includes
|
|
||||||
***************************/
|
|
||||||
#include <assert.h>
|
|
||||||
#include <stdlib.h> /* unconditional */
|
|
||||||
#include <stdio.h>
|
|
||||||
#include "config.h" /* unconditional */
|
|
||||||
|
|
||||||
|
|
||||||
#if HAVE_STRING_H
|
|
||||||
# include <string.h>
|
|
||||||
#endif
|
|
||||||
#if HAVE_MEMORY_H
|
|
||||||
# include <memory.h>
|
|
||||||
#endif
|
|
||||||
#if HAVE_STDDEF_H
|
|
||||||
# include <stddef.h>
|
|
||||||
#endif
|
|
||||||
#if HAVE_STRINGS_H
|
|
||||||
# ifdef _WIN32
|
|
||||||
# include <string.h>
|
|
||||||
# else
|
|
||||||
# include <strings.h>
|
|
||||||
# endif
|
|
||||||
#endif
|
|
||||||
#if HAVE_INTTYPES_H
|
|
||||||
# include <inttypes.h>
|
|
||||||
#else
|
|
||||||
# if HAVE_STDINT_H
|
|
||||||
# include <stdint.h>
|
|
||||||
# endif
|
|
||||||
#endif
|
|
||||||
#if defined(BUILD_DIVSUFSORT64)
|
|
||||||
# include "divsufsort64.h"
|
|
||||||
# ifndef SAIDX_T
|
|
||||||
# define SAIDX_T
|
|
||||||
# define saidx_t saidx64_t
|
|
||||||
# endif /* SAIDX_T */
|
|
||||||
# ifndef PRIdSAIDX_T
|
|
||||||
# define PRIdSAIDX_T PRIdSAIDX64_T
|
|
||||||
# endif /* PRIdSAIDX_T */
|
|
||||||
# define divsufsort divsufsort64
|
|
||||||
# define divbwt divbwt64
|
|
||||||
# define divsufsort_version divsufsort64_version
|
|
||||||
# define bw_transform bw_transform64
|
|
||||||
# define inverse_bw_transform inverse_bw_transform64
|
|
||||||
# define sufcheck sufcheck64
|
|
||||||
# define sa_search sa_search64
|
|
||||||
# define sa_simplesearch sa_simplesearch64
|
|
||||||
# define sssort sssort64
|
|
||||||
# define trsort trsort64
|
|
||||||
#else
|
|
||||||
# include "divsufsort.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
/*- Constants -*/
|
|
||||||
#if !defined(UINT8_MAX)
|
|
||||||
# define UINT8_MAX (255)
|
|
||||||
#endif /* UINT8_MAX */
|
|
||||||
#if defined(ALPHABET_SIZE) && (ALPHABET_SIZE < 1)
|
|
||||||
# undef ALPHABET_SIZE
|
|
||||||
#endif
|
|
||||||
#if !defined(ALPHABET_SIZE)
|
|
||||||
# define ALPHABET_SIZE (UINT8_MAX + 1)
|
|
||||||
#endif
|
|
||||||
/* for divsufsort.c */
|
|
||||||
#define BUCKET_A_SIZE (ALPHABET_SIZE)
|
|
||||||
#define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE)
|
|
||||||
/* for sssort.c */
|
|
||||||
#if defined(SS_INSERTIONSORT_THRESHOLD)
|
|
||||||
# if SS_INSERTIONSORT_THRESHOLD < 1
|
|
||||||
# undef SS_INSERTIONSORT_THRESHOLD
|
|
||||||
# define SS_INSERTIONSORT_THRESHOLD (1)
|
|
||||||
# endif
|
|
||||||
#else
|
|
||||||
# define SS_INSERTIONSORT_THRESHOLD (8)
|
|
||||||
#endif
|
|
||||||
#if defined(SS_BLOCKSIZE)
|
|
||||||
# if SS_BLOCKSIZE < 0
|
|
||||||
# undef SS_BLOCKSIZE
|
|
||||||
# define SS_BLOCKSIZE (0)
|
|
||||||
# elif 32768 <= SS_BLOCKSIZE
|
|
||||||
# undef SS_BLOCKSIZE
|
|
||||||
# define SS_BLOCKSIZE (32767)
|
|
||||||
# endif
|
|
||||||
#else
|
|
||||||
# define SS_BLOCKSIZE (1024)
|
|
||||||
#endif
|
|
||||||
/* minstacksize = log(SS_BLOCKSIZE) / log(3) * 2 */
|
|
||||||
#if SS_BLOCKSIZE == 0
|
|
||||||
# if defined(BUILD_DIVSUFSORT64)
|
|
||||||
# define SS_MISORT_STACKSIZE (96)
|
|
||||||
# else
|
|
||||||
# define SS_MISORT_STACKSIZE (64)
|
|
||||||
# endif
|
|
||||||
#elif SS_BLOCKSIZE <= 4096
|
|
||||||
# define SS_MISORT_STACKSIZE (16)
|
|
||||||
#else
|
|
||||||
# define SS_MISORT_STACKSIZE (24)
|
|
||||||
#endif
|
|
||||||
#if defined(BUILD_DIVSUFSORT64)
|
|
||||||
# define SS_SMERGE_STACKSIZE (64)
|
|
||||||
#else
|
|
||||||
# define SS_SMERGE_STACKSIZE (32)
|
|
||||||
#endif
|
|
||||||
/* for trsort.c */
|
|
||||||
#define TR_INSERTIONSORT_THRESHOLD (8)
|
|
||||||
#if defined(BUILD_DIVSUFSORT64)
|
|
||||||
# define TR_STACKSIZE (96)
|
|
||||||
#else
|
|
||||||
# define TR_STACKSIZE (64)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
/*- Macros -*/
|
|
||||||
#ifndef SWAP
|
|
||||||
# define SWAP(_a, _b) do { t = (_a); (_a) = (_b); (_b) = t; } while(0)
|
|
||||||
#endif /* SWAP */
|
|
||||||
#ifndef MIN
|
|
||||||
# define MIN(_a, _b) (((_a) < (_b)) ? (_a) : (_b))
|
|
||||||
#endif /* MIN */
|
|
||||||
#ifndef MAX
|
|
||||||
# define MAX(_a, _b) (((_a) > (_b)) ? (_a) : (_b))
|
|
||||||
#endif /* MAX */
|
|
||||||
#define STACK_PUSH(_a, _b, _c, _d)\
|
|
||||||
do {\
|
|
||||||
assert(ssize < STACK_SIZE);\
|
|
||||||
stack[ssize].a = (_a), stack[ssize].b = (_b),\
|
|
||||||
stack[ssize].c = (_c), stack[ssize++].d = (_d);\
|
|
||||||
} while(0)
|
|
||||||
#define STACK_PUSH5(_a, _b, _c, _d, _e)\
|
|
||||||
do {\
|
|
||||||
assert(ssize < STACK_SIZE);\
|
|
||||||
stack[ssize].a = (_a), stack[ssize].b = (_b),\
|
|
||||||
stack[ssize].c = (_c), stack[ssize].d = (_d), stack[ssize++].e = (_e);\
|
|
||||||
} while(0)
|
|
||||||
#define STACK_POP(_a, _b, _c, _d)\
|
|
||||||
do {\
|
|
||||||
assert(0 <= ssize);\
|
|
||||||
if(ssize == 0) { return; }\
|
|
||||||
(_a) = stack[--ssize].a, (_b) = stack[ssize].b,\
|
|
||||||
(_c) = stack[ssize].c, (_d) = stack[ssize].d;\
|
|
||||||
} while(0)
|
|
||||||
#define STACK_POP5(_a, _b, _c, _d, _e)\
|
|
||||||
do {\
|
|
||||||
assert(0 <= ssize);\
|
|
||||||
if(ssize == 0) { return; }\
|
|
||||||
(_a) = stack[--ssize].a, (_b) = stack[ssize].b,\
|
|
||||||
(_c) = stack[ssize].c, (_d) = stack[ssize].d, (_e) = stack[ssize].e;\
|
|
||||||
} while(0)
|
|
||||||
/* for divsufsort.c */
|
|
||||||
#define BUCKET_A(_c0) bucket_A[(_c0)]
|
|
||||||
#if ALPHABET_SIZE == 256
|
|
||||||
#define BUCKET_B(_c0, _c1) (bucket_B[((_c1) << 8) | (_c0)])
|
|
||||||
#define BUCKET_BSTAR(_c0, _c1) (bucket_B[((_c0) << 8) | (_c1)])
|
|
||||||
#else
|
|
||||||
#define BUCKET_B(_c0, _c1) (bucket_B[(_c1) * ALPHABET_SIZE + (_c0)])
|
|
||||||
#define BUCKET_BSTAR(_c0, _c1) (bucket_B[(_c0) * ALPHABET_SIZE + (_c1)])
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
/*- Private Prototypes -*/
|
|
||||||
/* sssort.c */
|
|
||||||
void
|
|
||||||
sssort(const sauchar_t *Td, const saidx_t *PA,
|
|
||||||
saidx_t *first, saidx_t *last,
|
|
||||||
saidx_t *buf, saidx_t bufsize,
|
|
||||||
saidx_t depth, saidx_t n, saint_t lastsuffix);
|
|
||||||
/* trsort.c */
|
|
||||||
void
|
|
||||||
trsort(saidx_t *ISA, saidx_t *SA, saidx_t n, saidx_t depth);
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
} /* extern "C" */
|
|
||||||
#endif /* __cplusplus */
|
|
||||||
|
|
||||||
#endif /* _DIVSUFSORT_PRIVATE_H */
|
|
@ -1,56 +0,0 @@
|
|||||||
/*
|
|
||||||
* lfs.h for libdivsufsort
|
|
||||||
* Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef _LFS_H
|
|
||||||
#define _LFS_H 1
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif /* __cplusplus */
|
|
||||||
|
|
||||||
#ifndef __STRICT_ANSI__
|
|
||||||
# define LFS_OFF_T off_t
|
|
||||||
# define LFS_FOPEN fopen
|
|
||||||
# define LFS_FTELL ftello
|
|
||||||
# define LFS_FSEEK fseeko
|
|
||||||
# define LFS_PRId PRIdMAX
|
|
||||||
#else
|
|
||||||
# define LFS_OFF_T long
|
|
||||||
# define LFS_FOPEN fopen
|
|
||||||
# define LFS_FTELL ftell
|
|
||||||
# define LFS_FSEEK fseek
|
|
||||||
# define LFS_PRId "ld"
|
|
||||||
#endif
|
|
||||||
#ifndef PRIdOFF_T
|
|
||||||
# define PRIdOFF_T LFS_PRId
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
} /* extern "C" */
|
|
||||||
#endif /* __cplusplus */
|
|
||||||
|
|
||||||
#endif /* _LFS_H */
|
|
@ -1,844 +0,0 @@
|
|||||||
/*
|
|
||||||
* sssort.c for libdivsufsort
|
|
||||||
* Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*- Compiler specifics -*/
|
|
||||||
#ifdef __clang__
|
|
||||||
#pragma clang diagnostic ignored "-Wshorten-64-to-32"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
|
|
||||||
/* inline is defined */
|
|
||||||
#elif defined(_MSC_VER)
|
|
||||||
# define inline __inline
|
|
||||||
#else
|
|
||||||
# define inline /* disable inline */
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef _MSC_VER /* Visual Studio */
|
|
||||||
# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
|
|
||||||
# define FORCE_INLINE static __forceinline
|
|
||||||
#else
|
|
||||||
# if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */
|
|
||||||
# ifdef __GNUC__
|
|
||||||
# define FORCE_INLINE static inline __attribute__((always_inline))
|
|
||||||
# else
|
|
||||||
# define FORCE_INLINE static inline
|
|
||||||
# endif
|
|
||||||
# else
|
|
||||||
# define FORCE_INLINE static
|
|
||||||
# endif /* __STDC_VERSION__ */
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*- Dependencies -*/
|
|
||||||
#include "divsufsort_private.h"
|
|
||||||
|
|
||||||
|
|
||||||
/*- Private Functions -*/
|
|
||||||
|
|
||||||
static const saint_t lg_table[256]= {
|
|
||||||
-1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
|
|
||||||
5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
|
|
||||||
6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
|
|
||||||
6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
|
|
||||||
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
|
||||||
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
|
||||||
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
|
||||||
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
|
|
||||||
};
|
|
||||||
|
|
||||||
#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE)
|
|
||||||
|
|
||||||
static INLINE
|
|
||||||
saint_t
|
|
||||||
ss_ilg(saidx_t n) {
|
|
||||||
#if SS_BLOCKSIZE == 0
|
|
||||||
# if defined(BUILD_DIVSUFSORT64)
|
|
||||||
return (n >> 32) ?
|
|
||||||
((n >> 48) ?
|
|
||||||
((n >> 56) ?
|
|
||||||
56 + lg_table[(n >> 56) & 0xff] :
|
|
||||||
48 + lg_table[(n >> 48) & 0xff]) :
|
|
||||||
((n >> 40) ?
|
|
||||||
40 + lg_table[(n >> 40) & 0xff] :
|
|
||||||
32 + lg_table[(n >> 32) & 0xff])) :
|
|
||||||
((n & 0xffff0000) ?
|
|
||||||
((n & 0xff000000) ?
|
|
||||||
24 + lg_table[(n >> 24) & 0xff] :
|
|
||||||
16 + lg_table[(n >> 16) & 0xff]) :
|
|
||||||
((n & 0x0000ff00) ?
|
|
||||||
8 + lg_table[(n >> 8) & 0xff] :
|
|
||||||
0 + lg_table[(n >> 0) & 0xff]));
|
|
||||||
# else
|
|
||||||
return (n & 0xffff0000) ?
|
|
||||||
((n & 0xff000000) ?
|
|
||||||
24 + lg_table[(n >> 24) & 0xff] :
|
|
||||||
16 + lg_table[(n >> 16) & 0xff]) :
|
|
||||||
((n & 0x0000ff00) ?
|
|
||||||
8 + lg_table[(n >> 8) & 0xff] :
|
|
||||||
0 + lg_table[(n >> 0) & 0xff]);
|
|
||||||
# endif
|
|
||||||
#elif SS_BLOCKSIZE < 256
|
|
||||||
return lg_table[n];
|
|
||||||
#else
|
|
||||||
return (n & 0xff00) ?
|
|
||||||
8 + lg_table[(n >> 8) & 0xff] :
|
|
||||||
0 + lg_table[(n >> 0) & 0xff];
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */
|
|
||||||
|
|
||||||
#if SS_BLOCKSIZE != 0
|
|
||||||
|
|
||||||
static const saint_t sqq_table[256] = {
|
|
||||||
0, 16, 22, 27, 32, 35, 39, 42, 45, 48, 50, 53, 55, 57, 59, 61,
|
|
||||||
64, 65, 67, 69, 71, 73, 75, 76, 78, 80, 81, 83, 84, 86, 87, 89,
|
|
||||||
90, 91, 93, 94, 96, 97, 98, 99, 101, 102, 103, 104, 106, 107, 108, 109,
|
|
||||||
110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
|
|
||||||
128, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
|
|
||||||
143, 144, 144, 145, 146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155,
|
|
||||||
156, 157, 158, 159, 160, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168,
|
|
||||||
169, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178, 179, 180,
|
|
||||||
181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 188, 189, 189, 190, 191,
|
|
||||||
192, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 199, 200, 201, 201,
|
|
||||||
202, 203, 203, 204, 204, 205, 206, 206, 207, 208, 208, 209, 209, 210, 211, 211,
|
|
||||||
212, 212, 213, 214, 214, 215, 215, 216, 217, 217, 218, 218, 219, 219, 220, 221,
|
|
||||||
221, 222, 222, 223, 224, 224, 225, 225, 226, 226, 227, 227, 228, 229, 229, 230,
|
|
||||||
230, 231, 231, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238,
|
|
||||||
239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247,
|
|
||||||
247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255
|
|
||||||
};
|
|
||||||
|
|
||||||
static INLINE
|
|
||||||
saidx_t
|
|
||||||
ss_isqrt(saidx_t x) {
|
|
||||||
saidx_t y, e;
|
|
||||||
|
|
||||||
if(x >= (SS_BLOCKSIZE * SS_BLOCKSIZE)) { return SS_BLOCKSIZE; }
|
|
||||||
e = (x & 0xffff0000) ?
|
|
||||||
((x & 0xff000000) ?
|
|
||||||
24 + lg_table[(x >> 24) & 0xff] :
|
|
||||||
16 + lg_table[(x >> 16) & 0xff]) :
|
|
||||||
((x & 0x0000ff00) ?
|
|
||||||
8 + lg_table[(x >> 8) & 0xff] :
|
|
||||||
0 + lg_table[(x >> 0) & 0xff]);
|
|
||||||
|
|
||||||
if(e >= 16) {
|
|
||||||
y = sqq_table[x >> ((e - 6) - (e & 1))] << ((e >> 1) - 7);
|
|
||||||
if(e >= 24) { y = (y + 1 + x / y) >> 1; }
|
|
||||||
y = (y + 1 + x / y) >> 1;
|
|
||||||
} else if(e >= 8) {
|
|
||||||
y = (sqq_table[x >> ((e - 6) - (e & 1))] >> (7 - (e >> 1))) + 1;
|
|
||||||
} else {
|
|
||||||
return sqq_table[x] >> 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
return (x < (y * y)) ? y - 1 : y;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* SS_BLOCKSIZE != 0 */
|
|
||||||
|
|
||||||
|
|
||||||
/*---------------------------------------------------------------------------*/
|
|
||||||
|
|
||||||
/* Compares two suffixes. */
|
|
||||||
static INLINE
|
|
||||||
saint_t
|
|
||||||
ss_compare(const sauchar_t *T,
|
|
||||||
const saidx_t *p1, const saidx_t *p2,
|
|
||||||
saidx_t depth) {
|
|
||||||
const sauchar_t *U1, *U2, *U1n, *U2n;
|
|
||||||
|
|
||||||
for(U1 = T + depth + *p1,
|
|
||||||
U2 = T + depth + *p2,
|
|
||||||
U1n = T + *(p1 + 1) + 2,
|
|
||||||
U2n = T + *(p2 + 1) + 2;
|
|
||||||
(U1 < U1n) && (U2 < U2n) && (*U1 == *U2);
|
|
||||||
++U1, ++U2) {
|
|
||||||
}
|
|
||||||
|
|
||||||
return U1 < U1n ?
|
|
||||||
(U2 < U2n ? *U1 - *U2 : 1) :
|
|
||||||
(U2 < U2n ? -1 : 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*---------------------------------------------------------------------------*/
|
|
||||||
|
|
||||||
#if (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1)
|
|
||||||
|
|
||||||
/* Insertionsort for small size groups */
|
|
||||||
static
|
|
||||||
void
|
|
||||||
ss_insertionsort(const sauchar_t *T, const saidx_t *PA,
|
|
||||||
saidx_t *first, saidx_t *last, saidx_t depth) {
|
|
||||||
saidx_t *i, *j;
|
|
||||||
saidx_t t;
|
|
||||||
saint_t r;
|
|
||||||
|
|
||||||
for(i = last - 2; first <= i; --i) {
|
|
||||||
for(t = *i, j = i + 1; 0 < (r = ss_compare(T, PA + t, PA + *j, depth));) {
|
|
||||||
do { *(j - 1) = *j; } while((++j < last) && (*j < 0));
|
|
||||||
if(last <= j) { break; }
|
|
||||||
}
|
|
||||||
if(r == 0) { *j = ~*j; }
|
|
||||||
*(j - 1) = t;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) */
|
|
||||||
|
|
||||||
|
|
||||||
/*---------------------------------------------------------------------------*/
|
|
||||||
|
|
||||||
#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE)
|
|
||||||
|
|
||||||
static INLINE
|
|
||||||
void
|
|
||||||
ss_fixdown(const sauchar_t *Td, const saidx_t *PA,
|
|
||||||
saidx_t *SA, saidx_t i, saidx_t size) {
|
|
||||||
saidx_t j, k;
|
|
||||||
saidx_t v;
|
|
||||||
saint_t c, d, e;
|
|
||||||
|
|
||||||
for(v = SA[i], c = Td[PA[v]]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) {
|
|
||||||
d = Td[PA[SA[k = j++]]];
|
|
||||||
if(d < (e = Td[PA[SA[j]]])) { k = j; d = e; }
|
|
||||||
if(d <= c) { break; }
|
|
||||||
}
|
|
||||||
SA[i] = v;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Simple top-down heapsort. */
|
|
||||||
static
|
|
||||||
void
|
|
||||||
ss_heapsort(const sauchar_t *Td, const saidx_t *PA, saidx_t *SA, saidx_t size) {
|
|
||||||
saidx_t i, m;
|
|
||||||
saidx_t t;
|
|
||||||
|
|
||||||
m = size;
|
|
||||||
if((size % 2) == 0) {
|
|
||||||
m--;
|
|
||||||
if(Td[PA[SA[m / 2]]] < Td[PA[SA[m]]]) { SWAP(SA[m], SA[m / 2]); }
|
|
||||||
}
|
|
||||||
|
|
||||||
for(i = m / 2 - 1; 0 <= i; --i) { ss_fixdown(Td, PA, SA, i, m); }
|
|
||||||
if((size % 2) == 0) { SWAP(SA[0], SA[m]); ss_fixdown(Td, PA, SA, 0, m); }
|
|
||||||
for(i = m - 1; 0 < i; --i) {
|
|
||||||
t = SA[0], SA[0] = SA[i];
|
|
||||||
ss_fixdown(Td, PA, SA, 0, i);
|
|
||||||
SA[i] = t;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*---------------------------------------------------------------------------*/
|
|
||||||
|
|
||||||
/* Returns the median of three elements. */
|
|
||||||
static INLINE
|
|
||||||
saidx_t *
|
|
||||||
ss_median3(const sauchar_t *Td, const saidx_t *PA,
|
|
||||||
saidx_t *v1, saidx_t *v2, saidx_t *v3) {
|
|
||||||
saidx_t *t;
|
|
||||||
if(Td[PA[*v1]] > Td[PA[*v2]]) { SWAP(v1, v2); }
|
|
||||||
if(Td[PA[*v2]] > Td[PA[*v3]]) {
|
|
||||||
if(Td[PA[*v1]] > Td[PA[*v3]]) { return v1; }
|
|
||||||
else { return v3; }
|
|
||||||
}
|
|
||||||
return v2;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Returns the median of five elements. */
|
|
||||||
static INLINE
|
|
||||||
saidx_t *
|
|
||||||
ss_median5(const sauchar_t *Td, const saidx_t *PA,
|
|
||||||
saidx_t *v1, saidx_t *v2, saidx_t *v3, saidx_t *v4, saidx_t *v5) {
|
|
||||||
saidx_t *t;
|
|
||||||
if(Td[PA[*v2]] > Td[PA[*v3]]) { SWAP(v2, v3); }
|
|
||||||
if(Td[PA[*v4]] > Td[PA[*v5]]) { SWAP(v4, v5); }
|
|
||||||
if(Td[PA[*v2]] > Td[PA[*v4]]) { SWAP(v2, v4); SWAP(v3, v5); }
|
|
||||||
if(Td[PA[*v1]] > Td[PA[*v3]]) { SWAP(v1, v3); }
|
|
||||||
if(Td[PA[*v1]] > Td[PA[*v4]]) { SWAP(v1, v4); SWAP(v3, v5); }
|
|
||||||
if(Td[PA[*v3]] > Td[PA[*v4]]) { return v4; }
|
|
||||||
return v3;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Returns the pivot element. */
|
|
||||||
static INLINE
|
|
||||||
saidx_t *
|
|
||||||
ss_pivot(const sauchar_t *Td, const saidx_t *PA, saidx_t *first, saidx_t *last) {
|
|
||||||
saidx_t *middle;
|
|
||||||
saidx_t t;
|
|
||||||
|
|
||||||
t = last - first;
|
|
||||||
middle = first + t / 2;
|
|
||||||
|
|
||||||
if(t <= 512) {
|
|
||||||
if(t <= 32) {
|
|
||||||
return ss_median3(Td, PA, first, middle, last - 1);
|
|
||||||
} else {
|
|
||||||
t >>= 2;
|
|
||||||
return ss_median5(Td, PA, first, first + t, middle, last - 1 - t, last - 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
t >>= 3;
|
|
||||||
first = ss_median3(Td, PA, first, first + t, first + (t << 1));
|
|
||||||
middle = ss_median3(Td, PA, middle - t, middle, middle + t);
|
|
||||||
last = ss_median3(Td, PA, last - 1 - (t << 1), last - 1 - t, last - 1);
|
|
||||||
return ss_median3(Td, PA, first, middle, last);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*---------------------------------------------------------------------------*/
|
|
||||||
|
|
||||||
/* Binary partition for substrings. */
|
|
||||||
static INLINE
|
|
||||||
saidx_t *
|
|
||||||
ss_partition(const saidx_t *PA,
|
|
||||||
saidx_t *first, saidx_t *last, saidx_t depth) {
|
|
||||||
saidx_t *a, *b;
|
|
||||||
saidx_t t;
|
|
||||||
for(a = first - 1, b = last;;) {
|
|
||||||
for(; (++a < b) && ((PA[*a] + depth) >= (PA[*a + 1] + 1));) { *a = ~*a; }
|
|
||||||
for(; (a < --b) && ((PA[*b] + depth) < (PA[*b + 1] + 1));) { }
|
|
||||||
if(b <= a) { break; }
|
|
||||||
t = ~*b;
|
|
||||||
*b = *a;
|
|
||||||
*a = t;
|
|
||||||
}
|
|
||||||
if(first < a) { *first = ~*first; }
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Multikey introsort for medium size groups. */
|
|
||||||
static
|
|
||||||
void
|
|
||||||
ss_mintrosort(const sauchar_t *T, const saidx_t *PA,
|
|
||||||
saidx_t *first, saidx_t *last,
|
|
||||||
saidx_t depth) {
|
|
||||||
#define STACK_SIZE SS_MISORT_STACKSIZE
|
|
||||||
struct { saidx_t *a, *b, c; saint_t d; } stack[STACK_SIZE];
|
|
||||||
const sauchar_t *Td;
|
|
||||||
saidx_t *a, *b, *c, *d, *e, *f;
|
|
||||||
saidx_t s, t;
|
|
||||||
saint_t ssize;
|
|
||||||
saint_t limit;
|
|
||||||
saint_t v, x = 0;
|
|
||||||
|
|
||||||
for(ssize = 0, limit = ss_ilg(last - first);;) {
|
|
||||||
|
|
||||||
if((last - first) <= SS_INSERTIONSORT_THRESHOLD) {
|
|
||||||
#if 1 < SS_INSERTIONSORT_THRESHOLD
|
|
||||||
if(1 < (last - first)) { ss_insertionsort(T, PA, first, last, depth); }
|
|
||||||
#endif
|
|
||||||
STACK_POP(first, last, depth, limit);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
Td = T + depth;
|
|
||||||
if(limit-- == 0) { ss_heapsort(Td, PA, first, last - first); }
|
|
||||||
if(limit < 0) {
|
|
||||||
for(a = first + 1, v = Td[PA[*first]]; a < last; ++a) {
|
|
||||||
if((x = Td[PA[*a]]) != v) {
|
|
||||||
if(1 < (a - first)) { break; }
|
|
||||||
v = x;
|
|
||||||
first = a;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(Td[PA[*first] - 1] < v) {
|
|
||||||
first = ss_partition(PA, first, a, depth);
|
|
||||||
}
|
|
||||||
if((a - first) <= (last - a)) {
|
|
||||||
if(1 < (a - first)) {
|
|
||||||
STACK_PUSH(a, last, depth, -1);
|
|
||||||
last = a, depth += 1, limit = ss_ilg(a - first);
|
|
||||||
} else {
|
|
||||||
first = a, limit = -1;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if(1 < (last - a)) {
|
|
||||||
STACK_PUSH(first, a, depth + 1, ss_ilg(a - first));
|
|
||||||
first = a, limit = -1;
|
|
||||||
} else {
|
|
||||||
last = a, depth += 1, limit = ss_ilg(a - first);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* choose pivot */
|
|
||||||
a = ss_pivot(Td, PA, first, last);
|
|
||||||
v = Td[PA[*a]];
|
|
||||||
SWAP(*first, *a);
|
|
||||||
|
|
||||||
/* partition */
|
|
||||||
for(b = first; (++b < last) && ((x = Td[PA[*b]]) == v);) { }
|
|
||||||
if(((a = b) < last) && (x < v)) {
|
|
||||||
for(; (++b < last) && ((x = Td[PA[*b]]) <= v);) {
|
|
||||||
if(x == v) { SWAP(*b, *a); ++a; }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(c = last; (b < --c) && ((x = Td[PA[*c]]) == v);) { }
|
|
||||||
if((b < (d = c)) && (x > v)) {
|
|
||||||
for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) {
|
|
||||||
if(x == v) { SWAP(*c, *d); --d; }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(; b < c;) {
|
|
||||||
SWAP(*b, *c);
|
|
||||||
for(; (++b < c) && ((x = Td[PA[*b]]) <= v);) {
|
|
||||||
if(x == v) { SWAP(*b, *a); ++a; }
|
|
||||||
}
|
|
||||||
for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) {
|
|
||||||
if(x == v) { SWAP(*c, *d); --d; }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(a <= d) {
|
|
||||||
c = b - 1;
|
|
||||||
|
|
||||||
if((s = a - first) > (t = b - a)) { s = t; }
|
|
||||||
for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
|
|
||||||
if((s = d - c) > (t = last - d - 1)) { s = t; }
|
|
||||||
for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
|
|
||||||
|
|
||||||
a = first + (b - a), c = last - (d - c);
|
|
||||||
b = (v <= Td[PA[*a] - 1]) ? a : ss_partition(PA, a, c, depth);
|
|
||||||
|
|
||||||
if((a - first) <= (last - c)) {
|
|
||||||
if((last - c) <= (c - b)) {
|
|
||||||
STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
|
|
||||||
STACK_PUSH(c, last, depth, limit);
|
|
||||||
last = a;
|
|
||||||
} else if((a - first) <= (c - b)) {
|
|
||||||
STACK_PUSH(c, last, depth, limit);
|
|
||||||
STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
|
|
||||||
last = a;
|
|
||||||
} else {
|
|
||||||
STACK_PUSH(c, last, depth, limit);
|
|
||||||
STACK_PUSH(first, a, depth, limit);
|
|
||||||
first = b, last = c, depth += 1, limit = ss_ilg(c - b);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if((a - first) <= (c - b)) {
|
|
||||||
STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
|
|
||||||
STACK_PUSH(first, a, depth, limit);
|
|
||||||
first = c;
|
|
||||||
} else if((last - c) <= (c - b)) {
|
|
||||||
STACK_PUSH(first, a, depth, limit);
|
|
||||||
STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
|
|
||||||
first = c;
|
|
||||||
} else {
|
|
||||||
STACK_PUSH(first, a, depth, limit);
|
|
||||||
STACK_PUSH(c, last, depth, limit);
|
|
||||||
first = b, last = c, depth += 1, limit = ss_ilg(c - b);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
limit += 1;
|
|
||||||
if(Td[PA[*first] - 1] < v) {
|
|
||||||
first = ss_partition(PA, first, last, depth);
|
|
||||||
limit = ss_ilg(last - first);
|
|
||||||
}
|
|
||||||
depth += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#undef STACK_SIZE
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */
|
|
||||||
|
|
||||||
|
|
||||||
/*---------------------------------------------------------------------------*/
|
|
||||||
|
|
||||||
#if SS_BLOCKSIZE != 0
|
|
||||||
|
|
||||||
static INLINE
|
|
||||||
void
|
|
||||||
ss_blockswap(saidx_t *a, saidx_t *b, saidx_t n) {
|
|
||||||
saidx_t t;
|
|
||||||
for(; 0 < n; --n, ++a, ++b) {
|
|
||||||
t = *a, *a = *b, *b = t;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static INLINE
|
|
||||||
void
|
|
||||||
ss_rotate(saidx_t *first, saidx_t *middle, saidx_t *last) {
|
|
||||||
saidx_t *a, *b, t;
|
|
||||||
saidx_t l, r;
|
|
||||||
l = middle - first, r = last - middle;
|
|
||||||
for(; (0 < l) && (0 < r);) {
|
|
||||||
if(l == r) { ss_blockswap(first, middle, l); break; }
|
|
||||||
if(l < r) {
|
|
||||||
a = last - 1, b = middle - 1;
|
|
||||||
t = *a;
|
|
||||||
do {
|
|
||||||
*a-- = *b, *b-- = *a;
|
|
||||||
if(b < first) {
|
|
||||||
*a = t;
|
|
||||||
last = a;
|
|
||||||
if((r -= l + 1) <= l) { break; }
|
|
||||||
a -= 1, b = middle - 1;
|
|
||||||
t = *a;
|
|
||||||
}
|
|
||||||
} while(1);
|
|
||||||
} else {
|
|
||||||
a = first, b = middle;
|
|
||||||
t = *a;
|
|
||||||
do {
|
|
||||||
*a++ = *b, *b++ = *a;
|
|
||||||
if(last <= b) {
|
|
||||||
*a = t;
|
|
||||||
first = a + 1;
|
|
||||||
if((l -= r + 1) <= r) { break; }
|
|
||||||
a += 1, b = middle;
|
|
||||||
t = *a;
|
|
||||||
}
|
|
||||||
} while(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*---------------------------------------------------------------------------*/
|
|
||||||
|
|
||||||
static
|
|
||||||
void
|
|
||||||
ss_inplacemerge(const sauchar_t *T, const saidx_t *PA,
|
|
||||||
saidx_t *first, saidx_t *middle, saidx_t *last,
|
|
||||||
saidx_t depth) {
|
|
||||||
const saidx_t *p;
|
|
||||||
saidx_t *a, *b;
|
|
||||||
saidx_t len, half;
|
|
||||||
saint_t q, r;
|
|
||||||
saint_t x;
|
|
||||||
|
|
||||||
for(;;) {
|
|
||||||
if(*(last - 1) < 0) { x = 1; p = PA + ~*(last - 1); }
|
|
||||||
else { x = 0; p = PA + *(last - 1); }
|
|
||||||
for(a = first, len = middle - first, half = len >> 1, r = -1;
|
|
||||||
0 < len;
|
|
||||||
len = half, half >>= 1) {
|
|
||||||
b = a + half;
|
|
||||||
q = ss_compare(T, PA + ((0 <= *b) ? *b : ~*b), p, depth);
|
|
||||||
if(q < 0) {
|
|
||||||
a = b + 1;
|
|
||||||
half -= (len & 1) ^ 1;
|
|
||||||
} else {
|
|
||||||
r = q;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(a < middle) {
|
|
||||||
if(r == 0) { *a = ~*a; }
|
|
||||||
ss_rotate(a, middle, last);
|
|
||||||
last -= middle - a;
|
|
||||||
middle = a;
|
|
||||||
if(first == middle) { break; }
|
|
||||||
}
|
|
||||||
--last;
|
|
||||||
if(x != 0) { while(*--last < 0) { } }
|
|
||||||
if(middle == last) { break; }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*---------------------------------------------------------------------------*/
|
|
||||||
|
|
||||||
/* Merge-forward with internal buffer. */
|
|
||||||
static
|
|
||||||
void
|
|
||||||
ss_mergeforward(const sauchar_t *T, const saidx_t *PA,
|
|
||||||
saidx_t *first, saidx_t *middle, saidx_t *last,
|
|
||||||
saidx_t *buf, saidx_t depth) {
|
|
||||||
saidx_t *a, *b, *c, *bufend;
|
|
||||||
saidx_t t;
|
|
||||||
saint_t r;
|
|
||||||
|
|
||||||
bufend = buf + (middle - first) - 1;
|
|
||||||
ss_blockswap(buf, first, middle - first);
|
|
||||||
|
|
||||||
for(t = *(a = first), b = buf, c = middle;;) {
|
|
||||||
r = ss_compare(T, PA + *b, PA + *c, depth);
|
|
||||||
if(r < 0) {
|
|
||||||
do {
|
|
||||||
*a++ = *b;
|
|
||||||
if(bufend <= b) { *bufend = t; return; }
|
|
||||||
*b++ = *a;
|
|
||||||
} while(*b < 0);
|
|
||||||
} else if(r > 0) {
|
|
||||||
do {
|
|
||||||
*a++ = *c, *c++ = *a;
|
|
||||||
if(last <= c) {
|
|
||||||
while(b < bufend) { *a++ = *b, *b++ = *a; }
|
|
||||||
*a = *b, *b = t;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
} while(*c < 0);
|
|
||||||
} else {
|
|
||||||
*c = ~*c;
|
|
||||||
do {
|
|
||||||
*a++ = *b;
|
|
||||||
if(bufend <= b) { *bufend = t; return; }
|
|
||||||
*b++ = *a;
|
|
||||||
} while(*b < 0);
|
|
||||||
|
|
||||||
do {
|
|
||||||
*a++ = *c, *c++ = *a;
|
|
||||||
if(last <= c) {
|
|
||||||
while(b < bufend) { *a++ = *b, *b++ = *a; }
|
|
||||||
*a = *b, *b = t;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
} while(*c < 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Merge-backward with internal buffer. */
|
|
||||||
static
|
|
||||||
void
|
|
||||||
ss_mergebackward(const sauchar_t *T, const saidx_t *PA,
|
|
||||||
saidx_t *first, saidx_t *middle, saidx_t *last,
|
|
||||||
saidx_t *buf, saidx_t depth) {
|
|
||||||
const saidx_t *p1, *p2;
|
|
||||||
saidx_t *a, *b, *c, *bufend;
|
|
||||||
saidx_t t;
|
|
||||||
saint_t r;
|
|
||||||
saint_t x;
|
|
||||||
|
|
||||||
bufend = buf + (last - middle) - 1;
|
|
||||||
ss_blockswap(buf, middle, last - middle);
|
|
||||||
|
|
||||||
x = 0;
|
|
||||||
if(*bufend < 0) { p1 = PA + ~*bufend; x |= 1; }
|
|
||||||
else { p1 = PA + *bufend; }
|
|
||||||
if(*(middle - 1) < 0) { p2 = PA + ~*(middle - 1); x |= 2; }
|
|
||||||
else { p2 = PA + *(middle - 1); }
|
|
||||||
for(t = *(a = last - 1), b = bufend, c = middle - 1;;) {
|
|
||||||
r = ss_compare(T, p1, p2, depth);
|
|
||||||
if(0 < r) {
|
|
||||||
if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; }
|
|
||||||
*a-- = *b;
|
|
||||||
if(b <= buf) { *buf = t; break; }
|
|
||||||
*b-- = *a;
|
|
||||||
if(*b < 0) { p1 = PA + ~*b; x |= 1; }
|
|
||||||
else { p1 = PA + *b; }
|
|
||||||
} else if(r < 0) {
|
|
||||||
if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; }
|
|
||||||
*a-- = *c, *c-- = *a;
|
|
||||||
if(c < first) {
|
|
||||||
while(buf < b) { *a-- = *b, *b-- = *a; }
|
|
||||||
*a = *b, *b = t;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if(*c < 0) { p2 = PA + ~*c; x |= 2; }
|
|
||||||
else { p2 = PA + *c; }
|
|
||||||
} else {
|
|
||||||
if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; }
|
|
||||||
*a-- = ~*b;
|
|
||||||
if(b <= buf) { *buf = t; break; }
|
|
||||||
*b-- = *a;
|
|
||||||
if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; }
|
|
||||||
*a-- = *c, *c-- = *a;
|
|
||||||
if(c < first) {
|
|
||||||
while(buf < b) { *a-- = *b, *b-- = *a; }
|
|
||||||
*a = *b, *b = t;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if(*b < 0) { p1 = PA + ~*b; x |= 1; }
|
|
||||||
else { p1 = PA + *b; }
|
|
||||||
if(*c < 0) { p2 = PA + ~*c; x |= 2; }
|
|
||||||
else { p2 = PA + *c; }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* D&C based merge. */
|
|
||||||
static
|
|
||||||
void
|
|
||||||
ss_swapmerge(const sauchar_t *T, const saidx_t *PA,
|
|
||||||
saidx_t *first, saidx_t *middle, saidx_t *last,
|
|
||||||
saidx_t *buf, saidx_t bufsize, saidx_t depth) {
|
|
||||||
#define STACK_SIZE SS_SMERGE_STACKSIZE
|
|
||||||
#define GETIDX(a) ((0 <= (a)) ? (a) : (~(a)))
|
|
||||||
#define MERGE_CHECK(a, b, c)\
|
|
||||||
do {\
|
|
||||||
if(((c) & 1) ||\
|
|
||||||
(((c) & 2) && (ss_compare(T, PA + GETIDX(*((a) - 1)), PA + *(a), depth) == 0))) {\
|
|
||||||
*(a) = ~*(a);\
|
|
||||||
}\
|
|
||||||
if(((c) & 4) && ((ss_compare(T, PA + GETIDX(*((b) - 1)), PA + *(b), depth) == 0))) {\
|
|
||||||
*(b) = ~*(b);\
|
|
||||||
}\
|
|
||||||
} while(0)
|
|
||||||
struct { saidx_t *a, *b, *c; saint_t d; } stack[STACK_SIZE];
|
|
||||||
saidx_t *l, *r, *lm, *rm;
|
|
||||||
saidx_t m, len, half;
|
|
||||||
saint_t ssize;
|
|
||||||
saint_t check, next;
|
|
||||||
|
|
||||||
for(check = 0, ssize = 0;;) {
|
|
||||||
if((last - middle) <= bufsize) {
|
|
||||||
if((first < middle) && (middle < last)) {
|
|
||||||
ss_mergebackward(T, PA, first, middle, last, buf, depth);
|
|
||||||
}
|
|
||||||
MERGE_CHECK(first, last, check);
|
|
||||||
STACK_POP(first, middle, last, check);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if((middle - first) <= bufsize) {
|
|
||||||
if(first < middle) {
|
|
||||||
ss_mergeforward(T, PA, first, middle, last, buf, depth);
|
|
||||||
}
|
|
||||||
MERGE_CHECK(first, last, check);
|
|
||||||
STACK_POP(first, middle, last, check);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
for(m = 0, len = MIN(middle - first, last - middle), half = len >> 1;
|
|
||||||
0 < len;
|
|
||||||
len = half, half >>= 1) {
|
|
||||||
if(ss_compare(T, PA + GETIDX(*(middle + m + half)),
|
|
||||||
PA + GETIDX(*(middle - m - half - 1)), depth) < 0) {
|
|
||||||
m += half + 1;
|
|
||||||
half -= (len & 1) ^ 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(0 < m) {
|
|
||||||
lm = middle - m, rm = middle + m;
|
|
||||||
ss_blockswap(lm, middle, m);
|
|
||||||
l = r = middle, next = 0;
|
|
||||||
if(rm < last) {
|
|
||||||
if(*rm < 0) {
|
|
||||||
*rm = ~*rm;
|
|
||||||
if(first < lm) { for(; *--l < 0;) { } next |= 4; }
|
|
||||||
next |= 1;
|
|
||||||
} else if(first < lm) {
|
|
||||||
for(; *r < 0; ++r) { }
|
|
||||||
next |= 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if((l - first) <= (last - r)) {
|
|
||||||
STACK_PUSH(r, rm, last, (next & 3) | (check & 4));
|
|
||||||
middle = lm, last = l, check = (check & 3) | (next & 4);
|
|
||||||
} else {
|
|
||||||
if((next & 2) && (r == middle)) { next ^= 6; }
|
|
||||||
STACK_PUSH(first, lm, l, (check & 3) | (next & 4));
|
|
||||||
first = r, middle = rm, check = (next & 3) | (check & 4);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if(ss_compare(T, PA + GETIDX(*(middle - 1)), PA + *middle, depth) == 0) {
|
|
||||||
*middle = ~*middle;
|
|
||||||
}
|
|
||||||
MERGE_CHECK(first, last, check);
|
|
||||||
STACK_POP(first, middle, last, check);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#undef STACK_SIZE
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* SS_BLOCKSIZE != 0 */
|
|
||||||
|
|
||||||
|
|
||||||
/*---------------------------------------------------------------------------*/
|
|
||||||
|
|
||||||
/*- Function -*/
|
|
||||||
|
|
||||||
/* Substring sort */
|
|
||||||
void
|
|
||||||
sssort(const sauchar_t *T, const saidx_t *PA,
|
|
||||||
saidx_t *first, saidx_t *last,
|
|
||||||
saidx_t *buf, saidx_t bufsize,
|
|
||||||
saidx_t depth, saidx_t n, saint_t lastsuffix) {
|
|
||||||
saidx_t *a;
|
|
||||||
#if SS_BLOCKSIZE != 0
|
|
||||||
saidx_t *b, *middle, *curbuf;
|
|
||||||
saidx_t j, k, curbufsize, limit;
|
|
||||||
#endif
|
|
||||||
saidx_t i;
|
|
||||||
|
|
||||||
if(lastsuffix != 0) { ++first; }
|
|
||||||
|
|
||||||
#if SS_BLOCKSIZE == 0
|
|
||||||
ss_mintrosort(T, PA, first, last, depth);
|
|
||||||
#else
|
|
||||||
if((bufsize < SS_BLOCKSIZE) &&
|
|
||||||
(bufsize < (last - first)) &&
|
|
||||||
(bufsize < (limit = ss_isqrt(last - first)))) {
|
|
||||||
if(SS_BLOCKSIZE < limit) { limit = SS_BLOCKSIZE; }
|
|
||||||
buf = middle = last - limit, bufsize = limit;
|
|
||||||
} else {
|
|
||||||
middle = last, limit = 0;
|
|
||||||
}
|
|
||||||
for(a = first, i = 0; SS_BLOCKSIZE < (middle - a); a += SS_BLOCKSIZE, ++i) {
|
|
||||||
#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
|
|
||||||
ss_mintrosort(T, PA, a, a + SS_BLOCKSIZE, depth);
|
|
||||||
#elif 1 < SS_BLOCKSIZE
|
|
||||||
ss_insertionsort(T, PA, a, a + SS_BLOCKSIZE, depth);
|
|
||||||
#endif
|
|
||||||
curbufsize = last - (a + SS_BLOCKSIZE);
|
|
||||||
curbuf = a + SS_BLOCKSIZE;
|
|
||||||
if(curbufsize <= bufsize) { curbufsize = bufsize, curbuf = buf; }
|
|
||||||
for(b = a, k = SS_BLOCKSIZE, j = i; j & 1; b -= k, k <<= 1, j >>= 1) {
|
|
||||||
ss_swapmerge(T, PA, b - k, b, b + k, curbuf, curbufsize, depth);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
|
|
||||||
ss_mintrosort(T, PA, a, middle, depth);
|
|
||||||
#elif 1 < SS_BLOCKSIZE
|
|
||||||
ss_insertionsort(T, PA, a, middle, depth);
|
|
||||||
#endif
|
|
||||||
for(k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) {
|
|
||||||
if(i & 1) {
|
|
||||||
ss_swapmerge(T, PA, a - k, a, middle, buf, bufsize, depth);
|
|
||||||
a -= k;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(limit != 0) {
|
|
||||||
#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
|
|
||||||
ss_mintrosort(T, PA, middle, last, depth);
|
|
||||||
#elif 1 < SS_BLOCKSIZE
|
|
||||||
ss_insertionsort(T, PA, middle, last, depth);
|
|
||||||
#endif
|
|
||||||
ss_inplacemerge(T, PA, first, middle, last, depth);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if(lastsuffix != 0) {
|
|
||||||
/* Insert last type B* suffix. */
|
|
||||||
saidx_t PAi[2]; PAi[0] = PA[*(first - 1)], PAi[1] = n - 2;
|
|
||||||
for(a = first, i = *(first - 1);
|
|
||||||
(a < last) && ((*a < 0) || (0 < ss_compare(T, &(PAi[0]), PA + *a, depth)));
|
|
||||||
++a) {
|
|
||||||
*(a - 1) = *a;
|
|
||||||
}
|
|
||||||
*(a - 1) = i;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,615 +0,0 @@
|
|||||||
/*
|
|
||||||
* trsort.c for libdivsufsort
|
|
||||||
* Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*- Compiler specifics -*/
|
|
||||||
#ifdef __clang__
|
|
||||||
#pragma clang diagnostic ignored "-Wshorten-64-to-32"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
|
|
||||||
/* inline is defined */
|
|
||||||
#elif defined(_MSC_VER)
|
|
||||||
# define inline __inline
|
|
||||||
#else
|
|
||||||
# define inline /* disable inline */
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef _MSC_VER /* Visual Studio */
|
|
||||||
# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
|
|
||||||
# define FORCE_INLINE static __forceinline
|
|
||||||
#else
|
|
||||||
# if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */
|
|
||||||
# ifdef __GNUC__
|
|
||||||
# define FORCE_INLINE static inline __attribute__((always_inline))
|
|
||||||
# else
|
|
||||||
# define FORCE_INLINE static inline
|
|
||||||
# endif
|
|
||||||
# else
|
|
||||||
# define FORCE_INLINE static
|
|
||||||
# endif /* __STDC_VERSION__ */
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*- Dependencies -*/
|
|
||||||
#include "divsufsort_private.h"
|
|
||||||
|
|
||||||
|
|
||||||
/*- Private Functions -*/
|
|
||||||
|
|
||||||
static const saint_t lg_table[256]= {
|
|
||||||
-1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
|
|
||||||
5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
|
|
||||||
6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
|
|
||||||
6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
|
|
||||||
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
|
||||||
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
|
||||||
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
|
||||||
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
|
|
||||||
};
|
|
||||||
|
|
||||||
static INLINE
|
|
||||||
saint_t
|
|
||||||
tr_ilg(saidx_t n) {
|
|
||||||
#if defined(BUILD_DIVSUFSORT64)
|
|
||||||
return (n >> 32) ?
|
|
||||||
((n >> 48) ?
|
|
||||||
((n >> 56) ?
|
|
||||||
56 + lg_table[(n >> 56) & 0xff] :
|
|
||||||
48 + lg_table[(n >> 48) & 0xff]) :
|
|
||||||
((n >> 40) ?
|
|
||||||
40 + lg_table[(n >> 40) & 0xff] :
|
|
||||||
32 + lg_table[(n >> 32) & 0xff])) :
|
|
||||||
((n & 0xffff0000) ?
|
|
||||||
((n & 0xff000000) ?
|
|
||||||
24 + lg_table[(n >> 24) & 0xff] :
|
|
||||||
16 + lg_table[(n >> 16) & 0xff]) :
|
|
||||||
((n & 0x0000ff00) ?
|
|
||||||
8 + lg_table[(n >> 8) & 0xff] :
|
|
||||||
0 + lg_table[(n >> 0) & 0xff]));
|
|
||||||
#else
|
|
||||||
return (n & 0xffff0000) ?
|
|
||||||
((n & 0xff000000) ?
|
|
||||||
24 + lg_table[(n >> 24) & 0xff] :
|
|
||||||
16 + lg_table[(n >> 16) & 0xff]) :
|
|
||||||
((n & 0x0000ff00) ?
|
|
||||||
8 + lg_table[(n >> 8) & 0xff] :
|
|
||||||
0 + lg_table[(n >> 0) & 0xff]);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*---------------------------------------------------------------------------*/
|
|
||||||
|
|
||||||
/* Simple insertionsort for small size groups. */
|
|
||||||
static
|
|
||||||
void
|
|
||||||
tr_insertionsort(const saidx_t *ISAd, saidx_t *first, saidx_t *last) {
|
|
||||||
saidx_t *a, *b;
|
|
||||||
saidx_t t, r;
|
|
||||||
|
|
||||||
for(a = first + 1; a < last; ++a) {
|
|
||||||
for(t = *a, b = a - 1; 0 > (r = ISAd[t] - ISAd[*b]);) {
|
|
||||||
do { *(b + 1) = *b; } while((first <= --b) && (*b < 0));
|
|
||||||
if(b < first) { break; }
|
|
||||||
}
|
|
||||||
if(r == 0) { *b = ~*b; }
|
|
||||||
*(b + 1) = t;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*---------------------------------------------------------------------------*/
|
|
||||||
|
|
||||||
static INLINE
|
|
||||||
void
|
|
||||||
tr_fixdown(const saidx_t *ISAd, saidx_t *SA, saidx_t i, saidx_t size) {
|
|
||||||
saidx_t j, k;
|
|
||||||
saidx_t v;
|
|
||||||
saidx_t c, d, e;
|
|
||||||
|
|
||||||
for(v = SA[i], c = ISAd[v]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) {
|
|
||||||
d = ISAd[SA[k = j++]];
|
|
||||||
if(d < (e = ISAd[SA[j]])) { k = j; d = e; }
|
|
||||||
if(d <= c) { break; }
|
|
||||||
}
|
|
||||||
SA[i] = v;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Simple top-down heapsort. */
|
|
||||||
static
|
|
||||||
void
|
|
||||||
tr_heapsort(const saidx_t *ISAd, saidx_t *SA, saidx_t size) {
|
|
||||||
saidx_t i, m;
|
|
||||||
saidx_t t;
|
|
||||||
|
|
||||||
m = size;
|
|
||||||
if((size % 2) == 0) {
|
|
||||||
m--;
|
|
||||||
if(ISAd[SA[m / 2]] < ISAd[SA[m]]) { SWAP(SA[m], SA[m / 2]); }
|
|
||||||
}
|
|
||||||
|
|
||||||
for(i = m / 2 - 1; 0 <= i; --i) { tr_fixdown(ISAd, SA, i, m); }
|
|
||||||
if((size % 2) == 0) { SWAP(SA[0], SA[m]); tr_fixdown(ISAd, SA, 0, m); }
|
|
||||||
for(i = m - 1; 0 < i; --i) {
|
|
||||||
t = SA[0], SA[0] = SA[i];
|
|
||||||
tr_fixdown(ISAd, SA, 0, i);
|
|
||||||
SA[i] = t;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*---------------------------------------------------------------------------*/
|
|
||||||
|
|
||||||
/* Returns the median of three elements. */
|
|
||||||
static INLINE
|
|
||||||
saidx_t *
|
|
||||||
tr_median3(const saidx_t *ISAd, saidx_t *v1, saidx_t *v2, saidx_t *v3) {
|
|
||||||
saidx_t *t;
|
|
||||||
if(ISAd[*v1] > ISAd[*v2]) { SWAP(v1, v2); }
|
|
||||||
if(ISAd[*v2] > ISAd[*v3]) {
|
|
||||||
if(ISAd[*v1] > ISAd[*v3]) { return v1; }
|
|
||||||
else { return v3; }
|
|
||||||
}
|
|
||||||
return v2;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Returns the median of five elements. */
|
|
||||||
static INLINE
|
|
||||||
saidx_t *
|
|
||||||
tr_median5(const saidx_t *ISAd,
|
|
||||||
saidx_t *v1, saidx_t *v2, saidx_t *v3, saidx_t *v4, saidx_t *v5) {
|
|
||||||
saidx_t *t;
|
|
||||||
if(ISAd[*v2] > ISAd[*v3]) { SWAP(v2, v3); }
|
|
||||||
if(ISAd[*v4] > ISAd[*v5]) { SWAP(v4, v5); }
|
|
||||||
if(ISAd[*v2] > ISAd[*v4]) { SWAP(v2, v4); SWAP(v3, v5); }
|
|
||||||
if(ISAd[*v1] > ISAd[*v3]) { SWAP(v1, v3); }
|
|
||||||
if(ISAd[*v1] > ISAd[*v4]) { SWAP(v1, v4); SWAP(v3, v5); }
|
|
||||||
if(ISAd[*v3] > ISAd[*v4]) { return v4; }
|
|
||||||
return v3;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Returns the pivot element. */
|
|
||||||
static INLINE
|
|
||||||
saidx_t *
|
|
||||||
tr_pivot(const saidx_t *ISAd, saidx_t *first, saidx_t *last) {
|
|
||||||
saidx_t *middle;
|
|
||||||
saidx_t t;
|
|
||||||
|
|
||||||
t = last - first;
|
|
||||||
middle = first + t / 2;
|
|
||||||
|
|
||||||
if(t <= 512) {
|
|
||||||
if(t <= 32) {
|
|
||||||
return tr_median3(ISAd, first, middle, last - 1);
|
|
||||||
} else {
|
|
||||||
t >>= 2;
|
|
||||||
return tr_median5(ISAd, first, first + t, middle, last - 1 - t, last - 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
t >>= 3;
|
|
||||||
first = tr_median3(ISAd, first, first + t, first + (t << 1));
|
|
||||||
middle = tr_median3(ISAd, middle - t, middle, middle + t);
|
|
||||||
last = tr_median3(ISAd, last - 1 - (t << 1), last - 1 - t, last - 1);
|
|
||||||
return tr_median3(ISAd, first, middle, last);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*---------------------------------------------------------------------------*/
|
|
||||||
|
|
||||||
typedef struct _trbudget_t trbudget_t;
|
|
||||||
struct _trbudget_t {
|
|
||||||
saidx_t chance;
|
|
||||||
saidx_t remain;
|
|
||||||
saidx_t incval;
|
|
||||||
saidx_t count;
|
|
||||||
};
|
|
||||||
|
|
||||||
static INLINE
|
|
||||||
void
|
|
||||||
trbudget_init(trbudget_t *budget, saidx_t chance, saidx_t incval) {
|
|
||||||
budget->chance = chance;
|
|
||||||
budget->remain = budget->incval = incval;
|
|
||||||
}
|
|
||||||
|
|
||||||
static INLINE
|
|
||||||
saint_t
|
|
||||||
trbudget_check(trbudget_t *budget, saidx_t size) {
|
|
||||||
if(size <= budget->remain) { budget->remain -= size; return 1; }
|
|
||||||
if(budget->chance == 0) { budget->count += size; return 0; }
|
|
||||||
budget->remain += budget->incval - size;
|
|
||||||
budget->chance -= 1;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*---------------------------------------------------------------------------*/
|
|
||||||
|
|
||||||
static INLINE
|
|
||||||
void
|
|
||||||
tr_partition(const saidx_t *ISAd,
|
|
||||||
saidx_t *first, saidx_t *middle, saidx_t *last,
|
|
||||||
saidx_t **pa, saidx_t **pb, saidx_t v) {
|
|
||||||
saidx_t *a, *b, *c, *d, *e, *f;
|
|
||||||
saidx_t t, s;
|
|
||||||
saidx_t x = 0;
|
|
||||||
|
|
||||||
for(b = middle - 1; (++b < last) && ((x = ISAd[*b]) == v);) { }
|
|
||||||
if(((a = b) < last) && (x < v)) {
|
|
||||||
for(; (++b < last) && ((x = ISAd[*b]) <= v);) {
|
|
||||||
if(x == v) { SWAP(*b, *a); ++a; }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(c = last; (b < --c) && ((x = ISAd[*c]) == v);) { }
|
|
||||||
if((b < (d = c)) && (x > v)) {
|
|
||||||
for(; (b < --c) && ((x = ISAd[*c]) >= v);) {
|
|
||||||
if(x == v) { SWAP(*c, *d); --d; }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(; b < c;) {
|
|
||||||
SWAP(*b, *c);
|
|
||||||
for(; (++b < c) && ((x = ISAd[*b]) <= v);) {
|
|
||||||
if(x == v) { SWAP(*b, *a); ++a; }
|
|
||||||
}
|
|
||||||
for(; (b < --c) && ((x = ISAd[*c]) >= v);) {
|
|
||||||
if(x == v) { SWAP(*c, *d); --d; }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(a <= d) {
|
|
||||||
c = b - 1;
|
|
||||||
if((s = a - first) > (t = b - a)) { s = t; }
|
|
||||||
for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
|
|
||||||
if((s = d - c) > (t = last - d - 1)) { s = t; }
|
|
||||||
for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
|
|
||||||
first += (b - a), last -= (d - c);
|
|
||||||
}
|
|
||||||
*pa = first, *pb = last;
|
|
||||||
}
|
|
||||||
|
|
||||||
static
|
|
||||||
void
|
|
||||||
tr_copy(saidx_t *ISA, const saidx_t *SA,
|
|
||||||
saidx_t *first, saidx_t *a, saidx_t *b, saidx_t *last,
|
|
||||||
saidx_t depth) {
|
|
||||||
/* sort suffixes of middle partition
|
|
||||||
by using sorted order of suffixes of left and right partition. */
|
|
||||||
saidx_t *c, *d, *e;
|
|
||||||
saidx_t s, v;
|
|
||||||
|
|
||||||
v = b - SA - 1;
|
|
||||||
for(c = first, d = a - 1; c <= d; ++c) {
|
|
||||||
if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
|
|
||||||
*++d = s;
|
|
||||||
ISA[s] = d - SA;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(c = last - 1, e = d + 1, d = b; e < d; --c) {
|
|
||||||
if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
|
|
||||||
*--d = s;
|
|
||||||
ISA[s] = d - SA;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static
|
|
||||||
void
|
|
||||||
tr_partialcopy(saidx_t *ISA, const saidx_t *SA,
|
|
||||||
saidx_t *first, saidx_t *a, saidx_t *b, saidx_t *last,
|
|
||||||
saidx_t depth) {
|
|
||||||
saidx_t *c, *d, *e;
|
|
||||||
saidx_t s, v;
|
|
||||||
saidx_t rank, lastrank, newrank = -1;
|
|
||||||
|
|
||||||
v = b - SA - 1;
|
|
||||||
lastrank = -1;
|
|
||||||
for(c = first, d = a - 1; c <= d; ++c) {
|
|
||||||
if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
|
|
||||||
*++d = s;
|
|
||||||
rank = ISA[s + depth];
|
|
||||||
if(lastrank != rank) { lastrank = rank; newrank = d - SA; }
|
|
||||||
ISA[s] = newrank;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
lastrank = -1;
|
|
||||||
for(e = d; first <= e; --e) {
|
|
||||||
rank = ISA[*e];
|
|
||||||
if(lastrank != rank) { lastrank = rank; newrank = e - SA; }
|
|
||||||
if(newrank != rank) { ISA[*e] = newrank; }
|
|
||||||
}
|
|
||||||
|
|
||||||
lastrank = -1;
|
|
||||||
for(c = last - 1, e = d + 1, d = b; e < d; --c) {
|
|
||||||
if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
|
|
||||||
*--d = s;
|
|
||||||
rank = ISA[s + depth];
|
|
||||||
if(lastrank != rank) { lastrank = rank; newrank = d - SA; }
|
|
||||||
ISA[s] = newrank;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static
|
|
||||||
void
|
|
||||||
tr_introsort(saidx_t *ISA, const saidx_t *ISAd,
|
|
||||||
saidx_t *SA, saidx_t *first, saidx_t *last,
|
|
||||||
trbudget_t *budget) {
|
|
||||||
#define STACK_SIZE TR_STACKSIZE
|
|
||||||
struct { const saidx_t *a; saidx_t *b, *c; saint_t d, e; }stack[STACK_SIZE];
|
|
||||||
saidx_t *a, *b, *c;
|
|
||||||
saidx_t t;
|
|
||||||
saidx_t v, x = 0;
|
|
||||||
saidx_t incr = ISAd - ISA;
|
|
||||||
saint_t limit, next;
|
|
||||||
saint_t ssize, trlink = -1;
|
|
||||||
|
|
||||||
for(ssize = 0, limit = tr_ilg(last - first);;) {
|
|
||||||
|
|
||||||
if(limit < 0) {
|
|
||||||
if(limit == -1) {
|
|
||||||
/* tandem repeat partition */
|
|
||||||
tr_partition(ISAd - incr, first, first, last, &a, &b, last - SA - 1);
|
|
||||||
|
|
||||||
/* update ranks */
|
|
||||||
if(a < last) {
|
|
||||||
for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; }
|
|
||||||
}
|
|
||||||
if(b < last) {
|
|
||||||
for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; }
|
|
||||||
}
|
|
||||||
|
|
||||||
/* push */
|
|
||||||
if(1 < (b - a)) {
|
|
||||||
STACK_PUSH5(NULL, a, b, 0, 0);
|
|
||||||
STACK_PUSH5(ISAd - incr, first, last, -2, trlink);
|
|
||||||
trlink = ssize - 2;
|
|
||||||
}
|
|
||||||
if((a - first) <= (last - b)) {
|
|
||||||
if(1 < (a - first)) {
|
|
||||||
STACK_PUSH5(ISAd, b, last, tr_ilg(last - b), trlink);
|
|
||||||
last = a, limit = tr_ilg(a - first);
|
|
||||||
} else if(1 < (last - b)) {
|
|
||||||
first = b, limit = tr_ilg(last - b);
|
|
||||||
} else {
|
|
||||||
STACK_POP5(ISAd, first, last, limit, trlink);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if(1 < (last - b)) {
|
|
||||||
STACK_PUSH5(ISAd, first, a, tr_ilg(a - first), trlink);
|
|
||||||
first = b, limit = tr_ilg(last - b);
|
|
||||||
} else if(1 < (a - first)) {
|
|
||||||
last = a, limit = tr_ilg(a - first);
|
|
||||||
} else {
|
|
||||||
STACK_POP5(ISAd, first, last, limit, trlink);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if(limit == -2) {
|
|
||||||
/* tandem repeat copy */
|
|
||||||
a = stack[--ssize].b, b = stack[ssize].c;
|
|
||||||
if(stack[ssize].d == 0) {
|
|
||||||
tr_copy(ISA, SA, first, a, b, last, ISAd - ISA);
|
|
||||||
} else {
|
|
||||||
if(0 <= trlink) { stack[trlink].d = -1; }
|
|
||||||
tr_partialcopy(ISA, SA, first, a, b, last, ISAd - ISA);
|
|
||||||
}
|
|
||||||
STACK_POP5(ISAd, first, last, limit, trlink);
|
|
||||||
} else {
|
|
||||||
/* sorted partition */
|
|
||||||
if(0 <= *first) {
|
|
||||||
a = first;
|
|
||||||
do { ISA[*a] = a - SA; } while((++a < last) && (0 <= *a));
|
|
||||||
first = a;
|
|
||||||
}
|
|
||||||
if(first < last) {
|
|
||||||
a = first; do { *a = ~*a; } while(*++a < 0);
|
|
||||||
next = (ISA[*a] != ISAd[*a]) ? tr_ilg(a - first + 1) : -1;
|
|
||||||
if(++a < last) { for(b = first, v = a - SA - 1; b < a; ++b) { ISA[*b] = v; } }
|
|
||||||
|
|
||||||
/* push */
|
|
||||||
if(trbudget_check(budget, a - first)) {
|
|
||||||
if((a - first) <= (last - a)) {
|
|
||||||
STACK_PUSH5(ISAd, a, last, -3, trlink);
|
|
||||||
ISAd += incr, last = a, limit = next;
|
|
||||||
} else {
|
|
||||||
if(1 < (last - a)) {
|
|
||||||
STACK_PUSH5(ISAd + incr, first, a, next, trlink);
|
|
||||||
first = a, limit = -3;
|
|
||||||
} else {
|
|
||||||
ISAd += incr, last = a, limit = next;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if(0 <= trlink) { stack[trlink].d = -1; }
|
|
||||||
if(1 < (last - a)) {
|
|
||||||
first = a, limit = -3;
|
|
||||||
} else {
|
|
||||||
STACK_POP5(ISAd, first, last, limit, trlink);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
STACK_POP5(ISAd, first, last, limit, trlink);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if((last - first) <= TR_INSERTIONSORT_THRESHOLD) {
|
|
||||||
tr_insertionsort(ISAd, first, last);
|
|
||||||
limit = -3;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(limit-- == 0) {
|
|
||||||
tr_heapsort(ISAd, first, last - first);
|
|
||||||
for(a = last - 1; first < a; a = b) {
|
|
||||||
for(x = ISAd[*a], b = a - 1; (first <= b) && (ISAd[*b] == x); --b) { *b = ~*b; }
|
|
||||||
}
|
|
||||||
limit = -3;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* choose pivot */
|
|
||||||
a = tr_pivot(ISAd, first, last);
|
|
||||||
SWAP(*first, *a);
|
|
||||||
v = ISAd[*first];
|
|
||||||
|
|
||||||
/* partition */
|
|
||||||
tr_partition(ISAd, first, first + 1, last, &a, &b, v);
|
|
||||||
if((last - first) != (b - a)) {
|
|
||||||
next = (ISA[*a] != v) ? tr_ilg(b - a) : -1;
|
|
||||||
|
|
||||||
/* update ranks */
|
|
||||||
for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; }
|
|
||||||
if(b < last) { for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } }
|
|
||||||
|
|
||||||
/* push */
|
|
||||||
if((1 < (b - a)) && (trbudget_check(budget, b - a))) {
|
|
||||||
if((a - first) <= (last - b)) {
|
|
||||||
if((last - b) <= (b - a)) {
|
|
||||||
if(1 < (a - first)) {
|
|
||||||
STACK_PUSH5(ISAd + incr, a, b, next, trlink);
|
|
||||||
STACK_PUSH5(ISAd, b, last, limit, trlink);
|
|
||||||
last = a;
|
|
||||||
} else if(1 < (last - b)) {
|
|
||||||
STACK_PUSH5(ISAd + incr, a, b, next, trlink);
|
|
||||||
first = b;
|
|
||||||
} else {
|
|
||||||
ISAd += incr, first = a, last = b, limit = next;
|
|
||||||
}
|
|
||||||
} else if((a - first) <= (b - a)) {
|
|
||||||
if(1 < (a - first)) {
|
|
||||||
STACK_PUSH5(ISAd, b, last, limit, trlink);
|
|
||||||
STACK_PUSH5(ISAd + incr, a, b, next, trlink);
|
|
||||||
last = a;
|
|
||||||
} else {
|
|
||||||
STACK_PUSH5(ISAd, b, last, limit, trlink);
|
|
||||||
ISAd += incr, first = a, last = b, limit = next;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
STACK_PUSH5(ISAd, b, last, limit, trlink);
|
|
||||||
STACK_PUSH5(ISAd, first, a, limit, trlink);
|
|
||||||
ISAd += incr, first = a, last = b, limit = next;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if((a - first) <= (b - a)) {
|
|
||||||
if(1 < (last - b)) {
|
|
||||||
STACK_PUSH5(ISAd + incr, a, b, next, trlink);
|
|
||||||
STACK_PUSH5(ISAd, first, a, limit, trlink);
|
|
||||||
first = b;
|
|
||||||
} else if(1 < (a - first)) {
|
|
||||||
STACK_PUSH5(ISAd + incr, a, b, next, trlink);
|
|
||||||
last = a;
|
|
||||||
} else {
|
|
||||||
ISAd += incr, first = a, last = b, limit = next;
|
|
||||||
}
|
|
||||||
} else if((last - b) <= (b - a)) {
|
|
||||||
if(1 < (last - b)) {
|
|
||||||
STACK_PUSH5(ISAd, first, a, limit, trlink);
|
|
||||||
STACK_PUSH5(ISAd + incr, a, b, next, trlink);
|
|
||||||
first = b;
|
|
||||||
} else {
|
|
||||||
STACK_PUSH5(ISAd, first, a, limit, trlink);
|
|
||||||
ISAd += incr, first = a, last = b, limit = next;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
STACK_PUSH5(ISAd, first, a, limit, trlink);
|
|
||||||
STACK_PUSH5(ISAd, b, last, limit, trlink);
|
|
||||||
ISAd += incr, first = a, last = b, limit = next;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if((1 < (b - a)) && (0 <= trlink)) { stack[trlink].d = -1; }
|
|
||||||
if((a - first) <= (last - b)) {
|
|
||||||
if(1 < (a - first)) {
|
|
||||||
STACK_PUSH5(ISAd, b, last, limit, trlink);
|
|
||||||
last = a;
|
|
||||||
} else if(1 < (last - b)) {
|
|
||||||
first = b;
|
|
||||||
} else {
|
|
||||||
STACK_POP5(ISAd, first, last, limit, trlink);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if(1 < (last - b)) {
|
|
||||||
STACK_PUSH5(ISAd, first, a, limit, trlink);
|
|
||||||
first = b;
|
|
||||||
} else if(1 < (a - first)) {
|
|
||||||
last = a;
|
|
||||||
} else {
|
|
||||||
STACK_POP5(ISAd, first, last, limit, trlink);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if(trbudget_check(budget, last - first)) {
|
|
||||||
limit = tr_ilg(last - first), ISAd += incr;
|
|
||||||
} else {
|
|
||||||
if(0 <= trlink) { stack[trlink].d = -1; }
|
|
||||||
STACK_POP5(ISAd, first, last, limit, trlink);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#undef STACK_SIZE
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*---------------------------------------------------------------------------*/
|
|
||||||
|
|
||||||
/*- Function -*/
|
|
||||||
|
|
||||||
/* Tandem repeat sort */
|
|
||||||
void
|
|
||||||
trsort(saidx_t *ISA, saidx_t *SA, saidx_t n, saidx_t depth) {
|
|
||||||
saidx_t *ISAd;
|
|
||||||
saidx_t *first, *last;
|
|
||||||
trbudget_t budget;
|
|
||||||
saidx_t t, skip, unsorted;
|
|
||||||
|
|
||||||
trbudget_init(&budget, tr_ilg(n) * 2 / 3, n);
|
|
||||||
/* trbudget_init(&budget, tr_ilg(n) * 3 / 4, n); */
|
|
||||||
for(ISAd = ISA + depth; -n < *SA; ISAd += ISAd - ISA) {
|
|
||||||
first = SA;
|
|
||||||
skip = 0;
|
|
||||||
unsorted = 0;
|
|
||||||
do {
|
|
||||||
if((t = *first) < 0) { first -= t; skip += t; }
|
|
||||||
else {
|
|
||||||
if(skip != 0) { *(first + skip) = skip; skip = 0; }
|
|
||||||
last = SA + ISA[t] + 1;
|
|
||||||
if(1 < (last - first)) {
|
|
||||||
budget.count = 0;
|
|
||||||
tr_introsort(ISA, ISAd, SA, first, last, &budget);
|
|
||||||
if(budget.count != 0) { unsorted += budget.count; }
|
|
||||||
else { skip = first - last; }
|
|
||||||
} else if((last - first) == 1) {
|
|
||||||
skip = -1;
|
|
||||||
}
|
|
||||||
first = last;
|
|
||||||
}
|
|
||||||
} while(first < (SA + n));
|
|
||||||
if(skip != 0) { *(first + skip) = skip; }
|
|
||||||
if(unsorted == 0) { break; }
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,381 +0,0 @@
|
|||||||
/*
|
|
||||||
* utils.c for libdivsufsort
|
|
||||||
* Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person
|
|
||||||
* obtaining a copy of this software and associated documentation
|
|
||||||
* files (the "Software"), to deal in the Software without
|
|
||||||
* restriction, including without limitation the rights to use,
|
|
||||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the
|
|
||||||
* Software is furnished to do so, subject to the following
|
|
||||||
* conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be
|
|
||||||
* included in all copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
||||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
||||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
||||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
||||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "divsufsort_private.h"
|
|
||||||
|
|
||||||
|
|
||||||
/*- Private Function -*/
|
|
||||||
|
|
||||||
/* Binary search for inverse bwt. */
|
|
||||||
static
|
|
||||||
saidx_t
|
|
||||||
binarysearch_lower(const saidx_t *A, saidx_t size, saidx_t value) {
|
|
||||||
saidx_t half, i;
|
|
||||||
for(i = 0, half = size >> 1;
|
|
||||||
0 < size;
|
|
||||||
size = half, half >>= 1) {
|
|
||||||
if(A[i + half] < value) {
|
|
||||||
i += half + 1;
|
|
||||||
half -= (size & 1) ^ 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*- Functions -*/
|
|
||||||
|
|
||||||
/* Burrows-Wheeler transform. */
|
|
||||||
saint_t
|
|
||||||
bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *SA,
|
|
||||||
saidx_t n, saidx_t *idx) {
|
|
||||||
saidx_t *A, i, j, p, t;
|
|
||||||
saint_t c;
|
|
||||||
|
|
||||||
/* Check arguments. */
|
|
||||||
if((T == NULL) || (U == NULL) || (n < 0) || (idx == NULL)) { return -1; }
|
|
||||||
if(n <= 1) {
|
|
||||||
if(n == 1) { U[0] = T[0]; }
|
|
||||||
*idx = n;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if((A = SA) == NULL) {
|
|
||||||
i = divbwt(T, U, NULL, n);
|
|
||||||
if(0 <= i) { *idx = i; i = 0; }
|
|
||||||
return (saint_t)i;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* BW transform. */
|
|
||||||
if(T == U) {
|
|
||||||
t = n;
|
|
||||||
for(i = 0, j = 0; i < n; ++i) {
|
|
||||||
p = t - 1;
|
|
||||||
t = A[i];
|
|
||||||
if(0 <= p) {
|
|
||||||
c = T[j];
|
|
||||||
U[j] = (j <= p) ? T[p] : (sauchar_t)A[p];
|
|
||||||
A[j] = c;
|
|
||||||
j++;
|
|
||||||
} else {
|
|
||||||
*idx = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
p = t - 1;
|
|
||||||
if(0 <= p) {
|
|
||||||
c = T[j];
|
|
||||||
U[j] = (j <= p) ? T[p] : (sauchar_t)A[p];
|
|
||||||
A[j] = c;
|
|
||||||
} else {
|
|
||||||
*idx = i;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
U[0] = T[n - 1];
|
|
||||||
for(i = 0; A[i] != 0; ++i) { U[i + 1] = T[A[i] - 1]; }
|
|
||||||
*idx = i + 1;
|
|
||||||
for(++i; i < n; ++i) { U[i] = T[A[i] - 1]; }
|
|
||||||
}
|
|
||||||
|
|
||||||
if(SA == NULL) {
|
|
||||||
/* Deallocate memory. */
|
|
||||||
free(A);
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Inverse Burrows-Wheeler transform. */
|
|
||||||
saint_t
|
|
||||||
inverse_bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *A,
|
|
||||||
saidx_t n, saidx_t idx) {
|
|
||||||
saidx_t C[ALPHABET_SIZE];
|
|
||||||
sauchar_t D[ALPHABET_SIZE];
|
|
||||||
saidx_t *B;
|
|
||||||
saidx_t i, p;
|
|
||||||
saint_t c, d;
|
|
||||||
|
|
||||||
/* Check arguments. */
|
|
||||||
if((T == NULL) || (U == NULL) || (n < 0) || (idx < 0) ||
|
|
||||||
(n < idx) || ((0 < n) && (idx == 0))) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
if(n <= 1) { return 0; }
|
|
||||||
|
|
||||||
if((B = A) == NULL) {
|
|
||||||
/* Allocate n*sizeof(saidx_t) bytes of memory. */
|
|
||||||
if((B = (saidx_t *)malloc((size_t)n * sizeof(saidx_t))) == NULL) { return -2; }
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Inverse BW transform. */
|
|
||||||
for(c = 0; c < ALPHABET_SIZE; ++c) { C[c] = 0; }
|
|
||||||
for(i = 0; i < n; ++i) { ++C[T[i]]; }
|
|
||||||
for(c = 0, d = 0, i = 0; c < ALPHABET_SIZE; ++c) {
|
|
||||||
p = C[c];
|
|
||||||
if(0 < p) {
|
|
||||||
C[c] = i;
|
|
||||||
D[d++] = (sauchar_t)c;
|
|
||||||
i += p;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(i = 0; i < idx; ++i) { B[C[T[i]]++] = i; }
|
|
||||||
for( ; i < n; ++i) { B[C[T[i]]++] = i + 1; }
|
|
||||||
for(c = 0; c < d; ++c) { C[c] = C[D[c]]; }
|
|
||||||
for(i = 0, p = idx; i < n; ++i) {
|
|
||||||
U[i] = D[binarysearch_lower(C, d, p)];
|
|
||||||
p = B[p - 1];
|
|
||||||
}
|
|
||||||
|
|
||||||
if(A == NULL) {
|
|
||||||
/* Deallocate memory. */
|
|
||||||
free(B);
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Checks the suffix array SA of the string T. */
|
|
||||||
saint_t
|
|
||||||
sufcheck(const sauchar_t *T, const saidx_t *SA,
|
|
||||||
saidx_t n, saint_t verbose) {
|
|
||||||
saidx_t C[ALPHABET_SIZE];
|
|
||||||
saidx_t i, p, q, t;
|
|
||||||
saint_t c;
|
|
||||||
|
|
||||||
if(verbose) { fprintf(stderr, "sufcheck: "); }
|
|
||||||
|
|
||||||
/* Check arguments. */
|
|
||||||
if((T == NULL) || (SA == NULL) || (n < 0)) {
|
|
||||||
if(verbose) { fprintf(stderr, "Invalid arguments.\n"); }
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
if(n == 0) {
|
|
||||||
if(verbose) { fprintf(stderr, "Done.\n"); }
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* check range: [0..n-1] */
|
|
||||||
for(i = 0; i < n; ++i) {
|
|
||||||
if((SA[i] < 0) || (n <= SA[i])) {
|
|
||||||
if(verbose) {
|
|
||||||
fprintf(stderr, "Out of the range [0,%" PRIdSAIDX_T "].\n"
|
|
||||||
" SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n",
|
|
||||||
n - 1, i, SA[i]);
|
|
||||||
}
|
|
||||||
return -2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* check first characters. */
|
|
||||||
for(i = 1; i < n; ++i) {
|
|
||||||
if(T[SA[i - 1]] > T[SA[i]]) {
|
|
||||||
if(verbose) {
|
|
||||||
fprintf(stderr, "Suffixes in wrong order.\n"
|
|
||||||
" T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d"
|
|
||||||
" > T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d\n",
|
|
||||||
i - 1, SA[i - 1], T[SA[i - 1]], i, SA[i], T[SA[i]]);
|
|
||||||
}
|
|
||||||
return -3;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* check suffixes. */
|
|
||||||
for(i = 0; i < ALPHABET_SIZE; ++i) { C[i] = 0; }
|
|
||||||
for(i = 0; i < n; ++i) { ++C[T[i]]; }
|
|
||||||
for(i = 0, p = 0; i < ALPHABET_SIZE; ++i) {
|
|
||||||
t = C[i];
|
|
||||||
C[i] = p;
|
|
||||||
p += t;
|
|
||||||
}
|
|
||||||
|
|
||||||
q = C[T[n - 1]];
|
|
||||||
C[T[n - 1]] += 1;
|
|
||||||
for(i = 0; i < n; ++i) {
|
|
||||||
p = SA[i];
|
|
||||||
if(0 < p) {
|
|
||||||
c = T[--p];
|
|
||||||
t = C[c];
|
|
||||||
} else {
|
|
||||||
c = T[p = n - 1];
|
|
||||||
t = q;
|
|
||||||
}
|
|
||||||
if((t < 0) || (p != SA[t])) {
|
|
||||||
if(verbose) {
|
|
||||||
fprintf(stderr, "Suffix in wrong position.\n"
|
|
||||||
" SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T " or\n"
|
|
||||||
" SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n",
|
|
||||||
t, (0 <= t) ? SA[t] : -1, i, SA[i]);
|
|
||||||
}
|
|
||||||
return -4;
|
|
||||||
}
|
|
||||||
if(t != q) {
|
|
||||||
++C[c];
|
|
||||||
if((n <= C[c]) || (T[SA[C[c]]] != c)) { C[c] = -1; }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(1 <= verbose) { fprintf(stderr, "Done.\n"); }
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static
|
|
||||||
int
|
|
||||||
_compare(const sauchar_t *T, saidx_t Tsize,
|
|
||||||
const sauchar_t *P, saidx_t Psize,
|
|
||||||
saidx_t suf, saidx_t *match) {
|
|
||||||
saidx_t i, j;
|
|
||||||
saint_t r;
|
|
||||||
for(i = suf + *match, j = *match, r = 0;
|
|
||||||
(i < Tsize) && (j < Psize) && ((r = T[i] - P[j]) == 0); ++i, ++j) { }
|
|
||||||
*match = j;
|
|
||||||
return (r == 0) ? -(j != Psize) : r;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Search for the pattern P in the string T. */
|
|
||||||
saidx_t
|
|
||||||
sa_search(const sauchar_t *T, saidx_t Tsize,
|
|
||||||
const sauchar_t *P, saidx_t Psize,
|
|
||||||
const saidx_t *SA, saidx_t SAsize,
|
|
||||||
saidx_t *idx) {
|
|
||||||
saidx_t size, lsize, rsize, half;
|
|
||||||
saidx_t match, lmatch, rmatch;
|
|
||||||
saidx_t llmatch, lrmatch, rlmatch, rrmatch;
|
|
||||||
saidx_t i, j, k;
|
|
||||||
saint_t r;
|
|
||||||
|
|
||||||
if(idx != NULL) { *idx = -1; }
|
|
||||||
if((T == NULL) || (P == NULL) || (SA == NULL) ||
|
|
||||||
(Tsize < 0) || (Psize < 0) || (SAsize < 0)) { return -1; }
|
|
||||||
if((Tsize == 0) || (SAsize == 0)) { return 0; }
|
|
||||||
if(Psize == 0) { if(idx != NULL) { *idx = 0; } return SAsize; }
|
|
||||||
|
|
||||||
for(i = j = k = 0, lmatch = rmatch = 0, size = SAsize, half = size >> 1;
|
|
||||||
0 < size;
|
|
||||||
size = half, half >>= 1) {
|
|
||||||
match = MIN(lmatch, rmatch);
|
|
||||||
r = _compare(T, Tsize, P, Psize, SA[i + half], &match);
|
|
||||||
if(r < 0) {
|
|
||||||
i += half + 1;
|
|
||||||
half -= (size & 1) ^ 1;
|
|
||||||
lmatch = match;
|
|
||||||
} else if(r > 0) {
|
|
||||||
rmatch = match;
|
|
||||||
} else {
|
|
||||||
lsize = half, j = i, rsize = size - half - 1, k = i + half + 1;
|
|
||||||
|
|
||||||
/* left part */
|
|
||||||
for(llmatch = lmatch, lrmatch = match, half = lsize >> 1;
|
|
||||||
0 < lsize;
|
|
||||||
lsize = half, half >>= 1) {
|
|
||||||
lmatch = MIN(llmatch, lrmatch);
|
|
||||||
r = _compare(T, Tsize, P, Psize, SA[j + half], &lmatch);
|
|
||||||
if(r < 0) {
|
|
||||||
j += half + 1;
|
|
||||||
half -= (lsize & 1) ^ 1;
|
|
||||||
llmatch = lmatch;
|
|
||||||
} else {
|
|
||||||
lrmatch = lmatch;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* right part */
|
|
||||||
for(rlmatch = match, rrmatch = rmatch, half = rsize >> 1;
|
|
||||||
0 < rsize;
|
|
||||||
rsize = half, half >>= 1) {
|
|
||||||
rmatch = MIN(rlmatch, rrmatch);
|
|
||||||
r = _compare(T, Tsize, P, Psize, SA[k + half], &rmatch);
|
|
||||||
if(r <= 0) {
|
|
||||||
k += half + 1;
|
|
||||||
half -= (rsize & 1) ^ 1;
|
|
||||||
rlmatch = rmatch;
|
|
||||||
} else {
|
|
||||||
rrmatch = rmatch;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; }
|
|
||||||
return k - j;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Search for the character c in the string T. */
|
|
||||||
saidx_t
|
|
||||||
sa_simplesearch(const sauchar_t *T, saidx_t Tsize,
|
|
||||||
const saidx_t *SA, saidx_t SAsize,
|
|
||||||
saint_t c, saidx_t *idx) {
|
|
||||||
saidx_t size, lsize, rsize, half;
|
|
||||||
saidx_t i, j, k, p;
|
|
||||||
saint_t r;
|
|
||||||
|
|
||||||
if(idx != NULL) { *idx = -1; }
|
|
||||||
if((T == NULL) || (SA == NULL) || (Tsize < 0) || (SAsize < 0)) { return -1; }
|
|
||||||
if((Tsize == 0) || (SAsize == 0)) { return 0; }
|
|
||||||
|
|
||||||
for(i = j = k = 0, size = SAsize, half = size >> 1;
|
|
||||||
0 < size;
|
|
||||||
size = half, half >>= 1) {
|
|
||||||
p = SA[i + half];
|
|
||||||
r = (p < Tsize) ? T[p] - c : -1;
|
|
||||||
if(r < 0) {
|
|
||||||
i += half + 1;
|
|
||||||
half -= (size & 1) ^ 1;
|
|
||||||
} else if(r == 0) {
|
|
||||||
lsize = half, j = i, rsize = size - half - 1, k = i + half + 1;
|
|
||||||
|
|
||||||
/* left part */
|
|
||||||
for(half = lsize >> 1;
|
|
||||||
0 < lsize;
|
|
||||||
lsize = half, half >>= 1) {
|
|
||||||
p = SA[j + half];
|
|
||||||
r = (p < Tsize) ? T[p] - c : -1;
|
|
||||||
if(r < 0) {
|
|
||||||
j += half + 1;
|
|
||||||
half -= (lsize & 1) ^ 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* right part */
|
|
||||||
for(half = rsize >> 1;
|
|
||||||
0 < rsize;
|
|
||||||
rsize = half, half >>= 1) {
|
|
||||||
p = SA[k + half];
|
|
||||||
r = (p < Tsize) ? T[p] - c : -1;
|
|
||||||
if(r <= 0) {
|
|
||||||
k += half + 1;
|
|
||||||
half -= (rsize & 1) ^ 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; }
|
|
||||||
return k - j;
|
|
||||||
}
|
|
@ -1,6 +1,6 @@
|
|||||||
# ################################################################
|
# ################################################################
|
||||||
# ZSTD library - Makefile
|
# ZSTD library - Makefile
|
||||||
# Copyright (C) Yann Collet 2015
|
# Copyright (C) Yann Collet 2015-2016
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
#
|
#
|
||||||
# BSD license
|
# BSD license
|
||||||
@ -28,7 +28,6 @@
|
|||||||
#
|
#
|
||||||
# You can contact the author at :
|
# You can contact the author at :
|
||||||
# - ZSTD homepage : http://www.zstd.net
|
# - ZSTD homepage : http://www.zstd.net
|
||||||
# - ZSTD source repository : https://github.com/Cyan4973/zstd
|
|
||||||
# ################################################################
|
# ################################################################
|
||||||
|
|
||||||
# Version numbers
|
# Version numbers
|
||||||
@ -52,7 +51,7 @@ FLAGS = $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $(MOREFLAGS)
|
|||||||
LIBDIR ?= $(PREFIX)/lib
|
LIBDIR ?= $(PREFIX)/lib
|
||||||
INCLUDEDIR=$(PREFIX)/include
|
INCLUDEDIR=$(PREFIX)/include
|
||||||
|
|
||||||
ZSTD_FILES := zstd_compress.c zstd_decompress.c fse.c huff0.c
|
ZSTD_FILES := zstd_compress.c zstd_decompress.c fse.c huff0.c dictBuilder.c divsufsort.c
|
||||||
ZSTD_LEGACY:= legacy/zstd_v01.c legacy/zstd_v02.c legacy/zstd_v03.c legacy/zstd_v04.c
|
ZSTD_LEGACY:= legacy/zstd_v01.c legacy/zstd_v02.c legacy/zstd_v03.c legacy/zstd_v04.c
|
||||||
|
|
||||||
ifeq ($(ZSTD_LEGACY_SUPPORT), 0)
|
ifeq ($(ZSTD_LEGACY_SUPPORT), 0)
|
||||||
@ -119,6 +118,7 @@ install: libzstd libzstd.pc
|
|||||||
@cp -a libzstd.pc $(DESTDIR)$(LIBDIR)/pkgconfig/
|
@cp -a libzstd.pc $(DESTDIR)$(LIBDIR)/pkgconfig/
|
||||||
@install -m 644 libzstd.a $(DESTDIR)$(LIBDIR)/libzstd.a
|
@install -m 644 libzstd.a $(DESTDIR)$(LIBDIR)/libzstd.a
|
||||||
@install -m 644 zstd.h $(DESTDIR)$(INCLUDEDIR)/zstd.h
|
@install -m 644 zstd.h $(DESTDIR)$(INCLUDEDIR)/zstd.h
|
||||||
|
@install -m 644 zstd.h $(DESTDIR)$(INCLUDEDIR)/zstd_buffered.h
|
||||||
@echo zstd static and shared library installed
|
@echo zstd static and shared library installed
|
||||||
|
|
||||||
uninstall:
|
uninstall:
|
||||||
|
@ -1,28 +1,37 @@
|
|||||||
/*
|
/*
|
||||||
dictBuilder - dictionary builder for LZ algorithms
|
dictBuilder - dictionary builder for zstd
|
||||||
Copyright (C) Yann Collet 2016
|
Copyright (C) Yann Collet 2016
|
||||||
|
|
||||||
GPL v2 License
|
BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
Redistribution and use in source and binary forms, with or without
|
||||||
it under the terms of the GNU General Public License as published by
|
modification, are permitted provided that the following conditions are
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
met:
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
* Redistributions of source code must retain the above copyright
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
notice, this list of conditions and the following disclaimer.
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
* Redistributions in binary form must reproduce the above
|
||||||
GNU General Public License for more details.
|
copyright notice, this list of conditions and the following disclaimer
|
||||||
|
in the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
You can contact the author at :
|
You can contact the author at :
|
||||||
- zstd source repository : https://github.com/Cyan4973/zstd
|
- Zstd source repository : https://www.zstd.net
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* **************************************
|
/*-**************************************
|
||||||
* Compiler Options
|
* Compiler Options
|
||||||
****************************************/
|
****************************************/
|
||||||
/* Disable some Visual warning messages */
|
/* Disable some Visual warning messages */
|
||||||
@ -41,7 +50,7 @@
|
|||||||
|
|
||||||
|
|
||||||
/*-*************************************
|
/*-*************************************
|
||||||
* Includes
|
* Dependencies
|
||||||
***************************************/
|
***************************************/
|
||||||
#include <stdlib.h> /* malloc, free */
|
#include <stdlib.h> /* malloc, free */
|
||||||
#include <string.h> /* memset */
|
#include <string.h> /* memset */
|
||||||
@ -53,9 +62,10 @@
|
|||||||
#include "mem.h" /* read */
|
#include "mem.h" /* read */
|
||||||
#include "error_private.h"
|
#include "error_private.h"
|
||||||
#include "divsufsort.h"
|
#include "divsufsort.h"
|
||||||
#include "dictBuilder.h"
|
#include "dictBuilder_static.h"
|
||||||
#include "zstd_compress.c"
|
#include "fse.h"
|
||||||
#include "huff0_static.h"
|
#include "huff0_static.h"
|
||||||
|
#include "zstd_internal.h"
|
||||||
|
|
||||||
|
|
||||||
/*-*************************************
|
/*-*************************************
|
||||||
@ -94,16 +104,16 @@ static const size_t g_min_fast_dictContent = 192;
|
|||||||
#define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
|
#define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
|
||||||
#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
|
#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
|
||||||
static unsigned g_displayLevel = 0; /* 0 : no display; 1: errors; 2: default; 4: full information */
|
static unsigned g_displayLevel = 0; /* 0 : no display; 1: errors; 2: default; 4: full information */
|
||||||
void DiB_setNotificationLevel(unsigned l) { g_displayLevel=l; }
|
void ZDICT_setNotificationLevel(unsigned l) { g_displayLevel=l; }
|
||||||
|
|
||||||
#define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
|
#define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \
|
||||||
if (DiB_GetMilliSpan(g_time) > refreshRate) \
|
if (ZDICT_GetMilliSpan(g_time) > refreshRate) \
|
||||||
{ g_time = clock(); DISPLAY(__VA_ARGS__); \
|
{ g_time = clock(); DISPLAY(__VA_ARGS__); \
|
||||||
if (g_displayLevel>=4) fflush(stdout); } }
|
if (g_displayLevel>=4) fflush(stdout); } }
|
||||||
static const unsigned refreshRate = 300;
|
static const unsigned refreshRate = 300;
|
||||||
static clock_t g_time = 0;
|
static clock_t g_time = 0;
|
||||||
|
|
||||||
void DiB_printHex(U32 dlevel, const void* ptr, size_t length)
|
void ZDICT_printHex(U32 dlevel, const void* ptr, size_t length)
|
||||||
{
|
{
|
||||||
const BYTE* const b = (const BYTE*)ptr;
|
const BYTE* const b = (const BYTE*)ptr;
|
||||||
size_t u;
|
size_t u;
|
||||||
@ -133,81 +143,25 @@ void DiB_printHex(U32 dlevel, const void* ptr, size_t length)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* ********************************************************
|
/*-********************************************************
|
||||||
* Helper functions
|
* Helper functions
|
||||||
**********************************************************/
|
**********************************************************/
|
||||||
unsigned DiB_versionNumber (void) { return DiB_VERSION_NUMBER; }
|
static unsigned ZDICT_GetMilliSpan(clock_t nPrevious)
|
||||||
|
|
||||||
static unsigned DiB_GetMilliSpan(clock_t nPrevious)
|
|
||||||
{
|
{
|
||||||
clock_t nCurrent = clock();
|
clock_t nCurrent = clock();
|
||||||
unsigned nSpan = (unsigned)(((nCurrent - nPrevious) * 1000) / CLOCKS_PER_SEC);
|
unsigned nSpan = (unsigned)(((nCurrent - nPrevious) * 1000) / CLOCKS_PER_SEC);
|
||||||
return nSpan;
|
return nSpan;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned DiB_isError(size_t errorCode) { return ERR_isError(errorCode); }
|
unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
|
||||||
|
|
||||||
const char* DiB_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
|
const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
|
||||||
|
|
||||||
|
|
||||||
/* ********************************************************
|
|
||||||
* File related operations
|
|
||||||
**********************************************************/
|
|
||||||
static unsigned long long DiB_getFileSize(const char* infilename)
|
|
||||||
{
|
|
||||||
int r;
|
|
||||||
#if defined(_MSC_VER)
|
|
||||||
struct _stat64 statbuf;
|
|
||||||
r = _stat64(infilename, &statbuf);
|
|
||||||
#else
|
|
||||||
struct stat statbuf;
|
|
||||||
r = stat(infilename, &statbuf);
|
|
||||||
#endif
|
|
||||||
if (r || !S_ISREG(statbuf.st_mode)) return 0; /* No good... */
|
|
||||||
return (unsigned long long)statbuf.st_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static unsigned long long DiB_getTotalFileSize(const char** fileNamesTable, unsigned nbFiles)
|
|
||||||
{
|
|
||||||
unsigned long long total = 0;
|
|
||||||
unsigned n;
|
|
||||||
for (n=0; n<nbFiles; n++)
|
|
||||||
total += DiB_getFileSize(fileNamesTable[n]);
|
|
||||||
return total;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static void DiB_loadFiles(void* buffer, size_t bufferSize,
|
|
||||||
size_t* fileSizes,
|
|
||||||
const char** fileNamesTable, unsigned nbFiles)
|
|
||||||
{
|
|
||||||
char* buff = (char*)buffer;
|
|
||||||
size_t pos = 0;
|
|
||||||
unsigned n;
|
|
||||||
|
|
||||||
for (n=0; n<nbFiles; n++) {
|
|
||||||
size_t readSize;
|
|
||||||
unsigned long long fileSize = DiB_getFileSize(fileNamesTable[n]);
|
|
||||||
FILE* f = fopen(fileNamesTable[n], "rb");
|
|
||||||
if (f==NULL) EXM_THROW(10, "impossible to open file %s", fileNamesTable[n]);
|
|
||||||
DISPLAYLEVEL(2, "Loading %s... \r", fileNamesTable[n]);
|
|
||||||
if (fileSize > bufferSize-pos) fileSize = 0; /* stop there, not enough memory to load all files */
|
|
||||||
readSize = fread(buff+pos, 1, (size_t)fileSize, f);
|
|
||||||
if (readSize != (size_t)fileSize) EXM_THROW(11, "could not read %s", fileNamesTable[n]);
|
|
||||||
pos += readSize;
|
|
||||||
fileSizes[n] = (size_t)fileSize;
|
|
||||||
fclose(f);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*-********************************************************
|
/*-********************************************************
|
||||||
* Dictionary training functions
|
* Dictionary training functions
|
||||||
**********************************************************/
|
**********************************************************/
|
||||||
static size_t DiB_read_ARCH(const void* p) { size_t r; memcpy(&r, p, sizeof(r)); return r; }
|
static unsigned ZDICT_NbCommonBytes (register size_t val)
|
||||||
|
|
||||||
static unsigned DiB_NbCommonBytes (register size_t val)
|
|
||||||
{
|
{
|
||||||
if (MEM_isLittleEndian()) {
|
if (MEM_isLittleEndian()) {
|
||||||
if (MEM_64bits()) {
|
if (MEM_64bits()) {
|
||||||
@ -266,17 +220,17 @@ static unsigned DiB_NbCommonBytes (register size_t val)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*! DiB_count() :
|
/*! ZDICT_count() :
|
||||||
Count the nb of common bytes between 2 pointers.
|
Count the nb of common bytes between 2 pointers.
|
||||||
Note : this function presumes end of buffer followed by noisy guard band.
|
Note : this function presumes end of buffer followed by noisy guard band.
|
||||||
*/
|
*/
|
||||||
static size_t DiB_count(const void* pIn, const void* pMatch)
|
static size_t ZDICT_count(const void* pIn, const void* pMatch)
|
||||||
{
|
{
|
||||||
const char* const pStart = (const char*)pIn;
|
const char* const pStart = (const char*)pIn;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
size_t diff = DiB_read_ARCH(pMatch) ^ DiB_read_ARCH(pIn);
|
size_t diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
|
||||||
if (!diff) { pIn = (const char*)pIn+sizeof(size_t); pMatch = (const char*)pMatch+sizeof(size_t); continue; }
|
if (!diff) { pIn = (const char*)pIn+sizeof(size_t); pMatch = (const char*)pMatch+sizeof(size_t); continue; }
|
||||||
pIn = (const char*)pIn+DiB_NbCommonBytes(diff);
|
pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff);
|
||||||
return (size_t)((const char*)pIn - pStart);
|
return (size_t)((const char*)pIn - pStart);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -288,7 +242,7 @@ typedef struct {
|
|||||||
U32 savings;
|
U32 savings;
|
||||||
} dictItem;
|
} dictItem;
|
||||||
|
|
||||||
void DiB_initDictItem(dictItem* d)
|
static void ZDICT_initDictItem(dictItem* d)
|
||||||
{
|
{
|
||||||
d->pos = 1;
|
d->pos = 1;
|
||||||
d->length = 0;
|
d->length = 0;
|
||||||
@ -298,9 +252,9 @@ void DiB_initDictItem(dictItem* d)
|
|||||||
|
|
||||||
#define LLIMIT 64 /* heuristic determined experimentally */
|
#define LLIMIT 64 /* heuristic determined experimentally */
|
||||||
#define MINMATCHLENGTH 7 /* heuristic determined experimentally */
|
#define MINMATCHLENGTH 7 /* heuristic determined experimentally */
|
||||||
static dictItem DiB_analyzePos(
|
static dictItem ZDICT_analyzePos(
|
||||||
BYTE* doneMarks,
|
BYTE* doneMarks,
|
||||||
const saidx_t* suffix, U32 start,
|
const int* suffix, U32 start,
|
||||||
const void* buffer, U32 minRatio)
|
const void* buffer, U32 minRatio)
|
||||||
{
|
{
|
||||||
U32 lengthList[LLIMIT] = {0};
|
U32 lengthList[LLIMIT] = {0};
|
||||||
@ -334,12 +288,12 @@ static dictItem DiB_analyzePos(
|
|||||||
/* look forward */
|
/* look forward */
|
||||||
do {
|
do {
|
||||||
end++;
|
end++;
|
||||||
length = DiB_count(b + pos, b + suffix[end]);
|
length = ZDICT_count(b + pos, b + suffix[end]);
|
||||||
} while (length >=MINMATCHLENGTH);
|
} while (length >=MINMATCHLENGTH);
|
||||||
|
|
||||||
/* look backward */
|
/* look backward */
|
||||||
do {
|
do {
|
||||||
length = DiB_count(b + pos, b + *(suffix+start-1));
|
length = ZDICT_count(b + pos, b + *(suffix+start-1));
|
||||||
if (length >=MINMATCHLENGTH) start--;
|
if (length >=MINMATCHLENGTH) start--;
|
||||||
} while(length >= MINMATCHLENGTH);
|
} while(length >= MINMATCHLENGTH);
|
||||||
|
|
||||||
@ -400,14 +354,14 @@ static dictItem DiB_analyzePos(
|
|||||||
/* look forward */
|
/* look forward */
|
||||||
do {
|
do {
|
||||||
end++;
|
end++;
|
||||||
length = DiB_count(b + pos, b + suffix[end]);
|
length = ZDICT_count(b + pos, b + suffix[end]);
|
||||||
if (length >= LLIMIT) length = LLIMIT-1;
|
if (length >= LLIMIT) length = LLIMIT-1;
|
||||||
lengthList[length]++;
|
lengthList[length]++;
|
||||||
} while (length >=MINMATCHLENGTH);
|
} while (length >=MINMATCHLENGTH);
|
||||||
|
|
||||||
/* look backward */
|
/* look backward */
|
||||||
do {
|
do {
|
||||||
length = DiB_count(b + pos, b + suffix[start-1]);
|
length = ZDICT_count(b + pos, b + suffix[start-1]);
|
||||||
if (length >= LLIMIT) length = LLIMIT-1;
|
if (length >= LLIMIT) length = LLIMIT-1;
|
||||||
lengthList[length]++;
|
lengthList[length]++;
|
||||||
if (length >=MINMATCHLENGTH) start--;
|
if (length >=MINMATCHLENGTH) start--;
|
||||||
@ -453,7 +407,7 @@ static dictItem DiB_analyzePos(
|
|||||||
if (testedPos == pos)
|
if (testedPos == pos)
|
||||||
length = solution.length;
|
length = solution.length;
|
||||||
else {
|
else {
|
||||||
length = DiB_count(b+pos, b+testedPos);
|
length = ZDICT_count(b+pos, b+testedPos);
|
||||||
if (length > solution.length) length = solution.length;
|
if (length > solution.length) length = solution.length;
|
||||||
}
|
}
|
||||||
pEnd = (U32)(testedPos + length);
|
pEnd = (U32)(testedPos + length);
|
||||||
@ -465,11 +419,11 @@ static dictItem DiB_analyzePos(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*! DiB_checkMerge
|
/*! ZDICT_checkMerge
|
||||||
check if dictItem can be merged, do it if possible
|
check if dictItem can be merged, do it if possible
|
||||||
@return : id of destination elt, 0 if not merged
|
@return : id of destination elt, 0 if not merged
|
||||||
*/
|
*/
|
||||||
static U32 DiB_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
|
static U32 ZDICT_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
|
||||||
{
|
{
|
||||||
const U32 tableSize = table->pos;
|
const U32 tableSize = table->pos;
|
||||||
const U32 max = elt.pos + (elt.length-1);
|
const U32 max = elt.pos + (elt.length-1);
|
||||||
@ -513,7 +467,7 @@ static U32 DiB_checkMerge(dictItem* table, dictItem elt, U32 eltNbToSkip)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void DiB_removeDictItem(dictItem* table, U32 id)
|
static void ZDICT_removeDictItem(dictItem* table, U32 id)
|
||||||
{
|
{
|
||||||
/* convention : first element is nb of elts */
|
/* convention : first element is nb of elts */
|
||||||
U32 max = table->pos;
|
U32 max = table->pos;
|
||||||
@ -525,15 +479,15 @@ static void DiB_removeDictItem(dictItem* table, U32 id)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void DiB_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
|
static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
|
||||||
{
|
{
|
||||||
/* merge if possible */
|
/* merge if possible */
|
||||||
U32 mergeId = DiB_checkMerge(table, elt, 0);
|
U32 mergeId = ZDICT_checkMerge(table, elt, 0);
|
||||||
if (mergeId) {
|
if (mergeId) {
|
||||||
U32 newMerge = 1;
|
U32 newMerge = 1;
|
||||||
while (newMerge) {
|
while (newMerge) {
|
||||||
newMerge = DiB_checkMerge(table, table[mergeId], mergeId);
|
newMerge = ZDICT_checkMerge(table, table[mergeId], mergeId);
|
||||||
if (newMerge) DiB_removeDictItem(table, mergeId);
|
if (newMerge) ZDICT_removeDictItem(table, mergeId);
|
||||||
mergeId = newMerge;
|
mergeId = newMerge;
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
@ -555,7 +509,7 @@ static void DiB_insertDictItem(dictItem* table, U32 maxSize, dictItem elt)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static U32 DiB_dictSize(const dictItem* dictList)
|
static U32 ZDICT_dictSize(const dictItem* dictList)
|
||||||
{
|
{
|
||||||
U32 u, dictSize = 0;
|
U32 u, dictSize = 0;
|
||||||
for (u=1; u<dictList[0].pos; u++)
|
for (u=1; u<dictList[0].pos; u++)
|
||||||
@ -564,32 +518,32 @@ static U32 DiB_dictSize(const dictItem* dictList)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void DiB_trainBuffer(dictItem* dictList, U32 dictListSize,
|
static void ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
|
||||||
const void* const buffer, const size_t bufferSize, /* buffer must end with noisy guard band */
|
const void* const buffer, const size_t bufferSize, /* buffer must end with noisy guard band */
|
||||||
const size_t* fileSizes, unsigned nbFiles,
|
const size_t* fileSizes, unsigned nbFiles,
|
||||||
U32 shiftRatio, unsigned maxDictSize)
|
U32 shiftRatio, unsigned maxDictSize)
|
||||||
{
|
{
|
||||||
saidx_t* const suffix0 = (saidx_t*)malloc((bufferSize+2)*sizeof(*suffix0));
|
int* const suffix0 = (int*)malloc((bufferSize+2)*sizeof(*suffix0));
|
||||||
saidx_t* const suffix = suffix0+1;
|
int* const suffix = suffix0+1;
|
||||||
U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
|
U32* reverseSuffix = (U32*)malloc((bufferSize)*sizeof(*reverseSuffix));
|
||||||
BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */
|
BYTE* doneMarks = (BYTE*)malloc((bufferSize+16)*sizeof(*doneMarks)); /* +16 for overflow security */
|
||||||
U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
|
U32* filePos = (U32*)malloc(nbFiles * sizeof(*filePos));
|
||||||
U32 minRatio = nbFiles >> shiftRatio;
|
U32 minRatio = nbFiles >> shiftRatio;
|
||||||
saint_t errorCode;
|
int divSuftSortResult;
|
||||||
|
|
||||||
/* init */
|
/* init */
|
||||||
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
||||||
if (!suffix0 || !reverseSuffix || !doneMarks || !filePos)
|
if (!suffix0 || !reverseSuffix || !doneMarks || !filePos)
|
||||||
EXM_THROW(1, "not enough memory for DiB_trainBuffer");
|
EXM_THROW(1, "not enough memory for ZDICT_trainBuffer");
|
||||||
if (minRatio < MINRATIO) minRatio = MINRATIO;
|
if (minRatio < MINRATIO) minRatio = MINRATIO;
|
||||||
memset(doneMarks, 0, bufferSize+16);
|
memset(doneMarks, 0, bufferSize+16);
|
||||||
|
|
||||||
/* sort */
|
/* sort */
|
||||||
DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
|
DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
|
||||||
errorCode = divsufsort((const sauchar_t*)buffer, suffix, (saidx_t)bufferSize);
|
divSuftSortResult = divsufsort((const unsigned char*)buffer, suffix, (int)bufferSize, 0);
|
||||||
if (errorCode != 0) EXM_THROW(2, "sort failed");
|
if (divSuftSortResult != 0) EXM_THROW(2, "sort failed");
|
||||||
suffix[bufferSize] = (saidx_t)bufferSize; /* leads into noise */
|
suffix[bufferSize] = (int)bufferSize; /* leads into noise */
|
||||||
suffix0[0] = (saidx_t)bufferSize; /* leads into noise */
|
suffix0[0] = (int)bufferSize; /* leads into noise */
|
||||||
{
|
{
|
||||||
/* build reverse suffix sort */
|
/* build reverse suffix sort */
|
||||||
size_t pos;
|
size_t pos;
|
||||||
@ -608,9 +562,9 @@ static void DiB_trainBuffer(dictItem* dictList, U32 dictListSize,
|
|||||||
U32 cursor; for (cursor=0; cursor < bufferSize; ) {
|
U32 cursor; for (cursor=0; cursor < bufferSize; ) {
|
||||||
dictItem solution;
|
dictItem solution;
|
||||||
if (doneMarks[cursor]) { cursor++; continue; }
|
if (doneMarks[cursor]) { cursor++; continue; }
|
||||||
solution = DiB_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio);
|
solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio);
|
||||||
if (solution.length==0) { cursor++; continue; }
|
if (solution.length==0) { cursor++; continue; }
|
||||||
DiB_insertDictItem(dictList, dictListSize, solution);
|
ZDICT_insertDictItem(dictList, dictListSize, solution);
|
||||||
cursor += solution.length;
|
cursor += solution.length;
|
||||||
DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
|
DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor / bufferSize * 100);
|
||||||
} }
|
} }
|
||||||
@ -633,26 +587,7 @@ static void DiB_trainBuffer(dictItem* dictList, U32 dictListSize,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static size_t DiB_findMaxMem(unsigned long long requiredMem)
|
static void ZDICT_fillNoise(void* buffer, size_t length)
|
||||||
{
|
|
||||||
size_t step = 8 MB;
|
|
||||||
void* testmem = NULL;
|
|
||||||
|
|
||||||
requiredMem = (((requiredMem >> 23) + 1) << 23);
|
|
||||||
requiredMem += 2 * step;
|
|
||||||
if (requiredMem > maxMemory) requiredMem = maxMemory;
|
|
||||||
|
|
||||||
while (!testmem) {
|
|
||||||
requiredMem -= step;
|
|
||||||
testmem = malloc((size_t)requiredMem);
|
|
||||||
}
|
|
||||||
|
|
||||||
free(testmem);
|
|
||||||
return (size_t)(requiredMem - step);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static void DiB_fillNoise(void* buffer, size_t length)
|
|
||||||
{
|
{
|
||||||
unsigned acc = PRIME1;
|
unsigned acc = PRIME1;
|
||||||
size_t p=0;;
|
size_t p=0;;
|
||||||
@ -672,34 +607,36 @@ typedef struct
|
|||||||
} EStats_ress_t;
|
} EStats_ress_t;
|
||||||
|
|
||||||
|
|
||||||
static void DiB_countEStats(EStats_ress_t esr,
|
static void ZDICT_countEStats(EStats_ress_t esr,
|
||||||
U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount,
|
U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount,
|
||||||
const void* src, size_t srcSize)
|
const void* src, size_t srcSize)
|
||||||
{
|
{
|
||||||
const BYTE* bytePtr;
|
const BYTE* bytePtr;
|
||||||
const U32* u32Ptr;
|
const U32* u32Ptr;
|
||||||
|
seqStore_t seqStore;
|
||||||
|
|
||||||
if (srcSize > BLOCKSIZE) srcSize = BLOCKSIZE; /* protection vs large samples */
|
if (srcSize > BLOCKSIZE) srcSize = BLOCKSIZE; /* protection vs large samples */
|
||||||
ZSTD_copyCCtx(esr.zc, esr.ref);
|
ZSTD_copyCCtx(esr.zc, esr.ref);
|
||||||
ZSTD_compressBlock(esr.zc, esr.workPlace, BLOCKSIZE, src, srcSize);
|
ZSTD_compressBlock(esr.zc, esr.workPlace, BLOCKSIZE, src, srcSize);
|
||||||
|
seqStore = ZSTD_copySeqStore(esr.zc);
|
||||||
|
|
||||||
/* count stats */
|
/* count stats */
|
||||||
for(bytePtr = esr.zc->seqStore.litStart; bytePtr < esr.zc->seqStore.lit; bytePtr++)
|
for(bytePtr = seqStore.litStart; bytePtr < seqStore.lit; bytePtr++)
|
||||||
countLit[*bytePtr]++;
|
countLit[*bytePtr]++;
|
||||||
for(u32Ptr = esr.zc->seqStore.offsetStart; u32Ptr < esr.zc->seqStore.offset; u32Ptr++) {
|
for(u32Ptr = seqStore.offsetStart; u32Ptr < seqStore.offset; u32Ptr++) {
|
||||||
BYTE offcode = (BYTE)ZSTD_highbit(*u32Ptr) + 1;
|
BYTE offcode = (BYTE)ZSTD_highbit(*u32Ptr) + 1;
|
||||||
if (*u32Ptr==0) offcode=0;
|
if (*u32Ptr==0) offcode=0;
|
||||||
offsetcodeCount[offcode]++;
|
offsetcodeCount[offcode]++;
|
||||||
}
|
}
|
||||||
for(bytePtr = esr.zc->seqStore.matchLengthStart; bytePtr < esr.zc->seqStore.matchLength; bytePtr++)
|
for(bytePtr = seqStore.matchLengthStart; bytePtr < seqStore.matchLength; bytePtr++)
|
||||||
matchlengthCount[*bytePtr]++;
|
matchlengthCount[*bytePtr]++;
|
||||||
for(bytePtr = esr.zc->seqStore.litLengthStart; bytePtr < esr.zc->seqStore.litLength; bytePtr++)
|
for(bytePtr = seqStore.litLengthStart; bytePtr < seqStore.litLength; bytePtr++)
|
||||||
litlengthCount[*bytePtr]++;
|
litlengthCount[*bytePtr]++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#define OFFCODE_MAX 18
|
#define OFFCODE_MAX 18 /* only applicable to first block */
|
||||||
static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
||||||
unsigned compressionLevel,
|
unsigned compressionLevel,
|
||||||
const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
|
const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
|
||||||
const void* dictBuffer, size_t dictBufferSize)
|
const void* dictBuffer, size_t dictBufferSize)
|
||||||
@ -734,7 +671,7 @@ static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|||||||
|
|
||||||
/* collect stats on all files */
|
/* collect stats on all files */
|
||||||
for (u=0; u<nbFiles; u++) {
|
for (u=0; u<nbFiles; u++) {
|
||||||
DiB_countEStats(esr,
|
ZDICT_countEStats(esr,
|
||||||
countLit, offcodeCount, matchLengthCount, litlengthCount,
|
countLit, offcodeCount, matchLengthCount, litlengthCount,
|
||||||
(const char*)srcBuffer + pos, fileSizes[u]);
|
(const char*)srcBuffer + pos, fileSizes[u]);
|
||||||
pos += fileSizes[u];
|
pos += fileSizes[u];
|
||||||
@ -794,33 +731,16 @@ static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void DiB_saveDict(const char* dictFileName,
|
|
||||||
const void* buff, size_t buffSize)
|
|
||||||
{
|
|
||||||
FILE* f;
|
|
||||||
size_t n;
|
|
||||||
|
|
||||||
f = fopen(dictFileName, "wb");
|
|
||||||
if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
|
|
||||||
|
|
||||||
n = fwrite(buff, 1, buffSize, f);
|
|
||||||
if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName)
|
|
||||||
|
|
||||||
n = (size_t)fclose(f);
|
|
||||||
if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#define DIB_FASTSEGMENTSIZE 64
|
#define DIB_FASTSEGMENTSIZE 64
|
||||||
/*! DiB_fastSampling (based on an idea by Giuseppe Ottaviano)
|
/*! ZDICT_fastSampling (based on an idea by Giuseppe Ottaviano)
|
||||||
Fill @dictBuffer with stripes of size DIB_FASTSEGMENTSIZE from @samplesBuffer
|
Fill `dictBuffer` with stripes of size DIB_FASTSEGMENTSIZE from `samplesBuffer`
|
||||||
up to @dictSize.
|
up to `dictSize`.
|
||||||
Filling starts from the end of @dictBuffer, down to maximum possible.
|
Filling starts from the end of `dictBuffer`, down to maximum possible.
|
||||||
if @dictSize is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of @dictBuffer won't be used.
|
if `dictSize` is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of `dictBuffer` won't be used.
|
||||||
@return : amount of data written into @dictBuffer
|
@return : amount of data written into `dictBuffer`
|
||||||
or an error Code (if @dictSize or @samplesSize too small)
|
or an error code
|
||||||
*/
|
*/
|
||||||
static size_t DiB_fastSampling(void* dictBuffer, size_t dictSize,
|
static size_t ZDICT_fastSampling(void* dictBuffer, size_t dictSize,
|
||||||
const void* samplesBuffer, size_t samplesSize)
|
const void* samplesBuffer, size_t samplesSize)
|
||||||
{
|
{
|
||||||
char* dstPtr = (char*)dictBuffer + dictSize;
|
char* dstPtr = (char*)dictBuffer + dictSize;
|
||||||
@ -851,10 +771,10 @@ static size_t DiB_fastSampling(void* dictBuffer, size_t dictSize,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static size_t DiB_trainFromBuffer_internal(
|
size_t ZDICT_trainFromBuffer_unsafe(
|
||||||
void* dictBuffer, size_t maxDictSize,
|
void* dictBuffer, size_t maxDictSize,
|
||||||
const void* samplesBuffer, const size_t* sampleSizes, unsigned nbSamples,
|
const void* samplesBuffer, const size_t* sampleSizes, unsigned nbSamples,
|
||||||
DiB_params_t params)
|
ZDICT_params_t params)
|
||||||
{
|
{
|
||||||
const U32 dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16));
|
const U32 dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16));
|
||||||
dictItem* dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
|
dictItem* dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
|
||||||
@ -869,14 +789,14 @@ static size_t DiB_trainFromBuffer_internal(
|
|||||||
|
|
||||||
/* init */
|
/* init */
|
||||||
{ unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += sampleSizes[u]; }
|
{ unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += sampleSizes[u]; }
|
||||||
if (!dictList) { DISPLAYLEVEL(1, "not enough memory for DiB_trainFromBuffer"); return ERROR(memory_allocation); }
|
if (!dictList) return ERROR(memory_allocation);
|
||||||
DiB_initDictItem(dictList);
|
ZDICT_initDictItem(dictList);
|
||||||
if (selectivity==0) selectivity = g_selectivity_default;
|
if (selectivity==0) selectivity = g_selectivity_default;
|
||||||
if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
|
if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
|
||||||
|
|
||||||
/* select stripes */
|
/* build dictionary */
|
||||||
if (selectivity>1) {
|
if (selectivity>1) { /* selectivity == 1 => fast mode */
|
||||||
DiB_trainBuffer(dictList, dictListSize,
|
ZDICT_trainBuffer(dictList, dictListSize,
|
||||||
samplesBuffer, sBuffSize,
|
samplesBuffer, sBuffSize,
|
||||||
sampleSizes, nbSamples,
|
sampleSizes, nbSamples,
|
||||||
selectivity, (U32)targetDictSize);
|
selectivity, (U32)targetDictSize);
|
||||||
@ -885,7 +805,7 @@ static size_t DiB_trainFromBuffer_internal(
|
|||||||
if (g_displayLevel>= 3) {
|
if (g_displayLevel>= 3) {
|
||||||
const U32 nb = 25;
|
const U32 nb = 25;
|
||||||
U32 u;
|
U32 u;
|
||||||
U32 dictContentSize = DiB_dictSize(dictList);
|
U32 dictContentSize = ZDICT_dictSize(dictList);
|
||||||
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
|
DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
|
||||||
DISPLAYLEVEL(3, "list %u best segments \n", nb);
|
DISPLAYLEVEL(3, "list %u best segments \n", nb);
|
||||||
for (u=1; u<=nb; u++) {
|
for (u=1; u<=nb; u++) {
|
||||||
@ -894,13 +814,13 @@ static size_t DiB_trainFromBuffer_internal(
|
|||||||
U32 d = MIN(40, l);
|
U32 d = MIN(40, l);
|
||||||
DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
|
DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
|
||||||
u, l, p, dictList[u].savings);
|
u, l, p, dictList[u].savings);
|
||||||
DiB_printHex(3, (const char*)samplesBuffer+p, d);
|
ZDICT_printHex(3, (const char*)samplesBuffer+p, d);
|
||||||
DISPLAYLEVEL(3, "| \n");
|
DISPLAYLEVEL(3, "| \n");
|
||||||
} } }
|
} } }
|
||||||
|
|
||||||
/* create dictionary */
|
/* create dictionary */
|
||||||
{
|
{
|
||||||
U32 dictContentSize = DiB_dictSize(dictList);
|
U32 dictContentSize = ZDICT_dictSize(dictList);
|
||||||
size_t hSize;
|
size_t hSize;
|
||||||
BYTE* ptr;
|
BYTE* ptr;
|
||||||
U32 u;
|
U32 u;
|
||||||
@ -918,7 +838,7 @@ static size_t DiB_trainFromBuffer_internal(
|
|||||||
if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */
|
if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */
|
||||||
DISPLAYLEVEL(3, "\r%70s\r", ""); /* clean display line */
|
DISPLAYLEVEL(3, "\r%70s\r", ""); /* clean display line */
|
||||||
DISPLAYLEVEL(3, "Adding %u KB with fast sampling \n", (U32)(targetDictSize>>10));
|
DISPLAYLEVEL(3, "Adding %u KB with fast sampling \n", (U32)(targetDictSize>>10));
|
||||||
dictContentSize = (U32)DiB_fastSampling((char*)dictBuffer + g_provision_entropySize,
|
dictContentSize = (U32)ZDICT_fastSampling((char*)dictBuffer + g_provision_entropySize,
|
||||||
targetDictSize, samplesBuffer, sBuffSize);
|
targetDictSize, samplesBuffer, sBuffSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -929,7 +849,7 @@ static size_t DiB_trainFromBuffer_internal(
|
|||||||
/* entropic tables */
|
/* entropic tables */
|
||||||
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
||||||
DISPLAYLEVEL(2, "statistics ... \n");
|
DISPLAYLEVEL(2, "statistics ... \n");
|
||||||
hSize += DiB_analyzeEntropy((char*)dictBuffer+4, maxDictSize-4,
|
hSize += ZDICT_analyzeEntropy((char*)dictBuffer+4, maxDictSize-4,
|
||||||
compressionLevel,
|
compressionLevel,
|
||||||
samplesBuffer, sampleSizes, nbSamples,
|
samplesBuffer, sampleSizes, nbSamples,
|
||||||
(char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize);
|
(char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize);
|
||||||
@ -945,76 +865,38 @@ static size_t DiB_trainFromBuffer_internal(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* issue : samplesBuffer need to be followed by a noisy guard band.
|
size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
|
||||||
* work around : duplicate the buffer, and add the noise ? */
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
||||||
size_t DiB_trainFromBuffer(void* dictBuffer, size_t maxDictSize,
|
ZDICT_params_t params)
|
||||||
const void* samplesBuffer, const size_t* sampleSizes, unsigned nbSamples,
|
|
||||||
DiB_params_t params)
|
|
||||||
{
|
{
|
||||||
size_t sBuffSize;
|
size_t sBuffSize;
|
||||||
void* newBuff;
|
void* newBuff;
|
||||||
size_t result;
|
size_t result;
|
||||||
|
|
||||||
{ unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += sampleSizes[u]; }
|
{ unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += samplesSizes[u]; }
|
||||||
newBuff = malloc(sBuffSize + NOISELENGTH);
|
newBuff = malloc(sBuffSize + NOISELENGTH);
|
||||||
if (!newBuff) return ERROR(memory_allocation);
|
if (!newBuff) return ERROR(memory_allocation);
|
||||||
|
|
||||||
memcpy(newBuff, samplesBuffer, sBuffSize);
|
memcpy(newBuff, samplesBuffer, sBuffSize);
|
||||||
DiB_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
|
ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
|
||||||
|
|
||||||
result = DiB_trainFromBuffer_internal(dictBuffer, maxDictSize,
|
result = ZDICT_trainFromBuffer_unsafe(dictBuffer, dictBufferCapacity,
|
||||||
newBuff, sampleSizes, nbSamples,
|
newBuff, samplesSizes, nbSamples,
|
||||||
params);
|
params);
|
||||||
free(newBuff);
|
free(newBuff);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
|
/* issue : samplesBuffer need to be followed by a noisy guard band.
|
||||||
const char** fileNamesTable, unsigned nbFiles,
|
* work around : duplicate the buffer, and add the noise ? */
|
||||||
DiB_params_t params)
|
size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
|
||||||
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
|
||||||
{
|
{
|
||||||
void* srcBuffer;
|
ZDICT_params_t params;
|
||||||
size_t benchedSize;
|
memset(¶ms, 0, sizeof(params));
|
||||||
size_t* fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
|
return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity,
|
||||||
unsigned long long totalSizeToLoad = DiB_getTotalFileSize(fileNamesTable, nbFiles);
|
samplesBuffer, samplesSizes, nbSamples,
|
||||||
void* dictBuffer = malloc(maxDictSize);
|
|
||||||
size_t dictSize;
|
|
||||||
int result = 0;
|
|
||||||
|
|
||||||
/* init */
|
|
||||||
benchedSize = DiB_findMaxMem(totalSizeToLoad * MEMMULT) / MEMMULT;
|
|
||||||
if ((unsigned long long)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad;
|
|
||||||
if (benchedSize < totalSizeToLoad)
|
|
||||||
DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20));
|
|
||||||
|
|
||||||
/* Memory allocation & restrictions */
|
|
||||||
srcBuffer = malloc(benchedSize+NOISELENGTH); /* + noise */
|
|
||||||
if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
|
|
||||||
|
|
||||||
/* Load input buffer */
|
|
||||||
DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles);
|
|
||||||
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
|
|
||||||
|
|
||||||
/* call buffer version */
|
|
||||||
dictSize = DiB_trainFromBuffer_internal(dictBuffer, maxDictSize,
|
|
||||||
srcBuffer, fileSizes, nbFiles,
|
|
||||||
params);
|
params);
|
||||||
if (DiB_isError(dictSize))
|
|
||||||
{
|
|
||||||
DISPLAYLEVEL(1, "dictionary training failed : %s", DiB_getErrorName(dictSize)); /* should not happen */
|
|
||||||
result = 1;
|
|
||||||
goto _cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* save dict */
|
|
||||||
DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
|
|
||||||
DiB_saveDict(dictFileName, dictBuffer, dictSize);
|
|
||||||
|
|
||||||
/* clean up */
|
|
||||||
_cleanup:
|
|
||||||
free(srcBuffer);
|
|
||||||
free(dictBuffer);
|
|
||||||
free(fileSizes);
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
67
lib/dictBuilder.h
Normal file
67
lib/dictBuilder.h
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
/*
|
||||||
|
dictBuilder header file
|
||||||
|
Copyright (C) Yann Collet 2016
|
||||||
|
|
||||||
|
BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above
|
||||||
|
copyright notice, this list of conditions and the following disclaimer
|
||||||
|
in the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
You can contact the author at :
|
||||||
|
- Zstd source repository : https://www.zstd.net
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef DICTBUILDER_H_001
|
||||||
|
#define DICTBUILDER_H_001
|
||||||
|
|
||||||
|
#if defined (__cplusplus)
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*-*************************************
|
||||||
|
* Public functions
|
||||||
|
***************************************/
|
||||||
|
/*! ZDICT_trainFromBuffer() :
|
||||||
|
Train a dictionary from a memory buffer `samplesBuffer`,
|
||||||
|
where `nbSamples` samples have been stored concatenated.
|
||||||
|
Each sample size is provided into an orderly table `samplesSizes`.
|
||||||
|
Resulting dictionary will be saved into `dictBuffer`.
|
||||||
|
@return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
|
||||||
|
or an error code, which can be tested by ZDICT_isError().
|
||||||
|
*/
|
||||||
|
size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
|
||||||
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
|
||||||
|
|
||||||
|
|
||||||
|
/*-*************************************
|
||||||
|
* Helper functions
|
||||||
|
***************************************/
|
||||||
|
unsigned ZDICT_isError(size_t errorCode);
|
||||||
|
const char* ZDICT_getErrorName(size_t errorCode);
|
||||||
|
|
||||||
|
|
||||||
|
#if defined (__cplusplus)
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
104
lib/dictBuilder_static.h
Normal file
104
lib/dictBuilder_static.h
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
/*
|
||||||
|
dictBuilder header file
|
||||||
|
for static linking only
|
||||||
|
Copyright (C) Yann Collet 2016
|
||||||
|
|
||||||
|
BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above
|
||||||
|
copyright notice, this list of conditions and the following disclaimer
|
||||||
|
in the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
You can contact the author at :
|
||||||
|
- Zstd source repository : https://www.zstd.net
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* This library is EXPERIMENTAL, below API is not yet stable */
|
||||||
|
|
||||||
|
#ifndef DICTBUILDER_STATIC_H_002
|
||||||
|
#define DICTBUILDER_STATIC_H_002
|
||||||
|
|
||||||
|
#if defined (__cplusplus)
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*-*************************************
|
||||||
|
* Dependencies
|
||||||
|
***************************************/
|
||||||
|
#include "dictBuilder.h"
|
||||||
|
|
||||||
|
|
||||||
|
/*-*************************************
|
||||||
|
* Public type
|
||||||
|
***************************************/
|
||||||
|
typedef struct {
|
||||||
|
unsigned selectivityLevel; /* 0 means default; larger => bigger selection => larger dictionary */
|
||||||
|
unsigned compressionLevel; /* 0 means default; target a specific zstd compression level */
|
||||||
|
} ZDICT_params_t;
|
||||||
|
|
||||||
|
|
||||||
|
/*-*************************************
|
||||||
|
* Public functions
|
||||||
|
***************************************/
|
||||||
|
/*! ZDICT_trainFromBuffer_advanced() :
|
||||||
|
Same as ZDICT_trainFromBuffer() with control over more parameters.
|
||||||
|
`parameters` is optional and can be provided with values set to 0 to mean "default".
|
||||||
|
@return : size of dictionary stored into `dictBuffer` (<= `dictBufferSize`)
|
||||||
|
or an error code, which can be tested by DiB_isError().
|
||||||
|
note : ZDICT_trainFromBuffer_advanced() will send notifications into stderr if instructed to, using ZDICT_setNotificationLevel()
|
||||||
|
*/
|
||||||
|
size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
|
||||||
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
||||||
|
ZDICT_params_t parameters);
|
||||||
|
|
||||||
|
|
||||||
|
/*-*************************************
|
||||||
|
* Helper functions
|
||||||
|
***************************************/
|
||||||
|
/*! ZDICT_setNotificationLevel() :
|
||||||
|
Set amount of notification to be displayed on the console.
|
||||||
|
default : 0 = no console notification.
|
||||||
|
1 = errors; 2 = notifications; 3 = details; 4 = debug;
|
||||||
|
Note : not thread-safe (uses a global constant)
|
||||||
|
*/
|
||||||
|
void ZDICT_setNotificationLevel(unsigned l);
|
||||||
|
|
||||||
|
|
||||||
|
/*-*************************************
|
||||||
|
* Private functions
|
||||||
|
***************************************/
|
||||||
|
/*! ZDICT_trainFromBuffer_unsafe() :
|
||||||
|
Same as ZDICT_trainFromBuffer_advanced(), but does not control `samplesBuffer`.
|
||||||
|
note : `samplesBuffer` must be followed by noisy guard band to avoid out-of-buffer reads.
|
||||||
|
@return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
|
||||||
|
or an error code.
|
||||||
|
*/
|
||||||
|
size_t ZDICT_trainFromBuffer_unsafe(void* dictBuffer, size_t dictBufferCapacity,
|
||||||
|
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
|
||||||
|
ZDICT_params_t parameters);
|
||||||
|
|
||||||
|
|
||||||
|
#if defined (__cplusplus)
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* DICTBUILDER_STATIC_H_002 */
|
1905
lib/divsufsort.c
Normal file
1905
lib/divsufsort.c
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* config.h for libdivsufsort
|
* divsufsort.h for libdivsufsort-lite
|
||||||
* Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
|
* Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
|
||||||
*
|
*
|
||||||
* Permission is hereby granted, free of charge, to any person
|
* Permission is hereby granted, free of charge, to any person
|
||||||
@ -24,60 +24,44 @@
|
|||||||
* OTHER DEALINGS IN THE SOFTWARE.
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef _CONFIG_H
|
#ifndef _DIVSUFSORT_H
|
||||||
#define _CONFIG_H 1
|
#define _DIVSUFSORT_H 1
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif /* __cplusplus */
|
#endif /* __cplusplus */
|
||||||
|
|
||||||
/** Define to the version of this package. **/
|
|
||||||
#define PROJECT_VERSION_FULL "2.0.1"
|
|
||||||
|
|
||||||
/** Define to 1 if you have the header files. **/
|
/*- Prototypes -*/
|
||||||
#define HAVE_INTTYPES_H 1
|
|
||||||
#define HAVE_STDDEF_H 1
|
|
||||||
#define HAVE_STDINT_H 1
|
|
||||||
#define HAVE_STDLIB_H 1
|
|
||||||
#define HAVE_STRING_H 1
|
|
||||||
#define HAVE_STRINGS_H 1
|
|
||||||
#define HAVE_MEMORY_H 1
|
|
||||||
#define HAVE_SYS_TYPES_H 1
|
|
||||||
|
|
||||||
/** for WinIO **/
|
/**
|
||||||
/* #undef HAVE_IO_H */
|
* Constructs the suffix array of a given string.
|
||||||
/* #undef HAVE_FCNTL_H */
|
* @param T[0..n-1] The input string.
|
||||||
/* #undef HAVE__SETMODE */
|
* @param SA[0..n-1] The output array of suffixes.
|
||||||
/* #undef HAVE_SETMODE */
|
* @param n The length of the given string.
|
||||||
/* #undef HAVE__FILENO */
|
* @param openMP enables OpenMP optimization.
|
||||||
/* #undef HAVE_FOPEN_S */
|
* @return 0 if no error occurred, -1 or -2 otherwise.
|
||||||
/* #undef HAVE__O_BINARY */
|
*/
|
||||||
/*
|
int
|
||||||
#ifndef HAVE__SETMODE
|
divsufsort(const unsigned char *T, int *SA, int n, int openMP);
|
||||||
# if HAVE_SETMODE
|
|
||||||
# define _setmode setmode
|
|
||||||
# define HAVE__SETMODE 1
|
|
||||||
# endif
|
|
||||||
# if HAVE__SETMODE && !HAVE__O_BINARY
|
|
||||||
# define _O_BINARY 0
|
|
||||||
# define HAVE__O_BINARY 1
|
|
||||||
# endif
|
|
||||||
#endif
|
|
||||||
*/
|
|
||||||
|
|
||||||
/** for inline **/
|
/**
|
||||||
#ifndef INLINE
|
* Constructs the burrows-wheeler transformed string of a given string.
|
||||||
# define INLINE inline
|
* @param T[0..n-1] The input string.
|
||||||
#endif
|
* @param U[0..n-1] The output string. (can be T)
|
||||||
|
* @param A[0..n-1] The temporary array. (can be NULL)
|
||||||
/** for VC++ warning **/
|
* @param n The length of the given string.
|
||||||
#ifdef _MSC_VER
|
* @param num_indexes The length of secondary indexes array. (can be NULL)
|
||||||
#pragma warning(disable: 4127)
|
* @param indexes The secondary indexes array. (can be NULL)
|
||||||
#endif
|
* @param openMP enables OpenMP optimization.
|
||||||
|
* @return The primary index if no error occurred, -1 or -2 otherwise.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
divbwt(const unsigned char *T, unsigned char *U, int *A, int n, unsigned char * num_indexes, int * indexes, int openMP);
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} /* extern "C" */
|
} /* extern "C" */
|
||||||
#endif /* __cplusplus */
|
#endif /* __cplusplus */
|
||||||
|
|
||||||
#endif /* _CONFIG_H */
|
#endif /* _DIVSUFSORT_H */
|
11
lib/mem.h
11
lib/mem.h
@ -119,11 +119,12 @@ MEM_STATIC unsigned MEM_isLittleEndian(void)
|
|||||||
|
|
||||||
#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
|
#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
|
||||||
|
|
||||||
/* violates C standard on structure alignment.
|
/* violates C standard, by lying on structure alignment.
|
||||||
Only use if no other choice to achieve best performance on target platform */
|
Only use if no other choice to achieve best performance on target platform */
|
||||||
MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
|
MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
|
||||||
MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
|
MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
|
||||||
MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
|
MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
|
||||||
|
MEM_STATIC U64 MEM_readST(const void* memPtr) { return *(const size_t*) memPtr; }
|
||||||
|
|
||||||
MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
|
MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
|
||||||
MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
|
MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
|
||||||
@ -133,11 +134,12 @@ MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }
|
|||||||
|
|
||||||
/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
|
/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
|
||||||
/* currently only defined for gcc and icc */
|
/* currently only defined for gcc and icc */
|
||||||
typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign;
|
typedef union { U16 u16; U32 u32; U64 u64; size_t st; } __attribute__((packed)) unalign;
|
||||||
|
|
||||||
MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
|
MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
|
||||||
MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
|
MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
|
||||||
MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
|
MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
|
||||||
|
MEM_STATIC U64 MEM_readST(const void* ptr) { return ((const unalign*)ptr)->st; }
|
||||||
|
|
||||||
MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
|
MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
|
||||||
MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; }
|
MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; }
|
||||||
@ -163,6 +165,11 @@ MEM_STATIC U64 MEM_read64(const void* memPtr)
|
|||||||
U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
|
U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
MEM_STATIC size_t MEM_readST(const void* memPtr)
|
||||||
|
{
|
||||||
|
size_t val; memcpy(&val, memPtr, sizeof(val)); return val;
|
||||||
|
}
|
||||||
|
|
||||||
MEM_STATIC void MEM_write16(void* memPtr, U16 value)
|
MEM_STATIC void MEM_write16(void* memPtr, U16 value)
|
||||||
{
|
{
|
||||||
memcpy(memPtr, &value, sizeof(value));
|
memcpy(memPtr, &value, sizeof(value));
|
||||||
|
@ -61,7 +61,7 @@ extern "C" {
|
|||||||
***************************************/
|
***************************************/
|
||||||
#define ZSTD_VERSION_MAJOR 0 /* for breaking interface changes */
|
#define ZSTD_VERSION_MAJOR 0 /* for breaking interface changes */
|
||||||
#define ZSTD_VERSION_MINOR 5 /* for new (non-breaking) interface capabilities */
|
#define ZSTD_VERSION_MINOR 5 /* for new (non-breaking) interface capabilities */
|
||||||
#define ZSTD_VERSION_RELEASE 0 /* for tweaks, bug-fixes, or development */
|
#define ZSTD_VERSION_RELEASE 1 /* for tweaks, bug-fixes, or development */
|
||||||
#define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
|
#define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
|
||||||
ZSTDLIB_API unsigned ZSTD_versionNumber (void);
|
ZSTDLIB_API unsigned ZSTD_versionNumber (void);
|
||||||
|
|
||||||
|
@ -48,7 +48,7 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/* *************************************
|
/*-*************************************
|
||||||
* Dependencies
|
* Dependencies
|
||||||
***************************************/
|
***************************************/
|
||||||
#include <stdlib.h> /* malloc */
|
#include <stdlib.h> /* malloc */
|
||||||
@ -59,46 +59,21 @@
|
|||||||
#include "zstd_internal.h"
|
#include "zstd_internal.h"
|
||||||
|
|
||||||
|
|
||||||
/* *************************************
|
/*-*************************************
|
||||||
* Constants
|
* Constants
|
||||||
***************************************/
|
***************************************/
|
||||||
static const U32 g_searchStrength = 8;
|
static const U32 g_searchStrength = 8;
|
||||||
|
|
||||||
|
|
||||||
/* *************************************
|
/*-*************************************
|
||||||
* Helper functions
|
* Helper functions
|
||||||
***************************************/
|
***************************************/
|
||||||
size_t ZSTD_compressBound(size_t srcSize) { return FSE_compressBound(srcSize) + 12; }
|
size_t ZSTD_compressBound(size_t srcSize) { return FSE_compressBound(srcSize) + 12; }
|
||||||
|
|
||||||
|
|
||||||
/* *************************************
|
/*-*************************************
|
||||||
* Sequence storage
|
* Sequence storage
|
||||||
***************************************/
|
***************************************/
|
||||||
typedef struct {
|
|
||||||
void* buffer;
|
|
||||||
U32* offsetStart;
|
|
||||||
U32* offset;
|
|
||||||
BYTE* offCodeStart;
|
|
||||||
BYTE* offCode;
|
|
||||||
BYTE* litStart;
|
|
||||||
BYTE* lit;
|
|
||||||
BYTE* litLengthStart;
|
|
||||||
BYTE* litLength;
|
|
||||||
BYTE* matchLengthStart;
|
|
||||||
BYTE* matchLength;
|
|
||||||
BYTE* dumpsStart;
|
|
||||||
BYTE* dumps;
|
|
||||||
/* opt */
|
|
||||||
U32* matchLengthFreq;
|
|
||||||
U32* litLengthFreq;
|
|
||||||
U32* litFreq;
|
|
||||||
U32* offCodeFreq;
|
|
||||||
U32 matchLengthSum;
|
|
||||||
U32 litLengthSum;
|
|
||||||
U32 litSum;
|
|
||||||
U32 offCodeSum;
|
|
||||||
} seqStore_t;
|
|
||||||
|
|
||||||
static void ZSTD_resetFreqs(seqStore_t* ssPtr)
|
static void ZSTD_resetFreqs(seqStore_t* ssPtr)
|
||||||
{
|
{
|
||||||
unsigned u;
|
unsigned u;
|
||||||
@ -129,7 +104,7 @@ static void ZSTD_resetSeqStore(seqStore_t* ssPtr)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* *************************************
|
/*-*************************************
|
||||||
* Context memory management
|
* Context memory management
|
||||||
***************************************/
|
***************************************/
|
||||||
struct ZSTD_CCtx_s
|
struct ZSTD_CCtx_s
|
||||||
@ -159,7 +134,6 @@ struct ZSTD_CCtx_s
|
|||||||
FSE_CTable litlengthCTable [FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
|
FSE_CTable litlengthCTable [FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
ZSTD_CCtx* ZSTD_createCCtx(void)
|
ZSTD_CCtx* ZSTD_createCCtx(void)
|
||||||
{
|
{
|
||||||
return (ZSTD_CCtx*) calloc(1, sizeof(ZSTD_CCtx));
|
return (ZSTD_CCtx*) calloc(1, sizeof(ZSTD_CCtx));
|
||||||
@ -172,14 +146,19 @@ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
|
|||||||
return 0; /* reserved as a potential error code in the future */
|
return 0; /* reserved as a potential error code in the future */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
seqStore_t ZSTD_copySeqStore(const ZSTD_CCtx* ctx)
|
||||||
|
{
|
||||||
|
return ctx->seqStore;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static unsigned ZSTD_highbit(U32 val);
|
static unsigned ZSTD_highbit(U32 val);
|
||||||
|
|
||||||
#define CLAMP(val,min,max) { if (val<min) val=min; else if (val>max) val=max; }
|
#define CLAMP(val,min,max) { if (val<min) val=min; else if (val>max) val=max; }
|
||||||
|
|
||||||
/** ZSTD_validateParams()
|
/** ZSTD_validateParams() :
|
||||||
correct params value to remain within authorized range
|
correct params value to remain within authorized range,
|
||||||
optimize for srcSize if srcSize > 0 */
|
optimize for `srcSize` if srcSize > 0 */
|
||||||
void ZSTD_validateParams(ZSTD_parameters* params)
|
void ZSTD_validateParams(ZSTD_parameters* params)
|
||||||
{
|
{
|
||||||
const U32 btPlus = (params->strategy == ZSTD_btlazy2) || (params->strategy == ZSTD_opt_bt);
|
const U32 btPlus = (params->strategy == ZSTD_btlazy2) || (params->strategy == ZSTD_opt_bt);
|
||||||
@ -800,33 +779,9 @@ MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const B
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* *************************************
|
/*-*************************************
|
||||||
* Match length counter
|
* Match length counter
|
||||||
***************************************/
|
***************************************/
|
||||||
static size_t ZSTD_read_ARCH(const void* p) { size_t r; memcpy(&r, p, sizeof(r)); return r; }
|
|
||||||
|
|
||||||
static unsigned ZSTD_highbit(U32 val)
|
|
||||||
{
|
|
||||||
# if defined(_MSC_VER) /* Visual */
|
|
||||||
unsigned long r=0;
|
|
||||||
_BitScanReverse(&r, val);
|
|
||||||
return (unsigned)r;
|
|
||||||
# elif defined(__GNUC__) && (__GNUC__ >= 3) /* GCC Intrinsic */
|
|
||||||
return 31 - __builtin_clz(val);
|
|
||||||
# else /* Software version */
|
|
||||||
static const int DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
|
|
||||||
U32 v = val;
|
|
||||||
int r;
|
|
||||||
v |= v >> 1;
|
|
||||||
v |= v >> 2;
|
|
||||||
v |= v >> 4;
|
|
||||||
v |= v >> 8;
|
|
||||||
v |= v >> 16;
|
|
||||||
r = DeBruijnClz[(U32)(v * 0x07C4ACDDU) >> 27];
|
|
||||||
return r;
|
|
||||||
# endif
|
|
||||||
}
|
|
||||||
|
|
||||||
static unsigned ZSTD_NbCommonBytes (register size_t val)
|
static unsigned ZSTD_NbCommonBytes (register size_t val)
|
||||||
{
|
{
|
||||||
if (MEM_isLittleEndian()) {
|
if (MEM_isLittleEndian()) {
|
||||||
@ -891,12 +846,11 @@ static size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLim
|
|||||||
const BYTE* const pStart = pIn;
|
const BYTE* const pStart = pIn;
|
||||||
|
|
||||||
while ((pIn<pInLimit-(sizeof(size_t)-1))) {
|
while ((pIn<pInLimit-(sizeof(size_t)-1))) {
|
||||||
size_t diff = ZSTD_read_ARCH(pMatch) ^ ZSTD_read_ARCH(pIn);
|
size_t diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
|
||||||
if (!diff) { pIn+=sizeof(size_t); pMatch+=sizeof(size_t); continue; }
|
if (!diff) { pIn+=sizeof(size_t); pMatch+=sizeof(size_t); continue; }
|
||||||
pIn += ZSTD_NbCommonBytes(diff);
|
pIn += ZSTD_NbCommonBytes(diff);
|
||||||
return (size_t)(pIn - pStart);
|
return (size_t)(pIn - pStart);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (MEM_64bits()) if ((pIn<(pInLimit-3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { pIn+=4; pMatch+=4; }
|
if (MEM_64bits()) if ((pIn<(pInLimit-3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { pIn+=4; pMatch+=4; }
|
||||||
if ((pIn<(pInLimit-1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { pIn+=2; pMatch+=2; }
|
if ((pIn<(pInLimit-1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { pIn+=2; pMatch+=2; }
|
||||||
if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
|
if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
|
||||||
@ -904,7 +858,7 @@ static size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLim
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** ZSTD_count_2segments() :
|
/** ZSTD_count_2segments() :
|
||||||
* can count match length with ip & match in potentially 2 different segments.
|
* can count match length with `ip` & `match` in 2 different segments.
|
||||||
* convention : on reaching mEnd, match count continue starting from iStart
|
* convention : on reaching mEnd, match count continue starting from iStart
|
||||||
*/
|
*/
|
||||||
static size_t ZSTD_count_2segments(const BYTE* ip, const BYTE* match, const BYTE* iEnd, const BYTE* mEnd, const BYTE* iStart)
|
static size_t ZSTD_count_2segments(const BYTE* ip, const BYTE* match, const BYTE* iEnd, const BYTE* mEnd, const BYTE* iStart)
|
||||||
|
@ -32,11 +32,7 @@
|
|||||||
#ifndef ZSTD_CCOMMON_H_MODULE
|
#ifndef ZSTD_CCOMMON_H_MODULE
|
||||||
#define ZSTD_CCOMMON_H_MODULE
|
#define ZSTD_CCOMMON_H_MODULE
|
||||||
|
|
||||||
#if defined (__cplusplus)
|
/*-*************************************
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* *************************************
|
|
||||||
* Dependencies
|
* Dependencies
|
||||||
***************************************/
|
***************************************/
|
||||||
#include "mem.h"
|
#include "mem.h"
|
||||||
@ -44,14 +40,14 @@ extern "C" {
|
|||||||
#include "zstd_static.h"
|
#include "zstd_static.h"
|
||||||
|
|
||||||
|
|
||||||
/* *************************************
|
/*-*************************************
|
||||||
* Common macros
|
* Common macros
|
||||||
***************************************/
|
***************************************/
|
||||||
#define MIN(a,b) ((a)<(b) ? (a) : (b))
|
#define MIN(a,b) ((a)<(b) ? (a) : (b))
|
||||||
#define MAX(a,b) ((a)>(b) ? (a) : (b))
|
#define MAX(a,b) ((a)>(b) ? (a) : (b))
|
||||||
|
|
||||||
|
|
||||||
/* *************************************
|
/*-*************************************
|
||||||
* Common constants
|
* Common constants
|
||||||
***************************************/
|
***************************************/
|
||||||
#define ZSTD_MAGICNUMBER 0xFD2FB525 /* v0.5 */
|
#define ZSTD_MAGICNUMBER 0xFD2FB525 /* v0.5 */
|
||||||
@ -130,9 +126,58 @@ MEM_STATIC void ZSTD_wildcopy(void* dst, const void* src, size_t length)
|
|||||||
while (op < oend);
|
while (op < oend);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
MEM_STATIC unsigned ZSTD_highbit(U32 val)
|
||||||
#if defined (__cplusplus)
|
{
|
||||||
|
# if defined(_MSC_VER) /* Visual */
|
||||||
|
unsigned long r=0;
|
||||||
|
_BitScanReverse(&r, val);
|
||||||
|
return (unsigned)r;
|
||||||
|
# elif defined(__GNUC__) && (__GNUC__ >= 3) /* GCC Intrinsic */
|
||||||
|
return 31 - __builtin_clz(val);
|
||||||
|
# else /* Software version */
|
||||||
|
static const int DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
|
||||||
|
U32 v = val;
|
||||||
|
int r;
|
||||||
|
v |= v >> 1;
|
||||||
|
v |= v >> 2;
|
||||||
|
v |= v >> 4;
|
||||||
|
v |= v >> 8;
|
||||||
|
v |= v >> 16;
|
||||||
|
r = DeBruijnClz[(U32)(v * 0x07C4ACDDU) >> 27];
|
||||||
|
return r;
|
||||||
|
# endif
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
|
/*-*******************************************
|
||||||
|
* Private interfaces
|
||||||
|
*********************************************/
|
||||||
|
typedef struct {
|
||||||
|
void* buffer;
|
||||||
|
U32* offsetStart;
|
||||||
|
U32* offset;
|
||||||
|
BYTE* offCodeStart;
|
||||||
|
BYTE* offCode;
|
||||||
|
BYTE* litStart;
|
||||||
|
BYTE* lit;
|
||||||
|
BYTE* litLengthStart;
|
||||||
|
BYTE* litLength;
|
||||||
|
BYTE* matchLengthStart;
|
||||||
|
BYTE* matchLength;
|
||||||
|
BYTE* dumpsStart;
|
||||||
|
BYTE* dumps;
|
||||||
|
/* opt */
|
||||||
|
U32* matchLengthFreq;
|
||||||
|
U32* litLengthFreq;
|
||||||
|
U32* litFreq;
|
||||||
|
U32* offCodeFreq;
|
||||||
|
U32 matchLengthSum;
|
||||||
|
U32 litLengthSum;
|
||||||
|
U32 litSum;
|
||||||
|
U32 offCodeSum;
|
||||||
|
} seqStore_t;
|
||||||
|
|
||||||
|
seqStore_t ZSTD_copySeqStore(const ZSTD_CCtx* ctx);
|
||||||
|
|
||||||
|
|
||||||
#endif /* ZSTD_CCOMMON_H_MODULE */
|
#endif /* ZSTD_CCOMMON_H_MODULE */
|
||||||
|
@ -205,7 +205,7 @@ ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t ds
|
|||||||
* Block functions
|
* Block functions
|
||||||
****************************************/
|
****************************************/
|
||||||
/*! Block functions produce and decode raw zstd blocks, without frame metadata.
|
/*! Block functions produce and decode raw zstd blocks, without frame metadata.
|
||||||
User will have to save and regenerate necessary information to regenerate data, such as block sizes.
|
User will have to take in charge required information to regenerate data, such as block sizes.
|
||||||
|
|
||||||
A few rules to respect :
|
A few rules to respect :
|
||||||
- Uncompressed block size must be <= 128 KB
|
- Uncompressed block size must be <= 128 KB
|
||||||
@ -222,18 +222,17 @@ ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t ds
|
|||||||
+ ZSTD_decompressBlock() doesn't accept uncompressed data as input !!
|
+ ZSTD_decompressBlock() doesn't accept uncompressed data as input !!
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
|
size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
|
||||||
size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
|
size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
|
||||||
|
|
||||||
|
|
||||||
/* *************************************
|
/*-*************************************
|
||||||
* Error management
|
* Error management
|
||||||
***************************************/
|
***************************************/
|
||||||
#include "error_public.h"
|
#include "error_public.h"
|
||||||
/*! ZSTD_getErrorCode() :
|
/*! ZSTD_getErrorCode() :
|
||||||
convert a `size_t` function result into a `ZSTD_error_code` enum type,
|
convert a `size_t` function result into a `ZSTD_error_code` enum type,
|
||||||
which can be used to compare directly with enum list within "error_public.h" */
|
which can be used to compare directly with enum list published into "error_public.h" */
|
||||||
ZSTD_ErrorCode ZSTD_getError(size_t code);
|
ZSTD_ErrorCode ZSTD_getError(size_t code);
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user