diff --git a/NEWS b/NEWS index f146eb14..cfbd6867 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,6 @@ v0.5.0 Changed streaming & dictionary API +New : dictionary builder utility v0.4.7 Improved : small compression speed improvement in HC mode diff --git a/dictBuilder/COPYING b/dictBuilder/COPYING new file mode 100644 index 00000000..d159169d --- /dev/null +++ b/dictBuilder/COPYING @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/dictBuilder/Makefile b/dictBuilder/Makefile new file mode 100644 index 00000000..c2ebdb88 --- /dev/null +++ b/dictBuilder/Makefile @@ -0,0 +1,67 @@ +# ########################################################################## +# Dict Builder - Makefile +# Copyright (C) Yann Collet 2015 +# +# GPL v2 License +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# You can contact the author at : +# - ZSTD source repository : http://code.google.com/p/zstd/ +# - Public forum : https://groups.google.com/forum/#!forum/lz4c +# ########################################################################## + +CPPFLAGS= -I../lib +CFLAGS ?= -O3 +CFLAGS += -std=c99 -Wall -Wextra -Wshadow -Wcast-qual -Wcast-align -Wundef -Wstrict-prototypes -Wstrict-aliasing=1 +FLAGS = $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $(MOREFLAGS) + +ZSTDDIR = ../lib + + +# Define *.exe as extension for Windows systems +ifneq (,$(filter Windows%,$(OS))) +EXT =.exe +VOID = nul +else +EXT = +VOID = /dev/null +endif + + +.PHONY: default all test + +default: dictBuilder + +all: dictBuilder + +dictBuilder: dictBuilder.c dibcli.c divsufsort.c sssort.c trsort.c $(ZSTDDIR)/huff0.c $(ZSTDDIR)/fse.c $(ZSTDDIR)/zstd_decompress.c + $(CC) $(FLAGS) $^ -o $@$(EXT) + +clean: + @rm -f core *.o tmp* result* *.gcda \ + dictBuilder$(EXT) + @echo Cleaning completed + +test: clean dictBuilder + +clangtest: CC = clang +clangtest: CFLAGS += -Werror +clangtest: clean dictBuilder + +gpptest: CC = g++ +gpptest: CFLAGS=-O3 -Wall -Wextra -Wshadow -Wcast-align -Wcast-qual -Wundef -Werror +gpptest: clean dictBuilder + diff --git a/dictBuilder/config.h b/dictBuilder/config.h new file mode 100644 index 00000000..c2925d33 --- /dev/null +++ b/dictBuilder/config.h @@ -0,0 +1,83 @@ +/* + * config.h for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _CONFIG_H +#define _CONFIG_H 1 + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** Define to the version of this package. **/ +#define PROJECT_VERSION_FULL "2.0.1" + +/** Define to 1 if you have the header files. **/ +#define HAVE_INTTYPES_H 1 +#define HAVE_STDDEF_H 1 +#define HAVE_STDINT_H 1 +#define HAVE_STDLIB_H 1 +#define HAVE_STRING_H 1 +#define HAVE_STRINGS_H 1 +#define HAVE_MEMORY_H 1 +#define HAVE_SYS_TYPES_H 1 + +/** for WinIO **/ +/* #undef HAVE_IO_H */ +/* #undef HAVE_FCNTL_H */ +/* #undef HAVE__SETMODE */ +/* #undef HAVE_SETMODE */ +/* #undef HAVE__FILENO */ +/* #undef HAVE_FOPEN_S */ +/* #undef HAVE__O_BINARY */ +/* +#ifndef HAVE__SETMODE +# if HAVE_SETMODE +# define _setmode setmode +# define HAVE__SETMODE 1 +# endif +# if HAVE__SETMODE && !HAVE__O_BINARY +# define _O_BINARY 0 +# define HAVE__O_BINARY 1 +# endif +#endif +*/ + +/** for inline **/ +#ifndef INLINE +# define INLINE inline +#endif + +/** for VC++ warning **/ +#ifdef _MSC_VER +#pragma warning(disable: 4127) +#endif + + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* _CONFIG_H */ diff --git a/dictBuilder/dibcli.c b/dictBuilder/dibcli.c new file mode 100644 index 00000000..50369c22 --- /dev/null +++ b/dictBuilder/dibcli.c @@ -0,0 +1,256 @@ +/* + dibcli - Command Line Interface (cli) for Dictionary Builder + Copyright (C) Yann Collet 2016 + + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + You can contact the author at : + - zstd source repository : https://github.com/Cyan4973/zstd + - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c +*/ + + +/************************************** +* Compiler Options +**************************************/ +#define _CRT_SECURE_NO_WARNINGS /* Visual : removes warning from strcpy */ +#define _POSIX_SOURCE 1 /* triggers fileno() within on unix */ + + +/************************************** +* Includes +**************************************/ +#include /* fprintf, getchar */ +#include /* exit, calloc, free */ +#include /* strcmp, strlen */ + +#include "dictBuilder.h" + + +/************************************** +* OS-specific Includes +**************************************/ +#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__) +# include /* _O_BINARY */ +# include /* _setmode, _isatty */ +# define SET_BINARY_MODE(file) _setmode(_fileno(file), _O_BINARY) +# define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream)) +#else +# include /* isatty */ +# define SET_BINARY_MODE(file) +# define IS_CONSOLE(stdStream) isatty(fileno(stdStream)) +#endif + + +/************************************** +* Constants +**************************************/ +#define PROGRAM_DESCRIPTION "Dictionary builder" +#ifndef PROGRAM_VERSION +# define QUOTE(str) #str +# define EXP_Q(str) QUOTE(str) +# define PROGRAM_VERSION "v" EXP_Q(DiB_VERSION_MAJOR) "." EXP_Q(DiB_VERSION_MINOR) "." EXP_Q(DiB_VERSION_RELEASE) +#endif +#define AUTHOR "Yann Collet" +#define WELCOME_MESSAGE "*** %s %s %i-bits, by %s ***\n", PROGRAM_DESCRIPTION, PROGRAM_VERSION, (int)(sizeof(void*)*8), AUTHOR + +#define KB *(1 <<10) +#define MB *(1 <<20) +#define GB *(1U<<30) + +static const unsigned selectionLevelDefault = 9; /* determined experimentally */ +static const unsigned maxDictSizeDefault = 110 KB; +static const char* dictFileNameDefault = "dictionary"; + + +/************************************** +* Display Macros +**************************************/ +#define DISPLAY(...) fprintf(displayOut, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } +static FILE* displayOut; +static unsigned displayLevel = 2; // 0 : no display // 1: errors // 2 : + result + interaction + warnings ; // 3 : + progression; // 4 : + information + + +/************************************** +* Exceptions +**************************************/ +#define DEBUG 0 +#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); +#define EXM_THROW(error, ...) \ +{ \ + DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ + DISPLAYLEVEL(1, "Error %i : ", error); \ + DISPLAYLEVEL(1, __VA_ARGS__); \ + DISPLAYLEVEL(1, "\n"); \ + exit(error); \ +} + + +/************************************** +* Command Line +**************************************/ +static int usage(const char* programName) +{ + DISPLAY( "Usage :\n"); + DISPLAY( " %s [arg] [filenames]\n", programName); + DISPLAY( "\n"); + DISPLAY( "Arguments :\n"); + DISPLAY( "--maxdict : limit dictionary to specified size (default : %u) \n", maxDictSizeDefault); + DISPLAY( " -o : name of dictionary file (default: %s) \n", dictFileNameDefault); + DISPLAY( " -h/-H : display help/long help and exit\n"); + return 0; +} + +static int usage_advanced(const char* programName) +{ + DISPLAY(WELCOME_MESSAGE); + usage(programName); + DISPLAY( "\n"); + DISPLAY( "Advanced arguments :\n"); + DISPLAY( " -# : selection level # (default :%u)\n", selectionLevelDefault); + DISPLAY( " -V : display Version number and exit\n"); + DISPLAY( " -v : verbose mode\n"); + DISPLAY( " -q : suppress warnings; specify twice to suppress errors too\n"); + return 0; +} + +static int badusage(const char* programName) +{ + DISPLAYLEVEL(1, "Incorrect parameters\n"); + if (displayLevel >= 1) usage(programName); + return 1; +} + + +static void waitEnter(void) +{ + int unused; + DISPLAY("Press enter to continue...\n"); + unused = getchar(); + (void)unused; +} + + +int main(int argCount, const char** argv) +{ + int i, + main_pause=0, + operationResult=0, + nextArgumentIsMaxDict=0, + nextArgumentIsDictFileName=0; + size_t maxDictSize = maxDictSizeDefault; + unsigned selectionLevel = selectionLevelDefault; + const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); /* argCount >= 1 */ + unsigned filenameIdx = 0; + const char* programName = argv[0]; + const char* dictFileName = dictFileNameDefault; + + /* init */ + displayOut = stderr; /* unfortunately, cannot be set at declaration */ + if (filenameTable==NULL) EXM_THROW(1, "not enough memory\n"); + /* Pick out program name from path. Don't rely on stdlib because of conflicting behavior */ + for (i = (int)strlen(programName); i > 0; i--) { if ((programName[i] == '/') || (programName[i] == '\\')) { i++; break; } } + programName += i; + + /* command switches */ + for(i=1; i='0') && (*argument<='9')) + maxDictSize = maxDictSize * 10 + (*argument - '0'), argument++; + if (*argument=='k' || *argument=='K') + maxDictSize <<= 10; + continue; + } + + /* long commands (--long-word) */ + if (!strcmp(argument, "--version")) { displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; } + if (!strcmp(argument, "--help")) { displayOut=stdout; return usage_advanced(programName); } + if (!strcmp(argument, "--verbose")) { displayLevel=4; continue; } + if (!strcmp(argument, "--quiet")) { displayLevel--; continue; } + if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; continue; } + + /* Decode commands (note : aggregated commands are allowed) */ + if (argument[0]=='-') { + argument++; + + while (argument[0]!=0) { + /* selection Level */ + if ((*argument>='0') && (*argument<='9')) { + selectionLevel = 0; + while ((*argument >= '0') && (*argument <= '9')) { + selectionLevel *= 10; + selectionLevel += *argument - '0'; + argument++; + } + continue; + } + + switch(argument[0]) + { + /* Display help */ + case 'V': displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; /* Version Only */ + case 'H': + case 'h': displayOut=stdout; return usage_advanced(programName); + + /* Verbose mode */ + case 'v': displayLevel++; if (displayLevel<3) displayLevel=3; argument++; break; + + /* Quiet mode */ + case 'q': displayLevel--; argument++; break; + + /* dictionary name */ + case 'o': nextArgumentIsDictFileName=1; argument++; break; + + /* Pause at the end (hidden option) */ + case 'p': main_pause=1; argument++; break; + + /* unknown command */ + default : return badusage(programName); + } } + continue; + } + + /* add filename to list */ + filenameTable[filenameIdx++] = argument; + } + + /* Welcome message (if verbose) */ + DISPLAYLEVEL(3, WELCOME_MESSAGE); + + if (filenameIdx==0) return badusage(programName); + + /* building ... */ + DiB_setDisplayLevel(displayLevel); + operationResult = DiB_trainFiles(dictFileName, maxDictSize, filenameTable, filenameIdx, selectionLevel); + + if (main_pause) waitEnter(); + free((void*)filenameTable); + return operationResult; +} diff --git a/dictBuilder/dictBuilder.c b/dictBuilder/dictBuilder.c new file mode 100644 index 00000000..ed41d7c5 --- /dev/null +++ b/dictBuilder/dictBuilder.c @@ -0,0 +1,946 @@ +/* + dictBuilder.c + Copyright (C) Yann Collet 2016 + + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + You can contact the author at : + - zstd source repository : https://github.com/Cyan4973/zstd + - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c +*/ + +/* ************************************** +* Compiler Options +****************************************/ +/* Disable some Visual warning messages */ +#ifdef _MSC_VER +# define _CRT_SECURE_NO_WARNINGS /* fopen */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + +/* Unix Large Files support (>4GB) */ +#define _FILE_OFFSET_BITS 64 +#if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */ +# define _LARGEFILE_SOURCE +#elif ! defined(__LP64__) /* No point defining Large file for 64 bit */ +# define _LARGEFILE64_SOURCE +#endif + +/* S_ISREG & gettimeofday() are not supported by MSVC */ +#if defined(_MSC_VER) || defined(_WIN32) +# define BMK_LEGACY_TIMER 1 +#endif + + +/* ************************************* +* Includes +***************************************/ +#include /* malloc, free */ +#include /* memset */ +#include /* fprintf, fopen, ftello64 */ +#include /* stat64 */ +#include /* stat64 */ +#include /* clock */ + +#include "mem.h" /* read */ +#include "divsufsort.h" +#include "dictBuilder.h" +#include "zstd_compress.c" +#include "huff0_static.h" + + +/* ************************************* +* Compiler specifics +***************************************/ +#if !defined(S_ISREG) +# define S_ISREG(x) (((x) & S_IFMT) == S_IFREG) +#endif + +#ifdef _MSC_VER +#define snprintf sprintf_s +#endif + + +/* ************************************* +* Constants +***************************************/ +#define KB *(1 <<10) +#define MB *(1 <<20) +#define GB *(1U<<30) + +#define DICTLISTSIZE 10000 +#define MEMMULT 11 +static const size_t maxMemory = (sizeof(size_t)==4) ? (2 GB - 64 MB) : (size_t)(3 GB) * MEMMULT; + +#define NOISELENGTH 32 +#define PRIME1 2654435761U +#define PRIME2 2246822519U + +#define MINRATIO 4 + + +/* ************************************* +* console display +***************************************/ +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); } +static unsigned g_displayLevel = 2; /* 0 : no display; 1: errors; 2: default; 4: full information */ +void DiB_setDisplayLevel(unsigned l) { g_displayLevel=l; } + +#define DISPLAYUPDATE(l, ...) if (g_displayLevel>=l) { \ + if (DiB_GetMilliSpan(g_time) > refreshRate) \ + { g_time = clock(); DISPLAY(__VA_ARGS__); \ + if (g_displayLevel>=4) fflush(stdout); } } +static const unsigned refreshRate = 300; +static clock_t g_time = 0; + +void DiB_printHex(U32 dlevel, const void* ptr, size_t length) +{ + const BYTE* const b = (const BYTE*)ptr; + size_t u; + for (u=0; u126) c = '.'; /* non-printable char */ + DISPLAYLEVEL(dlevel, "%c", c); + } +} + + +/* ************************************* +* Exceptions +***************************************/ +#ifndef DEBUG +# define DEBUG 0 +#endif +#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); +#define EXM_THROW(error, ...) \ +{ \ + DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ + DISPLAYLEVEL(1, "Error %i : ", error); \ + DISPLAYLEVEL(1, __VA_ARGS__); \ + DISPLAYLEVEL(1, "\n"); \ + exit(error); \ +} + + +/* ******************************************************** +* Helper functions +**********************************************************/ +unsigned DiB_versionNumber (void) { return DiB_VERSION_NUMBER; } + +static unsigned DiB_GetMilliSpan(clock_t nPrevious) +{ + clock_t nCurrent = clock(); + unsigned nSpan = (unsigned)(((nCurrent - nPrevious) * 1000) / CLOCKS_PER_SEC); + return nSpan; +} + + +/* ******************************************************** +* File related operations +**********************************************************/ +static unsigned long long DiB_getFileSize(const char* infilename) +{ + int r; +#if defined(_MSC_VER) + struct _stat64 statbuf; + r = _stat64(infilename, &statbuf); +#else + struct stat statbuf; + r = stat(infilename, &statbuf); +#endif + if (r || !S_ISREG(statbuf.st_mode)) return 0; /* No good... */ + return (unsigned long long)statbuf.st_size; +} + + +static unsigned long long DiB_getTotalFileSize(const char** fileNamesTable, unsigned nbFiles) +{ + unsigned long long total = 0; + unsigned n; + for (n=0; n bufferSize-pos) fileSize = 0; /* stop there, not enough memory to load all files */ + readSize = fread(buff+pos, 1, (size_t)fileSize, f); + if (readSize != (size_t)fileSize) EXM_THROW(11, "could not read %s", fileNamesTable[n]); + pos += readSize; + fileSizes[n] = (size_t)fileSize; + fclose(f); + } +} + + +/*-******************************************************** +* Dictionary training functions +**********************************************************/ +static size_t DiB_read_ARCH(const void* p) { size_t r; memcpy(&r, p, sizeof(r)); return r; } + +static unsigned DiB_NbCommonBytes (register size_t val) +{ + if (MEM_isLittleEndian()) { + if (MEM_64bits()) { +# if defined(_MSC_VER) && defined(_WIN64) + unsigned long r = 0; + _BitScanForward64( &r, (U64)val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_ctzll((U64)val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif + } else { /* 32 bits */ +# if defined(_MSC_VER) + unsigned long r=0; + _BitScanForward( &r, (U32)val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_ctz((U32)val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif + } + } else { /* Big Endian CPU */ + if (MEM_64bits()) { +# if defined(_MSC_VER) && defined(_WIN64) + unsigned long r = 0; + _BitScanReverse64( &r, val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_clzll(val) >> 3); +# else + unsigned r; + const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ + if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +# endif + } else { /* 32 bits */ +# if defined(_MSC_VER) + unsigned long r = 0; + _BitScanReverse( &r, (unsigned long)val ); + return (unsigned)(r>>3); +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_clz((U32)val) >> 3); +# else + unsigned r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif + } } +} + + +/*! DiB_count() : + Count the nb of common bytes between 2 pointers. + Note : this function presumes end of buffer followed by noisy guard band. +*/ +static size_t DiB_count(const void* pIn, const void* pMatch) +{ + const char* const pStart = (const char*)pIn; + for (;;) { + size_t diff = DiB_read_ARCH(pMatch) ^ DiB_read_ARCH(pIn); + if (!diff) { pIn = (const char*)pIn+sizeof(size_t); pMatch = (const char*)pMatch+sizeof(size_t); continue; } + pIn = (const char*)pIn+DiB_NbCommonBytes(diff); + return (size_t)((const char*)pIn - pStart); + } +} + + +typedef struct { + U32 pos; + U32 length; + U32 savings; +} dictItem; + +void DiB_initDictItem(dictItem* d) +{ + d->pos = 1; + d->length = 0; + d->savings = (U32)(-1); +} + + +#define LLIMIT 64 /* heuristic determined experimentally */ +#define MINMATCHLENGTH 7 /* heuristic determined experimentally */ +static dictItem DiB_analyzePos( + BYTE* doneMarks, + const saidx_t* suffix, U32 start, + const void* buffer, U32 minRatio) +{ + U32 lengthList[LLIMIT] = {0}; + U32 cumulLength[LLIMIT] = {0}; + U32 savings[LLIMIT] = {0}; + const BYTE* b = (const BYTE*)buffer; + size_t length; + size_t maxLength = LLIMIT; + size_t pos = suffix[start]; + U32 end = start; + dictItem solution; + + + /* init */ + memset(&solution, 0, sizeof(solution)); + doneMarks[pos] = 1; + + /* trivial repetition cases */ + if ( (MEM_read16(b+pos+0) == MEM_read16(b+pos+2)) + ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3)) + ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) { + /* skip and mark segment */ + U16 u16 = MEM_read16(b+pos+4); + U32 u, e = 6; + while (MEM_read16(b+pos+e) == u16) e+=2 ; + if (b[pos+e] == b[pos+e-1]) e++; + for (u=1; u=MINMATCHLENGTH); + + /* look backward */ + do { + length = DiB_count(b + pos, b + *(suffix+start-1)); + if (length >=MINMATCHLENGTH) start--; + } while(length >= MINMATCHLENGTH); + + /* exit if not found a minimum nb of repetitions */ + if (end-start < minRatio) { + U32 idx; + for(idx=start; idx= %u at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos); + DISPLAYLEVEL(4, "\n"); + + for (searchLength = MINMATCHLENGTH ; ; searchLength++) { + BYTE currentChar = 0; + U32 currentCount = 0; + U32 currentID = refinedStart; + U32 id; + U32 selectedCount = 0; + U32 selectedID = currentID; + for (id =refinedStart; id < refinedEnd; id++) { + if (b[ suffix[id] + searchLength] != currentChar) { + if (currentCount > selectedCount) { + selectedCount = currentCount; + selectedID = currentID; + } + currentID = id; + currentChar = b[ suffix[id] + searchLength]; + currentCount = 0; + } + currentCount ++; + } + if (currentCount > selectedCount) { /* for last */ + selectedCount = currentCount; + selectedID = currentID; + } + + if (selectedCount < minRatio) + break; + //DISPLAYLEVEL(4, "best char at length %u: %02X (seen %u times) (pos %u) \n", searchLength+1, selectedChar, selectedCount, selectedRef); + refinedStart = selectedID; + refinedEnd = refinedStart + selectedCount; + } + + /* evaluate gain based on new ref */ + start = refinedStart; + pos = suffix[refinedStart]; + end = start; + memset(lengthList, 0, sizeof(lengthList)); + + /* look forward */ + do { + end++; + length = DiB_count(b + pos, b + suffix[end]); + if (length >= LLIMIT) length = LLIMIT-1; + lengthList[length]++; + } while (length >=MINMATCHLENGTH); + + /* look backward */ + do { + length = DiB_count(b + pos, b + suffix[start-1]); + if (length >= LLIMIT) length = LLIMIT-1; + lengthList[length]++; + if (length >=MINMATCHLENGTH) start--; + } while(length >= MINMATCHLENGTH); + + /* largest useful length */ + memset(cumulLength, 0, sizeof(cumulLength)); + cumulLength[maxLength-1] = lengthList[maxLength-1]; + for (i=maxLength-2; i>=0; i--) + cumulLength[i] = cumulLength[i+1] + lengthList[i]; + + for (i=LLIMIT-1; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break; + maxLength = i; + + /* reduce maxLength in case of final into repetitive data */ + { + U32 l = maxLength; + BYTE c = b[pos + maxLength-1]; + while (b[pos+l-2]==c) l--; + maxLength = l; + } + if (maxLength < MINMATCHLENGTH) return solution; /* skip : no long-enough solution */ + + /* calculate savings */ + savings[5] = 0; + for (i=MINMATCHLENGTH; i<=(int)maxLength; i++) + savings[i] = savings[i-1] + (lengthList[i] * (i-3)); + + DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n", + (U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength); + + solution.pos = pos; + solution.length = maxLength; + solution.savings = savings[maxLength]; + + /* mark positions done */ + { + U32 id; + U32 testedPos; + for (id=start; id solution.length) length = solution.length; + } + pEnd = testedPos + length; + for (p=testedPos; ppos; + const U32 max = elt.pos + (elt.length-1); + + /* tail overlap */ + U32 u; for (u=1; u elt.pos) && (table[u].pos < max)) { /* overlap */ + /* append */ + U32 addedLength = table[u].pos - elt.pos; + table[u].length += addedLength; + table[u].pos = elt.pos; + table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */ + table[u].savings += elt.length / 8; /* rough approx */ + elt = table[u]; + while ((u>1) && (table[u-1].savings < elt.savings)) + table[u] = table[u-1], u--; + table[u] = elt; + return u; + } } + + /* front overlap */ + for (u=1; u elt.pos) && (table[u].pos < elt.pos)) { /* overlap */ + /* append */ + int addedLength = (elt.pos + elt.length) - (table[u].pos + table[u].length); + table[u].savings += elt.length / 8; /* rough approx */ + if (addedLength > 0) { /* otherwise, already included */ + table[u].length += addedLength; + table[u].savings += elt.savings * addedLength / elt.length; /* rough approx */ + } + elt = table[u]; + while ((u>1) && (table[u-1].savings < elt.savings)) + table[u] = table[u-1], u--; + table[u] = elt; + return u; + } } + + return 0; +} + + +static void DiB_removeDictItem(dictItem* table, U32 id) +{ + /* convention : first element is nb of elts */ + U32 max = table->pos; + U32 u; + if (!id) return; /* protection, should never happen */ + for (u=id; upos--; +} + + +static void DiB_insertDictItem(dictItem* table, U32 maxSize, dictItem elt) +{ + /* merge if possible */ + U32 mergeId = DiB_checkMerge(table, elt, 0); + if (mergeId) { + U32 newMerge = 1; + while (newMerge) { + newMerge = DiB_checkMerge(table, table[mergeId], mergeId); + if (newMerge) DiB_removeDictItem(table, mergeId); + mergeId = newMerge; + } + return; + } + + /* insert */ + { + U32 current; + U32 nextElt = table->pos; + if (nextElt >= maxSize) nextElt = maxSize-1; + current = nextElt-1; + while (table[current].savings < elt.savings) { + table[current+1] = table[current]; + current--; + } + table[current+1] = elt; + table->pos = nextElt+1; + } +} + + +static U32 DiB_dictSize(const dictItem* dictList) +{ + U32 u, dictSize = 0; + for (u=1; u> shiftRatio; + saint_t errorCode; + + /* init */ + DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ + if (!suffix0 || !reverseSuffix || !doneMarks || !filePos) + EXM_THROW(1, "not enough memory for DiB_trainBuffer"); + if (minRatio < MINRATIO) minRatio = MINRATIO; + memset(doneMarks, 0, bufferSize+16); + + /* sort */ + DISPLAYLEVEL(2, "sorting %s ...\n", displayName); + errorCode = divsufsort((const sauchar_t*)buffer, suffix, bufferSize); + if (errorCode != 0) EXM_THROW(2, "sort failed"); + suffix[bufferSize] = bufferSize; /* leads into noise */ + suffix0[0] = bufferSize; /* leads into noise */ + { + /* build reverse suffix sort */ + size_t pos; + for (pos=0; pos < bufferSize; pos++) + reverseSuffix[suffix[pos]] = pos; + /* build file pos */ + filePos[0] = 0; + for (pos=1; pospos; /* convention : nb of useful elts within dictList */ + U32 currentSize = 0; + U32 n; for (n=1; n maxDictSize) break; + } + dictList->pos = n; + } + + free(suffix0); + free(reverseSuffix); + free(doneMarks); + free(filePos); +} + + +static size_t DiB_findMaxMem(unsigned long long requiredMem) +{ + size_t step = 8 MB; + void* testmem = NULL; + + requiredMem = (((requiredMem >> 23) + 1) << 23); + requiredMem += 2 * step; + if (requiredMem > maxMemory) requiredMem = maxMemory; + + while (!testmem) { + requiredMem -= step; + testmem = malloc((size_t)requiredMem); + } + + free(testmem); + return (size_t)(requiredMem - step); +} + + +static void DiB_fillNoise(void* buffer, size_t length) +{ + unsigned acc = PRIME1; + size_t p=0;; + + for (p=0; p> 21); + } +} + + +typedef struct +{ + ZSTD_CCtx* ref; + ZSTD_CCtx* zc; + void* workPlace; /* must be BLOCKSIZE allocated */ +} EStats_ress_t; + + +static void DiB_countEStats(EStats_ress_t esr, + U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, + const void* src, size_t srcSize) +{ + const BYTE* bytePtr; + const U32* u32Ptr; + + if (srcSize > BLOCKSIZE) srcSize = BLOCKSIZE; /* protection vs large samples */ + ZSTD_copyCCtx(esr.zc, esr.ref); + ZSTD_compressBlock(esr.zc, esr.workPlace, BLOCKSIZE, src, srcSize); + + /* count stats */ + for(bytePtr = esr.zc->seqStore.litStart; bytePtr < esr.zc->seqStore.lit; bytePtr++) + countLit[*bytePtr]++; + for(u32Ptr = esr.zc->seqStore.offsetStart; u32Ptr < esr.zc->seqStore.offset; u32Ptr++) { + BYTE offcode = (BYTE)ZSTD_highbit(*u32Ptr) + 1; + if (*u32Ptr==0) offcode=0; + offsetcodeCount[offcode]++; + } + for(bytePtr = esr.zc->seqStore.matchLengthStart; bytePtr < esr.zc->seqStore.matchLength; bytePtr++) + matchlengthCount[*bytePtr]++; + for(bytePtr = esr.zc->seqStore.litLengthStart; bytePtr < esr.zc->seqStore.litLength; bytePtr++) + litlengthCount[*bytePtr]++; +} + + +#define OFFCODE_MAX 18 +static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize, + const void* srcBuffer, size_t* fileSizes, unsigned nbFiles, + const void* dictBuffer, size_t dictBufferSize) +{ + U32 countLit[256]; + U32 offcodeCount[MaxOff+1]; + HUF_CREATE_STATIC_CTABLE(hufTable, 255); + short offcodeNCount[MaxOff+1]; + U32 matchLengthCount[MaxML+1]; + short matchLengthNCount[MaxML+1]; + U32 litlengthCount[MaxLL+1]; + short litlengthNCount[MaxLL+1]; + EStats_ress_t esr; + ZSTD_parameters params; + U32 u, huffLog = 12, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total; + size_t pos = 0, errorCode; + size_t eSize = 0; + + /* init */ + for (u=0; u<256; u++) countLit[u]=1; /* any character must be described */ + for (u=0; u<=OFFCODE_MAX; u++) offcodeCount[u]=1; + for (u=0; u<=MaxML; u++) matchLengthCount[u]=1; + for (u=0; u<=MaxLL; u++) litlengthCount[u]=1; + esr.ref = ZSTD_createCCtx(); + esr.zc = ZSTD_createCCtx(); + esr.workPlace = malloc(BLOCKSIZE); + if (!esr.ref || !esr.zc || !esr.workPlace) EXM_THROW(30, "Not enough memory"); + params = ZSTD_getParams(5, dictBufferSize + 15 KB); + params.strategy = ZSTD_greedy; + ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params); + + /* collect stats on all files */ + for (u=0; u totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad; + if (benchedSize < totalSizeToLoad) + DISPLAY("Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20)); + + /* Memory allocation & restrictions */ + srcBuffer = malloc(benchedSize+NOISELENGTH); /* + noise */ + if ((!fileSizes) || (!srcBuffer) || (!dictList)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */ + DiB_initDictItem(dictList); + + /* Load input buffer */ + DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles); + DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* for end of buffer condition */ + + /* Train */ + snprintf (mfName, sizeof(mfName), " %u files", nbFiles); + if (nbFiles > 1) displayName = mfName; + else displayName = fileNamesTable[0]; + + DiB_trainBuffer(dictList, dictListSize, + srcBuffer, benchedSize, + displayName, + fileSizes, nbFiles, maxDictSize, + shiftRatio); + + /* display best matches */ + if (g_displayLevel>= 3) { + const U32 nb = 25; + U32 u; + U32 dictContentSize = DiB_dictSize(dictList); + DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize); + DISPLAYLEVEL(3, "list %u best segments \n", nb); + for (u=1; u<=nb; u++) { + U32 p = dictList[u].pos; + U32 l = dictList[u].length; + U32 d = MIN(40, l); + DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |", + u, l, p, dictList[u].savings); + DiB_printHex(3, (char*)srcBuffer+p, d); + DISPLAYLEVEL(3, "| \n"); + } } + + /* create dictionary */ + { + void* dictContent; + U32 dictContentSize = DiB_dictSize(dictList); + void* dictHeader; + size_t dictHeaderSize, hSize; + BYTE* ptr; + U32 u; + + /* build dict */ + #define EBSIZE (2 KB) + dictHeaderSize = EBSIZE; + dictHeader = malloc(dictHeaderSize); + dictContent = malloc(dictContentSize); + if (!dictHeader || !dictContent) EXM_THROW(2, "not enough memory"); + + /* build dict content */ + ptr = (BYTE*)dictContent + dictContentSize; + + for (u=1; upos; u++) { + U32 l = dictList[u].length; + ptr -= l; + memcpy(ptr, (char*)srcBuffer+dictList[u].pos, l); + } + + /* dictionary header */ + MEM_writeLE32(dictHeader, ZSTD_DICT_MAGIC); + hSize = 4; + dictHeaderSize -= 4; + + /* entropic tables */ + DISPLAYLEVEL(2, "statistics ... \n"); + hSize += DiB_analyzeEntropy((char*)dictHeader+4, dictHeaderSize, + srcBuffer, fileSizes, nbFiles, + dictContent, dictContentSize); + + /* save dict */ + { + size_t dictSize = hSize + dictContentSize; + DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); + DiB_saveDict(dictFileName, dictHeader, hSize, dictContent, dictContentSize); + //DiB_saveDict(dictFileName, NULL, 0, dictContent, dictContentSize); // content only + } + /* clean */ + free(dictHeader); + free(dictContent); + } + + /* clean up */ + free(srcBuffer); + free(fileSizes); + free(dictList); + return 0; +} + diff --git a/dictBuilder/dictBuilder.h b/dictBuilder/dictBuilder.h new file mode 100644 index 00000000..76efed29 --- /dev/null +++ b/dictBuilder/dictBuilder.h @@ -0,0 +1,44 @@ +/* + dictBuilder.h + Copyright (C) Yann Collet 2016 + + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + You can contact the author at : + - zstd source repository : https://github.com/Cyan4973/zstd + - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c +*/ + +/* ************************************* +* Version +***************************************/ +#define DiB_VERSION_MAJOR 0 /* for breaking interface changes */ +#define DiB_VERSION_MINOR 0 /* for new (non-breaking) interface capabilities */ +#define DiB_VERSION_RELEASE 1 /* for tweaks, bug-fixes, or development */ +#define DiB_VERSION_NUMBER (DiB_VERSION_MAJOR *100*100 + DiB_VERSION_MINOR *100 + DiB_VERSION_RELEASE) +unsigned DiB_versionNumber (void); + + +/* ************************************* +* Main functions +***************************************/ +int DiB_trainFiles(const char* dictFileName, unsigned maxDictSize, + const char** fileNamesTable, unsigned nbFiles, + unsigned selectionLevel); + + +void DiB_setDisplayLevel(unsigned l); diff --git a/dictBuilder/divsufsort.c b/dictBuilder/divsufsort.c new file mode 100644 index 00000000..9f64b4f4 --- /dev/null +++ b/dictBuilder/divsufsort.c @@ -0,0 +1,398 @@ +/* + * divsufsort.c for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "divsufsort_private.h" +#ifdef _OPENMP +# include +#endif + + +/*- Private Functions -*/ + +/* Sorts suffixes of type B*. */ +static +saidx_t +sort_typeBstar(const sauchar_t *T, saidx_t *SA, + saidx_t *bucket_A, saidx_t *bucket_B, + saidx_t n) { + saidx_t *PAb, *ISAb, *buf; +#ifdef _OPENMP + saidx_t *curbuf; + saidx_t l; +#endif + saidx_t i, j, k, t, m, bufsize; + saint_t c0, c1; +#ifdef _OPENMP + saint_t d0, d1; + int tmp; +#endif + + /* Initialize bucket arrays. */ + for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; } + for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; } + + /* Count the number of occurrences of the first one or two characters of each + type A, B and B* suffix. Moreover, store the beginning position of all + type B* suffixes into the array SA. */ + for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) { + /* type A suffix. */ + do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1)); + if(0 <= i) { + /* type B* suffix. */ + ++BUCKET_BSTAR(c0, c1); + SA[--m] = i; + /* type B suffix. */ + for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { + ++BUCKET_B(c0, c1); + } + } + } + m = n - m; +/* +note: + A type B* suffix is lexicographically smaller than a type B suffix that + begins with the same first two characters. +*/ + + /* Calculate the index of start/end point of each bucket. */ + for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) { + t = i + BUCKET_A(c0); + BUCKET_A(c0) = i + j; /* start point */ + i = t + BUCKET_B(c0, c0); + for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) { + j += BUCKET_BSTAR(c0, c1); + BUCKET_BSTAR(c0, c1) = j; /* end point */ + i += BUCKET_B(c0, c1); + } + } + + if(0 < m) { + /* Sort the type B* suffixes by their first two characters. */ + PAb = SA + n - m; ISAb = SA + m; + for(i = m - 2; 0 <= i; --i) { + t = PAb[i], c0 = T[t], c1 = T[t + 1]; + SA[--BUCKET_BSTAR(c0, c1)] = i; + } + t = PAb[m - 1], c0 = T[t], c1 = T[t + 1]; + SA[--BUCKET_BSTAR(c0, c1)] = m - 1; + + /* Sort the type B* substrings using sssort. */ +#ifdef _OPENMP + tmp = omp_get_max_threads(); + buf = SA + m, bufsize = (n - (2 * m)) / tmp; + c0 = ALPHABET_SIZE - 2, c1 = ALPHABET_SIZE - 1, j = m; +#pragma omp parallel default(shared) private(curbuf, k, l, d0, d1, tmp) + { + tmp = omp_get_thread_num(); + curbuf = buf + tmp * bufsize; + k = 0; + for(;;) { + #pragma omp critical(sssort_lock) + { + if(0 < (l = j)) { + d0 = c0, d1 = c1; + do { + k = BUCKET_BSTAR(d0, d1); + if(--d1 <= d0) { + d1 = ALPHABET_SIZE - 1; + if(--d0 < 0) { break; } + } + } while(((l - k) <= 1) && (0 < (l = k))); + c0 = d0, c1 = d1, j = k; + } + } + if(l == 0) { break; } + sssort(T, PAb, SA + k, SA + l, + curbuf, bufsize, 2, n, *(SA + k) == (m - 1)); + } + } +#else + buf = SA + m, bufsize = n - (2 * m); + for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) { + for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) { + i = BUCKET_BSTAR(c0, c1); + if(1 < (j - i)) { + sssort(T, PAb, SA + i, SA + j, + buf, bufsize, 2, n, *(SA + i) == (m - 1)); + } + } + } +#endif + + /* Compute ranks of type B* substrings. */ + for(i = m - 1; 0 <= i; --i) { + if(0 <= SA[i]) { + j = i; + do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i])); + SA[i + 1] = i - j; + if(i <= 0) { break; } + } + j = i; + do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0); + ISAb[SA[i]] = j; + } + + /* Construct the inverse suffix array of type B* suffixes using trsort. */ + trsort(ISAb, SA, m, 1); + + /* Set the sorted order of tyoe B* suffixes. */ + for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) { + for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { } + if(0 <= i) { + t = i; + for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { } + SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t; + } + } + + /* Calculate the index of start/end point of each bucket. */ + BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */ + for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) { + i = BUCKET_A(c0 + 1) - 1; + for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) { + t = i - BUCKET_B(c0, c1); + BUCKET_B(c0, c1) = i; /* end point */ + + /* Move all type B* suffixes to the correct position. */ + for(i = t, j = BUCKET_BSTAR(c0, c1); + j <= k; + --i, --k) { SA[i] = SA[k]; } + } + BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */ + BUCKET_B(c0, c0) = i; /* end point */ + } + } + + return m; +} + +/* Constructs the suffix array by using the sorted order of type B* suffixes. */ +static +void +construct_SA(const sauchar_t *T, saidx_t *SA, + saidx_t *bucket_A, saidx_t *bucket_B, + saidx_t n, saidx_t m) { + saidx_t *i, *j, *k; + saidx_t s; + saint_t c0, c1, c2; + + if(0 < m) { + /* Construct the sorted order of type B suffixes by using + the sorted order of type B* suffixes. */ + for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { + /* Scan the suffix array from right to left. */ + for(i = SA + BUCKET_BSTAR(c1, c1 + 1), + j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1; + i <= j; + --j) { + if(0 < (s = *j)) { + assert(T[s] == c1); + assert(((s + 1) < n) && (T[s] <= T[s + 1])); + assert(T[s - 1] <= T[s]); + *j = ~s; + c0 = T[--s]; + if((0 < s) && (T[s - 1] > c0)) { s = ~s; } + if(c0 != c2) { + if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; } + k = SA + BUCKET_B(c2 = c0, c1); + } + assert(k < j); + *k-- = s; + } else { + assert(((s == 0) && (T[s] == c1)) || (s < 0)); + *j = ~s; + } + } + } + } + + /* Construct the suffix array by using + the sorted order of type B suffixes. */ + k = SA + BUCKET_A(c2 = T[n - 1]); + *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1); + /* Scan the suffix array from left to right. */ + for(i = SA, j = SA + n; i < j; ++i) { + if(0 < (s = *i)) { + assert(T[s - 1] >= T[s]); + c0 = T[--s]; + if((s == 0) || (T[s - 1] < c0)) { s = ~s; } + if(c0 != c2) { + BUCKET_A(c2) = k - SA; + k = SA + BUCKET_A(c2 = c0); + } + assert(i < k); + *k++ = s; + } else { + assert(s < 0); + *i = ~s; + } + } +} + +/* Constructs the burrows-wheeler transformed string directly + by using the sorted order of type B* suffixes. */ +static +saidx_t +construct_BWT(const sauchar_t *T, saidx_t *SA, + saidx_t *bucket_A, saidx_t *bucket_B, + saidx_t n, saidx_t m) { + saidx_t *i, *j, *k, *orig; + saidx_t s; + saint_t c0, c1, c2; + + if(0 < m) { + /* Construct the sorted order of type B suffixes by using + the sorted order of type B* suffixes. */ + for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) { + /* Scan the suffix array from right to left. */ + for(i = SA + BUCKET_BSTAR(c1, c1 + 1), + j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1; + i <= j; + --j) { + if(0 < (s = *j)) { + assert(T[s] == c1); + assert(((s + 1) < n) && (T[s] <= T[s + 1])); + assert(T[s - 1] <= T[s]); + c0 = T[--s]; + *j = ~((saidx_t)c0); + if((0 < s) && (T[s - 1] > c0)) { s = ~s; } + if(c0 != c2) { + if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; } + k = SA + BUCKET_B(c2 = c0, c1); + } + assert(k < j); + *k-- = s; + } else if(s != 0) { + *j = ~s; +#ifndef NDEBUG + } else { + assert(T[s] == c1); +#endif + } + } + } + } + + /* Construct the BWTed string by using + the sorted order of type B suffixes. */ + k = SA + BUCKET_A(c2 = T[n - 1]); + *k++ = (T[n - 2] < c2) ? ~((saidx_t)T[n - 2]) : (n - 1); + /* Scan the suffix array from left to right. */ + for(i = SA, j = SA + n, orig = SA; i < j; ++i) { + if(0 < (s = *i)) { + assert(T[s - 1] >= T[s]); + c0 = T[--s]; + *i = c0; + if((0 < s) && (T[s - 1] < c0)) { s = ~((saidx_t)T[s - 1]); } + if(c0 != c2) { + BUCKET_A(c2) = k - SA; + k = SA + BUCKET_A(c2 = c0); + } + assert(i < k); + *k++ = s; + } else if(s != 0) { + *i = ~s; + } else { + orig = i; + } + } + + return orig - SA; +} + + +/*---------------------------------------------------------------------------*/ + +/*- Function -*/ + +saint_t +divsufsort(const sauchar_t *T, saidx_t *SA, saidx_t n) { + saidx_t *bucket_A, *bucket_B; + saidx_t m; + saint_t err = 0; + + /* Check arguments. */ + if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; } + else if(n == 0) { return 0; } + else if(n == 1) { SA[0] = 0; return 0; } + else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; } + + bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t)); + bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t)); + + /* Suffixsort. */ + if((bucket_A != NULL) && (bucket_B != NULL)) { + m = sort_typeBstar(T, SA, bucket_A, bucket_B, n); + construct_SA(T, SA, bucket_A, bucket_B, n, m); + } else { + err = -2; + } + + free(bucket_B); + free(bucket_A); + + return err; +} + +saidx_t +divbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n) { + saidx_t *B; + saidx_t *bucket_A, *bucket_B; + saidx_t m, pidx, i; + + /* Check arguments. */ + if((T == NULL) || (U == NULL) || (n < 0)) { return -1; } + else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; } + + if((B = A) == NULL) { B = (saidx_t *)malloc((size_t)(n + 1) * sizeof(saidx_t)); } + bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t)); + bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t)); + + /* Burrows-Wheeler Transform. */ + if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) { + m = sort_typeBstar(T, B, bucket_A, bucket_B, n); + pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m); + + /* Copy to output string. */ + U[0] = T[n - 1]; + for(i = 0; i < pidx; ++i) { U[i + 1] = (sauchar_t)B[i]; } + for(i += 1; i < n; ++i) { U[i] = (sauchar_t)B[i]; } + pidx += 1; + } else { + pidx = -2; + } + + free(bucket_B); + free(bucket_A); + if(A == NULL) { free(B); } + + return pidx; +} + +const char * +divsufsort_version(void) { + return PROJECT_VERSION_FULL; +} diff --git a/dictBuilder/divsufsort.h b/dictBuilder/divsufsort.h new file mode 100644 index 00000000..6d3e6487 --- /dev/null +++ b/dictBuilder/divsufsort.h @@ -0,0 +1,180 @@ +/* + * divsufsort.h for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _DIVSUFSORT_H +#define _DIVSUFSORT_H 1 + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#include + +#ifndef DIVSUFSORT_API +# ifdef DIVSUFSORT_BUILD_DLL +# define DIVSUFSORT_API +# else +# define DIVSUFSORT_API +# endif +#endif + +/*- Datatypes -*/ +#ifndef SAUCHAR_T +#define SAUCHAR_T +typedef uint8_t sauchar_t; +#endif /* SAUCHAR_T */ +#ifndef SAINT_T +#define SAINT_T +typedef int32_t saint_t; +#endif /* SAINT_T */ +#ifndef SAIDX_T +#define SAIDX_T +typedef int32_t saidx_t; +#endif /* SAIDX_T */ +#ifndef PRIdSAINT_T +#define PRIdSAINT_T PRId32 +#endif /* PRIdSAINT_T */ +#ifndef PRIdSAIDX_T +#define PRIdSAIDX_T PRId32 +#endif /* PRIdSAIDX_T */ + + +/*- Prototypes -*/ + +/** + * Constructs the suffix array of a given string. + * @param T[0..n-1] The input string. + * @param SA[0..n-1] The output array of suffixes. + * @param n The length of the given string. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ +DIVSUFSORT_API +saint_t +divsufsort(const sauchar_t *T, saidx_t *SA, saidx_t n); + +/** + * Constructs the burrows-wheeler transformed string of a given string. + * @param T[0..n-1] The input string. + * @param U[0..n-1] The output string. (can be T) + * @param A[0..n-1] The temporary array. (can be NULL) + * @param n The length of the given string. + * @return The primary index if no error occurred, -1 or -2 otherwise. + */ +DIVSUFSORT_API +saidx_t +divbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n); + +/** + * Returns the version of the divsufsort library. + * @return The version number string. + */ +DIVSUFSORT_API +const char * +divsufsort_version(void); + + +/** + * Constructs the burrows-wheeler transformed string of a given string and suffix array. + * @param T[0..n-1] The input string. + * @param U[0..n-1] The output string. (can be T) + * @param SA[0..n-1] The suffix array. (can be NULL) + * @param n The length of the given string. + * @param idx The output primary index. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ +DIVSUFSORT_API +saint_t +bw_transform(const sauchar_t *T, sauchar_t *U, + saidx_t *SA /* can NULL */, + saidx_t n, saidx_t *idx); + +/** + * Inverse BW-transforms a given BWTed string. + * @param T[0..n-1] The input string. + * @param U[0..n-1] The output string. (can be T) + * @param A[0..n-1] The temporary array. (can be NULL) + * @param n The length of the given string. + * @param idx The primary index. + * @return 0 if no error occurred, -1 or -2 otherwise. + */ +DIVSUFSORT_API +saint_t +inverse_bw_transform(const sauchar_t *T, sauchar_t *U, + saidx_t *A /* can NULL */, + saidx_t n, saidx_t idx); + +/** + * Checks the correctness of a given suffix array. + * @param T[0..n-1] The input string. + * @param SA[0..n-1] The input suffix array. + * @param n The length of the given string. + * @param verbose The verbose mode. + * @return 0 if no error occurred. + */ +DIVSUFSORT_API +saint_t +sufcheck(const sauchar_t *T, const saidx_t *SA, saidx_t n, saint_t verbose); + +/** + * Search for the pattern P in the string T. + * @param T[0..Tsize-1] The input string. + * @param Tsize The length of the given string. + * @param P[0..Psize-1] The input pattern string. + * @param Psize The length of the given pattern string. + * @param SA[0..SAsize-1] The input suffix array. + * @param SAsize The length of the given suffix array. + * @param idx The output index. + * @return The count of matches if no error occurred, -1 otherwise. + */ +DIVSUFSORT_API +saidx_t +sa_search(const sauchar_t *T, saidx_t Tsize, + const sauchar_t *P, saidx_t Psize, + const saidx_t *SA, saidx_t SAsize, + saidx_t *left); + +/** + * Search for the character c in the string T. + * @param T[0..Tsize-1] The input string. + * @param Tsize The length of the given string. + * @param SA[0..SAsize-1] The input suffix array. + * @param SAsize The length of the given suffix array. + * @param c The input character. + * @param idx The output index. + * @return The count of matches if no error occurred, -1 otherwise. + */ +DIVSUFSORT_API +saidx_t +sa_simplesearch(const sauchar_t *T, saidx_t Tsize, + const saidx_t *SA, saidx_t SAsize, + saint_t c, saidx_t *left); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* _DIVSUFSORT_H */ diff --git a/dictBuilder/divsufsort_private.h b/dictBuilder/divsufsort_private.h new file mode 100644 index 00000000..dc35063f --- /dev/null +++ b/dictBuilder/divsufsort_private.h @@ -0,0 +1,208 @@ +/* + * divsufsort_private.h for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _DIVSUFSORT_PRIVATE_H +#define _DIVSUFSORT_PRIVATE_H 1 + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/* ************************* +* Includes +***************************/ +#include +#include /* unconditional */ +#include +#include "config.h" /* unconditional */ + + +#if HAVE_STRING_H +# include +#endif +#if HAVE_MEMORY_H +# include +#endif +#if HAVE_STDDEF_H +# include +#endif +#if HAVE_STRINGS_H +# include +#endif +#if HAVE_INTTYPES_H +# include +#else +# if HAVE_STDINT_H +# include +# endif +#endif +#if defined(BUILD_DIVSUFSORT64) +# include "divsufsort64.h" +# ifndef SAIDX_T +# define SAIDX_T +# define saidx_t saidx64_t +# endif /* SAIDX_T */ +# ifndef PRIdSAIDX_T +# define PRIdSAIDX_T PRIdSAIDX64_T +# endif /* PRIdSAIDX_T */ +# define divsufsort divsufsort64 +# define divbwt divbwt64 +# define divsufsort_version divsufsort64_version +# define bw_transform bw_transform64 +# define inverse_bw_transform inverse_bw_transform64 +# define sufcheck sufcheck64 +# define sa_search sa_search64 +# define sa_simplesearch sa_simplesearch64 +# define sssort sssort64 +# define trsort trsort64 +#else +# include "divsufsort.h" +#endif + + +/*- Constants -*/ +#if !defined(UINT8_MAX) +# define UINT8_MAX (255) +#endif /* UINT8_MAX */ +#if defined(ALPHABET_SIZE) && (ALPHABET_SIZE < 1) +# undef ALPHABET_SIZE +#endif +#if !defined(ALPHABET_SIZE) +# define ALPHABET_SIZE (UINT8_MAX + 1) +#endif +/* for divsufsort.c */ +#define BUCKET_A_SIZE (ALPHABET_SIZE) +#define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE) +/* for sssort.c */ +#if defined(SS_INSERTIONSORT_THRESHOLD) +# if SS_INSERTIONSORT_THRESHOLD < 1 +# undef SS_INSERTIONSORT_THRESHOLD +# define SS_INSERTIONSORT_THRESHOLD (1) +# endif +#else +# define SS_INSERTIONSORT_THRESHOLD (8) +#endif +#if defined(SS_BLOCKSIZE) +# if SS_BLOCKSIZE < 0 +# undef SS_BLOCKSIZE +# define SS_BLOCKSIZE (0) +# elif 32768 <= SS_BLOCKSIZE +# undef SS_BLOCKSIZE +# define SS_BLOCKSIZE (32767) +# endif +#else +# define SS_BLOCKSIZE (1024) +#endif +/* minstacksize = log(SS_BLOCKSIZE) / log(3) * 2 */ +#if SS_BLOCKSIZE == 0 +# if defined(BUILD_DIVSUFSORT64) +# define SS_MISORT_STACKSIZE (96) +# else +# define SS_MISORT_STACKSIZE (64) +# endif +#elif SS_BLOCKSIZE <= 4096 +# define SS_MISORT_STACKSIZE (16) +#else +# define SS_MISORT_STACKSIZE (24) +#endif +#if defined(BUILD_DIVSUFSORT64) +# define SS_SMERGE_STACKSIZE (64) +#else +# define SS_SMERGE_STACKSIZE (32) +#endif +/* for trsort.c */ +#define TR_INSERTIONSORT_THRESHOLD (8) +#if defined(BUILD_DIVSUFSORT64) +# define TR_STACKSIZE (96) +#else +# define TR_STACKSIZE (64) +#endif + + +/*- Macros -*/ +#ifndef SWAP +# define SWAP(_a, _b) do { t = (_a); (_a) = (_b); (_b) = t; } while(0) +#endif /* SWAP */ +#ifndef MIN +# define MIN(_a, _b) (((_a) < (_b)) ? (_a) : (_b)) +#endif /* MIN */ +#ifndef MAX +# define MAX(_a, _b) (((_a) > (_b)) ? (_a) : (_b)) +#endif /* MAX */ +#define STACK_PUSH(_a, _b, _c, _d)\ + do {\ + assert(ssize < STACK_SIZE);\ + stack[ssize].a = (_a), stack[ssize].b = (_b),\ + stack[ssize].c = (_c), stack[ssize++].d = (_d);\ + } while(0) +#define STACK_PUSH5(_a, _b, _c, _d, _e)\ + do {\ + assert(ssize < STACK_SIZE);\ + stack[ssize].a = (_a), stack[ssize].b = (_b),\ + stack[ssize].c = (_c), stack[ssize].d = (_d), stack[ssize++].e = (_e);\ + } while(0) +#define STACK_POP(_a, _b, _c, _d)\ + do {\ + assert(0 <= ssize);\ + if(ssize == 0) { return; }\ + (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\ + (_c) = stack[ssize].c, (_d) = stack[ssize].d;\ + } while(0) +#define STACK_POP5(_a, _b, _c, _d, _e)\ + do {\ + assert(0 <= ssize);\ + if(ssize == 0) { return; }\ + (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\ + (_c) = stack[ssize].c, (_d) = stack[ssize].d, (_e) = stack[ssize].e;\ + } while(0) +/* for divsufsort.c */ +#define BUCKET_A(_c0) bucket_A[(_c0)] +#if ALPHABET_SIZE == 256 +#define BUCKET_B(_c0, _c1) (bucket_B[((_c1) << 8) | (_c0)]) +#define BUCKET_BSTAR(_c0, _c1) (bucket_B[((_c0) << 8) | (_c1)]) +#else +#define BUCKET_B(_c0, _c1) (bucket_B[(_c1) * ALPHABET_SIZE + (_c0)]) +#define BUCKET_BSTAR(_c0, _c1) (bucket_B[(_c0) * ALPHABET_SIZE + (_c1)]) +#endif + + +/*- Private Prototypes -*/ +/* sssort.c */ +void +sssort(const sauchar_t *Td, const saidx_t *PA, + saidx_t *first, saidx_t *last, + saidx_t *buf, saidx_t bufsize, + saidx_t depth, saidx_t n, saint_t lastsuffix); +/* trsort.c */ +void +trsort(saidx_t *ISA, saidx_t *SA, saidx_t n, saidx_t depth); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* _DIVSUFSORT_PRIVATE_H */ diff --git a/dictBuilder/lfs.h b/dictBuilder/lfs.h new file mode 100644 index 00000000..7ef88f0b --- /dev/null +++ b/dictBuilder/lfs.h @@ -0,0 +1,56 @@ +/* + * lfs.h for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _LFS_H +#define _LFS_H 1 + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#ifndef __STRICT_ANSI__ +# define LFS_OFF_T off_t +# define LFS_FOPEN fopen +# define LFS_FTELL ftello +# define LFS_FSEEK fseeko +# define LFS_PRId PRIdMAX +#else +# define LFS_OFF_T long +# define LFS_FOPEN fopen +# define LFS_FTELL ftell +# define LFS_FSEEK fseek +# define LFS_PRId "ld" +#endif +#ifndef PRIdOFF_T +# define PRIdOFF_T LFS_PRId +#endif + + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* _LFS_H */ diff --git a/dictBuilder/sssort.c b/dictBuilder/sssort.c new file mode 100644 index 00000000..4a18fd2a --- /dev/null +++ b/dictBuilder/sssort.c @@ -0,0 +1,815 @@ +/* + * sssort.c for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "divsufsort_private.h" + + +/*- Private Functions -*/ + +static const saint_t lg_table[256]= { + -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, + 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 +}; + +#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) + +static INLINE +saint_t +ss_ilg(saidx_t n) { +#if SS_BLOCKSIZE == 0 +# if defined(BUILD_DIVSUFSORT64) + return (n >> 32) ? + ((n >> 48) ? + ((n >> 56) ? + 56 + lg_table[(n >> 56) & 0xff] : + 48 + lg_table[(n >> 48) & 0xff]) : + ((n >> 40) ? + 40 + lg_table[(n >> 40) & 0xff] : + 32 + lg_table[(n >> 32) & 0xff])) : + ((n & 0xffff0000) ? + ((n & 0xff000000) ? + 24 + lg_table[(n >> 24) & 0xff] : + 16 + lg_table[(n >> 16) & 0xff]) : + ((n & 0x0000ff00) ? + 8 + lg_table[(n >> 8) & 0xff] : + 0 + lg_table[(n >> 0) & 0xff])); +# else + return (n & 0xffff0000) ? + ((n & 0xff000000) ? + 24 + lg_table[(n >> 24) & 0xff] : + 16 + lg_table[(n >> 16) & 0xff]) : + ((n & 0x0000ff00) ? + 8 + lg_table[(n >> 8) & 0xff] : + 0 + lg_table[(n >> 0) & 0xff]); +# endif +#elif SS_BLOCKSIZE < 256 + return lg_table[n]; +#else + return (n & 0xff00) ? + 8 + lg_table[(n >> 8) & 0xff] : + 0 + lg_table[(n >> 0) & 0xff]; +#endif +} + +#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */ + +#if SS_BLOCKSIZE != 0 + +static const saint_t sqq_table[256] = { + 0, 16, 22, 27, 32, 35, 39, 42, 45, 48, 50, 53, 55, 57, 59, 61, + 64, 65, 67, 69, 71, 73, 75, 76, 78, 80, 81, 83, 84, 86, 87, 89, + 90, 91, 93, 94, 96, 97, 98, 99, 101, 102, 103, 104, 106, 107, 108, 109, +110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, +128, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, +143, 144, 144, 145, 146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155, +156, 157, 158, 159, 160, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168, +169, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178, 179, 180, +181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 188, 189, 189, 190, 191, +192, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 199, 200, 201, 201, +202, 203, 203, 204, 204, 205, 206, 206, 207, 208, 208, 209, 209, 210, 211, 211, +212, 212, 213, 214, 214, 215, 215, 216, 217, 217, 218, 218, 219, 219, 220, 221, +221, 222, 222, 223, 224, 224, 225, 225, 226, 226, 227, 227, 228, 229, 229, 230, +230, 231, 231, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238, +239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, +247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255 +}; + +static INLINE +saidx_t +ss_isqrt(saidx_t x) { + saidx_t y, e; + + if(x >= (SS_BLOCKSIZE * SS_BLOCKSIZE)) { return SS_BLOCKSIZE; } + e = (x & 0xffff0000) ? + ((x & 0xff000000) ? + 24 + lg_table[(x >> 24) & 0xff] : + 16 + lg_table[(x >> 16) & 0xff]) : + ((x & 0x0000ff00) ? + 8 + lg_table[(x >> 8) & 0xff] : + 0 + lg_table[(x >> 0) & 0xff]); + + if(e >= 16) { + y = sqq_table[x >> ((e - 6) - (e & 1))] << ((e >> 1) - 7); + if(e >= 24) { y = (y + 1 + x / y) >> 1; } + y = (y + 1 + x / y) >> 1; + } else if(e >= 8) { + y = (sqq_table[x >> ((e - 6) - (e & 1))] >> (7 - (e >> 1))) + 1; + } else { + return sqq_table[x] >> 4; + } + + return (x < (y * y)) ? y - 1 : y; +} + +#endif /* SS_BLOCKSIZE != 0 */ + + +/*---------------------------------------------------------------------------*/ + +/* Compares two suffixes. */ +static INLINE +saint_t +ss_compare(const sauchar_t *T, + const saidx_t *p1, const saidx_t *p2, + saidx_t depth) { + const sauchar_t *U1, *U2, *U1n, *U2n; + + for(U1 = T + depth + *p1, + U2 = T + depth + *p2, + U1n = T + *(p1 + 1) + 2, + U2n = T + *(p2 + 1) + 2; + (U1 < U1n) && (U2 < U2n) && (*U1 == *U2); + ++U1, ++U2) { + } + + return U1 < U1n ? + (U2 < U2n ? *U1 - *U2 : 1) : + (U2 < U2n ? -1 : 0); +} + + +/*---------------------------------------------------------------------------*/ + +#if (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) + +/* Insertionsort for small size groups */ +static +void +ss_insertionsort(const sauchar_t *T, const saidx_t *PA, + saidx_t *first, saidx_t *last, saidx_t depth) { + saidx_t *i, *j; + saidx_t t; + saint_t r; + + for(i = last - 2; first <= i; --i) { + for(t = *i, j = i + 1; 0 < (r = ss_compare(T, PA + t, PA + *j, depth));) { + do { *(j - 1) = *j; } while((++j < last) && (*j < 0)); + if(last <= j) { break; } + } + if(r == 0) { *j = ~*j; } + *(j - 1) = t; + } +} + +#endif /* (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) */ + + +/*---------------------------------------------------------------------------*/ + +#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) + +static INLINE +void +ss_fixdown(const sauchar_t *Td, const saidx_t *PA, + saidx_t *SA, saidx_t i, saidx_t size) { + saidx_t j, k; + saidx_t v; + saint_t c, d, e; + + for(v = SA[i], c = Td[PA[v]]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) { + d = Td[PA[SA[k = j++]]]; + if(d < (e = Td[PA[SA[j]]])) { k = j; d = e; } + if(d <= c) { break; } + } + SA[i] = v; +} + +/* Simple top-down heapsort. */ +static +void +ss_heapsort(const sauchar_t *Td, const saidx_t *PA, saidx_t *SA, saidx_t size) { + saidx_t i, m; + saidx_t t; + + m = size; + if((size % 2) == 0) { + m--; + if(Td[PA[SA[m / 2]]] < Td[PA[SA[m]]]) { SWAP(SA[m], SA[m / 2]); } + } + + for(i = m / 2 - 1; 0 <= i; --i) { ss_fixdown(Td, PA, SA, i, m); } + if((size % 2) == 0) { SWAP(SA[0], SA[m]); ss_fixdown(Td, PA, SA, 0, m); } + for(i = m - 1; 0 < i; --i) { + t = SA[0], SA[0] = SA[i]; + ss_fixdown(Td, PA, SA, 0, i); + SA[i] = t; + } +} + + +/*---------------------------------------------------------------------------*/ + +/* Returns the median of three elements. */ +static INLINE +saidx_t * +ss_median3(const sauchar_t *Td, const saidx_t *PA, + saidx_t *v1, saidx_t *v2, saidx_t *v3) { + saidx_t *t; + if(Td[PA[*v1]] > Td[PA[*v2]]) { SWAP(v1, v2); } + if(Td[PA[*v2]] > Td[PA[*v3]]) { + if(Td[PA[*v1]] > Td[PA[*v3]]) { return v1; } + else { return v3; } + } + return v2; +} + +/* Returns the median of five elements. */ +static INLINE +saidx_t * +ss_median5(const sauchar_t *Td, const saidx_t *PA, + saidx_t *v1, saidx_t *v2, saidx_t *v3, saidx_t *v4, saidx_t *v5) { + saidx_t *t; + if(Td[PA[*v2]] > Td[PA[*v3]]) { SWAP(v2, v3); } + if(Td[PA[*v4]] > Td[PA[*v5]]) { SWAP(v4, v5); } + if(Td[PA[*v2]] > Td[PA[*v4]]) { SWAP(v2, v4); SWAP(v3, v5); } + if(Td[PA[*v1]] > Td[PA[*v3]]) { SWAP(v1, v3); } + if(Td[PA[*v1]] > Td[PA[*v4]]) { SWAP(v1, v4); SWAP(v3, v5); } + if(Td[PA[*v3]] > Td[PA[*v4]]) { return v4; } + return v3; +} + +/* Returns the pivot element. */ +static INLINE +saidx_t * +ss_pivot(const sauchar_t *Td, const saidx_t *PA, saidx_t *first, saidx_t *last) { + saidx_t *middle; + saidx_t t; + + t = last - first; + middle = first + t / 2; + + if(t <= 512) { + if(t <= 32) { + return ss_median3(Td, PA, first, middle, last - 1); + } else { + t >>= 2; + return ss_median5(Td, PA, first, first + t, middle, last - 1 - t, last - 1); + } + } + t >>= 3; + first = ss_median3(Td, PA, first, first + t, first + (t << 1)); + middle = ss_median3(Td, PA, middle - t, middle, middle + t); + last = ss_median3(Td, PA, last - 1 - (t << 1), last - 1 - t, last - 1); + return ss_median3(Td, PA, first, middle, last); +} + + +/*---------------------------------------------------------------------------*/ + +/* Binary partition for substrings. */ +static INLINE +saidx_t * +ss_partition(const saidx_t *PA, + saidx_t *first, saidx_t *last, saidx_t depth) { + saidx_t *a, *b; + saidx_t t; + for(a = first - 1, b = last;;) { + for(; (++a < b) && ((PA[*a] + depth) >= (PA[*a + 1] + 1));) { *a = ~*a; } + for(; (a < --b) && ((PA[*b] + depth) < (PA[*b + 1] + 1));) { } + if(b <= a) { break; } + t = ~*b; + *b = *a; + *a = t; + } + if(first < a) { *first = ~*first; } + return a; +} + +/* Multikey introsort for medium size groups. */ +static +void +ss_mintrosort(const sauchar_t *T, const saidx_t *PA, + saidx_t *first, saidx_t *last, + saidx_t depth) { +#define STACK_SIZE SS_MISORT_STACKSIZE + struct { saidx_t *a, *b, c; saint_t d; } stack[STACK_SIZE]; + const sauchar_t *Td; + saidx_t *a, *b, *c, *d, *e, *f; + saidx_t s, t; + saint_t ssize; + saint_t limit; + saint_t v, x = 0; + + for(ssize = 0, limit = ss_ilg(last - first);;) { + + if((last - first) <= SS_INSERTIONSORT_THRESHOLD) { +#if 1 < SS_INSERTIONSORT_THRESHOLD + if(1 < (last - first)) { ss_insertionsort(T, PA, first, last, depth); } +#endif + STACK_POP(first, last, depth, limit); + continue; + } + + Td = T + depth; + if(limit-- == 0) { ss_heapsort(Td, PA, first, last - first); } + if(limit < 0) { + for(a = first + 1, v = Td[PA[*first]]; a < last; ++a) { + if((x = Td[PA[*a]]) != v) { + if(1 < (a - first)) { break; } + v = x; + first = a; + } + } + if(Td[PA[*first] - 1] < v) { + first = ss_partition(PA, first, a, depth); + } + if((a - first) <= (last - a)) { + if(1 < (a - first)) { + STACK_PUSH(a, last, depth, -1); + last = a, depth += 1, limit = ss_ilg(a - first); + } else { + first = a, limit = -1; + } + } else { + if(1 < (last - a)) { + STACK_PUSH(first, a, depth + 1, ss_ilg(a - first)); + first = a, limit = -1; + } else { + last = a, depth += 1, limit = ss_ilg(a - first); + } + } + continue; + } + + /* choose pivot */ + a = ss_pivot(Td, PA, first, last); + v = Td[PA[*a]]; + SWAP(*first, *a); + + /* partition */ + for(b = first; (++b < last) && ((x = Td[PA[*b]]) == v);) { } + if(((a = b) < last) && (x < v)) { + for(; (++b < last) && ((x = Td[PA[*b]]) <= v);) { + if(x == v) { SWAP(*b, *a); ++a; } + } + } + for(c = last; (b < --c) && ((x = Td[PA[*c]]) == v);) { } + if((b < (d = c)) && (x > v)) { + for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) { + if(x == v) { SWAP(*c, *d); --d; } + } + } + for(; b < c;) { + SWAP(*b, *c); + for(; (++b < c) && ((x = Td[PA[*b]]) <= v);) { + if(x == v) { SWAP(*b, *a); ++a; } + } + for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) { + if(x == v) { SWAP(*c, *d); --d; } + } + } + + if(a <= d) { + c = b - 1; + + if((s = a - first) > (t = b - a)) { s = t; } + for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } + if((s = d - c) > (t = last - d - 1)) { s = t; } + for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } + + a = first + (b - a), c = last - (d - c); + b = (v <= Td[PA[*a] - 1]) ? a : ss_partition(PA, a, c, depth); + + if((a - first) <= (last - c)) { + if((last - c) <= (c - b)) { + STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); + STACK_PUSH(c, last, depth, limit); + last = a; + } else if((a - first) <= (c - b)) { + STACK_PUSH(c, last, depth, limit); + STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); + last = a; + } else { + STACK_PUSH(c, last, depth, limit); + STACK_PUSH(first, a, depth, limit); + first = b, last = c, depth += 1, limit = ss_ilg(c - b); + } + } else { + if((a - first) <= (c - b)) { + STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); + STACK_PUSH(first, a, depth, limit); + first = c; + } else if((last - c) <= (c - b)) { + STACK_PUSH(first, a, depth, limit); + STACK_PUSH(b, c, depth + 1, ss_ilg(c - b)); + first = c; + } else { + STACK_PUSH(first, a, depth, limit); + STACK_PUSH(c, last, depth, limit); + first = b, last = c, depth += 1, limit = ss_ilg(c - b); + } + } + } else { + limit += 1; + if(Td[PA[*first] - 1] < v) { + first = ss_partition(PA, first, last, depth); + limit = ss_ilg(last - first); + } + depth += 1; + } + } +#undef STACK_SIZE +} + +#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */ + + +/*---------------------------------------------------------------------------*/ + +#if SS_BLOCKSIZE != 0 + +static INLINE +void +ss_blockswap(saidx_t *a, saidx_t *b, saidx_t n) { + saidx_t t; + for(; 0 < n; --n, ++a, ++b) { + t = *a, *a = *b, *b = t; + } +} + +static INLINE +void +ss_rotate(saidx_t *first, saidx_t *middle, saidx_t *last) { + saidx_t *a, *b, t; + saidx_t l, r; + l = middle - first, r = last - middle; + for(; (0 < l) && (0 < r);) { + if(l == r) { ss_blockswap(first, middle, l); break; } + if(l < r) { + a = last - 1, b = middle - 1; + t = *a; + do { + *a-- = *b, *b-- = *a; + if(b < first) { + *a = t; + last = a; + if((r -= l + 1) <= l) { break; } + a -= 1, b = middle - 1; + t = *a; + } + } while(1); + } else { + a = first, b = middle; + t = *a; + do { + *a++ = *b, *b++ = *a; + if(last <= b) { + *a = t; + first = a + 1; + if((l -= r + 1) <= r) { break; } + a += 1, b = middle; + t = *a; + } + } while(1); + } + } +} + + +/*---------------------------------------------------------------------------*/ + +static +void +ss_inplacemerge(const sauchar_t *T, const saidx_t *PA, + saidx_t *first, saidx_t *middle, saidx_t *last, + saidx_t depth) { + const saidx_t *p; + saidx_t *a, *b; + saidx_t len, half; + saint_t q, r; + saint_t x; + + for(;;) { + if(*(last - 1) < 0) { x = 1; p = PA + ~*(last - 1); } + else { x = 0; p = PA + *(last - 1); } + for(a = first, len = middle - first, half = len >> 1, r = -1; + 0 < len; + len = half, half >>= 1) { + b = a + half; + q = ss_compare(T, PA + ((0 <= *b) ? *b : ~*b), p, depth); + if(q < 0) { + a = b + 1; + half -= (len & 1) ^ 1; + } else { + r = q; + } + } + if(a < middle) { + if(r == 0) { *a = ~*a; } + ss_rotate(a, middle, last); + last -= middle - a; + middle = a; + if(first == middle) { break; } + } + --last; + if(x != 0) { while(*--last < 0) { } } + if(middle == last) { break; } + } +} + + +/*---------------------------------------------------------------------------*/ + +/* Merge-forward with internal buffer. */ +static +void +ss_mergeforward(const sauchar_t *T, const saidx_t *PA, + saidx_t *first, saidx_t *middle, saidx_t *last, + saidx_t *buf, saidx_t depth) { + saidx_t *a, *b, *c, *bufend; + saidx_t t; + saint_t r; + + bufend = buf + (middle - first) - 1; + ss_blockswap(buf, first, middle - first); + + for(t = *(a = first), b = buf, c = middle;;) { + r = ss_compare(T, PA + *b, PA + *c, depth); + if(r < 0) { + do { + *a++ = *b; + if(bufend <= b) { *bufend = t; return; } + *b++ = *a; + } while(*b < 0); + } else if(r > 0) { + do { + *a++ = *c, *c++ = *a; + if(last <= c) { + while(b < bufend) { *a++ = *b, *b++ = *a; } + *a = *b, *b = t; + return; + } + } while(*c < 0); + } else { + *c = ~*c; + do { + *a++ = *b; + if(bufend <= b) { *bufend = t; return; } + *b++ = *a; + } while(*b < 0); + + do { + *a++ = *c, *c++ = *a; + if(last <= c) { + while(b < bufend) { *a++ = *b, *b++ = *a; } + *a = *b, *b = t; + return; + } + } while(*c < 0); + } + } +} + +/* Merge-backward with internal buffer. */ +static +void +ss_mergebackward(const sauchar_t *T, const saidx_t *PA, + saidx_t *first, saidx_t *middle, saidx_t *last, + saidx_t *buf, saidx_t depth) { + const saidx_t *p1, *p2; + saidx_t *a, *b, *c, *bufend; + saidx_t t; + saint_t r; + saint_t x; + + bufend = buf + (last - middle) - 1; + ss_blockswap(buf, middle, last - middle); + + x = 0; + if(*bufend < 0) { p1 = PA + ~*bufend; x |= 1; } + else { p1 = PA + *bufend; } + if(*(middle - 1) < 0) { p2 = PA + ~*(middle - 1); x |= 2; } + else { p2 = PA + *(middle - 1); } + for(t = *(a = last - 1), b = bufend, c = middle - 1;;) { + r = ss_compare(T, p1, p2, depth); + if(0 < r) { + if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; } + *a-- = *b; + if(b <= buf) { *buf = t; break; } + *b-- = *a; + if(*b < 0) { p1 = PA + ~*b; x |= 1; } + else { p1 = PA + *b; } + } else if(r < 0) { + if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; } + *a-- = *c, *c-- = *a; + if(c < first) { + while(buf < b) { *a-- = *b, *b-- = *a; } + *a = *b, *b = t; + break; + } + if(*c < 0) { p2 = PA + ~*c; x |= 2; } + else { p2 = PA + *c; } + } else { + if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; } + *a-- = ~*b; + if(b <= buf) { *buf = t; break; } + *b-- = *a; + if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; } + *a-- = *c, *c-- = *a; + if(c < first) { + while(buf < b) { *a-- = *b, *b-- = *a; } + *a = *b, *b = t; + break; + } + if(*b < 0) { p1 = PA + ~*b; x |= 1; } + else { p1 = PA + *b; } + if(*c < 0) { p2 = PA + ~*c; x |= 2; } + else { p2 = PA + *c; } + } + } +} + +/* D&C based merge. */ +static +void +ss_swapmerge(const sauchar_t *T, const saidx_t *PA, + saidx_t *first, saidx_t *middle, saidx_t *last, + saidx_t *buf, saidx_t bufsize, saidx_t depth) { +#define STACK_SIZE SS_SMERGE_STACKSIZE +#define GETIDX(a) ((0 <= (a)) ? (a) : (~(a))) +#define MERGE_CHECK(a, b, c)\ + do {\ + if(((c) & 1) ||\ + (((c) & 2) && (ss_compare(T, PA + GETIDX(*((a) - 1)), PA + *(a), depth) == 0))) {\ + *(a) = ~*(a);\ + }\ + if(((c) & 4) && ((ss_compare(T, PA + GETIDX(*((b) - 1)), PA + *(b), depth) == 0))) {\ + *(b) = ~*(b);\ + }\ + } while(0) + struct { saidx_t *a, *b, *c; saint_t d; } stack[STACK_SIZE]; + saidx_t *l, *r, *lm, *rm; + saidx_t m, len, half; + saint_t ssize; + saint_t check, next; + + for(check = 0, ssize = 0;;) { + if((last - middle) <= bufsize) { + if((first < middle) && (middle < last)) { + ss_mergebackward(T, PA, first, middle, last, buf, depth); + } + MERGE_CHECK(first, last, check); + STACK_POP(first, middle, last, check); + continue; + } + + if((middle - first) <= bufsize) { + if(first < middle) { + ss_mergeforward(T, PA, first, middle, last, buf, depth); + } + MERGE_CHECK(first, last, check); + STACK_POP(first, middle, last, check); + continue; + } + + for(m = 0, len = MIN(middle - first, last - middle), half = len >> 1; + 0 < len; + len = half, half >>= 1) { + if(ss_compare(T, PA + GETIDX(*(middle + m + half)), + PA + GETIDX(*(middle - m - half - 1)), depth) < 0) { + m += half + 1; + half -= (len & 1) ^ 1; + } + } + + if(0 < m) { + lm = middle - m, rm = middle + m; + ss_blockswap(lm, middle, m); + l = r = middle, next = 0; + if(rm < last) { + if(*rm < 0) { + *rm = ~*rm; + if(first < lm) { for(; *--l < 0;) { } next |= 4; } + next |= 1; + } else if(first < lm) { + for(; *r < 0; ++r) { } + next |= 2; + } + } + + if((l - first) <= (last - r)) { + STACK_PUSH(r, rm, last, (next & 3) | (check & 4)); + middle = lm, last = l, check = (check & 3) | (next & 4); + } else { + if((next & 2) && (r == middle)) { next ^= 6; } + STACK_PUSH(first, lm, l, (check & 3) | (next & 4)); + first = r, middle = rm, check = (next & 3) | (check & 4); + } + } else { + if(ss_compare(T, PA + GETIDX(*(middle - 1)), PA + *middle, depth) == 0) { + *middle = ~*middle; + } + MERGE_CHECK(first, last, check); + STACK_POP(first, middle, last, check); + } + } +#undef STACK_SIZE +} + +#endif /* SS_BLOCKSIZE != 0 */ + + +/*---------------------------------------------------------------------------*/ + +/*- Function -*/ + +/* Substring sort */ +void +sssort(const sauchar_t *T, const saidx_t *PA, + saidx_t *first, saidx_t *last, + saidx_t *buf, saidx_t bufsize, + saidx_t depth, saidx_t n, saint_t lastsuffix) { + saidx_t *a; +#if SS_BLOCKSIZE != 0 + saidx_t *b, *middle, *curbuf; + saidx_t j, k, curbufsize, limit; +#endif + saidx_t i; + + if(lastsuffix != 0) { ++first; } + +#if SS_BLOCKSIZE == 0 + ss_mintrosort(T, PA, first, last, depth); +#else + if((bufsize < SS_BLOCKSIZE) && + (bufsize < (last - first)) && + (bufsize < (limit = ss_isqrt(last - first)))) { + if(SS_BLOCKSIZE < limit) { limit = SS_BLOCKSIZE; } + buf = middle = last - limit, bufsize = limit; + } else { + middle = last, limit = 0; + } + for(a = first, i = 0; SS_BLOCKSIZE < (middle - a); a += SS_BLOCKSIZE, ++i) { +#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE + ss_mintrosort(T, PA, a, a + SS_BLOCKSIZE, depth); +#elif 1 < SS_BLOCKSIZE + ss_insertionsort(T, PA, a, a + SS_BLOCKSIZE, depth); +#endif + curbufsize = last - (a + SS_BLOCKSIZE); + curbuf = a + SS_BLOCKSIZE; + if(curbufsize <= bufsize) { curbufsize = bufsize, curbuf = buf; } + for(b = a, k = SS_BLOCKSIZE, j = i; j & 1; b -= k, k <<= 1, j >>= 1) { + ss_swapmerge(T, PA, b - k, b, b + k, curbuf, curbufsize, depth); + } + } +#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE + ss_mintrosort(T, PA, a, middle, depth); +#elif 1 < SS_BLOCKSIZE + ss_insertionsort(T, PA, a, middle, depth); +#endif + for(k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) { + if(i & 1) { + ss_swapmerge(T, PA, a - k, a, middle, buf, bufsize, depth); + a -= k; + } + } + if(limit != 0) { +#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE + ss_mintrosort(T, PA, middle, last, depth); +#elif 1 < SS_BLOCKSIZE + ss_insertionsort(T, PA, middle, last, depth); +#endif + ss_inplacemerge(T, PA, first, middle, last, depth); + } +#endif + + if(lastsuffix != 0) { + /* Insert last type B* suffix. */ + saidx_t PAi[2]; PAi[0] = PA[*(first - 1)], PAi[1] = n - 2; + for(a = first, i = *(first - 1); + (a < last) && ((*a < 0) || (0 < ss_compare(T, &(PAi[0]), PA + *a, depth))); + ++a) { + *(a - 1) = *a; + } + *(a - 1) = i; + } +} diff --git a/dictBuilder/trsort.c b/dictBuilder/trsort.c new file mode 100644 index 00000000..6fe3e67b --- /dev/null +++ b/dictBuilder/trsort.c @@ -0,0 +1,586 @@ +/* + * trsort.c for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "divsufsort_private.h" + + +/*- Private Functions -*/ + +static const saint_t lg_table[256]= { + -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, + 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 +}; + +static INLINE +saint_t +tr_ilg(saidx_t n) { +#if defined(BUILD_DIVSUFSORT64) + return (n >> 32) ? + ((n >> 48) ? + ((n >> 56) ? + 56 + lg_table[(n >> 56) & 0xff] : + 48 + lg_table[(n >> 48) & 0xff]) : + ((n >> 40) ? + 40 + lg_table[(n >> 40) & 0xff] : + 32 + lg_table[(n >> 32) & 0xff])) : + ((n & 0xffff0000) ? + ((n & 0xff000000) ? + 24 + lg_table[(n >> 24) & 0xff] : + 16 + lg_table[(n >> 16) & 0xff]) : + ((n & 0x0000ff00) ? + 8 + lg_table[(n >> 8) & 0xff] : + 0 + lg_table[(n >> 0) & 0xff])); +#else + return (n & 0xffff0000) ? + ((n & 0xff000000) ? + 24 + lg_table[(n >> 24) & 0xff] : + 16 + lg_table[(n >> 16) & 0xff]) : + ((n & 0x0000ff00) ? + 8 + lg_table[(n >> 8) & 0xff] : + 0 + lg_table[(n >> 0) & 0xff]); +#endif +} + + +/*---------------------------------------------------------------------------*/ + +/* Simple insertionsort for small size groups. */ +static +void +tr_insertionsort(const saidx_t *ISAd, saidx_t *first, saidx_t *last) { + saidx_t *a, *b; + saidx_t t, r; + + for(a = first + 1; a < last; ++a) { + for(t = *a, b = a - 1; 0 > (r = ISAd[t] - ISAd[*b]);) { + do { *(b + 1) = *b; } while((first <= --b) && (*b < 0)); + if(b < first) { break; } + } + if(r == 0) { *b = ~*b; } + *(b + 1) = t; + } +} + + +/*---------------------------------------------------------------------------*/ + +static INLINE +void +tr_fixdown(const saidx_t *ISAd, saidx_t *SA, saidx_t i, saidx_t size) { + saidx_t j, k; + saidx_t v; + saidx_t c, d, e; + + for(v = SA[i], c = ISAd[v]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) { + d = ISAd[SA[k = j++]]; + if(d < (e = ISAd[SA[j]])) { k = j; d = e; } + if(d <= c) { break; } + } + SA[i] = v; +} + +/* Simple top-down heapsort. */ +static +void +tr_heapsort(const saidx_t *ISAd, saidx_t *SA, saidx_t size) { + saidx_t i, m; + saidx_t t; + + m = size; + if((size % 2) == 0) { + m--; + if(ISAd[SA[m / 2]] < ISAd[SA[m]]) { SWAP(SA[m], SA[m / 2]); } + } + + for(i = m / 2 - 1; 0 <= i; --i) { tr_fixdown(ISAd, SA, i, m); } + if((size % 2) == 0) { SWAP(SA[0], SA[m]); tr_fixdown(ISAd, SA, 0, m); } + for(i = m - 1; 0 < i; --i) { + t = SA[0], SA[0] = SA[i]; + tr_fixdown(ISAd, SA, 0, i); + SA[i] = t; + } +} + + +/*---------------------------------------------------------------------------*/ + +/* Returns the median of three elements. */ +static INLINE +saidx_t * +tr_median3(const saidx_t *ISAd, saidx_t *v1, saidx_t *v2, saidx_t *v3) { + saidx_t *t; + if(ISAd[*v1] > ISAd[*v2]) { SWAP(v1, v2); } + if(ISAd[*v2] > ISAd[*v3]) { + if(ISAd[*v1] > ISAd[*v3]) { return v1; } + else { return v3; } + } + return v2; +} + +/* Returns the median of five elements. */ +static INLINE +saidx_t * +tr_median5(const saidx_t *ISAd, + saidx_t *v1, saidx_t *v2, saidx_t *v3, saidx_t *v4, saidx_t *v5) { + saidx_t *t; + if(ISAd[*v2] > ISAd[*v3]) { SWAP(v2, v3); } + if(ISAd[*v4] > ISAd[*v5]) { SWAP(v4, v5); } + if(ISAd[*v2] > ISAd[*v4]) { SWAP(v2, v4); SWAP(v3, v5); } + if(ISAd[*v1] > ISAd[*v3]) { SWAP(v1, v3); } + if(ISAd[*v1] > ISAd[*v4]) { SWAP(v1, v4); SWAP(v3, v5); } + if(ISAd[*v3] > ISAd[*v4]) { return v4; } + return v3; +} + +/* Returns the pivot element. */ +static INLINE +saidx_t * +tr_pivot(const saidx_t *ISAd, saidx_t *first, saidx_t *last) { + saidx_t *middle; + saidx_t t; + + t = last - first; + middle = first + t / 2; + + if(t <= 512) { + if(t <= 32) { + return tr_median3(ISAd, first, middle, last - 1); + } else { + t >>= 2; + return tr_median5(ISAd, first, first + t, middle, last - 1 - t, last - 1); + } + } + t >>= 3; + first = tr_median3(ISAd, first, first + t, first + (t << 1)); + middle = tr_median3(ISAd, middle - t, middle, middle + t); + last = tr_median3(ISAd, last - 1 - (t << 1), last - 1 - t, last - 1); + return tr_median3(ISAd, first, middle, last); +} + + +/*---------------------------------------------------------------------------*/ + +typedef struct _trbudget_t trbudget_t; +struct _trbudget_t { + saidx_t chance; + saidx_t remain; + saidx_t incval; + saidx_t count; +}; + +static INLINE +void +trbudget_init(trbudget_t *budget, saidx_t chance, saidx_t incval) { + budget->chance = chance; + budget->remain = budget->incval = incval; +} + +static INLINE +saint_t +trbudget_check(trbudget_t *budget, saidx_t size) { + if(size <= budget->remain) { budget->remain -= size; return 1; } + if(budget->chance == 0) { budget->count += size; return 0; } + budget->remain += budget->incval - size; + budget->chance -= 1; + return 1; +} + + +/*---------------------------------------------------------------------------*/ + +static INLINE +void +tr_partition(const saidx_t *ISAd, + saidx_t *first, saidx_t *middle, saidx_t *last, + saidx_t **pa, saidx_t **pb, saidx_t v) { + saidx_t *a, *b, *c, *d, *e, *f; + saidx_t t, s; + saidx_t x = 0; + + for(b = middle - 1; (++b < last) && ((x = ISAd[*b]) == v);) { } + if(((a = b) < last) && (x < v)) { + for(; (++b < last) && ((x = ISAd[*b]) <= v);) { + if(x == v) { SWAP(*b, *a); ++a; } + } + } + for(c = last; (b < --c) && ((x = ISAd[*c]) == v);) { } + if((b < (d = c)) && (x > v)) { + for(; (b < --c) && ((x = ISAd[*c]) >= v);) { + if(x == v) { SWAP(*c, *d); --d; } + } + } + for(; b < c;) { + SWAP(*b, *c); + for(; (++b < c) && ((x = ISAd[*b]) <= v);) { + if(x == v) { SWAP(*b, *a); ++a; } + } + for(; (b < --c) && ((x = ISAd[*c]) >= v);) { + if(x == v) { SWAP(*c, *d); --d; } + } + } + + if(a <= d) { + c = b - 1; + if((s = a - first) > (t = b - a)) { s = t; } + for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } + if((s = d - c) > (t = last - d - 1)) { s = t; } + for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); } + first += (b - a), last -= (d - c); + } + *pa = first, *pb = last; +} + +static +void +tr_copy(saidx_t *ISA, const saidx_t *SA, + saidx_t *first, saidx_t *a, saidx_t *b, saidx_t *last, + saidx_t depth) { + /* sort suffixes of middle partition + by using sorted order of suffixes of left and right partition. */ + saidx_t *c, *d, *e; + saidx_t s, v; + + v = b - SA - 1; + for(c = first, d = a - 1; c <= d; ++c) { + if((0 <= (s = *c - depth)) && (ISA[s] == v)) { + *++d = s; + ISA[s] = d - SA; + } + } + for(c = last - 1, e = d + 1, d = b; e < d; --c) { + if((0 <= (s = *c - depth)) && (ISA[s] == v)) { + *--d = s; + ISA[s] = d - SA; + } + } +} + +static +void +tr_partialcopy(saidx_t *ISA, const saidx_t *SA, + saidx_t *first, saidx_t *a, saidx_t *b, saidx_t *last, + saidx_t depth) { + saidx_t *c, *d, *e; + saidx_t s, v; + saidx_t rank, lastrank, newrank = -1; + + v = b - SA - 1; + lastrank = -1; + for(c = first, d = a - 1; c <= d; ++c) { + if((0 <= (s = *c - depth)) && (ISA[s] == v)) { + *++d = s; + rank = ISA[s + depth]; + if(lastrank != rank) { lastrank = rank; newrank = d - SA; } + ISA[s] = newrank; + } + } + + lastrank = -1; + for(e = d; first <= e; --e) { + rank = ISA[*e]; + if(lastrank != rank) { lastrank = rank; newrank = e - SA; } + if(newrank != rank) { ISA[*e] = newrank; } + } + + lastrank = -1; + for(c = last - 1, e = d + 1, d = b; e < d; --c) { + if((0 <= (s = *c - depth)) && (ISA[s] == v)) { + *--d = s; + rank = ISA[s + depth]; + if(lastrank != rank) { lastrank = rank; newrank = d - SA; } + ISA[s] = newrank; + } + } +} + +static +void +tr_introsort(saidx_t *ISA, const saidx_t *ISAd, + saidx_t *SA, saidx_t *first, saidx_t *last, + trbudget_t *budget) { +#define STACK_SIZE TR_STACKSIZE + struct { const saidx_t *a; saidx_t *b, *c; saint_t d, e; }stack[STACK_SIZE]; + saidx_t *a, *b, *c; + saidx_t t; + saidx_t v, x = 0; + saidx_t incr = ISAd - ISA; + saint_t limit, next; + saint_t ssize, trlink = -1; + + for(ssize = 0, limit = tr_ilg(last - first);;) { + + if(limit < 0) { + if(limit == -1) { + /* tandem repeat partition */ + tr_partition(ISAd - incr, first, first, last, &a, &b, last - SA - 1); + + /* update ranks */ + if(a < last) { + for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; } + } + if(b < last) { + for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } + } + + /* push */ + if(1 < (b - a)) { + STACK_PUSH5(NULL, a, b, 0, 0); + STACK_PUSH5(ISAd - incr, first, last, -2, trlink); + trlink = ssize - 2; + } + if((a - first) <= (last - b)) { + if(1 < (a - first)) { + STACK_PUSH5(ISAd, b, last, tr_ilg(last - b), trlink); + last = a, limit = tr_ilg(a - first); + } else if(1 < (last - b)) { + first = b, limit = tr_ilg(last - b); + } else { + STACK_POP5(ISAd, first, last, limit, trlink); + } + } else { + if(1 < (last - b)) { + STACK_PUSH5(ISAd, first, a, tr_ilg(a - first), trlink); + first = b, limit = tr_ilg(last - b); + } else if(1 < (a - first)) { + last = a, limit = tr_ilg(a - first); + } else { + STACK_POP5(ISAd, first, last, limit, trlink); + } + } + } else if(limit == -2) { + /* tandem repeat copy */ + a = stack[--ssize].b, b = stack[ssize].c; + if(stack[ssize].d == 0) { + tr_copy(ISA, SA, first, a, b, last, ISAd - ISA); + } else { + if(0 <= trlink) { stack[trlink].d = -1; } + tr_partialcopy(ISA, SA, first, a, b, last, ISAd - ISA); + } + STACK_POP5(ISAd, first, last, limit, trlink); + } else { + /* sorted partition */ + if(0 <= *first) { + a = first; + do { ISA[*a] = a - SA; } while((++a < last) && (0 <= *a)); + first = a; + } + if(first < last) { + a = first; do { *a = ~*a; } while(*++a < 0); + next = (ISA[*a] != ISAd[*a]) ? tr_ilg(a - first + 1) : -1; + if(++a < last) { for(b = first, v = a - SA - 1; b < a; ++b) { ISA[*b] = v; } } + + /* push */ + if(trbudget_check(budget, a - first)) { + if((a - first) <= (last - a)) { + STACK_PUSH5(ISAd, a, last, -3, trlink); + ISAd += incr, last = a, limit = next; + } else { + if(1 < (last - a)) { + STACK_PUSH5(ISAd + incr, first, a, next, trlink); + first = a, limit = -3; + } else { + ISAd += incr, last = a, limit = next; + } + } + } else { + if(0 <= trlink) { stack[trlink].d = -1; } + if(1 < (last - a)) { + first = a, limit = -3; + } else { + STACK_POP5(ISAd, first, last, limit, trlink); + } + } + } else { + STACK_POP5(ISAd, first, last, limit, trlink); + } + } + continue; + } + + if((last - first) <= TR_INSERTIONSORT_THRESHOLD) { + tr_insertionsort(ISAd, first, last); + limit = -3; + continue; + } + + if(limit-- == 0) { + tr_heapsort(ISAd, first, last - first); + for(a = last - 1; first < a; a = b) { + for(x = ISAd[*a], b = a - 1; (first <= b) && (ISAd[*b] == x); --b) { *b = ~*b; } + } + limit = -3; + continue; + } + + /* choose pivot */ + a = tr_pivot(ISAd, first, last); + SWAP(*first, *a); + v = ISAd[*first]; + + /* partition */ + tr_partition(ISAd, first, first + 1, last, &a, &b, v); + if((last - first) != (b - a)) { + next = (ISA[*a] != v) ? tr_ilg(b - a) : -1; + + /* update ranks */ + for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; } + if(b < last) { for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } } + + /* push */ + if((1 < (b - a)) && (trbudget_check(budget, b - a))) { + if((a - first) <= (last - b)) { + if((last - b) <= (b - a)) { + if(1 < (a - first)) { + STACK_PUSH5(ISAd + incr, a, b, next, trlink); + STACK_PUSH5(ISAd, b, last, limit, trlink); + last = a; + } else if(1 < (last - b)) { + STACK_PUSH5(ISAd + incr, a, b, next, trlink); + first = b; + } else { + ISAd += incr, first = a, last = b, limit = next; + } + } else if((a - first) <= (b - a)) { + if(1 < (a - first)) { + STACK_PUSH5(ISAd, b, last, limit, trlink); + STACK_PUSH5(ISAd + incr, a, b, next, trlink); + last = a; + } else { + STACK_PUSH5(ISAd, b, last, limit, trlink); + ISAd += incr, first = a, last = b, limit = next; + } + } else { + STACK_PUSH5(ISAd, b, last, limit, trlink); + STACK_PUSH5(ISAd, first, a, limit, trlink); + ISAd += incr, first = a, last = b, limit = next; + } + } else { + if((a - first) <= (b - a)) { + if(1 < (last - b)) { + STACK_PUSH5(ISAd + incr, a, b, next, trlink); + STACK_PUSH5(ISAd, first, a, limit, trlink); + first = b; + } else if(1 < (a - first)) { + STACK_PUSH5(ISAd + incr, a, b, next, trlink); + last = a; + } else { + ISAd += incr, first = a, last = b, limit = next; + } + } else if((last - b) <= (b - a)) { + if(1 < (last - b)) { + STACK_PUSH5(ISAd, first, a, limit, trlink); + STACK_PUSH5(ISAd + incr, a, b, next, trlink); + first = b; + } else { + STACK_PUSH5(ISAd, first, a, limit, trlink); + ISAd += incr, first = a, last = b, limit = next; + } + } else { + STACK_PUSH5(ISAd, first, a, limit, trlink); + STACK_PUSH5(ISAd, b, last, limit, trlink); + ISAd += incr, first = a, last = b, limit = next; + } + } + } else { + if((1 < (b - a)) && (0 <= trlink)) { stack[trlink].d = -1; } + if((a - first) <= (last - b)) { + if(1 < (a - first)) { + STACK_PUSH5(ISAd, b, last, limit, trlink); + last = a; + } else if(1 < (last - b)) { + first = b; + } else { + STACK_POP5(ISAd, first, last, limit, trlink); + } + } else { + if(1 < (last - b)) { + STACK_PUSH5(ISAd, first, a, limit, trlink); + first = b; + } else if(1 < (a - first)) { + last = a; + } else { + STACK_POP5(ISAd, first, last, limit, trlink); + } + } + } + } else { + if(trbudget_check(budget, last - first)) { + limit = tr_ilg(last - first), ISAd += incr; + } else { + if(0 <= trlink) { stack[trlink].d = -1; } + STACK_POP5(ISAd, first, last, limit, trlink); + } + } + } +#undef STACK_SIZE +} + + + +/*---------------------------------------------------------------------------*/ + +/*- Function -*/ + +/* Tandem repeat sort */ +void +trsort(saidx_t *ISA, saidx_t *SA, saidx_t n, saidx_t depth) { + saidx_t *ISAd; + saidx_t *first, *last; + trbudget_t budget; + saidx_t t, skip, unsorted; + + trbudget_init(&budget, tr_ilg(n) * 2 / 3, n); +/* trbudget_init(&budget, tr_ilg(n) * 3 / 4, n); */ + for(ISAd = ISA + depth; -n < *SA; ISAd += ISAd - ISA) { + first = SA; + skip = 0; + unsorted = 0; + do { + if((t = *first) < 0) { first -= t; skip += t; } + else { + if(skip != 0) { *(first + skip) = skip; skip = 0; } + last = SA + ISA[t] + 1; + if(1 < (last - first)) { + budget.count = 0; + tr_introsort(ISA, ISAd, SA, first, last, &budget); + if(budget.count != 0) { unsorted += budget.count; } + else { skip = first - last; } + } else if((last - first) == 1) { + skip = -1; + } + first = last; + } + } while(first < (SA + n)); + if(skip != 0) { *(first + skip) = skip; } + if(unsorted == 0) { break; } + } +} diff --git a/dictBuilder/utils.c b/dictBuilder/utils.c new file mode 100644 index 00000000..90fb23ef --- /dev/null +++ b/dictBuilder/utils.c @@ -0,0 +1,381 @@ +/* + * utils.c for libdivsufsort + * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "divsufsort_private.h" + + +/*- Private Function -*/ + +/* Binary search for inverse bwt. */ +static +saidx_t +binarysearch_lower(const saidx_t *A, saidx_t size, saidx_t value) { + saidx_t half, i; + for(i = 0, half = size >> 1; + 0 < size; + size = half, half >>= 1) { + if(A[i + half] < value) { + i += half + 1; + half -= (size & 1) ^ 1; + } + } + return i; +} + + +/*- Functions -*/ + +/* Burrows-Wheeler transform. */ +saint_t +bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *SA, + saidx_t n, saidx_t *idx) { + saidx_t *A, i, j, p, t; + saint_t c; + + /* Check arguments. */ + if((T == NULL) || (U == NULL) || (n < 0) || (idx == NULL)) { return -1; } + if(n <= 1) { + if(n == 1) { U[0] = T[0]; } + *idx = n; + return 0; + } + + if((A = SA) == NULL) { + i = divbwt(T, U, NULL, n); + if(0 <= i) { *idx = i; i = 0; } + return (saint_t)i; + } + + /* BW transform. */ + if(T == U) { + t = n; + for(i = 0, j = 0; i < n; ++i) { + p = t - 1; + t = A[i]; + if(0 <= p) { + c = T[j]; + U[j] = (j <= p) ? T[p] : (sauchar_t)A[p]; + A[j] = c; + j++; + } else { + *idx = i; + } + } + p = t - 1; + if(0 <= p) { + c = T[j]; + U[j] = (j <= p) ? T[p] : (sauchar_t)A[p]; + A[j] = c; + } else { + *idx = i; + } + } else { + U[0] = T[n - 1]; + for(i = 0; A[i] != 0; ++i) { U[i + 1] = T[A[i] - 1]; } + *idx = i + 1; + for(++i; i < n; ++i) { U[i] = T[A[i] - 1]; } + } + + if(SA == NULL) { + /* Deallocate memory. */ + free(A); + } + + return 0; +} + +/* Inverse Burrows-Wheeler transform. */ +saint_t +inverse_bw_transform(const sauchar_t *T, sauchar_t *U, saidx_t *A, + saidx_t n, saidx_t idx) { + saidx_t C[ALPHABET_SIZE]; + sauchar_t D[ALPHABET_SIZE]; + saidx_t *B; + saidx_t i, p; + saint_t c, d; + + /* Check arguments. */ + if((T == NULL) || (U == NULL) || (n < 0) || (idx < 0) || + (n < idx) || ((0 < n) && (idx == 0))) { + return -1; + } + if(n <= 1) { return 0; } + + if((B = A) == NULL) { + /* Allocate n*sizeof(saidx_t) bytes of memory. */ + if((B = (saidx_t *)malloc((size_t)n * sizeof(saidx_t))) == NULL) { return -2; } + } + + /* Inverse BW transform. */ + for(c = 0; c < ALPHABET_SIZE; ++c) { C[c] = 0; } + for(i = 0; i < n; ++i) { ++C[T[i]]; } + for(c = 0, d = 0, i = 0; c < ALPHABET_SIZE; ++c) { + p = C[c]; + if(0 < p) { + C[c] = i; + D[d++] = (sauchar_t)c; + i += p; + } + } + for(i = 0; i < idx; ++i) { B[C[T[i]]++] = i; } + for( ; i < n; ++i) { B[C[T[i]]++] = i + 1; } + for(c = 0; c < d; ++c) { C[c] = C[D[c]]; } + for(i = 0, p = idx; i < n; ++i) { + U[i] = D[binarysearch_lower(C, d, p)]; + p = B[p - 1]; + } + + if(A == NULL) { + /* Deallocate memory. */ + free(B); + } + + return 0; +} + +/* Checks the suffix array SA of the string T. */ +saint_t +sufcheck(const sauchar_t *T, const saidx_t *SA, + saidx_t n, saint_t verbose) { + saidx_t C[ALPHABET_SIZE]; + saidx_t i, p, q, t; + saint_t c; + + if(verbose) { fprintf(stderr, "sufcheck: "); } + + /* Check arguments. */ + if((T == NULL) || (SA == NULL) || (n < 0)) { + if(verbose) { fprintf(stderr, "Invalid arguments.\n"); } + return -1; + } + if(n == 0) { + if(verbose) { fprintf(stderr, "Done.\n"); } + return 0; + } + + /* check range: [0..n-1] */ + for(i = 0; i < n; ++i) { + if((SA[i] < 0) || (n <= SA[i])) { + if(verbose) { + fprintf(stderr, "Out of the range [0,%" PRIdSAIDX_T "].\n" + " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n", + n - 1, i, SA[i]); + } + return -2; + } + } + + /* check first characters. */ + for(i = 1; i < n; ++i) { + if(T[SA[i - 1]] > T[SA[i]]) { + if(verbose) { + fprintf(stderr, "Suffixes in wrong order.\n" + " T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d" + " > T[SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "]=%d\n", + i - 1, SA[i - 1], T[SA[i - 1]], i, SA[i], T[SA[i]]); + } + return -3; + } + } + + /* check suffixes. */ + for(i = 0; i < ALPHABET_SIZE; ++i) { C[i] = 0; } + for(i = 0; i < n; ++i) { ++C[T[i]]; } + for(i = 0, p = 0; i < ALPHABET_SIZE; ++i) { + t = C[i]; + C[i] = p; + p += t; + } + + q = C[T[n - 1]]; + C[T[n - 1]] += 1; + for(i = 0; i < n; ++i) { + p = SA[i]; + if(0 < p) { + c = T[--p]; + t = C[c]; + } else { + c = T[p = n - 1]; + t = q; + } + if((t < 0) || (p != SA[t])) { + if(verbose) { + fprintf(stderr, "Suffix in wrong position.\n" + " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T " or\n" + " SA[%" PRIdSAIDX_T "]=%" PRIdSAIDX_T "\n", + t, (0 <= t) ? SA[t] : -1, i, SA[i]); + } + return -4; + } + if(t != q) { + ++C[c]; + if((n <= C[c]) || (T[SA[C[c]]] != c)) { C[c] = -1; } + } + } + + if(1 <= verbose) { fprintf(stderr, "Done.\n"); } + return 0; +} + + +static +int +_compare(const sauchar_t *T, saidx_t Tsize, + const sauchar_t *P, saidx_t Psize, + saidx_t suf, saidx_t *match) { + saidx_t i, j; + saint_t r; + for(i = suf + *match, j = *match, r = 0; + (i < Tsize) && (j < Psize) && ((r = T[i] - P[j]) == 0); ++i, ++j) { } + *match = j; + return (r == 0) ? -(j != Psize) : r; +} + +/* Search for the pattern P in the string T. */ +saidx_t +sa_search(const sauchar_t *T, saidx_t Tsize, + const sauchar_t *P, saidx_t Psize, + const saidx_t *SA, saidx_t SAsize, + saidx_t *idx) { + saidx_t size, lsize, rsize, half; + saidx_t match, lmatch, rmatch; + saidx_t llmatch, lrmatch, rlmatch, rrmatch; + saidx_t i, j, k; + saint_t r; + + if(idx != NULL) { *idx = -1; } + if((T == NULL) || (P == NULL) || (SA == NULL) || + (Tsize < 0) || (Psize < 0) || (SAsize < 0)) { return -1; } + if((Tsize == 0) || (SAsize == 0)) { return 0; } + if(Psize == 0) { if(idx != NULL) { *idx = 0; } return SAsize; } + + for(i = j = k = 0, lmatch = rmatch = 0, size = SAsize, half = size >> 1; + 0 < size; + size = half, half >>= 1) { + match = MIN(lmatch, rmatch); + r = _compare(T, Tsize, P, Psize, SA[i + half], &match); + if(r < 0) { + i += half + 1; + half -= (size & 1) ^ 1; + lmatch = match; + } else if(r > 0) { + rmatch = match; + } else { + lsize = half, j = i, rsize = size - half - 1, k = i + half + 1; + + /* left part */ + for(llmatch = lmatch, lrmatch = match, half = lsize >> 1; + 0 < lsize; + lsize = half, half >>= 1) { + lmatch = MIN(llmatch, lrmatch); + r = _compare(T, Tsize, P, Psize, SA[j + half], &lmatch); + if(r < 0) { + j += half + 1; + half -= (lsize & 1) ^ 1; + llmatch = lmatch; + } else { + lrmatch = lmatch; + } + } + + /* right part */ + for(rlmatch = match, rrmatch = rmatch, half = rsize >> 1; + 0 < rsize; + rsize = half, half >>= 1) { + rmatch = MIN(rlmatch, rrmatch); + r = _compare(T, Tsize, P, Psize, SA[k + half], &rmatch); + if(r <= 0) { + k += half + 1; + half -= (rsize & 1) ^ 1; + rlmatch = rmatch; + } else { + rrmatch = rmatch; + } + } + + break; + } + } + + if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; } + return k - j; +} + +/* Search for the character c in the string T. */ +saidx_t +sa_simplesearch(const sauchar_t *T, saidx_t Tsize, + const saidx_t *SA, saidx_t SAsize, + saint_t c, saidx_t *idx) { + saidx_t size, lsize, rsize, half; + saidx_t i, j, k, p; + saint_t r; + + if(idx != NULL) { *idx = -1; } + if((T == NULL) || (SA == NULL) || (Tsize < 0) || (SAsize < 0)) { return -1; } + if((Tsize == 0) || (SAsize == 0)) { return 0; } + + for(i = j = k = 0, size = SAsize, half = size >> 1; + 0 < size; + size = half, half >>= 1) { + p = SA[i + half]; + r = (p < Tsize) ? T[p] - c : -1; + if(r < 0) { + i += half + 1; + half -= (size & 1) ^ 1; + } else if(r == 0) { + lsize = half, j = i, rsize = size - half - 1, k = i + half + 1; + + /* left part */ + for(half = lsize >> 1; + 0 < lsize; + lsize = half, half >>= 1) { + p = SA[j + half]; + r = (p < Tsize) ? T[p] - c : -1; + if(r < 0) { + j += half + 1; + half -= (lsize & 1) ^ 1; + } + } + + /* right part */ + for(half = rsize >> 1; + 0 < rsize; + rsize = half, half >>= 1) { + p = SA[k + half]; + r = (p < Tsize) ? T[p] - c : -1; + if(r <= 0) { + k += half + 1; + half -= (rsize & 1) ^ 1; + } + } + + break; + } + } + + if(idx != NULL) { *idx = (0 < (k - j)) ? j : i; } + return k - j; +}