/****************************************************************************** * Copyright 2010 Duane Merrill * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may ob3ain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * * * * AUTHORS' REQUEST: * * If you use|reference|benchmark this code, please cite our Technical * Report (http://www.cs.virginia.edu/~dgm4d/papers/RadixSortTR.pdf): * * @TechReport{ Merrill:Sorting:2010, * author = "Duane Merrill and Andrew Grimshaw", * title = "Revisiting Sorting for GPGPU Stream Architectures", * year = "2010", * institution = "University of Virginia, Department of Computer Science", * address = "Charlottesville, VA, USA", * number = "CS2010-03" * } * * For more information, see our Google Code project site: * http://code.google.com/p/back40computing/ * * Thanks! ******************************************************************************/ /****************************************************************************** * Simple test driver program for *large-problem* radix sorting. * * Useful for demonstrating how to integrate radix sorting into * your application ******************************************************************************/ /****************************************************************************** * Converted from CUDA to OpenCL/DirectCompute by Erwin Coumans ******************************************************************************/ #ifdef _WIN32 #pragma warning (disable:4996) #endif #include #include #include #include #include #include #include //#include #include /********************** * */ #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" #include "../btgui/Timing/b3Clock.h" cl_context g_cxMainContext; cl_device_id g_device; cl_command_queue g_cqCommandQueue; /*********************** * */ bool g_verbose; ///Preferred OpenCL device/platform. When < 0 then no preference is used. ///Note that b3OpenCLUtils might still use the preference of using a platform vendor that matches the SDK vendor used to build the application. ///Preferred device/platform take priority over this platform-vendor match int gPreferredDeviceId = -1; int gPreferredPlatformId = -1; /****************************************************************************** * Routines ******************************************************************************/ /** * Keys-only sorting. Uses the GPU to sort the specified vector of elements for the given * number of iterations, displaying runtime information. * * @param[in] num_elements * Size in elements of the vector to sort * @param[in] h_keys * Vector of keys to sort * @param[in] iterations * Number of times to invoke the GPU sorting primitive * @param[in] cfg * Config */ template void TimedSort( unsigned int num_elements, K *h_keys, unsigned int iterations) { printf("Keys only, %d iterations, %d elements\n", iterations, num_elements); int max_elements = num_elements; b3AlignedObjectArray hostData; hostData.resize(num_elements); for (int i=0;i gpuData(g_cxMainContext,g_cqCommandQueue); gpuData.copyFromHost(hostData); //sorter.executeHost(gpuData); sorter.execute(gpuData); b3AlignedObjectArray hostDataSorted; gpuData.copyToHost(hostDataSorted); clFinish(g_cqCommandQueue); { //printf("Key-values, %d iterations, %d elements", iterations, num_elements); // Create sorting enactor // Perform the timed number of sorting iterations double elapsed = 0; float duration = 0; b3Clock watch; //warm-start gpuData.copyFromHost(hostData); clFinish(g_cqCommandQueue); sorter.execute(gpuData); watch.reset(); for (int i = 0; i < iterations; i++) { // Move a fresh copy of the problem into device storage gpuData.copyFromHost(hostData); clFinish(g_cqCommandQueue); // Start GPU timing record double startMs = watch.getTimeMicroseconds()/1e3; // Call the sorting API routine sorter.execute(gpuData); clFinish(g_cqCommandQueue); double stopMs = watch.getTimeMicroseconds()/1e3; duration = stopMs - startMs; // End GPU timing record elapsed += (double) duration; printf("duration = %f\n", duration); } // Display timing information double avg_runtime = elapsed / iterations; // double throughput = ((double) num_elements) / avg_runtime / 1000.0 / 1000.0; // printf(", %f GPU ms, %f x10^9 elts/sec\n", avg_runtime, throughput); double throughput = ((double) num_elements) / avg_runtime / 1000.0 ; printf(", %f GPU ms, %f x10^6 elts/sec\n", avg_runtime, throughput); gpuData.copyToHost(hostData); for (int i=0;i void TimedSort( unsigned int num_elements, K *h_keys, V *h_values, unsigned int iterations) { printf("Key-values, %d iterations, %d elements\n", iterations, num_elements); int max_elements = num_elements; b3AlignedObjectArray hostData; hostData.resize(num_elements); for (int i=0;i gpuData(g_cxMainContext,g_cqCommandQueue); gpuData.copyFromHost(hostData); //sorter.executeHost(gpuData); sorter.execute(gpuData); b3AlignedObjectArray hostDataSorted; gpuData.copyToHost(hostDataSorted); #if 0 for (int i=0;i void RandomBits(K &key, int entropy_reduction = 0, int lower_key_bits = sizeof(K) * 8) { const unsigned int NUM_UCHARS = (sizeof(K) + sizeof(unsigned char) - 1) / sizeof(unsigned char); unsigned char key_bits[NUM_UCHARS]; do { for (int j = 0; j < NUM_UCHARS; j++) { unsigned char quarterword = 0xff; for (int i = 0; i <= entropy_reduction; i++) { quarterword &= (rand() >> 7); } key_bits[j] = quarterword; } if (lower_key_bits < sizeof(K) * 8) { unsigned long long base = 0; memcpy(&base, key_bits, sizeof(K)); base &= (1 << lower_key_bits) - 1; memcpy(key_bits, &base, sizeof(K)); } memcpy(&key, key_bits, sizeof(K)); } while (key != key); // avoids NaNs when generating random floating point numbers } /****************************************************************************** * Templated routines for printing keys/values to the console ******************************************************************************/ template void PrintValue(T val) { printf("%d", val); } template<> void PrintValue(float val) { printf("%f", val); } template<> void PrintValue(double val) { printf("%f", val); } template<> void PrintValue(unsigned char val) { printf("%u", val); } template<> void PrintValue(unsigned short val) { printf("%u", val); } template<> void PrintValue(unsigned int val) { printf("%u", val); } template<> void PrintValue(long val) { printf("%ld", val); } template<> void PrintValue(unsigned long val) { printf("%lu", val); } template<> void PrintValue(long long val) { printf("%lld", val); } template<> void PrintValue(unsigned long long val) { printf("%llu", val); } /** * Compares the equivalence of two arrays */ template int CompareResults(T* computed, T* reference, SizeT len, bool verbose = true) { printf("\n"); for (SizeT i = 0; i < len; i++) { if (computed[i] != reference[i]) { printf("INCORRECT: [%lu]: ", (unsigned long) i); PrintValue(computed[i]); printf(" != "); PrintValue(reference[i]); if (verbose) { printf("\nresult[..."); for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) { PrintValue(computed[j]); printf(", "); } printf("...]"); printf("\nreference[..."); for (size_t j = (i >= 5) ? i - 5 : 0; (j < i + 5) && (j < len); j++) { PrintValue(reference[j]); printf(", "); } printf("...]"); } return 1; } } printf("CORRECT\n"); return 0; } /** * Creates an example sorting problem whose keys is a vector of the specified * number of K elements, values of V elements, and then dispatches the problem * to the GPU for the given number of iterations, displaying runtime information. * * @param[in] iterations * Number of times to invoke the GPU sorting primitive * @param[in] num_elements * Size in elements of the vector to sort * @param[in] cfg * Config */ template void TestSort( unsigned int iterations, int num_elements, bool keys_only) { // Allocate the sorting problem on the host and fill the keys with random bytes K *h_keys = NULL; K *h_reference_keys = NULL; V *h_values = NULL; h_keys = (K*) malloc(num_elements * sizeof(K)); h_reference_keys = (K*) malloc(num_elements * sizeof(K)); if (!keys_only) h_values = (V*) malloc(num_elements * sizeof(V)); // Use random bits for (unsigned int i = 0; i < num_elements; ++i) { RandomBits(h_keys[i], 0); //h_keys[i] = num_elements-i; //h_keys[i] = 0xffffffffu-i; if (!keys_only) h_values[i] = h_keys[i];//0xffffffffu-i; h_reference_keys[i] = h_keys[i]; } // Run the timing test if (keys_only) { TimedSort(num_elements, h_keys, iterations); } else { TimedSort(num_elements, h_keys, h_values, iterations); } // cudaThreadSynchronize(); // Display sorted key data if (g_verbose) { printf("\n\nKeys:\n"); for (int i = 0; i < num_elements; i++) { PrintValue(h_keys[i]); printf(", "); } printf("\n\n"); } // Verify solution std::sort(h_reference_keys, h_reference_keys + num_elements); CompareResults(h_keys, h_reference_keys, num_elements, true); printf("\n"); fflush(stdout); // Free our allocated host memory if (h_keys != NULL) free(h_keys); if (h_values != NULL) free(h_values); } /** * Displays the commandline usage for this tool */ void Usage() { printf("\ntest_large_problem_sorting [--device=] [--v] [--i=] [--n=] [--key-values] [--deviceId=] [--platformId=]\n"); printf("\n"); printf("\t--v\tDisplays sorted results to the console.\n"); printf("\n"); printf("\t--i\tPerforms the sorting operation times\n"); printf("\t\t\ton the device. Re-copies original input each time. Default = 1\n"); printf("\n"); printf("\t--n\tThe number of elements to comprise the sample problem\n"); printf("\t\t\tDefault = 512\n"); printf("\n"); printf("\t--key-values\tSpecifies that keys are accommodated by value pairings\n"); printf("\n"); } /****************************************************************************** * Command-line parsing ******************************************************************************/ #include #include #include class b3CommandLineArgs { protected: std::map pairs; public: // Constructor b3CommandLineArgs(int argc, char **argv) { using namespace std; for (int i = 1; i < argc; i++) { string arg = argv[i]; if ((arg[0] != '-') || (arg[1] != '-')) { continue; } string::size_type pos; string key, val; if ((pos = arg.find( '=')) == string::npos) { key = string(arg, 2, arg.length() - 2); val = ""; } else { key = string(arg, 2, pos - 2); val = string(arg, pos + 1, arg.length() - 1); } pairs[key] = val; } } bool CheckCmdLineFlag(const char* arg_name) { using namespace std; map::iterator itr; if ((itr = pairs.find(arg_name)) != pairs.end()) { return true; } return false; } template void GetCmdLineArgument(const char *arg_name, T &val); int ParsedArgc() { return pairs.size(); } }; template void b3CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val) { using namespace std; map::iterator itr; if ((itr = pairs.find(arg_name)) != pairs.end()) { istringstream strstream(itr->second); strstream >> val; } } template <> void b3CommandLineArgs::GetCmdLineArgument(const char* arg_name, char* &val) { using namespace std; map::iterator itr; if ((itr = pairs.find(arg_name)) != pairs.end()) { string s = itr->second; val = (char*) malloc(sizeof(char) * (s.length() + 1)); strcpy(val, s.c_str()); } else { val = NULL; } } /****************************************************************************** * Main ******************************************************************************/ extern bool gDebugSkipLoadingBinary; void myprintf(const char* msg) { (void*) msg; } int main( int argc, char** argv) { //gDebugSkipLoadingBinary = true; // b3SetCustomPrintfFunc(myprintf); cl_int ciErrNum; b3CommandLineArgs args(argc,argv); args.GetCmdLineArgument("deviceId", gPreferredDeviceId); args.GetCmdLineArgument("platformId", gPreferredPlatformId); b3Printf("Initialize OpenCL using b3OpenCLUtils_createContextFromType\n"); cl_platform_id platformId; // g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId); g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId); //g_cxMainContext = b3OpenCLUtils_createContextFromType(CL_DEVICE_TYPE_CPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId); oclCHECKERROR(ciErrNum, CL_SUCCESS); int numDev = b3OpenCLUtils_getNumDevices(g_cxMainContext); if (!numDev) { b3Error("error: no OpenCL devices\n"); exit(0); } int devId = 0; g_device = b3OpenCLUtils_getDevice(g_cxMainContext,devId); b3OpenCLUtils_printDeviceInfo(g_device); // create a command-queue g_cqCommandQueue = clCreateCommandQueue(g_cxMainContext, g_device, 0, &ciErrNum); oclCHECKERROR(ciErrNum, CL_SUCCESS); //srand(time(NULL)); srand(0); // presently deterministic unsigned int num_elements = 8*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288; unsigned int iterations = 10; bool keys_only = true; // // Check command line arguments // if (args.CheckCmdLineFlag("help")) { Usage(); return 0; } args.GetCmdLineArgument("i", iterations); args.GetCmdLineArgument("n", num_elements); keys_only = !args.CheckCmdLineFlag("key-values"); g_verbose = args.CheckCmdLineFlag("v"); TestSort( iterations, num_elements, keys_only); }