2012-06-11 18:53:35 +00:00
|
|
|
//
|
2013-09-26 19:04:57 +00:00
|
|
|
// Copyright 2013 Pixar
|
2012-06-11 18:53:35 +00:00
|
|
|
//
|
2013-09-26 19:04:57 +00:00
|
|
|
// Licensed under the Apache License, Version 2.0 (the "Apache License")
|
|
|
|
// with the following modification; you may not use this file except in
|
|
|
|
// compliance with the Apache License and the following modification to it:
|
|
|
|
// Section 6. Trademarks. is deleted and replaced with:
|
2012-06-11 18:53:35 +00:00
|
|
|
//
|
2013-09-26 19:04:57 +00:00
|
|
|
// 6. Trademarks. This License does not grant permission to use the trade
|
|
|
|
// names, trademarks, service marks, or product names of the Licensor
|
|
|
|
// and its affiliates, except as required to comply with Section 4(c) of
|
|
|
|
// the License and to reproduce the content of the NOTICE file.
|
2012-06-11 18:53:35 +00:00
|
|
|
//
|
2013-09-26 19:04:57 +00:00
|
|
|
// You may obtain a copy of the Apache License at
|
2012-06-11 18:53:35 +00:00
|
|
|
//
|
2013-09-26 19:04:57 +00:00
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
2013-07-18 21:19:50 +00:00
|
|
|
//
|
2013-09-26 19:04:57 +00:00
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the Apache License with the above modification is
|
|
|
|
// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
|
|
// KIND, either express or implied. See the Apache License for the specific
|
|
|
|
// language governing permissions and limitations under the Apache License.
|
2012-06-11 18:53:35 +00:00
|
|
|
//
|
2013-09-26 19:04:57 +00:00
|
|
|
|
2012-06-11 18:53:35 +00:00
|
|
|
#ifndef OSD_CUDA_INIT_H
|
|
|
|
#define OSD_CUDA_INIT_H
|
|
|
|
|
2012-12-11 01:15:13 +00:00
|
|
|
#include <algorithm>
|
2013-01-19 00:03:17 +00:00
|
|
|
#include <cstdio>
|
2012-12-11 01:15:13 +00:00
|
|
|
|
2012-06-11 18:53:35 +00:00
|
|
|
// From "NVIDIA GPU Computing SDK 4.2/C/common/inc/cutil_inline_runtime.h":
|
|
|
|
|
|
|
|
// Beginning of GPU Architecture definitions
|
|
|
|
inline int _ConvertSMVer2Cores_local(int major, int minor)
|
|
|
|
{
|
|
|
|
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
|
|
|
typedef struct {
|
|
|
|
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
|
|
|
int Cores;
|
|
|
|
} sSMtoCores;
|
|
|
|
|
2012-08-04 02:51:27 +00:00
|
|
|
sSMtoCores nGpuArchCoresPerSM[] =
|
2012-06-11 18:53:35 +00:00
|
|
|
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
|
|
|
|
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
|
|
|
|
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
|
|
|
|
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
|
|
|
|
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
|
|
|
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
|
|
|
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
|
|
|
|
{ -1, -1 }
|
|
|
|
};
|
|
|
|
|
|
|
|
int index = 0;
|
|
|
|
while (nGpuArchCoresPerSM[index].SM != -1) {
|
|
|
|
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
|
|
|
|
return nGpuArchCoresPerSM[index].Cores;
|
|
|
|
}
|
|
|
|
index++;
|
|
|
|
}
|
|
|
|
printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
// end of GPU Architecture definitions
|
|
|
|
|
|
|
|
// This function returns the best GPU (with maximum GFLOPS)
|
|
|
|
inline int cutGetMaxGflopsDeviceId()
|
|
|
|
{
|
|
|
|
int current_device = 0, sm_per_multiproc = 0;
|
|
|
|
int max_compute_perf = 0, max_perf_device = 0;
|
|
|
|
int device_count = 0, best_SM_arch = 0;
|
|
|
|
cudaDeviceProp deviceProp;
|
|
|
|
|
|
|
|
cudaGetDeviceCount( &device_count );
|
|
|
|
// Find the best major SM Architecture GPU device
|
|
|
|
while ( current_device < device_count ) {
|
|
|
|
cudaGetDeviceProperties( &deviceProp, current_device );
|
|
|
|
if (deviceProp.major > 0 && deviceProp.major < 9999) {
|
|
|
|
best_SM_arch = std::max(best_SM_arch, deviceProp.major);
|
|
|
|
}
|
|
|
|
current_device++;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Find the best CUDA capable GPU device
|
|
|
|
current_device = 0;
|
|
|
|
while( current_device < device_count ) {
|
|
|
|
cudaGetDeviceProperties( &deviceProp, current_device );
|
|
|
|
if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
|
|
|
|
sm_per_multiproc = 1;
|
|
|
|
} else {
|
|
|
|
sm_per_multiproc = _ConvertSMVer2Cores_local(deviceProp.major, deviceProp.minor);
|
|
|
|
}
|
|
|
|
int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
|
|
|
|
if( compute_perf > max_compute_perf ) {
|
|
|
|
// If we find GPU with SM major > 2, search only these
|
|
|
|
if ( best_SM_arch > 2 ) {
|
|
|
|
// If our device==dest_SM_arch, choose this, or else pass
|
2012-08-04 02:51:27 +00:00
|
|
|
if (deviceProp.major == best_SM_arch) {
|
2012-06-11 18:53:35 +00:00
|
|
|
max_compute_perf = compute_perf;
|
|
|
|
max_perf_device = current_device;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
max_compute_perf = compute_perf;
|
|
|
|
max_perf_device = current_device;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
++current_device;
|
|
|
|
}
|
|
|
|
return max_perf_device;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif //OSD_CUDA_INIT_H
|