Subversion

gpucv

[/] [experimental/] [trunk/] [gpucv/] [src/] [plugin/] [cxcoregcu/] [oper_array/] [statistics/] [sum.filter.cu] - Rev 567

Compare with Previous - Blame


//CVG_LicenseBegin==============================================================
//
//	Copyright@ Institut TELECOM 2005
//		http://www.institut-telecom.fr/en_accueil.html
//	
//	This software is a GPU accelerated library for computer-vision. It 
//	supports an OPENCV-like extensible interface for easily porting OPENCV 
//	applications.
//	
//	Contacts :
//		patrick.horain@it-sudparis.eu
//		gpucv-developers@picoforge.int-evry.fr
//	
//	Project's Home Page :
//		https://picoforge.int-evry.fr/cgi-bin/twiki/view/Gpucv/Web/WebHome
//	
//	This software is governed by the CeCILL-B license under French law and
//	abiding by the rules of distribution of free software.  You can  use, 
//	modify and/ or redistribute the software under the terms of the CeCILL-B
//	license as circulated by CEA, CNRS and INRIA at the following URL
//	"http://www.cecill.info/licences/Licence_CeCILL-B_V1-en.html". 
//	
//================================================================CVG_LicenseEnd
/** 	\brief Contains GpuCV-CUDA correspondance of cxcore->array.
*	\author Yannick Allusse
*/
#include <cxcoregcu/config.h>
#if _GPUCV_COMPILE_CUDA

#include <GPUCVCuda/gpucv_wrapper_c.h>
#include <cxcoregcu/cxcoregcu_array_arithm.kernel.h>
#include <cxcoregcu/cxcoregcu_statistics.kernel.h>

void ReduceGetNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, int maxThreads, int &blocks, int &threads)
{
        if (whichKernel < 3)
        {
                threads = (n < maxThreads) ? n : maxThreads;
                blocks = n / threads;
        }
        else
        {
                if (n == 1) 
                        threads = 1;
                else
                        threads = (n < maxThreads*2) ? n / 2 : maxThreads;
                blocks = n / (threads * 2);

                if (whichKernel == 6)
                        blocks = min(maxBlocks, blocks);
        }
}
//=============================================================
#define PROF_REDUCE(FCT)

/**
*\bug Don't know why yet, but the value of the ponter is changing when arriving into the function...???
*/
_GPUCV_CXCOREGCU_EXPORT_CU 
void gcuSumArr(CvArr* _arr)
{
#if 0
        //gcudaDeviceInit();
        //prepare settings
        unsigned int width		= gcuGetWidth(_arr);
        unsigned int height		= gcuGetHeight(_arr);
        unsigned int depth		= gcuGetGLDepth(_arr);
        unsigned int channels	= gcuGetnChannels(_arr);
        //const unsigned int DataSize = DataN * sizeof(unsigned char);
        int maxThreads = 128;  // number of threads per block
        int whichKernel = 6;
        int maxBlocks = 64;


        //Check inputs is done in the cv_cu.cpp file, to manage exceptions
        PROF_REDUCE(CUT_SAFE_CALL(cutCreateTimer(&hTimer)););	

        //=====================
        //prepare source
        unsigned char * d_src = (unsigned char *)gcuPreProcess(_arr, GCU_INPUT, CU_MEMORYTYPE_DEVICE);
        //=====================

        int numBlocks = 0;
        int numThreads = 0;
        int maxNumBlocks = DataN / maxThreads;
        ReduceGetNumBlocksAndThreads(whichKernel, DataN, maxBlocks, maxThreads, numBlocks, numThreads);
        //if (numBlocks == 1) cpuFinalThreshold = 1;

        //prepare ouput========
        int* d_odata = NULL;
        int* h_odata = (int*) malloc(maxNumBlocks*sizeof(int));
        GCU_CUDA_SAFE_CALL( cudaMalloc((void**) &d_odata, numBlocks*sizeof(int)) );
        //=====================
        
        gcudaThreadSynchronize();

        dim3 dimBlock(numThreads, 1, 1);
        dim3 dimGrid(numBlocks, 1, 1);
        int smemSize = numThreads * sizeof(int);
        switch (numThreads)
        {
        case 512:
                CudaReduceSum<512><<< dimGrid, dimBlock, smemSize >>>(d_src, d_odata, DataN); break;
        case 256:
                CudaReduceSum<256><<< dimGrid, dimBlock, smemSize >>>(d_src, d_odata, DataN); break;
        case 128:
                CudaReduceSum<128><<< dimGrid, dimBlock, smemSize >>>(d_src, d_odata, DataN); break;
        case 64:
                CudaReduceSum< 64><<< dimGrid, dimBlock, smemSize >>>(d_src, d_odata, DataN); break;
        case 32:
                CudaReduceSum< 32><<< dimGrid, dimBlock, smemSize >>>(d_src, d_odata, DataN); break;
        case 16:
                CudaReduceSum< 16><<< dimGrid, dimBlock, smemSize >>>(d_src, d_odata, DataN); break;
        case  8:
                CudaReduceSum<  8><<< dimGrid, dimBlock, smemSize >>>(d_src, d_odata, DataN); break;
        case  4:
                CudaReduceSum<  4><<< dimGrid, dimBlock, smemSize >>>(d_src, d_odata, DataN); break;
        case  2:
                CudaReduceSum<  2><<< dimGrid, dimBlock, smemSize >>>(d_src, d_odata, DataN); break;
        case  1:
                CudaReduceSum<  1><<< dimGrid, dimBlock, smemSize >>>(d_src, d_odata, DataN); break;
        }
        // sum partial sums from each block on CPU        
        // copy result from device to host
        GCU_CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_odata, numBlocks*sizeof(int), cudaMemcpyDeviceToHost) );

        gcuScalar gpu_result;
        for(int i=0; i<numBlocks; i++) 
        {
                gpu_result.val[0] += h_odata[i];
        }

        //GCU_CUDA_SAFE_CALL (cudaUnBindTexture(texSobel));

        //check results

        //clean output
        GCU_CUDA_SAFE_CALL( cudaFree(d_odata));
        //=====================

        //=====================
        //clean source
        gcuPostProcess(_arr);	

        //close operator
        PROF_REDUCE(CUT_SAFE_CALL(cutDeleteTimer(hTimer)););
#endif
}
//==========================================================
#endif//CUDA

Powered by WebSVN v1.61