#include #include #include #include #define LROW 1002 void initval( float *p[], float *rhs[], int nrows, int ncols, float pborder, float pinside, float rhsborder, float rhsinside ) { int i, j; /***/ for ( i = 0; i < nrows; i++ ) { for ( j = 0; j < ncols; j++ ) { if ( (i == 0) || (j == 0) || (i == (nrows-1)) || (j == (ncols-1)) ) { p[i][j] = pborder; rhs[i][j] = rhsborder; } else { p[i][j] = pinside; rhs[i][j] = rhsinside; } } } } void allocate( int nrows, int ncols, float **cop, float **corhs ) { cudaError_t rc; /***/ cudaSetDevice( 0 ); rc = cudaMalloc( (void**)cop, nrows*ncols*sizeof(**cop) ); if ( rc ) { printf( "Failed to allocate\n" ); return; } rc = cudaMalloc( (void**)(cop+1), nrows*ncols*sizeof(**cop) ); if ( rc ) { printf( "Failed to allocate\n" ); return; } rc = cudaMalloc( (void**)corhs, nrows*ncols*sizeof(**corhs) ); if ( rc ) { printf( "Failed to allocate\n" ); return; } if ( ncols > LROW ) { printf( "Too long rows: %d while maximum is %d\n", ncols, LROW ); return; } } void upload( float *p[], float *rhs[], int nrows, int ncols, float *cop, float *corhs ) { int i; /***/ for ( i = 0; i < nrows; i++ ) { cudaMemcpy( &(cop[i*ncols]), p[i], ncols*sizeof(*cop), cudaMemcpyHostToDevice ); if ( corhs ) cudaMemcpy( &(corhs[i*ncols]), rhs[i], ncols*sizeof(*corhs), cudaMemcpyHostToDevice ); } } void download( float *p[], float *rhs[], int nrows, int ncols, float *cop, float *corhs ) { int i; /***/ for ( i = 0; i < nrows; i++ ) { cudaMemcpy( p[i], &(cop[i*ncols]), ncols*sizeof(*p[0]), cudaMemcpyDeviceToHost ); } } __global__ void iteration( float *f, float *fn, float *rhs, int nrows, int ncols, float delx, float dely, float addcoeff[] ) { int i, n; float *pf, *pfn, *prhs; __shared__ float middle[LROW]; float rdx2, rdy2, beta2; float upper, lower; /***/ rdx2 = 1./delx/delx; rdy2 = 1./dely/dely; beta2 = 1.0/(2.0*(rdx2+rdy2)); /* calculate this grid start & size: */ n = (nrows+gridDim.x-1)/gridDim.x; pf = f + n*ncols*blockIdx.x; pfn = fn + n*ncols*blockIdx.x; prhs = rhs + n*ncols*blockIdx.x; if ( blockIdx.x == (gridDim.x-1) ) n = nrows - n*(gridDim.x-1); /* make the shadow edges: */ if ( blockIdx.x == 0 ) { n++; } else { pf -= ncols; pfn -= ncols; prhs -= ncols; if ( blockIdx.x == (gridDim.x-1) ) n++; else n += 2; } /* pipeline the grid portion through the shared memory "vector registers": */ upper = pf[threadIdx.x]; pf += ncols; pfn += ncols; middle[threadIdx.x] = pf[threadIdx.x]; prhs += ncols; for ( i = 2; i < n; i++ ) { lower = pf[threadIdx.x+ncols]; __syncthreads(); if ( (threadIdx.x > 0) && (threadIdx.x < (ncols-1)) ) { pfn[threadIdx.x] = beta2*( rdx2*(upper+lower) + rdy2*(middle[threadIdx.x-1]+middle[threadIdx.x+1]) - prhs[threadIdx.x]); } prhs += ncols; pf += ncols; pfn += ncols; __syncthreads(); upper = middle[threadIdx.x]; middle[threadIdx.x] = lower; } } void poissn( float *p[], float *rhs[], float **pcop, int ip, float *corhs, int nrows, int ncols, float delx, float dely, float addcoeff[] ) { iteration<<<32,ncols>>>( pcop[ip], pcop[1-ip], corhs, nrows, ncols, delx, dely, addcoeff ); }