#include #include #include #include #define GRIDSIZE 32 #define BLOCKSIZE 1024 extern __shared__ double pmiddle[]; __global__ void iteration( double *f, double *fn, double *rhs, int nrows, int ncols, double rdx2, double rdy2, double beta2 ) { int i, n; double *pf, *pfn, *prhs; double *middle = pmiddle; double upper, lower; /***/ /* calculate this grid start & size: */ n = nrows/gridDim.x; pf = f + n*ncols*blockIdx.x; pfn = fn + n*ncols*blockIdx.x; prhs = rhs + n*ncols*blockIdx.x; if ( blockIdx.x == (gridDim.x-1) ) n = nrows - n*(gridDim.x-1); /* make the shadow edges: */ if ( blockIdx.x == 0 ) { n++; } else { pf -= ncols; pfn -= ncols; prhs -= ncols; if ( blockIdx.x == (gridDim.x-1) ) n++; else n += 2; } /* pipeline the grid portion through the shared memory "vector registers": */ upper = pf[threadIdx.x]; pf += ncols; pfn += ncols; middle[threadIdx.x] = pf[threadIdx.x]; prhs += ncols; for ( i = 2; i < n; i++ ) { lower = pf[threadIdx.x+ncols]; __syncthreads(); if ( (threadIdx.x > 0) && (threadIdx.x < (blockDim.x-1)) ) { pfn[threadIdx.x] = beta2*( rdx2*(upper+lower) + rdy2*(middle[threadIdx.x-1]+middle[threadIdx.x+1]) - prhs[threadIdx.x]); } prhs += ncols; pf += ncols; pfn += ncols; __syncthreads(); upper = middle[threadIdx.x]; middle[threadIdx.x] = lower; } } void itstep( int nrows, int ncols, void *f, void *fn, void *r, double rdx, double rdy, double beta ) { int n, nl; /***/ n = nrows/GRIDSIZE; nl = nrows - n*(GRIDSIZE-1); if ( (n < 1) || (nl < 1) ) { fprintf( stderr, "Too many grid blocks (%d) for the number of rows per device (%d), exiting\n", GRIDSIZE, nrows ); exit( -1 ); } n = ncols; while ( n > 2 ) { nl = n; if ( nl > BLOCKSIZE ) nl = BLOCKSIZE; iteration<<>>( (double*)f, (double*)fn, (double*)r, nrows, ncols, rdx, rdy, beta ); f = ((double*)f)+(nl-2); fn = ((double*)fn)+(nl-2); r = ((double*)r)+(nl-2); n -= (nl-2); } }