#include #include #include #include "shmemtime.h" #include #ifdef FORFERMI #include "/common/cuda/include/cuda_runtime.h" #endif #include "dimension.h" #define FILENAME "a_fermi.dat" // Change here the number of steps, the cell geometry, etc #define NITER 5000 #define STEPITER 1000 #define delx 0.5 #define dely 0.25 // end change here. /***/ extern void itstep( int mx, int my, void *f, void *newf, void *r, double rdx2, double rdy2, double beta ); int main( int argc, char **argv ) { int i, j, n, mx1, mx2, my1, my_number, n_of_nodes, totalmx, partmx, leftmx, mx, my; FILE *fp; double howlong; double rdx2, rdy2, beta; /***/ shmem_init(); my_number = shmem_my_pe(); n_of_nodes = shmem_n_pes(); if ( argc != 3 ) { if ( !my_number ) { fprintf( stderr, "Usage: %s \n", argv[0] ); } return (-1); } totalmx = mx = (int)atol( argv[1] ); my = (int)atol( argv[2] ); if ( my < 1 ) { if ( !my_number ) { fprintf( stderr, "Number of columns (%d) should be positive\n", my ); } return (-1); } /* Compute the number of rows per node: */ mx = (totalmx+n_of_nodes-1)/n_of_nodes; /* This is the number of rows for all but the last: */ partmx = mx; /* This is the number of rows for the last: */ /* It cannot be greater than partmx, but it can be non-positive: */ leftmx = totalmx - partmx*(n_of_nodes-1); if ( leftmx < 1 ) { if ( !my_number ) { fprintf( stderr, "Cannot distribute rows, too many processors\n" ); } return (-1); } if ( my_number == (n_of_nodes-1) ) mx = leftmx; /* End rows distribution. */ partmx += 2; mx += 2; my += 2; /* Here we know the array sizes, so make the arrays themselves: */ { DIM2( double, f, my ); DIM2( double, newf, my ); DIM2( double, r, my ); typeof( f ) pf[2]; int curf; #ifdef FORFERMI double *devf; double *devnewf; double *devr; double *devpf[2]; cudaError_t rc1, rc2, rc3; #endif /***/ f = (typeof(f))shmalloc( 2*partmx*sizeof(*f) ); r = (typeof(r))malloc( mx*sizeof(*r) ); if ( (!f) || (!r) ) { fprintf( stderr, "Cannot allocate, exiting\n" ); exit( -1 ); } curf = 0; pf[0] = f; pf[1] = f + partmx; newf = pf[1]; #ifdef FORFERMI cudaSetDevice( my_number % 3 ); rc1 = cudaMalloc( (void**)(&devf), mx*my*sizeof(*devf) ); rc2 = cudaMalloc( (void**)(&devnewf), mx*my*sizeof(*devnewf) ); rc3 = cudaMalloc( (void**)(&devr), mx*my*sizeof(*devr) ); if ( (rc1 != cudaSuccess) || (rc2 != cudaSuccess) || (rc3 != cudaSuccess) ) { fprintf( stderr, "Cannot allocate device memory, exiting\n" ); exit( -1 ); } else { devpf[0] = devf; devpf[1] = devnewf; } #endif rdx2 = 1./delx/delx; rdy2 = 1./dely/dely; beta = 1.0/(2.0*(rdx2+rdy2)); if ( !my_number ) { printf( "Solving heat conduction task on %d by %d grid by %d processors\n", totalmx, my-2, n_of_nodes ); fflush( stdout ); } for ( i = 0; i < mx; i++ ) { for ( j = 0; j < my; j++ ) { if ( ((i==0)&&(my_number==0)) || (j==0) || ((i==(mx-1))&&(my_number==(n_of_nodes-1))) || (j==(my-1)) ) newf[i][j] = f[i][j] = 1.0; else newf[i][j] = f[i][j] = 0.0; r[i][j] = 0.0; } } #ifdef FORFERMI cudaMemcpy( devf, f, mx*my*sizeof(*devf), cudaMemcpyHostToDevice ); cudaMemcpy( devnewf, newf, mx*my*sizeof(*devnewf), cudaMemcpyHostToDevice ); cudaMemcpy( devr, r, mx*my*sizeof(*devr), cudaMemcpyHostToDevice ); #endif mx1 = mx - 1; my1 = my - 1; shmem_barrier_all(); /* Iteration loop: */ howlong = shmem_ttt(); for ( n = 0; n < NITER; n++ ) { if ( !my_number ) { if ( !(n%STEPITER) ) printf( "Iteration %d\n", n ); } /* Step of calculation starts here: */ f = pf[curf]; newf = pf[1-curf]; #ifdef FORFERMI devf = devpf[curf]; devnewf = devpf[1-curf]; itstep( mx, my, devf, devnewf, devr, rdx2, rdy2, beta ); if ( my_number > 0 ) cudaMemcpy( &(newf[1][1]), devnewf+my+1, (my-2)*sizeof(*devnewf), cudaMemcpyDeviceToHost ); if ( my_number < (n_of_nodes-1) ) cudaMemcpy( &(newf[mx-2][1]), devnewf+my*(mx-2)+1, (my-2)*sizeof(*devnewf), cudaMemcpyDeviceToHost ); #else itstep( mx, my, f, newf, r, rdx2, rdy2, beta ); #endif /* Do all the transfers: */ shmem_barrier_all(); if ( my_number > 0 ) shmem_double_put( &(newf[partmx-1][1]), &(newf[1][1]), my-2, my_number-1 ); if ( my_number < (n_of_nodes-1) ) shmem_double_put( &(newf[0][1]), &(newf[mx-2][1]), my-2, my_number+1 ); shmem_barrier_all(); #ifdef FORFERMI if ( my_number > 0 ) cudaMemcpy( devnewf+1, &(newf[0][1]), (my-2)*sizeof(*devnewf), cudaMemcpyHostToDevice ); if ( my_number < (n_of_nodes-1) ) cudaMemcpy( devnewf+my*(mx-1)+1, &(newf[mx-1][1]), (my-2)*sizeof(*devnewf), cudaMemcpyHostToDevice ); #endif /* swap the halves: */ curf = 1 - curf; } #ifdef FORFERMI cudaMemcpy( newf, devnewf, mx*my*sizeof(*devnewf), cudaMemcpyDeviceToHost ); #endif if ( !my_number ) { printf( "Elapsed time: %f sec\n", (float)(shmem_ttt()-howlong) ); fp = fopen( FILENAME, "w" ); fclose( fp ); } for ( j = 0; j < n_of_nodes; j++ ) { shmem_barrier_all(); if ( j == my_number ) { fp = fopen( FILENAME, "a" ); for ( i = 1; i < (mx-1); i++ ) { fwrite( &(newf[i][1]), my-2, sizeof(newf[0][0]), fp ); } fclose( fp ); } } } shmem_finalize(); return 0; }