The C version of parallelDiffusion is as follows:
/* WARNING: There is a bug in this program! */
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <mpi.h>
const double pi = 3.1415926535897932385;
int main( int argc, char *argv[] ){
/* Arguments required for executing argv[0]: */
/* 0 -> serialDiffusion */
/* 1 -> nni */
/* 2 -> nnj */
/* 3 -> numTimeSteps */
double **uo; /* solution at previous time step */
double **u; /* solution at current time step */
double mu, nu, c; /* parameters of the FDE */
double ell1,ell2; /* x,y grid dimensions, 0 <= x <= ell1, etc. */
double dx,dy; /* x,y grid spacing, or spacial step size */
int nni,nnj; /* x,y number of interior grid points; excl. bdy. pts.*/
int numTimeSteps; /* when to end simulation */
double dt; /* time step size */
double dcoeff; /* diffusion coefficient */
double u_0; /* uniform initial conditions */
double actual; /* the true solution */
int i,j,k,m,n,timeStep;
int myStarti,myEndi,mynni; /* my start, end, and number of grid points */
/* in x-direction */
int leftProc,rightProc; /* my neighboring processes, left and right */
int myrank,numProcs;
/* Declare an MPI status variable. */
MPI_Status status;
/* Initialize the MPI API: */
MPI_Init(&argc, &argv);
/* Request my ID number: */
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
/* I'll also ask how many other processors are out there: */
MPI_Comm_size(MPI_COMM_WORLD, &numProcs);
/* Okay. The preparations have been made. */
/* Set the diffusion coefficient, grid dimensions, */
/* and the uniform initial condition. */
dcoeff = 1.0;
ell1 = 7.0;
ell2 = 1.0;
u_0 = 1.0;
/* Get the number of interior grid points and */
/* number of time steps from the command line. */
nni = atoi( argv[1] );
nnj = atoi( argv[2] );
numTimeSteps = atoi( argv[3] );
/* Compute number of x-direction grid points allocated to me, mynni. */
/* Compute my starting and ending x-direction grid points, myStarti and */
/* myEndi. */
mynni = nni / numProcs;
if ( myrank < (nni % numProcs) ) {
mynni++;
myStarti = myrank*mynni + 1;
}
else {
myStarti = myrank*mynni + 1 + (nni % numProcs);
}
myEndi = myStarti + mynni - 1;
/* Allocate memory for my portion of the solution arrays. */
/* Include boundaries. */
/* As declared above, u is a pointer to a pointer to a double. */
/* Allocate my memory so u points to the first element of a */
/* (mynni+2)-long array of pointers to doubles. Likewise for uo. */
u = (double **) malloc( (size_t) ( (mynni+2) * sizeof(double*) ) );
uo = (double **) malloc( (size_t) ( (mynni+2) * sizeof(double*) ) );
/* u[0] is a pointer to a double. */
/* Allocate my memory so u[0] points to the first element of a */
/* (mynni+2)*(nnj+2)-long array of doubles. Likewise for uo[0]. */
u[0] = (double *) malloc( (size_t) ( ((mynni+2)*(nnj+2)) * sizeof(double) ) );
uo[0] = (double *) malloc( (size_t) ( ((mynni+2)*(nnj+2)) * sizeof(double) ) );
/* Now set pointer u[1] to point to *(u[0] + (nnj+2)). */
/* And set pointer u[2] to point to *(u[0] + 2*(nnj+2)) */
/* or *(u[1] + (nnj+2)). */
/* And so forth. Likewise for uo[1], uo[2], etc. */
for ( i = 1; i < mynni+2; i++ ) {
u[i] = u[i-1] + (nnj+2);
uo[i] = uo[i-1] + (nnj+2);
}
/* We have now dynamically allocated my memory for the arrays */
/* u[mynni+2][nnj+2] and uo[mynni+2][nnj+2] which are large */
/* enough to include the boundary points. */
/* Set the ranks of the processors to the left and right of me. */
/* These are the processors I will communicate with. */
rightProc = myrank + 1;
if(rightProc == numProcs) rightProc = MPI_PROC_NULL;
leftProc = myrank - 1;
if(leftProc == -1) leftProc = MPI_PROC_NULL;
/* Compute dx and dy, and set dt so that the Courant condition */
/* is satisfied. */
dx = ell1 / ( nni + 1 );
dy = ell2 / ( nnj + 1 );
dt = 0.49 / ( dcoeff*( 1.0/(dx*dx) + 1.0/(dy*dy) ) );
/* Set uniform initial conditions on my portion of the grid. */
for ( i = 1; i <= mynni; i++ ) {
for ( j = 1; j <= nnj; j++ ) {
u[i][j] = u_0;
}
}
/* Set boundary conditions for my portion of the grid. */
/* Assume Dirichlet boundary conditions equal to 0. */
/* First the y = 0 boundary. */
for ( i = 0; i <= mynni+1; i++ ) {
u[i][0] = 0.0;
}
/* Second the y = ell2 boundary. */
for ( i = 0; i <= mynni+1; i++ ) {
u[i][nnj+1] = 0.0;
}
/* Third the x = 0 boundary. */
if ( myrank == 0 ) {
for ( j = 1; j <= nnj; j++ ) {
u[0][j] = 0.0;
}
}
/* Finally the x = ell1 boundary. */
if ( myrank == numProcs-1 ) {
for ( j = 1; j <= nnj; j++ ) {
u[mynni+1][j] = 0.0;
}
}
/* Solve the 2-D diffusion equation by the explicit FTCS method. */
mu = dcoeff*dt / (dx*dx);
nu = dcoeff*dt / (dy*dy);
c = 1.0 - 2.0*( mu + nu );
for ( timeStep = 1; timeStep <= numTimeSteps; timeStep++ ) {
/* Update previous time step data. */
for ( i = 0; i <= mynni+1; i++ ) {
for ( j = 0; j <= nnj+1; j++ ) {
uo[i][j] = u[i][j];
}
}
/* Now compute the stencil for each ij pair that I own. */
for ( i = 1; i <= mynni; i++ ) {
for ( j = 1; j <= nnj; j++ ) {
u[i][j] = mu*(uo[i+1][j]+uo[i-1][j]) + c*uo[i][j]
+ nu*(uo[i][j+1]+uo[i][j-1]);
}
}
/* Now send/receive ghost points to/from my left and right neighbors. */
/* Be careful that no deadlock situations arise! */
/* To avoid deadlocks, we let all odd-ranked processors send first */
/* and receive second. */
if ( (myrank % 2) == 1 ) {
MPI_Send( u[1], nnj+2, MPI_DOUBLE, leftProc, 0, MPI_COMM_WORLD );
MPI_Recv( u[0], nnj+2, MPI_DOUBLE, leftProc, 0, MPI_COMM_WORLD,
&status );
MPI_Send( u[mynni], nnj+2, MPI_DOUBLE, rightProc, 0,
MPI_COMM_WORLD );
MPI_Recv( u[mynni+1], nnj+2, MPI_DOUBLE, rightProc, 0,
MPI_COMM_WORLD, &status );
}
/* And we let all even-ranked processors receive first and */
/* and send second. */
else {
MPI_Recv( u[mynni+1], nnj+2, MPI_DOUBLE, rightProc, 0,
MPI_COMM_WORLD, &status );
MPI_Send( u[mynni], nnj+2, MPI_DOUBLE, rightProc, 0,
MPI_COMM_WORLD );
MPI_Recv( u[0], nnj+2, MPI_DOUBLE, leftProc, 0, MPI_COMM_WORLD,
&status );
MPI_Send( u[1], nnj+2, MPI_DOUBLE, leftProc, 0, MPI_COMM_WORLD );
}
/* Now proceed to the next time step. */
}
/* Now that I am done we should consolidate our results somewhere. */
/* But this is just a test program, so let's not bother. Rather, */
/* let's just compare our results at a representative point with */
/* the actual solution. */
/* Let's look now at a specific grid point. */
i = (int) nni/(numProcs*2);
j = 10;
/* If I own this grid point, then let me do the work. */
if ( (i>=myStarti) && (i<=myEndi) ) {
/* Compute the actual temperature at this point. */
actual = 0.0;
for ( m = 1; m < 1000; m += 2 ) {
for ( n = 1; n < 1000; n += 2 ) {
actual += 1.0/(m*n) * exp( - dcoeff * pi*pi *
( m*m/(ell1*ell1) + n*n/(ell2*ell2) )*
numTimeSteps*dt )
* sin( m*pi*i*dx/ell1 ) * sin( n*pi*j*dy/ell2 );
}
}
actual *= 16.0*u_0 / (pi*pi);
/* Compare the actual with the numerical result at this point. */
printf( "This is process: %d After %d time steps some results are:\n", myrank, numTimeSteps );
printf( "actual u[%d][%d] = %f\n", i, j, actual );
printf( "computed u[%d][%d] = %f\n", i, j, u[i-myStarti+1][j] );
}
/* Let's look now at another specific grid point. Why is this not modular?? :-) */
//i = (int) ell1 / (4*dx);
i = (int) nni/numProcs;
/* If I own this grid point, then let me do the work. */
if ( (i>=myStarti) && (i<=myEndi) ) {
/* Compute the actual temperature at this point. */
actual = 0.0;
for ( m = 1; m < 1000; m += 2 ) {
for ( n = 1; n < 1000; n += 2 ) {
actual += 1.0/(m*n) * exp( - dcoeff * pi*pi *
( m*m/(ell1*ell1) + n*n/(ell2*ell2) )*
numTimeSteps*dt )
* sin( m*pi*i*dx/ell1 ) * sin( n*pi*j*dy/ell2 );
}
}
actual *= 16.0*u_0 / (pi*pi);
/* Compare the actual with the numerical result at this point. */
printf( "This is process: %d After %d time steps some results are:\n", myrank, numTimeSteps );
printf( "actual u[%d][%d] = %f\n", i, j, actual );
printf( "computed u[%d][%d] = %f\n", i, j, u[i-myStarti+1][j] );
}
/* Close the MIP API: */
MPI_Finalize();
return 0;
}
Note: As was the case before, once again, if you did not copy example programs from HPC's account, then copy the above source code as parallelDiffusion.c to your ParallelDiffusion directory.
Notice that we do not consolidate results in a single output file nor on a single processor as would be done in a practical application.
Exercise: Do this!
To compile the parallelDiffusion program, on the head node (bhX) type:
[agopu@bh2 agopu]$ cd ~/MPI_Tutorial/ParallelDiffusion/ [agopu@bh2 ParallelDiffusion]$ makeThe ls -l command will show your new executable parallelDiffusion .
To run this program on interactive nodes you got through qsub -I (bcXX), do:
[agopu@bc81 agopu]$ cd ~/MPI_Tutorial/ParallelDiffusion/ [agopu@bc81 ParallelDiffusion]$ lamboot $PBS_NODEFILE [agopu@bc81 ParallelDiffusion]$ mpirun C parallelDiffusion 8000 1000 100 [agopu@bc81 ParallelDiffusion]$ lamhalt
This is process: 0 After 100 time steps some results are: actual u[1000][10] = 0.874806 computed u[1000][10] = 0.873978 This is process: 0 After 100 time steps some results are: actual u[2000][10] = 0.874775 computed u[2000][10] = 0.848010 [0] Intel Trace Collector INFO: Writing tracefile parallelDiffusion.stf in /N/u/agopu/MPI_Tutorial/ParallelDiffusion
Non-interactive job submission: To submit a PBS job to run this program, use the submit_parallelDiffusion.sh shell script within your ParallelDiffusion directory.
Note: Again, as before, if you did not copy examples programs from the HPC account then, modify the qsub job submission script so that mpirun will execute the parallelDiffusion program with parameters 8000 1000 100 or other values you may wish to have.
. . .
mpirun C parallelDiffusion 8000 1000 100
. . .
Once you've done that, save the script as submit_parallelDiffusion.sh and then do the job submission.
[agopu@bh2 ParallelDiffusion]$ pwd /N/u/agopu/MPI_Tutorial/ParallelDiffusion/ [agopu@bh2 ParallelDiffusion]$ qsub submit_parallelDiffusion.sh
By now, hopefully you know how to monitor PBS jobs! If you have trouble, then take at a look at Working with Portable Batch Scheduler (PBS) section; PBS related details are explained in it.
If you submitted the job to PBS non-intereactively, then check the output and error files for messages your program or PBS may have given out.
[agopu@bh2 ParallelDiffusion] pwd
/N/u/agopu/MPI_Tutorial/ParallelDiffusion/
[agopu@bh2 ParallelDiffusion]$ ls -latr submit_*.o* submit_*.e*
-rw------- 1 agopu hpc 184 Dec 20 15:50 submit_parallel.o286298
[agopu@bh2 ParallelDiffusion]$ more submit_parallel.o286298
After 100 time steps some results are:
actual u[400][10] = 0.874737
. . .
computed u[1000][10] = 0.873978
[0] Intel Trace Collector INFO: Writing tracefile parallelDiffusion.stf in /N/u/agopu/MPI_Tutorial/ParallelDiffusion
| Previous: Parallelizing the Numerical Solution... | Up: Table of Contents | Next: Simple MPI I/O and Error Handling |
|---|