%
% This file was automatically produced at Feb 27 2003, 22:51:56 by
% c2latex -c mpithreads_both.c
%
\documentclass[11pt,a4paper]{article}
\setlength{\textwidth}{15cm}
\setlength{\textheight}{22.5cm}
\setlength{\hoffset}{-2cm}
\setlength{\voffset}{-2cm}
\begin{document}
\expandafter\ifx\csname indentation\endcsname\relax%
\newlength{\indentation}\fi
\setlength{\indentation}{0.5em}
\begin{flushleft}
{$/\ast$\it{}$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$\mbox{}\\
$\ast$ FILE: mpithreads\_both.c\mbox{}\\
$\ast$ DESCRIPTION:\mbox{}\\
$\ast$   This program illustrates the simultaneous use of MPI and Pthreads. \mbox{}\\
$\ast$   It is essentially a simple combination of a code that implements a dot \mbox{}\\
$\ast$   product using threads, and a code that uses MPI for the same purpose. \mbox{}\\
$\ast$   It is the last of four codes used to show the progression from a serial \mbox{}\\
$\ast$   program to a hybrid MPI/Pthreads program. The other relevant codes are:\mbox{}\\
$\ast$      $-$ mpithreads\_serial.c   $-$ The serial version\mbox{}\\
$\ast$      $-$ mpithreads\_threads.c  $-$ A shared memory programming model using\mbox{}\\
$\ast$          Pthreads\mbox{}\\
$\ast$      $-$ mpithreads\_mpi.c $-$ A distributed memory programming model with MPI\mbox{}\\
$\ast$\mbox{}\\
$\ast$   All the internode MPI communication is done by the main thread on each \mbox{}\\
$\ast$   node $-$ the other threads within that node need not even be aware that \mbox{}\\
$\ast$   internode communication is being performed. Use of the SPMD model for \mbox{}\\
$\ast$   MPI was chosen for convenience, with replication of the main data on \mbox{}\\
$\ast$   all nodes. A more memory efficient implementation would be advisable \mbox{}\\
$\ast$   for larger data sets.  This is the simplest model for mixed MPI/Pthreads \mbox{}\\
$\ast$   programming. \mbox{}\\
$\ast$\mbox{}\\
$\ast$ SOURCE: Vijay Sonnad, IBM\mbox{}\\
$\ast$ LAST REVISED:  10/8/99 Blaise Barney\mbox{}\\
$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast$$\ast/$}\mbox{}\\
\mbox{}\\
{\tt \#include} $<${\tt{}pthread.h}$>$\mbox{}\\
{\tt \#include} "{\tt{}mpi.h}" \mbox{}\\
{\tt \#include} $<${\tt{}malloc.h}$>$\mbox{}\\
\mbox{}\\
{$/\ast$\it{}  \mbox{}\\
This structure has been changed slightly from the previous cases\mbox{}\\
to include the number of threads per node. \mbox{}\\
$\ast/$}\mbox{}\\
\mbox{}\\
{\bf typedef} {\bf struct} \mbox{}\\
\hspace*{1\indentation}\{\mbox{}\\
\hspace*{3\indentation}{\bf double}      $\ast$a;\mbox{}\\
\hspace*{3\indentation}{\bf double}      $\ast$b;\mbox{}\\
\hspace*{3\indentation}{\bf double}     sum; \mbox{}\\
\hspace*{3\indentation}{\bf int}     veclen; \mbox{}\\
\hspace*{3\indentation}{\bf int}   numthrds;\mbox{}\\
\hspace*{1\indentation}\} DOTDATA;\mbox{}\\
\mbox{}\\
{$/\ast$\it{} Define globally accessible variables and a mutex $\ast/$}\mbox{}\\
\mbox{}\\
{\tt \#define} MAXTHRDS 8\mbox{}\\
{\tt \#define} VECLEN 100\mbox{}\\
DOTDATA dotstr; \mbox{}\\
pthread\_t callThd[MAXTHRDS];\mbox{}\\
pthread\_mutex\_t mutexsum;\mbox{}\\
\mbox{}\\
{$/\ast$\it{}\mbox{}\\
The function dotprod has only minor changes from the code \mbox{}\\
that used threads or MPI.  \mbox{}\\
$\ast/$}\mbox{}\\
\mbox{}\\
{\bf void} $\ast$dotprod({\bf void} $\ast$arg)\mbox{}\\
\{\mbox{}\\
\mbox{}\\
\hspace*{3\indentation}{$/\ast$\it{} Define and use local variables for convenience $\ast/$}\mbox{}\\
\mbox{}\\
\hspace*{3\indentation}{\bf int} i, start, end, mythrd, len, numthrds, myid;\mbox{}\\
\hspace*{3\indentation}{\bf double} mysum, $\ast$x, $\ast$y;\mbox{}\\
\mbox{}\\
\hspace*{3\indentation}{$/\ast$\it{}\mbox{}\\
\hspace*{3\indentation}The number of threads and nodes defines the beginning \mbox{}\\
\hspace*{3\indentation}and ending for the dot product; each  thread does work \mbox{}\\
\hspace*{3\indentation}on a vector of length VECLENGTH.\mbox{}\\
\hspace*{3\indentation}$\ast/$}\mbox{}\\
\mbox{}\\
\hspace*{3\indentation}mythrd = ({\bf int})arg;\mbox{}\\
\hspace*{3\indentation}MPI\_Comm\_rank (MPI\_COMM\_WORLD, \&myid);\mbox{}\\
\mbox{}\\
\hspace*{3\indentation}numthrds = dotstr.numthrds;\mbox{}\\
\hspace*{3\indentation}len = dotstr.veclen;\mbox{}\\
\hspace*{3\indentation}start = myid$\ast$numthrds$\ast$len + mythrd$\ast$len;\mbox{}\\
\hspace*{3\indentation}end   = start + len;\mbox{}\\
\hspace*{3\indentation}x = dotstr.a;\mbox{}\\
\hspace*{3\indentation}y = dotstr.b;\mbox{}\\
\mbox{}\\
\hspace*{3\indentation}{$/\ast$\it{}\mbox{}\\
\hspace*{3\indentation}Perform the dot product and assign result\mbox{}\\
\hspace*{3\indentation}to the appropriate variable in the structure. \mbox{}\\
\hspace*{3\indentation}$\ast/$}\mbox{}\\
\mbox{}\\
\hspace*{3\indentation}mysum = 0;\mbox{}\\
\hspace*{3\indentation}{\bf for} (i=start; i$<$end ; i++) \mbox{}\\
\hspace*{4\indentation}\{\mbox{}\\
\hspace*{6\indentation}mysum += (x[i] $\ast$ y[i]);\mbox{}\\
\hspace*{4\indentation}\}\mbox{}\\
\mbox{}\\
\hspace*{3\indentation}{$/\ast$\it{}\mbox{}\\
\hspace*{3\indentation}Lock a mutex prior to updating the value in the structure, and unlock it \mbox{}\\
\hspace*{3\indentation}upon updating.\mbox{}\\
\hspace*{3\indentation}$\ast/$}\mbox{}\\
\hspace*{3\indentation}pthread\_mutex\_lock (\&mutexsum);\mbox{}\\
\hspace*{3\indentation}printf({\tt"Task \%d thread \%d adding partial sum of \%f to node sum of \%f$\backslash$n"},\mbox{}\\
\hspace*{11\indentation}myid, mythrd, mysum, dotstr.sum);\mbox{}\\
\hspace*{3\indentation}dotstr.sum += mysum;\mbox{}\\
\hspace*{3\indentation}pthread\_mutex\_unlock (\&mutexsum);\mbox{}\\
\mbox{}\\
\hspace*{3\indentation}pthread\_exit(({\bf void}$\ast$)0);\mbox{}\\
\}\mbox{}\\
\mbox{}\\
{$/\ast$\it{} \mbox{}\\
As before,the main program does very little computation. It creates\mbox{}\\
threads on each node and the main thread does all the MPI calls. \mbox{}\\
$\ast/$}\mbox{}\\
\mbox{}\\
{\bf int} main({\bf int} argc, {\bf char}$\ast$ argv[])\mbox{}\\
\{\mbox{}\\
\hspace*{3\indentation}{\bf int} i,len=VECLEN;\mbox{}\\
\hspace*{3\indentation}{\bf int} myid, numprocs; \mbox{}\\
\hspace*{3\indentation}{\bf int} nump1, numthrds;\mbox{}\\
\hspace*{3\indentation}{\bf double} $\ast$a, $\ast$b;\mbox{}\\
\hspace*{3\indentation}{\bf double} nodesum, allsum;\mbox{}\\
\hspace*{3\indentation}{\bf int} status;\mbox{}\\
\hspace*{3\indentation}pthread\_attr\_t attr;\mbox{}\\
\mbox{}\\
\hspace*{3\indentation}{$/\ast$\it{} MPI Initialization $\ast/$}\mbox{}\\
\mbox{}\\
\hspace*{3\indentation}MPI\_Init (\&argc, \&argv);\mbox{}\\
\hspace*{3\indentation}MPI\_Comm\_size (MPI\_COMM\_WORLD, \&numprocs);\mbox{}\\
\hspace*{3\indentation}MPI\_Comm\_rank (MPI\_COMM\_WORLD, \&myid);\mbox{}\\
\mbox{}\\
\hspace*{3\indentation}{$/\ast$\it{} Assign storage and initialize values $\ast/$}\mbox{}\\
\mbox{}\\
\hspace*{3\indentation}numthrds=MAXTHRDS;\mbox{}\\
\hspace*{3\indentation}a = ({\bf double}$\ast$) malloc (numprocs$\ast$numthrds$\ast$len$\ast${\bf sizeof}({\bf double}));\mbox{}\\
\hspace*{3\indentation}b = ({\bf double}$\ast$) malloc (numprocs$\ast$numthrds$\ast$len$\ast${\bf sizeof}({\bf double}));\mbox{}\\
\hspace*{2\indentation}\mbox{}\\
\hspace*{3\indentation}{\bf for} (i=0; i$<$len$\ast$numprocs$\ast$numthrds; i++)\mbox{}\\
\hspace*{4\indentation}\{\mbox{}\\
\hspace*{5\indentation}a[i]=1;\mbox{}\\
\hspace*{5\indentation}b[i]=a[i];\mbox{}\\
\hspace*{4\indentation}\}\mbox{}\\
\mbox{}\\
\hspace*{3\indentation}dotstr.veclen = len; \mbox{}\\
\hspace*{3\indentation}dotstr.a = a; \mbox{}\\
\hspace*{3\indentation}dotstr.b = b; \mbox{}\\
\hspace*{3\indentation}dotstr.sum=0;\mbox{}\\
\hspace*{3\indentation}dotstr.numthrds=MAXTHRDS;\mbox{}\\
\hspace*{2\indentation}\mbox{}\\
\hspace*{3\indentation}{$/\ast$\it{} \mbox{}\\
\hspace*{3\indentation}Create thread attribute to specify that the main thread needs\mbox{}\\
\hspace*{3\indentation}to join with the threads it creates.\mbox{}\\
\hspace*{3\indentation}$\ast/$}\mbox{}\\
\hspace*{3\indentation}pthread\_attr\_init(\&attr );\mbox{}\\
\hspace*{3\indentation}pthread\_attr\_setdetachstate(\&attr, PTHREAD\_CREATE\_JOINABLE);\mbox{}\\
\hspace*{2\indentation}\mbox{}\\
\hspace*{3\indentation}{$/\ast$\it{} Create a mutex $\ast/$}\mbox{}\\
\hspace*{3\indentation}pthread\_mutex\_init (\&mutexsum, NULL);\mbox{}\\
\mbox{}\\
\hspace*{3\indentation}{$/\ast$\it{} Create threads within this node to perform the dotproduct  $\ast/$}\mbox{}\\
\hspace*{3\indentation}{\bf for}(i=0;i$<$numthrds;i++)\mbox{}\\
\hspace*{3\indentation}\{\mbox{}\\
\hspace*{8\indentation}pthread\_create( \&callThd[i], \&attr, dotprod, ({\bf void} $\ast$)i); \mbox{}\\
\hspace*{3\indentation}\}\mbox{}\\
\mbox{}\\
\hspace*{3\indentation}{$/\ast$\it{} Release the thread attribute handle as it is no longer needed $\ast/$}\mbox{}\\
\hspace*{3\indentation}pthread\_attr\_destroy(\&attr );\mbox{}\\
\mbox{}\\
\hspace*{3\indentation}{$/\ast$\it{} Wait on the other threads within this node $\ast/$}\mbox{}\\
\hspace*{3\indentation}{\bf for}(i=0;i$<$numthrds;i++)\mbox{}\\
\hspace*{3\indentation}\{\mbox{}\\
\hspace*{8\indentation}pthread\_join( callThd[i], ({\bf void} $\ast$$\ast$)\&status);\mbox{}\\
\hspace*{3\indentation}\}\mbox{}\\
\mbox{}\\
\mbox{}\\
\hspace*{3\indentation}nodesum = dotstr.sum;\mbox{}\\
\hspace*{3\indentation}printf({\tt"Task \%d node sum is \%f$\backslash$n"},myid, nodesum);\mbox{}\\
\mbox{}\\
\hspace*{3\indentation}{$/\ast$\it{} After the dot product, perform a summation of results on each node $\ast/$}\mbox{}\\
\hspace*{3\indentation}MPI\_Reduce (\&nodesum, \&allsum, 1, MPI\_DOUBLE, MPI\_SUM, 0, MPI\_COMM\_WORLD);\mbox{}\\
\mbox{}\\
\hspace*{3\indentation}{\bf if} (myid == 0)  \mbox{}\\
\hspace*{3\indentation}printf ({\tt"Done. MPI with threads version: sum  $=$  \%f $\backslash$n"}, allsum);\mbox{}\\
\hspace*{3\indentation}MPI\_Finalize();\mbox{}\\
\hspace*{3\indentation}free (a);\mbox{}\\
\hspace*{3\indentation}free (b);\mbox{}\\
\hspace*{3\indentation}pthread\_mutex\_destroy(\&mutexsum);\mbox{}\\
\hspace*{3\indentation}exit (0);\mbox{}\\
\}   \mbox{}\\
\end{flushleft}
\end{document}