      PROGRAM POLY3
C
      INCLUDE 'mpif.h'
      INCLUDE 'poly3.inc'
C
C     =================================================
C     ===                                           ===
C     ===   Program:  Communication Bottleneck      ===
C     ===   Version:  MPI + Fortran 77              ===
C     ===                                           ===
C     =================================================
C
C     /* Program name */
      CHARACTER*10  NAME /'poly3_mpi'/
C
C     /* Timer routine */
C
      EXTERNAL DWALLTIME00
      DOUBLE PRECISION DWALLTIME00
C
C     Variables:
C
      INTEGER I, ierr, my_rank, status(MPI_STATUS_SIZE)
      CHARACTER*6 GROUP /'poly3 '/
C
C     /* maximum length of arrays */
C
      INTEGER NNMAX
      PARAMETER(NNMAX=10000)
C
      COMMON /SHDATA/  A(NNMAX),B(NNMAX)
      DOUBLE PRECISION A,B
C
      COMMON /COEFFS/  S0,S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11
      DOUBLE PRECISION S0,S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11
C
      INTEGER CT,JT,KFSEL,MREF,N,NFLOP,NFLP,NMAX,NP,
     &        NPOLRP,NPOLYS,NSEL,NSLAVE,NTIM,NTIMES
      DOUBLE PRECISION FHALF,FRAT(50),PER,RHAT,RINF,RLAST(50),RMFLPS,
     &                 S12,ST0,STIME,T0,T1,T2,T10,T11,TN,TOTIME,TS,X,
     &                 XN,XN12,XNLAST(50),Y
C
C     /* the unit number of output files */
C
      INTEGER IOUNIT, NW1
      PARAMETER(IOUNIT=10)
      PARAMETER(NW1=11)
C
C     Initialize MPI.
C
      CALL MPI_INIT( ierr )
C     
      IF( ierr.NE.MPI_SUCCESS )
     &    PRINT *,'ERROR: MPI_INIT returned ', ierr
C
      CALL MPI_COMM_RANK( MPI_COMM_WORLD, my_rank, ierr)
      CALL MPI_COMM_SIZE( MPI_COMM_WORLD, NP, ierr)
C
      IF( my_rank.EQ.0 )THEN
C
C        Get parameters from input file
C
         PRINT *
         PRINT *,'      POLY3:  Communication Bottleneck Test'
         PRINT *,'      -------------------------------------'
         PRINT *,'A vector of data is sent from a master process to a'
         PRINT *,'slave process which performs an evaluation of a'
         PRINT *,'polynomial for each element of the vector, and then'
         PRINT *,'returns the vector of results to the master.'
         PRINT *,'The slave may be selected as any of the processes.'
         PRINT *,'      -------------------------------------'
         PRINT *
         OPEN( UNIT=IOUNIT,FILE='poly3.dat',STATUS='OLD' )
         READ( IOUNIT,FMT = * ) 
         PRINT *,'Number of processes =',NP
C
         PRINT *
         PRINT *,'Master process, 0, exchanges with a slave process'
         PRINT *,'with a number in the range 1 to ', NP-1,'.'
         PRINT *,'      -------------------------------------'
         PRINT *
         READ( IOUNIT,FMT = * ) NSLAVE
C
         IF( NP.EQ.2 )THEN
	    NSLAVE = 1
         ELSE
            IF( NSLAVE.LT.1 .OR. NSLAVE.GT.NP-1 )THEN
               PRINT *,'The slave process value', NSLAVE, ' is invalid.'
               PRINT *,'Try again:  1 <= slave number <=', NP-1
               PRINT *
               CALL MPI_FINALIZE(ierr)
               STOP
            ENDIF
         ENDIF
         PRINT *,'Slave number =',NSLAVE
         CLOSE(IOUNIT)
C
C        /* Send all process ids to each process */
C        /* Send NSLAVE to processes */
C
C       /* Broadcast NSLAVE to each process */
C
        CALL MPI_BCAST( NSLAVE, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
C	
      ELSE
C        
C       /* All slaves receive NSLAVE from master */
C
        CALL MPI_BCAST( NSLAVE, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)	

      ENDIF
C
      IF( my_rank.EQ.0 )THEN
         OPEN(NW1,FILE='poly3.res')
C
C       /* print header */
C
        CALL HEADER(NW1,NAME)
C
      ENDIF
C
C     /* Global synchronisation */
C
      CALL MPI_BARRIER( MPI_COMM_WORLD, ierr)
      IF( ierr.NE.MPI_SUCCESS )
     &    PRINT *,'ERROR: MPI_BARRIER returned ', ierr

C
C     /* Initialise variables here */
C
      NTIMES=1000*NITER
      NMAX=NNMAX
C
C     /* Initialise coeffs of polynomial */
C
      S0=3.12
      S1=2.13
      S2=1.65
      S3=8.32
      S4=1.76
      S5=3.09
      S6=1.82
      S7=2.41
      S8=3.69
      S9=4.11
      S10=8.51
      S11=1.66
C
      IF( my_rank.EQ.0 )THEN
C
C        /* INITIALIZE TIMER */
C
         T10 = DWALLTIME00()
C
C        /* INITIALIZE ARRAYS : */
C
         DO 30 I=1,NMAX
            A(I)=5.842*I
            B(I)=3.9675*I
   30    CONTINUE
C
         WRITE(NW1,997)NP
  997    FORMAT(/,
     2  1X,'       -------------------------------',/,
     3  1X,'       POLY3: Communication Bottleneck',/,
     4  1X,'       -------------------------------',/,
     5  1X,'                 Result Summary       ',/,
     6  1X,'                 --------------       ',//,
     7  1X,'       Number of processes = ',I6,//,
     8  1X,'       -------------------------------',/)
C
      ENDIF
C
C     /* Number of polynomials */
C
      NPOLYS=20
      CT = 0
      DO 21 KFSEL=1,NPOLYS,1
C
C       /* Select polynomial order = NFLP*NPOLRP/2 */
C
        IF( KFSEL.EQ.1 )NFLP=2
        IF( KFSEL.EQ.2 )NFLP=4
        IF( KFSEL.EQ.3 )NFLP=6
        IF( KFSEL.EQ.4 )NFLP=8
        IF( KFSEL.EQ.5 )NFLP=10
        IF( KFSEL.EQ.6 )NFLP=12
        IF( KFSEL.EQ.7 )NFLP=14
        IF( KFSEL.EQ.8 )NFLP=16
        IF( KFSEL.EQ.9 )NFLP=18
        IF( KFSEL.GE.10 )NFLP=20
C
C       /* Number of repeats of 10th degree polynomial */
C
        NPOLRP=1
        IF( KFSEL.GE.11 .AND. KFSEL.LE.15 ) NPOLRP=2*(KFSEL-10)
        IF( KFSEL.GE.16 .AND. KFSEL.LE.20 ) NPOLRP=20*(KFSEL-15)
        NFLOP=NPOLRP*NFLP
C
        IF( my_rank.EQ.0 )THEN
           MREF=2
           FRAT(KFSEL)=NFLOP/MREF
C
           WRITE(NW1,345)
  345      FORMAT(1X,//)
C
           WRITE(NW1,344) FRAT(KFSEL)  
  344      FORMAT(1X,'COMPUTATIONAL INTENSITY = flop per mem ref',
     2    F10.3,/,1X,'------------------------------------------')
C
           WRITE(NW1,346) NFLOP,MREF
  346      FORMAT(1X,/,'Floating operations per iteration =',I4,//,
     1              'Memory references   per iteration =',I4)
C
C         /* INITIALIZE LEAST SQUARES */
C
           CALL LSTSQ(0,XN,TN,RINF,XN12,PER)
C
           WRITE(NW1,668)
  668      FORMAT(//,2X,'LOOP',7X,'LOOP',8X,'RINF',8X,
     1     'N1/2',7X,'S1/2',
     1     5X,'ERROR',4X,'R-AVERAGE',6X,'T0',6X,'NTIM',
     2     /,1X,'LENGTH',6X,'SECS',7X,'MFLOP/S',4X,'VEC.LEN',6X,'FLOP',
     3     7X,'%',7X,'MFLOP/S',6X,'SECS',4X,'repeat')
        ENDIF
C
        DO 20 NSEL=1,12,1
C
C         /* SELECT VECTOR LENGTH */
C
          IF( NSEL.EQ.1 )N=10
          IF( NSEL.EQ.2 )N=50
          IF( NSEL.EQ.3 )N=100
          IF( NSEL.EQ.4 )N=200
          IF( NSEL.EQ.5 )N=500
          IF( NSEL.EQ.6 )N=1000
          IF( NSEL.EQ.7 )N=2000
          IF( NSEL.EQ.8 )N=3000
          IF( NSEL.EQ.9 )N=4000
          IF( NSEL.EQ.10 )N=6000
          IF( NSEL.EQ.11 )N=8000
          IF( NSEL.EQ.12 )N=10000
          IF( N.GT.NMAX )GOTO 20
C
C         /* SELECT REPEAT */
C
          NTIM=NTIMES/(NFLOP*N)
          IF (NTIM .LT. 1) NTIM=1
C
          IF( my_rank .EQ. 0 )THEN
C
C            /* Measure overhead */
C
             T1 = DWALLTIME00()
             DO 86 JT=1,NTIM
               CALL DUMMY(JT)
   86        CONTINUE
             T2 = DWALLTIME00()
             T0=T2-T1
C
C            /* TIME REMOTE POLYNOMIAL EVALUATION */
C
             T1 = DWALLTIME00()
C
             DO 16 JT=1,NTIM
               CALL DUMMY(JT)
C
C              /*  SEND DATA TO SLAVE FOR COMPUTATION */
C
               CALL MPI_SEND(B, N, MPI_DOUBLE_PRECISION, NSLAVE,
     &                       NSLAVE, MPI_COMM_WORLD, ierr)

C              /* WAIT FOR RESULTS FROM SLAVE  */
C              /* RECEIVE  RESULTS FROM SLAVE  */
C
              CALL MPI_RECV(A, N, MPI_DOUBLE_PRECISION, NSLAVE,
     &                      MPI_ANY_TAG, MPI_COMM_WORLD, status, ierr)
C
   16        CONTINUE
          T2 = DWALLTIME00()
C
C         Output iteration info to user
C         Also, this statement appears to make this program work;
C         without it, some optimization causes a hang.
C         This statement is done 12 * NPOLYS = 240 times.
C
          CT = CT +1
          PRINT *, 'Count check = ', CT
C
          ELSEIF( my_rank.EQ.NSLAVE )THEN
C
             DO 116 JT=1,NTIM
C
C              /* RECEIVE DATA FROM MASTER */
C
               CALL MPI_RECV(B, N, MPI_DOUBLE_PRECISION, 0, MPI_ANY_TAG,
     &                       MPI_COMM_WORLD, status, ierr)
C
C              /* CALL DOALL TO COMPUTE POLYNOMIAL */
C
               CALL DOALL(N,NFLP,NPOLRP)
C
C              /* SEND RESULTS TO MASTER */
C
               CALL MPI_SEND(A, N, MPI_DOUBLE_PRECISION, 0, 
     &                       0, MPI_COMM_WORLD, ierr)
C
C              /*    SLAVE CODE  */
C
  116        CONTINUE
C
           ENDIF
C
           IF( my_rank.EQ.0 )THEN
C
C             /* Average loop time, TS  */
C
              TS=((T2-T1)-T0)/NTIM
C
C             /* Convert to time per vector operation */
C
              TN=TS/NFLOP
C
C             /* ESCAPE IF VECTOR TOO LONG */
C
              IF( N.EQ.0 ) GOTO 20
C
C             /* UPDATE LEAST SQUARES */
C
              XN=N
              CALL LSTSQ(1,XN,TN,RINF,XN12,PER)
C
C             /* CONVERT TO MFLOP/S */
C
              RINF=RINF*1.0E-6
C
C             /* AVERAGE MFLOP/S */
C
              RMFLPS= N/TN*1.0E-6
              S12   = NFLOP*XN12
              STIME = NFLOP*TN
              ST0   = 0.0
              IF( RINF.NE.0.0 )ST0 = (S12/RINF)*1.0E-06
              IF( NSEL.EQ.1 )ST0 = STIME
C
              WRITE(NW1,100) N,STIME,RINF,XN12,S12,PER		
     2                  ,RMFLPS,ST0,NTIM
  100         FORMAT(1X,I6,1PE12.3,0PF12.6,F12.3,1PE12.3,F7.2,
     1         0PF12.6,1PE12.3,I8)
C
C             /* End master code */
C
           ENDIF
C
C       /* END OF NSEL LOOP */
C
   20   CONTINUE
C
        IF( my_rank.EQ.0 )THEN
           RLAST(KFSEL)=RINF
           XNLAST(KFSEL)=XN12
        ENDIF
C
C     /* END OF KFSEL LOOP */
C
   21 CONTINUE
C
      IF( my_rank.EQ.0 )THEN
C  
C        /* INITIALIZE LEAST SQUARES */
C
         CALL LSTSQ(0,X,Y,RHAT,FHALF,PER)
C
         WRITE(NW1,401)
  401    FORMAT(//,'                     CALCULATION OF FHALF',/,
     1              '                 --------------------',//,
     1    'A straight line is being fitted to y as function of x.',/,
     1    'Inverse slope is RHAT and negative intercept is FHALF.',/,
     1    'If y-values are obviously too noisy, try larger value',/
     1    'of parameter NITER in poly3.inc',//,
     2    2X,' F=x ',7X,' y  ',10X,'RINF',8X,' RHAT  ',2X,'  FHALF',
     2    4X,'ERROR',/,3X,'f/m',6X,'us/mref',7X,'Mflop/s',
     3    6X,'Mflop/s',3X,'flop/mref',4X,'%')
C
         DO 41 KFSEL=1,NPOLYS
           Y=FRAT(KFSEL)/RLAST(KFSEL)
           X=FRAT(KFSEL)
C          /* UPDATE LEAST SQUARES */
           CALL LSTSQ(1,X,Y,RHAT,FHALF,PER)
           WRITE(NW1,400) X,Y,RLAST(KFSEL),RHAT,FHALF,PER        
  400      FORMAT(1X,F6.1,3(1X,0PF12.6),1X,F9.4,1X,F7.2)
   41    CONTINUE
C
         WRITE(NW1,402) RHAT,FHALF
  402    FORMAT(///10X,'LAST VALUES :    RHAT =',0PF12.6,' Mflop/s',//,
     1         10X,'                FHALF =',F12.4,' flop/mref')
C 
         T11 = DWALLTIME00()
C
         TOTIME=T11-T10
         WRITE(NW1,25) TOTIME
   25    FORMAT(//,5X,'TOTAL EXECUTION TIME IS ',1PE20.10,' SECONDS.')
C
         CLOSE(NW1)
C
         PRINT *,'Benchmark completed.'
C 
      ENDIF
C
C     /* Global synchronisation */
C
      CALL MPI_BARRIER( MPI_COMM_WORLD, ierr )
      IF( ierr.NE.MPI_SUCCESS )
     &    PRINT *,'ERROR: MPI_BARRIER returned ', ierr
		 
C
C     Program finished. Leave group and MPI before exiting
C
      CALL MPI_FINALIZE(ierr)
C
      STOP
C
      END
