C#######################################################################
C PSTSWM Version 4.0 (12/1/94)                                         #
C  A message-passing benchmark code and parallel algorithm testbed     #
C  that solves the nonlinear shallow water equations using the spectral#
C  transform method.                                                   #
C Written by:                                                          #
C  Patrick Worley of Oak Ridge National Laboratory                     #
C  Ian Foster of Argonne National Laboratory                           #
C Based on the sequential code STSWM 2.0 by James Hack and Ruediger    #
C  Jakob of the National Center for Atmospheric Research.              #
C Research and development funded by the Computer Hardware, Advanced   #
C  Mathematics, and Model Physics (CHAMMP) program of the U.S.         #
C  Department of Energy.                                               # 
C                                                                      #
C Questions and comments should be directed to worley@msr.epm.ornl.gov #
C Please notify and acknowledge the authors in any research or         #
C publications utilizing PSTSWM or any part of the code.               #
C                                                                      #
C NOTICE: Neither the institutions nor the authors make any            #
C representations about the suitability of this software for any       #
C purpose. This software is provided "as is", without express or       #
C implied warranty.                                                    #
C#######################################################################
C include precision declaration definitions                            #
#include "precision.i"
C#######################################################################
      SUBROUTINE RINGSUM(COMMOPT, PROTOPT, RINGSIZE,
     &                   ME, LEFT, RIGHT, MYINDEX, BASE1, BASE2,
     &                   LTH, WS, SUM)
C
C This subroutine calls routines that calculate a vector sum over a 
C specified subset of processors using a ring-shift algorithm.
C The results are duplicated across all processors in the subset.
C
C Communication options (COMMOPT) for RINGSUM include:
C  IF (COMMOPT .EQ. 0) simple shift: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered shift: [send/recv]|[recv/send]/sum
C  IF (COMMOPT .EQ. 2) simple shift with recv-ahead
C  IF (COMMOPT .EQ. 3) ordered shift with recv-ahead
C Communication protocol options (PROTOPT) for RINGSUM include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: FLTSUM
C calls: RING1, RING2
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C communication protocol option 
      INTEGER PROTOPT
C number of processors in subset
      INTEGER RINGSIZE
C my processor id
      INTEGER ME
C processor ids for neighbors in logical ring
      INTEGER LEFT, RIGHT
C index of "me" in logical ring
      INTEGER MYINDEX
C message type offsets to use in interprocessor communication
      INTEGER BASE1, BASE2
C length of vectors to be summed
      INTEGER LTH
C
C     Work Space
C
C message buffers
      REAL WS(LTH)
C
C     Input/Output
C
C on entry, contains (local) data, on exit contains vector sum
      REAL SUM(LTH)
C
C---- Executable Statements --------------------------------------------
C
      IF (RINGSIZE .GT. 1) THEN
C       Calculate vector sum.
C
        IF (COMMOPT .LE. 1) THEN
C         no recv-ahead algorithms
          CALL RING1(COMMOPT, PROTOPT, RINGSIZE, ME, LEFT,
     &               RIGHT, MYINDEX, BASE1, BASE2, LTH, WS, SUM)
C
        ELSEIF (COMMOPT .LE. 3) THEN
C         recv-ahead algorithms
          CALL RING2(COMMOPT-2, PROTOPT, RINGSIZE, ME, LEFT,
     &               RIGHT, MYINDEX, BASE1, BASE2, LTH, WS, SUM)
C
        ELSE
C         illegal communication option specified
          WRITE(0,100) ME, COMMOPT
  100     FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE RINGSUM ',/,
     &            ' ILLEGAL COMMUNICATION OPTION SPECIFIED',/,
     &            ' PROCID = ',I4,' COMMOPT = ',I4)
          STOP
C
        ENDIF
C
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE RING1(COMMOPT, PROTOPT, RINGSIZE,
     &                 ME, LEFT, RIGHT, MYINDEX, BASE1, BASE2,
     &                 LTH, WS, SUM)
C
C This subroutine calculates a vector sum over a specified subset
C of processors using a ring-shift algorithm without recv-ahead.
C The results are duplicated across all processors in the subset.
C
C Communication options (COMMOPT) for RINGSUM include:
C  IF (COMMOPT .EQ. 0) simple shift: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered shift: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for RINGSUM include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: RINGSUM
C calls: RINGINIT, SENDRECV
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
#     include "params.i"
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C communication protocol option 
      INTEGER PROTOPT
C number of processors in subset
      INTEGER RINGSIZE
C my processor id
      INTEGER ME
C processor ids for neighbors in logical ring
      INTEGER LEFT, RIGHT
C index of "me" in logical ring
      INTEGER MYINDEX
C message type offsets to use in interprocessor communication
      INTEGER BASE1, BASE2
C length of vectors to be summed
      INTEGER LTH
C
C     Work Space
C
C message buffers
      REAL WS(LTH)
C
C     Input/Output
C
C On entry, SUM contains (local) data. On exit, SUM contains the vector
C sum.  
      REAL SUM(LTH)
C
C---- Local Variables --------------------------------------------------
C
C order of send/recv in ordered sendrecv algorithm
      INTEGER ORDER
C arrays indicating the size of the message being sent/received 
C during a given sendrecv, and the corresponding index offset
      INTEGER SIZE(NPROCSX)
      INTEGER OFFSET(NPROCSX)
C work space used to calculate size and index
      INTEGER TEMP(NPROCSX)
C temporaries for current message sizes and offsets
      INTEGER SNDLTH, RCVLTH, SNDDEX, RCVDEX
C loop indices
      INTEGER I, STEP
C
C---- Executable Statements --------------------------------------------
C
C     Precalculate message order, sizes, and offsets.
      CALL RINGINIT(LTH, ALIGN, RINGSIZE, MYINDEX, TEMP, ORDER, SIZE, 
     &              OFFSET)
C
C     Calculate vector sum (leaving result distributed).
      DO STEP=RINGSIZE-1,1,-1
C
        SNDLTH = SIZE(STEP+1)
        SNDDEX = OFFSET(STEP+1)
        RCVLTH = SIZE(STEP)
        RCVDEX = OFFSET(STEP)
C
C       Shift vector segment.
        CALL SENDRECV(COMMOPT, PROTOPT, ORDER, ME, 
     &                STEP+BASE1, RIGHT, RBYTES*SNDLTH, SUM(SNDDEX), 
     &                LEFT, RBYTES*RCVLTH, WS(RCVDEX))
C
C       Update vector segment.
        DO I = RCVDEX, RCVDEX+(RCVLTH-1)
          SUM(I) = SUM(I) + WS(I)
        ENDDO
C
      ENDDO
C
C     Broadcast distributed vector by reversing ring-shift algorithm. 
      DO STEP=1,RINGSIZE-1
C
        RCVLTH = SIZE(STEP+1)
        RCVDEX = OFFSET(STEP+1)
        SNDLTH = SIZE(STEP)
        SNDDEX = OFFSET(STEP)
C
C       Shift vector segment.
        CALL SENDRECV(COMMOPT, PROTOPT, ORDER, ME, 
     &                STEP+BASE2, LEFT, RBYTES*SNDLTH, SUM(SNDDEX), 
     &                RIGHT, RBYTES*RCVLTH, SUM(RCVDEX))
C
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE RING2(COMMOPT, PROTOPT, RINGSIZE,
     &                 ME, LEFT, RIGHT, MYINDEX, BASE1, BASE2,
     &                 LTH, WS, SUM)
C
C This subroutine calculates a vector sum over a specified subset
C of processors using a ring-shift algorithm with recv-ahead.
C The results are duplicated across all processors in the subset.
C
C Communication options (COMMOPT) for RINGSUM include:
C  IF (COMMOPT .EQ. 0) simple shift: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered shift: [send/recv]|[recv/send]/sum 
C Communication protocol options (PROTOPT) for RINGSUM include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  
C    nonblocking receive and recv-ahead
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: RINGSUM
C calls: RINGINIT, SR1, SR2, SR3
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
#     include "params.i"
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C communication protocol option 
      INTEGER PROTOPT
C number of processors in subset
      INTEGER RINGSIZE
C my processor id
      INTEGER ME
C processor ids for neighbors in logical ring
      INTEGER LEFT, RIGHT
C index of "me" in logical ring
      INTEGER MYINDEX
C message type offsets to use in interprocessor communication
      INTEGER BASE1, BASE2
C length of vectors to be summed
      INTEGER LTH
C
C     Work Space
C
C message buffers
      REAL WS(LTH)
C
C     Input/output
C
C On entry, SUM contains (local) data. On exit, SUM contains the vector
C sum. 
      REAL SUM(LTH)
C
C---- Local Variables --------------------------------------------------
C
C order of send/recv in ordered sendrecv algorithm
      INTEGER ORDER
C arrays indicating the size of the message being sent/received 
C during a given sendrecv, and the corresponding index offset
      INTEGER SIZE(NPROCSX)
      INTEGER OFFSET(NPROCSX)
C work space used to calculate size and index
      INTEGER TEMP(NPROCSX)
C temporaries for current message sizes and offsets
      INTEGER SNDLTH, RCVLTH, SNDDEX, RCVDEX
C loop indices
      INTEGER I, STEP
C
C---- Executable Statements --------------------------------------------
C
C     Precalculate message order, sizes, and offsets.
      CALL RINGINIT(LTH, ALIGN, RINGSIZE, MYINDEX, TEMP, ORDER, SIZE, 
     &              OFFSET)
C
C     Post all receive requests for vector sum algorithm
C     (disabling safe forcetype protocol - only needs to be used
C     once).
      DO I=RINGSIZE-1,1,-1
        RCVLTH = SIZE(I)
        RCVDEX = OFFSET(I)
        CALL SR1(PROTOPT, .FALSE., ME, I+BASE1, LEFT, 
     &           RBYTES*RCVLTH, WS(RCVDEX))
      ENDDO
C
C     Notify source processor that first set of receive requests
C     have been posted. This trick preserves the order of 
C     SENDRECV requests, which can be more efficient on some machines.
      CALL SRSAFE(PROTOPT, ME, RINGSIZE-1+BASE1, LEFT)
C
C     Post receive requests for broadcast algorithm
C     (since know that requests will be posted before messages
C     sent, can disable safe forcetype protocol).
      DO I=1,RINGSIZE-1
        RCVLTH = SIZE(I+1)
        RCVDEX = OFFSET(I+1)
        CALL SR1(PROTOPT, .FALSE., ME, I+BASE2, RIGHT, 
     &           RBYTES*RCVLTH, SUM(RCVDEX))
      ENDDO
C
C     Calculate vector sum using ring-shift algorithm (leaving result 
C     distributed):
C     1) (STEP .EQ. RINGSIZE-1) case: use safe forcetype protocol.
      SNDLTH = SIZE(RINGSIZE)
      SNDDEX = OFFSET(RINGSIZE)
      RCVLTH = SIZE(RINGSIZE-1)
      RCVDEX = OFFSET(RINGSIZE-1)
C
      CALL SR2(COMMOPT, PROTOPT, .TRUE., ORDER, ME, 
     &         RINGSIZE-1+BASE1, RIGHT, RBYTES*SNDLTH, SUM(SNDDEX), 
     &         LEFT, RBYTES*RCVLTH, WS(RCVDEX))
C
C     update vector segment
      DO I = RCVDEX, RCVDEX+(RCVLTH-1)
        SUM(I) = SUM(I) + WS(I)
      ENDDO
C
C     2) (STEP .LT. RINGSIZE-1) case: no safe forcetype protocol.
      DO STEP=RINGSIZE-2,1,-1
C
        SNDLTH = SIZE(STEP+1)
        SNDDEX = OFFSET(STEP+1)
        RCVLTH = SIZE(STEP)
        RCVDEX = OFFSET(STEP)
C
        CALL SR2(COMMOPT, PROTOPT, .FALSE., ORDER, ME, 
     &           STEP+BASE1, RIGHT, RBYTES*SNDLTH, SUM(SNDDEX),
     &           LEFT, RBYTES*RCVLTH, WS(RCVDEX))
C
C       Update vector segment.
        DO I = RCVDEX, RCVDEX+(RCVLTH-1)
          SUM(I) = SUM(I) + WS(I)
        ENDDO
C
      ENDDO
C
C     Broadcast distributed vector by reversing ring-shift algorithm.
      DO STEP=1,RINGSIZE-1
C
        RCVLTH = SIZE(STEP+1)
        RCVDEX = OFFSET(STEP+1)
        SNDLTH = SIZE(STEP)
        SNDDEX = OFFSET(STEP)
C
C       Next, shift vector segment back the other way.
        CALL SR2(COMMOPT, PROTOPT, .FALSE., ORDER, ME, 
     &           STEP+BASE2, LEFT, RBYTES*SNDLTH, SUM(SNDDEX), 
     &           RIGHT, RBYTES*RCVLTH, SUM(RCVDEX))
C
      ENDDO
C
C     Clean up outstanding sends. Note that send buffers are never in
C     danger of being overwritten. During the summation stage, 
C     send buffer space is never reused. The broadcast stage does
C     not begin until all summation sends have completed (since the 
C     broadcast data comes from the destination of the summation sends).
      DO STEP=RINGSIZE-1,1,-1
        RCVLTH = SIZE(STEP)
        RCVDEX = OFFSET(STEP)
        CALL SR3(COMMOPT, PROTOPT, ME, STEP+BASE1, RIGHT,
     &           LEFT, RBYTES*RCVLTH, WS(RCVDEX))
      ENDDO
      DO STEP=1,RINGSIZE-1
        RCVLTH = SIZE(STEP+1)
        RCVDEX = OFFSET(STEP+1)
        CALL SR3(COMMOPT, PROTOPT, ME, STEP+BASE2, LEFT,
     &           RIGHT, RBYTES*RCVLTH, SUM(RCVDEX))
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE RINGINIT(LTH, ALIGN, RINGSIZE, MYINDEX, WS, ORDER, 
     &                    SIZE, OFFSET) 
C
C This routine calculates message sizes and offsets needed by the
C ring-shift vector sum and broadcast algorithms. 
C
C called by: RING1, RING2
C calls:
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C length of vectors to be summed
      INTEGER LTH
C alignment requirement for send and receive buffers (in number of 
C reals)
      INTEGER ALIGN
C number of processors in subset
      INTEGER RINGSIZE
C index of "me" in MAP array
      INTEGER MYINDEX
C
C     Work Space
C
C work space used to rotate arrays
      INTEGER WS(RINGSIZE)
C
C     Output
C
C order of send/recv in ordered sendrecv algorithm
      INTEGER ORDER
C array indicating the size of the message being sent/received during a
C given sendrecv
      INTEGER SIZE(RINGSIZE)
C array indicating the index for the beginning of the message being 
C sent/received during a given sendrecv
      INTEGER OFFSET(RINGSIZE)
C
C---- Local Variables --------------------------------------------------
C
C loop indices
      INTEGER I
C temporaries for determining message lengths
      INTEGER TMPSIZE, SWITCH
C
C---- Executable Statements -------------------------------------------
C
C     Determine order for ordered algorithms.
      ORDER = MOD(MYINDEX,2)
C
C     Determine (aligned) segment sizes.
      TMPSIZE = (LTH/ALIGN)/RINGSIZE
      SWITCH  = MOD(LTH/ALIGN,RINGSIZE)
      DO I=SWITCH+1,RINGSIZE
        SIZE(I) = ALIGN*TMPSIZE
      ENDDO
      TMPSIZE = TMPSIZE + 1
      DO I=1,SWITCH
        SIZE(I) = ALIGN*TMPSIZE
      ENDDO
      SIZE(RINGSIZE) = SIZE(RINGSIZE) + MOD(LTH,ALIGN)
C
C     Determine segment offsets.
      OFFSET(1) = 1
      DO I=2,RINGSIZE
        OFFSET(I) = OFFSET(I-1) + SIZE(I-1)
      ENDDO
C
C     Rotate size array to reflect location in logical ring.
      DO I=1,RINGSIZE
        WS(I) = SIZE(I)
      ENDDO
      DO I=MYINDEX+1,RINGSIZE
        SIZE(I-MYINDEX) = WS(I) 
      ENDDO
      DO I=1,MYINDEX
        SIZE(I+(RINGSIZE-MYINDEX)) = WS(I) 
      ENDDO
C
C     Rotate index array to reflect location in logical ring.
      DO I=1,RINGSIZE
        WS(I) = OFFSET(I)
      ENDDO
      DO I=MYINDEX+1,RINGSIZE
        OFFSET(I-MYINDEX) = WS(I) 
      ENDDO
      DO I=1,MYINDEX
        OFFSET(I+(RINGSIZE-MYINDEX)) = WS(I) 
      ENDDO
C      
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
