C#######################################################################
C PSTSWM Version 1.0 (8/1/93)                                          #
C  A message-passing benchmark code and parallel algorithm testbed     #
C  that solves the nonlinear shallow water equations using the spectral#
C  transform method.                                                   #
C Written by:                                                          #
C  Patrick Worley of Oak Ridge National Laboratory                     #
C  Ian Foster of Argonne National Laboratory                           #
C Based on the sequential code STSWM 2.0 by James Hack and Ruediger    #
C  Jakob of the National Center for Atmospheric Research.              #
C Research and development funded by the Computer Hardware, Advanced   #
C  Mathematics, and Model Physics (CHAMMP) program of the U.S.         #
C  Department of Energy.                                               # 
C                                                                      #
C Questions and comments should be directed to worley@msr.epm.ornl.gov #
C Please notify and acknowledge the authors in any research or         #
C publications utilizing PSTSWM or any part of the code.               #
C                                                                      #
C NOTICE: Neither the institutions nor the authors make any            #
C representations about the suitability of this software for any       #
C purpose. This software is provided "as is", without express or       #
C implied warranty.                                                    #
C#######################################################################
      SUBROUTINE EXCHSUM(COMMOPT, BUFFERS, PROTOPT, FORCETYPE, MAPSIZE,
     &                   MAP, MYINDEX, BASE, LTH, LTJUMP, NVEC, WS, SUM)
C
C This subroutine calls routines that calculate a vector sum over a 
C specified subset of processors using a bidirectional exchange 
C algorithm, generalized to handle nonpowers of two numbers of 
C processors. The MAP array defines the subset and the processor 
C ordering to use. The results are duplicated across all processors in 
C the subset.
C
C Communication options (COMMOPT) for EXCHSUM include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for EXCHSUM include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5) .AND. (BUFFERS .GT. 1)
C    recv-ahead
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: FLTSUM
C calls: EXCH1, EXCH2
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C number of communication buffers (to use in recv-ahead algorithms)
      INTEGER BUFFERS
C communication protocol option 
      INTEGER PROTOPT
C forcetype message type offset
      INTEGER FORCETYPE
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offset to use in interprocessor communication
      INTEGER BASE
C length of vectors to be summed
      INTEGER LTH
C declared vector length
      INTEGER LTJUMP
C number of vectors being calculated for
      INTEGER NVEC
C
C     Work Space
C
C message buffers
      REAL WS(LTJUMP,NVEC,BUFFERS)
C
C     Input/Output
C
C on entry, contains (local) data, on exit contains vector sum
      REAL SUM(LTJUMP,NVEC)
C
C---- Executable Statements --------------------------------------------
C
      IF (MAPSIZE .GT. 1) THEN
C       Calculate vector sum.
C
        IF (BUFFERS .EQ. 1) THEN
C         single buffer algorithms
          CALL EXCH1(COMMOPT, PROTOPT, FORCETYPE, MAPSIZE,
     &               MAP, MYINDEX, BASE, LTH, LTJUMP, NVEC, WS, SUM)
C
        ELSEIF (BUFFERS .GE. 2) THEN
C         multiple buffer algorithms
          CALL EXCH2(COMMOPT, BUFFERS/2, PROTOPT, FORCETYPE, MAPSIZE,
     &               MAP, MYINDEX, BASE, LTH, LTJUMP, NVEC, WS, SUM)
C
        ELSE
C         illegal number of buffers specified
          WRITE(0,100) MAP(MYINDEX), BUFFERS
  100     FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE EXCHSUM ',/,
     &            ' ILLEGAL COMMUNICATION BUFFER SIZE SPECIFIED',/,
     &            ' PROCID = ',I4,' BUFFERS = ',I4)
          STOP
        ENDIF
C
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE EXCH1(COMMOPT, PROTOPT, FORCETYPE, MAPSIZE, MAP, 
     &                 MYINDEX, BASE, LTH, LTJUMP, NVEC, WS, SUM)
C
C This subroutine calculates a vector sum over a specified subset
C of processors when one communication buffer is provided, using
C a bidirectional exchange algorithm generalized to handle nonpowers of 
C two numbers of processors. The MAP array defines the subset and the
C processor ordering to use. The results are duplicated across all 
C processors in the subset.
C
C Communication options (COMMOPT) for EXCH1 include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for EXCH1 include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: EXCHSUM
C calls: EXCHINIT, SWAP, SWAP_RECV, SWAP_SEND
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
      INCLUDE 'params.i'
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C communication protocol option 
      INTEGER PROTOPT
C forcetype message type offset
      INTEGER FORCETYPE
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offset to use in interprocessor communication
      INTEGER BASE
C length of vectors to be summed
      INTEGER LTH
C declared vector length
      INTEGER LTJUMP
C number of vectors being calculated for
      INTEGER NVEC
C
C     Work Space
C
C message buffers
      REAL WS(LTJUMP,NVEC)
C
C     Input/Output
C
C on entry, contains (local) data, on exit contains vector sum
      REAL SUM(LTJUMP,NVEC)
C
C---- Local Variables --------------------------------------------------
C
C true processor id for "me"
      INTEGER ME
C flag indicating whether "me" is in the logical (partial) upper
C half cube of a non-power of two hypercube
      LOGICAL UPPER
C If MAPSIZE is not a power of two, then TWIN is the (true) processor 
C id for the neighboring processor in the other half cube of 
C the partial logical hypercube.
      INTEGER TWIN
C number of swaps in bidirectional exchange algorithm
      INTEGER MAXSTEP
C size of message being swapped
      INTEGER MSGLTH
C arrays indicating the destination of the message sent during a
C given swap, and whether this processor sends or receives first
      INTEGER SWAPNODE(LGPROCSX)
      INTEGER ORDER(LGPROCSX)
C loop indices
      INTEGER I, J, STEP
C
C---- Executable Statements --------------------------------------------
C
C     Identify who i am.
      ME = MAP(MYINDEX)
C
C     Calculate length of vector being exchanged (in bytes).
      MSGLTH = RBYTES*LTJUMP*NVEC
C
C     Precalculate swap partners and other information needed by vector
C     sum algorithm.
      CALL EXCHINIT(MAPSIZE, MAP, MYINDEX, LGPROCSX, UPPER, TWIN,
     &              MAXSTEP, ORDER, SWAPNODE) 
C
      IF (UPPER) THEN
C       In upper half cube, so send data to TWIN and wait for results.
C       - ordered send/recv, with upper half cube processor going 
C         first
C       - safe to use same buffer for send and recv because using
C         ordered swap and going first
C
        CALL SWAP(1, PROTOPT, FORCETYPE, 1, ME, BASE, TWIN, MSGLTH, SUM,
     &            MSGLTH, SUM)
C
      ELSE
C       In lower half cube, so use usual power of two algorithm.
C
        IF (TWIN .NE. -1) THEN
C         Get data from twin in upper half cube using recv part of 
C         ordered swap.
          CALL SWAP_RECV(1, PROTOPT, FORCETYPE, -1, ME, BASE, TWIN, 
     &                   MSGLTH, WS)
C
C         Sum vectors.
          DO J = 1,NVEC
            DO I = 1,LTH
              SUM(I,J) = SUM(I,J) + WS(I,J)
            ENDDO
          ENDDO
        ENDIF
C
C       Calculate vector sum in lower half cube using exchange algorithm.
        DO STEP=1,MAXSTEP
C
C         Exchange vectors.
          CALL SWAP(COMMOPT, PROTOPT, FORCETYPE, ORDER(STEP), ME, BASE,
     &              SWAPNODE(STEP), MSGLTH, SUM, MSGLTH, WS)
C
C         Sum vectors.
          DO J = 1,NVEC
            DO I = 1,LTH
              SUM(I,J) = SUM(I,J) + WS(I,J)
            ENDDO
          ENDDO
C
        ENDDO
C
        IF (TWIN .NE. -1) THEN
C
C         Send results to twin in upper half cube using send part of 
C         ordered swap.
          CALL SWAP_SEND(1, PROTOPT, FORCETYPE, -1, ME, BASE, TWIN,
     &                   MSGLTH, SUM, MSGLTH, SUM)
C
        ENDIF
C
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE EXCH2(COMMOPT, BUFFERS, PROTOPT, FORCETYPE, MAPSIZE, 
     &                 MAP, MYINDEX, BASE, LTH, LTJUMP, NVEC, WS, SUM)
C
C This subroutine calculates a vector sum over a specified subset
C of processors when more than one communication buffers is provided, 
C using a bidirectional exchange algorithm generalized to handle 
C nonpowers of two number of processors. The MAP array defines the 
C subset and the processor ordering to use. The results are duplicated 
C across all processors in the subset.
C
C Communication options (COMMOPT) for EXCH2 include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for EXCH2 include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  
C    nonblocking receive and recv-ahead
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: EXCHSUM
C calls: EXCHINIT, SWAP, SWAP1, SWAP2, SWAP3, SWAP_RECVBEGIN, 
C        SWAP_RECVEND, SWAP_SEND
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
      INCLUDE 'params.i'
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C number of communication buffers 
C (to use in nonblocking send and recv algorithms)
      INTEGER BUFFERS
C communication protocol option 
      INTEGER PROTOPT
C forcetype message type offset
      INTEGER FORCETYPE
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offset to use in interprocessor communication
      INTEGER BASE
C length of vectors to be summed
      INTEGER LTH
C declared vector length
      INTEGER LTJUMP
C number of vectors being calculated for
      INTEGER NVEC
C
C     Work Space
C
C message buffers
C (messages sent from (*,*,1,*), messages received into (*,*,2,*))
      REAL WS(LTJUMP,NVEC,2,BUFFERS)
C
C     Input/Output
C
C On entry, contains (local) data, on exit contains vector sum.
      REAL SUM(LTJUMP,NVEC)
C
C---- Local Variables --------------------------------------------------
C
C true processor id for "me"
      INTEGER ME
C flag indicating whether "me" is in the logical (partial) upper
C half cube of a non-power of two hypercube
      LOGICAL UPPER
C If MAPSIZE is not a power of two, then TWIN is the (true) processor 
C id for the neighboring processor in the other half cube of 
C the partial logical hypercube.
      INTEGER TWIN
C number of swaps in bidirectional exchange algorithm
      INTEGER MAXSTEP
C size of message being swapped
      INTEGER MSGLTH
C arrays indicating the destination of the message sent during a
C given swap, and whether this processor sends or receives first
      INTEGER SWAPNODE(LGPROCSX)
      INTEGER ORDER(LGPROCSX)
C loop indices
      INTEGER I, J, STEP, ISSTART
C buffer pointers
      INTEGER CURBUF, MAXBUF, PREVBUF
C
C---- Executable Statements --------------------------------------------
C
C     Identify who I am.
      ME = MAP(MYINDEX)
C
C     Calculate length of vector being exchanged (in bytes).
      MSGLTH = RBYTES*LTJUMP*NVEC
C
C     Precalculate swap partners and other information needed by vector
C     sum algorithm. 
      CALL EXCHINIT(MAPSIZE, MAP, MYINDEX, LGPROCSX, UPPER, TWIN,
     &              MAXSTEP, ORDER, SWAPNODE) 
C
      IF (UPPER) THEN
C       In upper half cube, so send data to twin and wait for results.
C       - ordered send/recv, with upper half cube processor going 
C         first
C       - safe to use same buffer for send and recv because using
C         ordered swap and going first
C
        CALL SWAP(1, PROTOPT, FORCETYPE, 1, ME, BASE, TWIN, MSGLTH, SUM,
     &            MSGLTH, SUM)
C
      ELSE
C       In lower half cube, so use usual power of two algorithm.
C
C       Calculate number of recv-ahead buffers to use.
        MAXBUF = MIN(MAXSTEP, BUFFERS)
C
        IF (TWIN .NE. -1) THEN
C         Prepare for data from twin in upper half cube using recv part
C         of ordered swap.
          CALL SWAP_RECVBEGIN(1, PROTOPT, FORCETYPE, -1, ME, 
     &                        BASE, TWIN, MSGLTH, WS(1,1,1,MAXBUF))
        ENDIF
C
C       Post MAXBUF receive requests.
        DO I=1,MAXBUF
          CALL SWAP1(COMMOPT, PROTOPT, FORCETYPE, .TRUE., ORDER(I), ME,
     &               BASE, SWAPNODE(I), MSGLTH, WS(1,1,2,I)) 
        ENDDO
C
C       Calculate global sum using generalized exchange algorithm.
        IF (TWIN .NE. -1) THEN
C
C         Get data from twin in upper half cube using recvend part of 
C         ordered swap. Using "send" buffer to receive TWIN data so as
C         not to impact recv-ahead requests. This works because the 
C         receive is completed before the send buffer is needed.
          CALL SWAP_RECVEND(1, PROTOPT, FORCETYPE, -1, ME, BASE, TWIN, 
     &                      MSGLTH, WS(1,1,1,MAXBUF))
C
C         Sum vectors.
          DO J = 1,NVEC
            DO I = 1,LTH
              SUM(I,J) = SUM(I,J) + WS(I,J,1,MAXBUF)
            ENDDO
          ENDDO
C
        ENDIF
C
        IF (MAXSTEP .GT. 1) THEN
C
C         case 1) (STEP .EQ. 1)
C         Initialize buffer pointers.
          PREVBUF = MAXBUF
          CURBUF  = 1
C
C         Receive message and initiate corresponding send.
          CALL SWAP2(COMMOPT, PROTOPT, FORCETYPE, .TRUE., ORDER(1), ME,
     &               BASE, SWAPNODE(1), MSGLTH, SUM, MSGLTH,
     &               WS(1,1,2,CURBUF)) 
C
C         Sum vectors.
          DO J = 1,NVEC
            DO I = 1,LTH
              WS(I,J,1,CURBUF) = WS(I,J,2,CURBUF) + SUM(I,J)
            ENDDO
          ENDDO
C
C         case 2) (STEP = 2,...,MAXSTEP-1)
          DO STEP=2,MAXSTEP-1
C
C           Update buffer pointers.
            PREVBUF = CURBUF
            CURBUF  = MOD(STEP-1, MAXBUF) + 1
C
C           Post next recv-ahead receive request.
            IF (MAXBUF-1+STEP .LE. MAXSTEP) THEN
              CALL SWAP1(COMMOPT, PROTOPT, FORCETYPE, .TRUE., 
     &                   ORDER(MAXBUF-1+STEP), ME, BASE,
     &                   SWAPNODE(MAXBUF-1+STEP), 
     &                   MSGLTH, WS(1,1,2,PREVBUF))
            ENDIF
C
C           Receive message and initiate corresponding send.
            CALL SWAP2(COMMOPT, PROTOPT, FORCETYPE, .TRUE., ORDER(STEP),
     &                 ME, BASE, SWAPNODE(STEP), MSGLTH,  
     &                 WS(1,1,1,PREVBUF), MSGLTH, WS(1,1,2,CURBUF))
C
C           Wait until send of WS(1,1,1,CURBUF) is complete before 
C           overwriting contents.
            IF (STEP .GT. MAXBUF) THEN
              CALL SWAP3(COMMOPT, PROTOPT, FORCETYPE, ME, BASE,
     &                   SWAPNODE(STEP-MAXBUF))
            ENDIF
C
C           Sum vectors.
            DO J = 1,NVEC
              DO I = 1,LTH
                WS(I,J,1,CURBUF) = WS(I,J,2,CURBUF) + WS(I,J,1,PREVBUF)
              ENDDO
            ENDDO
C
          ENDDO
C
C         case 3) (STEP .EQ. MAXSTEP)
C         Update buffer pointers.
          PREVBUF = CURBUF
          CURBUF  = MOD(MAXSTEP-1, MAXBUF) + 1
C
C         Post one final recv-ahead receive request, if needed.
          IF (MAXBUF .EQ. 1) THEN
            CALL SWAP1(COMMOPT, PROTOPT, FORCETYPE, .TRUE., 
     &                 ORDER(MAXSTEP), ME, BASE, SWAPNODE(MAXSTEP), 
     &                 MSGLTH, WS(1,1,2,PREVBUF))
          ENDIF
C
C         Receive message and initiate corresponding send.
          CALL SWAP2(COMMOPT, PROTOPT, FORCETYPE, .TRUE., 
     &               ORDER(MAXSTEP), ME, BASE, SWAPNODE(MAXSTEP), 
     &               MSGLTH, WS(1,1,1,PREVBUF), MSGLTH, 
     &               WS(1,1,2,CURBUF))
C
C         Make sure that send of SUM is complete before overwriting 
C         contents.
          IF ((MAXSTEP .EQ. MAXBUF) .OR. (MAXSTEP .EQ. 2)) THEN
            CALL SWAP3(COMMOPT, PROTOPT, FORCETYPE, ME, BASE,
     &                 SWAPNODE(1))
            ISSTART = 2
          ELSE
            ISSTART = MAXSTEP - MAXBUF
          ENDIF
C
C         Sum vectors.
          DO J = 1,NVEC
            DO I = 1,LTH
              SUM(I,J) = WS(I,J,2,CURBUF) + WS(I,J,1,PREVBUF)
            ENDDO
          ENDDO
C
C         Wait until outstanding send operations are complete.
          DO I=ISSTART,MAXSTEP
            CALL SWAP3(COMMOPT, PROTOPT, FORCETYPE, ME, BASE, 
     &                 SWAPNODE(I))
          ENDDO
C
        ELSE
C         (MAXSTEP .EQ. 1) case
C
C         Receive message and initiate corresponding send.
          CALL SWAP2(COMMOPT, PROTOPT, FORCETYPE, .TRUE., 
     &               ORDER(1), ME, BASE, SWAPNODE(1), 
     &               MSGLTH, SUM, MSGLTH, WS(1,1,2,1))
C
C         Wait until send of SUM is complete before overwriting contents.
          CALL SWAP3(COMMOPT, PROTOPT, FORCETYPE, ME, BASE, SWAPNODE(1))
C
C         Sum vectors.
          DO J = 1,NVEC
            DO I = 1,LTH
              SUM(I,J) = WS(I,J,2,1) + SUM(I,J)
            ENDDO
          ENDDO
C
        ENDIF
C
        IF (TWIN .NE. -1) THEN
C
C         Send results to twin in upper half cube using send part of 
C         ordered swap.
          CALL SWAP_SEND(1, PROTOPT, FORCETYPE, -1, ME, BASE, TWIN,
     &                   MSGLTH, SUM, MSGLTH, SUM)
C
        ENDIF
C
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE EXCHINIT(MAPSIZE, MAP, MYINDEX, LGPROCSX, UPPER, TWIN,
     &                    MAXSWAP, ORDER, SWAPNODE)
C
C This routine calculates swap partners and other information needed
C by the bidirectional exchange vector sum algorithms.
C
C called by: EXCH1, EXCH2
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C declared length of the output arrays, 
C at least as long as LOG2(MAPSIZE)
      INTEGER LGPROCSX
C
C     Output
C
C flag indicating whether "me" is in the logical (partial) upper
C half cube of a non-power of two hypercube
      LOGICAL UPPER
C If MAPSIZE is not a power of two, then TWIN is the (true) processor 
C id for the neighboring processor in the other half cube of 
C the partial logical hypercube.
      INTEGER TWIN
C number of swaps in bidirectional exchange algorithm
      INTEGER MAXSWAP
C array indicating whether this processor sends or receives first
C during a swap at a given step (for synchronous communication)
      INTEGER ORDER(LGPROCSX)
C array indicating the destination of the message being sent during
C a given swap
      INTEGER SWAPNODE(LGPROCSX)
C
C---- Local Variables --------------------------------------------------
C
C temporary for determining swap information
      INTEGER I
C variables indicating sizes of upper partial half cube
C and lower complete half cube
      INTEGER EXTRA, HALF
C MYINDEX for permuted ordering when MAPSIZE not a power of two
      INTEGER LOCALDEX
C temporary for determining message destination
      INTEGER DEST
C
C---- Executable Statements -------------------------------------------
C                                                                              
C     Check for existence of a partial upper half cube.
      I = 1
      DO WHILE (I .LT. MAPSIZE)
        I = I*2
      ENDDO
C
      IF (MAPSIZE .EQ. I) THEN
        HALF = I
        EXTRA = 0
      ELSE
        HALF = I/2
        EXTRA = MAPSIZE - HALF
      ENDIF
C
C     Calculate swap partners and other associated information.
C     (For nonpowers of two, logical partial upper half cube processors
C      mapped to first "extra" even processors in logical ring. This
C      improves performance on a mesh architecture. Performance is worse
C      on a hypercube, but using a nonpower of two number of processors
C      on a hypercube is bogus anyway. Mapping courtesy of S. Seidel.) 
      UPPER = .FALSE.
      IF (EXTRA .GT. 0) THEN
C
C       First, identify TWIN and new MYINDEX with permuted mapping.
        IF (MYINDEX .LT. 2*EXTRA) THEN
          IF (MOD(MYINDEX, 2) .EQ. 1) THEN
            TWIN = MAP(MYINDEX-1)
            LOCALDEX = MYINDEX/2
          ELSE
            TWIN = MAP(MYINDEX+1)
            UPPER = .TRUE.
          ENDIF
        ELSE
          LOCALDEX = MYINDEX - EXTRA
          TWIN = -1
        ENDIF
C
      ELSE
C
C       Power of two, so no TWIN and no permutation.
        TWIN = -1
        LOCALDEX = MYINDEX
C
      ENDIF
C
C     Next, calculate swap partners. Swap order for (COMMOPT .EQ. 1) 
C     protocol chosen in such a way as to minimize collisions on a 
C     bidirectional grid.
      IF (.NOT. UPPER) THEN
C
        MAXSWAP = 0
        I = 1
C
        DO WHILE (I .LT. HALF)
C
          MAXSWAP = MAXSWAP + 1
C
          IF (MOD(LOCALDEX, 2*I) .LT. I) THEN
            DEST = LOCALDEX + I
            IF (MOD(LOCALDEX, 2) .EQ. 0) THEN
              ORDER(MAXSWAP) = 1
            ELSE
              ORDER(MAXSWAP) = -1
            ENDIF
          ELSE
            DEST = LOCALDEX - I
            IF (MOD(DEST, 2) .EQ. 0) THEN
              ORDER(MAXSWAP) = -1
            ELSE
              ORDER(MAXSWAP) = 1
            ENDIF
          ENDIF
C
          IF (DEST .LT. EXTRA) THEN
            SWAPNODE(MAXSWAP) = MAP(2*DEST+1)           
          ELSE
            SWAPNODE(MAXSWAP) = MAP(DEST+EXTRA)           
          ENDIF
C
          I = 2*I
C
        ENDDO
C
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
