C#######################################################################
C PSTSWM Version 1.0 (8/1/93)                                          #
C  A message-passing benchmark code and parallel algorithm testbed     #
C  that solves the nonlinear shallow water equations using the spectral#
C  transform method.                                                   #
C Written by:                                                          #
C  Patrick Worley of Oak Ridge National Laboratory                     #
C  Ian Foster of Argonne National Laboratory                           #
C Based on the sequential code STSWM 2.0 by James Hack and Ruediger    #
C  Jakob of the National Center for Atmospheric Research.              #
C Research and development funded by the Computer Hardware, Advanced   #
C  Mathematics, and Model Physics (CHAMMP) program of the U.S.         #
C  Department of Energy.                                               # 
C                                                                      #
C Questions and comments should be directed to worley@msr.epm.ornl.gov #
C Please notify and acknowledge the authors in any research or         #
C publications utilizing PSTSWM or any part of the code.               #
C                                                                      #
C NOTICE: Neither the institutions nor the authors make any            #
C representations about the suitability of this software for any       #
C purpose. This software is provided "as is", without express or       #
C implied warranty.                                                    #
C#######################################################################
      SUBROUTINE SWAPTRANS(COMMOPT, PROTOPT, FORCETYPE, MAPSIZE, MAP,
     &                     MYINDEX, BASE, DIR, W, M, N, H1, H2, ML, NL,
     &                     MAX, A, WS, B) 
C
C This subroutine calls routines that compute B = transpose(A) using
C an O(P) "swap" transpose algorithm, where each step consists of
C swapping information between processors. Here
C  A is a matrix of size (W,M,H1,H2,N) distributed by rows and
C  B is a matrix of size (W,N,H1,H2,M) distributed by rows
C over MAPSIZE processors, and each processor has part of A and B as 
C follows:
C  A(W,ML,H1,H2,N): Each processor has ML = (M/P or M/P+1) rows of A; 
C                   excess rows are allocated to lower-numbered nodes.
C  B(W,NL,H1,H2,M): Each processor has NL = (N/P or N/P+1) rows of B; 
C                   excess rows are allocated to lower-numbered nodes.
C W is 1 or 2, depending on whether the arrays are REAL or COMPLEX.
C
C Alternative reorganizations of B are also supported, and are 
C determined by the parameter DIR. DIR specifies where TRANSPOSE is 
C called from, allowing the routine to order B as required for 
C subsequent stages in PSTSWM: 
C  DIR=-1: B(W,MAX,NL,H1,H2)  (Used after real forward transpose.)
C  DIR=+1: B(W,MAX,M,H1,H2)   (Used after real backward transpose.)
C  DIR=-2: B(W,MAX,H1,M,H2)   (Used after complex forward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=+2: B(W,MAX,H1,NL,H2)  (Used after complex backward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=-3: B(W,MAX,NL,M,H2)   (Used after complex forward transpose
C                             in distributed FFT/transpose LT algorithm)
C  DIR=+3: B(W,MAX,M,NL,H2)   (Used after complex backward transpose
C The MAP array defines the processor subset and ordering to use.
C
C Communication options (COMMOPT) for SWAPTRANS include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C  IF (COMMOPT .EQ. 2) simple swap with recv-ahead
C  IF (COMMOPT .EQ. 3) ordered swap with recv-ahead
C Communication protocol options (PROTOPT) for SWAPTRANS include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: TRANSPOSE
C calls: SWPTRNS1, SWPTRNS2
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C communication protocol option 
      INTEGER PROTOPT
C forcetype message type offset
      INTEGER FORCETYPE
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offset to use in interprocessor communication
      INTEGER BASE
C context in which transpose occurs, and hence required data organization
      INTEGER DIR
C number of reals in datatype (1: REAL, 2: COMPLEX)
      INTEGER W
C dimensions of input and output arrays
      INTEGER M, N, H1, H2, ML, NL, MAX
C local component of the array that is to be transposed
      REAL A(W,ML,H1,H2,N)
C
C     Work Space
C
C message buffers
      REAL WS(W,NL,H1,H2,M)
C
C     Output
C
C local component of the transposed array.
C (organized as REAL (W,MAX,NL,H1,H2), (W,MAX,M,H1,H2), (W,MAX,H1,M,H2),
C  (W,MAX,H1,NL,H2), (W,MAX,NL,M,H2), or (W,MAX,M,NL,H2)) 
      REAL B(1)
C
C---- Executable Statements --------------------------------------------
C
      IF (MAPSIZE .GT. 1) THEN
C       Compute transpose.
C
        IF (COMMOPT .LE. 1) THEN
C         no recv-ahead algorithms
          CALL SWPTRNS1(COMMOPT, PROTOPT, FORCETYPE, MAPSIZE, MAP,
     &                  MYINDEX, BASE, DIR, W, M, N, H1, H2, ML, NL,
     &                  MAX, A, WS, B) 
C
        ELSEIF (COMMOPT .LE. 3) THEN
C         recv-ahead algorithms
          CALL SWPTRNS2(COMMOPT-2, PROTOPT, FORCETYPE, MAPSIZE, MAP,
     &                  MYINDEX, BASE, DIR, W, M, N, H1, H2, ML, NL,
     &                  MAX, A, WS, B) 
C
        ELSE
C         illegal communication option specified
          WRITE(0,100) MAP(MYINDEX), COMMOPT
  100     FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE SWAPTRANS ',/,
     &            ' ILLEGAL COMMUNICATION OPTION SPECIFIED',/,
     &            ' PROCID = ',I4,' COMMOPT = ',I4)
          STOP
C
        ENDIF
C
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE SWPTRNS1(COMMOPT, PROTOPT, FORCETYPE, MAPSIZE, MAP, 
     &                    MYINDEX, BASE, DIR, W, M, N, H1, H2, ML, NL,
     &                    MAX, A, WS, B) 
C
C This routine computes B = transpose(A) using an O(P) "swap" transpose
C algorithm without recv-ahead, where each step consists of swapping
C information between processors. Here 
C  A is a matrix of size (W,M,H1,H2,N) distributed by rows and
C  B is a matrix of size (W,N,H1,H2,M) distributed by rows
C over MAPSIZE processors, and each processor has part of A and B as 
C follows:
C  A(W,ML,H1,H2,N): Each processor has ML = (M/P or M/P+1) rows of A; 
C                   excess rows are allocated to lower-numbered nodes.
C  B(W,NL,H1,H2,M): Each processor has NL = (N/P or N/P+1) rows of B; 
C                   excess rows are allocated to lower-numbered nodes.
C W is 1 or 2, depending on whether the arrays are REAL or COMPLEX.
C
C Alternative reorganizations of B are also supported, and are 
C determined by the parameter DIR. DIR specifies where TRANSPOSE is 
C called from, allowing the routine to order B as required for 
C subsequent stages in PSTSWM: 
C  DIR=-1: B(W,MAX,NL,H1,H2)  (Used after real forward transpose.)
C  DIR=+1: B(W,MAX,M,H1,H2)   (Used after real backward transpose.)
C  DIR=-2: B(W,MAX,H1,M,H2)   (Used after complex forward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=+2: B(W,MAX,H1,NL,H2)  (Used after complex backward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=-3: B(W,MAX,NL,M,H2)   (Used after complex forward transpose
C                             in distributed FFT/transpose LT algorithm)
C  DIR=+3: B(W,MAX,M,NL,H2)   (Used after complex backward transpose
C                             in distributed FFT/transpose LT algorithm)
C The MAP array defines the processor subset and ordering to use.
C
C Communication options (COMMOPT) for SWPTRNS1 include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for SWPTRNS1 include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: SWAPTRANS
C calls: SWPTRNS_INIT, SWAP, TRANS
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
      INCLUDE 'params.i'
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C communication protocol option 
      INTEGER PROTOPT
C forcetype message type offset
      INTEGER FORCETYPE
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offset to use in interprocessor communication
      INTEGER BASE
C -1 for forward transpose; +1 for backward.
      INTEGER DIR
C number of reals in datatype (1: REAL, 2: COMPLEX)
      INTEGER W
C dimensions of input and output arrays
      INTEGER M, N, H1, H2, ML, NL, MAX
C Local component of the array that is to be transposed.
C (organized as REAL (W,ML,H1,H2,N))
      REAL A(W*ML*H1*H2,N)
C
C     Work Space
C
C message buffers
C (organized as REAL (W,NL,H1,H2,M))
      REAL WS(W*NL*H1*H2,M)
C
C     Output
C
C Local component of the transposed array.
C (organized as REAL (W,MAX,NL,H1,H2), (W,MAX,M,H1,H2), (W,MAX,H1,M,H2),
C  (W,MAX,H1,NL,H2), (W,MAX,NL,M,H2), or (W,MAX,M,NL,H2)) 
      REAL B(1)
C
C---- Local Variables --------------------------------------------------
C
C true processor id for "me"
      INTEGER ME
C number of bytes in a column segment being sent and in a row segment
C being received. 
      INTEGER COLSIZE, ROWSIZE
C size of message being sent and received during a swap
      INTEGER SENDSIZE, RECVSIZE
C arrays indicating beginning index and size of message being sent 
C during a given swap
      INTEGER SENDDEX(0:NPROCSX-1), SENDCOLS(NPROCSX-1)
C arrays indicating beginning index and size of message being 
C received during a given swap
      INTEGER RECVDEX(0:NPROCSX-1), RECVROWS(NPROCSX-1)
C arrays indicating swap partner and whether this processor sends or 
C receives first during a swap at a given step (for synchronous 
C communication)
      INTEGER SWAPNODE(NPROCSX-1), ORDER(NPROCSX-1)
C loop index
      INTEGER STEP
C
C---- Executable Statements --------------------------------------------
C
C     Identify who I am.
      ME = MAP(MYINDEX)
C
C     Calculate length of a single column segment (for sending) and a
C     single row segment (for receiving), in bytes.
      COLSIZE = RBYTES*W*ML*H1*H2
      ROWSIZE = RBYTES*W*NL*H1*H2
C
C     Precalculate swap partners and other information needed by 
C     transpose algorithm.
      CALL SWPTRNS_INIT(M, N, MAPSIZE, MAP, MYINDEX, SENDDEX,
     &                  SENDCOLS, RECVDEX, RECVROWS, SWAPNODE, ORDER)
C
C     Construct transpose using O(P) swap algorithm.
      DO STEP=1,MAPSIZE-1
C
C       Swap components.
        SENDSIZE = SENDCOLS(STEP)*COLSIZE
        RECVSIZE = RECVROWS(STEP)*ROWSIZE
        CALL SWAP(COMMOPT, PROTOPT, FORCETYPE, ORDER(STEP), ME, BASE, 
     &            SWAPNODE(STEP), SENDSIZE, A(1,SENDDEX(STEP)),
     &            RECVSIZE, WS)
C
C       Transpose received component into B.
        CALL TRANS(DIR, W, M, H1, H2, RECVROWS(STEP), NL, MAX,
     &             RECVDEX(STEP), WS, B) 
C
      ENDDO
C
C     Finally, transpose last component from A to B.
      CALL TRANS(DIR, W, M, H1, H2, ML, NL, MAX, RECVDEX(0),
     &           A(1,SENDDEX(0)), B) 
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE SWPTRNS2(COMMOPT, PROTOPT, FORCETYPE, MAPSIZE, MAP, 
     &                    MYINDEX, BASE, DIR, W, M, N, H1, H2, ML, NL,
     &                    MAX, A, WS, B) 
C
C This routine computes B = transpose(A) using an O(P) "swap" transpose
C algorithm with recv-ahead, where each step consists of swapping
C information between processors. Here 
C  A is a matrix of size (W,M,H1,H2,N) distributed by rows and
C  B is a matrix of size (W,N,H1,H2,M) distributed by rows
C over MAPSIZE processors, and each processor has part of A and B as 
C follows:
C  A(W,ML,H1,H2,N): Each processor has ML = (M/P or M/P+1) rows of A; 
C                   excess rows are allocated to lower-numbered nodes.
C  B(W,NL,H1,H2,M): Each processor has NL = (N/P or N/P+1) rows of B; 
C                   excess rows are allocated to lower-numbered nodes.
C W is 1 or 2, depending on whether the arrays are REAL or COMPLEX.
C
C Alternative reorganizations of B are also supported, and are 
C determined by the parameter DIR. DIR specifies where TRANSPOSE is 
C called from, allowing the routine to order B as required for 
C subsequent stages in PSTSWM: 
C  DIR=-1: B(W,MAX,NL,H1,H2)  (Used after real forward transpose.)
C  DIR=+1: B(W,MAX,M,H1,H2)   (Used after real backward transpose.)
C  DIR=-2: B(W,MAX,H1,M,H2)   (Used after complex forward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=+2: B(W,MAX,H1,NL,H2)  (Used after complex backward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=-3: B(W,MAX,NL,M,H2)   (Used after complex forward transpose
C                             in distributed FFT/transpose LT algorithm)
C  DIR=+3: B(W,MAX,M,NL,H2)   (Used after complex backward transpose
C                             in distributed FFT/transpose LT algorithm)
C The MAP array defines the processor subset and ordering to use.
C
C Communication options (COMMOPT) for SWPTRNS2 include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for SWPTRNS2 include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  
C    nonblocking receive and recv-ahead
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: SWAPTRANS
C calls: SWPTRNS_INIT, SWAP1, SWAP2, SWAP3, TRANS
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
      INCLUDE 'params.i'
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C communication protocol option 
      INTEGER PROTOPT
C forcetype message type offset
      INTEGER FORCETYPE
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offset to use in interprocessor communication
      INTEGER BASE
C context in which transpose occurs, and hence required data organization
      INTEGER DIR
C number of reals in datatype (1: REAL, 2: COMPLEX)
      INTEGER W
C dimensions of input and output arrays
      INTEGER M, N, H1, H2, ML, NL, MAX
C Local component of the array that is to be transposed.
C (organized as REAL (W,ML,H1,H2,N))
      REAL A(W*ML*H1*H2,N)
C
C     Work Space
C
C message buffers
C (organized as REAL (W,NL,H1,H2,M))
      REAL WS(W*NL*H1*H2,M)
C
C     Output
C
C Local component of the transposed array.
C (organized as REAL (W,MAX,NL,H1,H2), (W,MAX,M,H1,H2), (W,MAX,H1,M,H2),
C  (W,MAX,H1,NL,H2), (W,MAX,NL,M,H2), or (W,MAX,M,NL,H2)) 
      REAL B(1)
C
C---- Local Variables --------------------------------------------------
C
C true processor id for "me"
      INTEGER ME
C number of bytes in a column segment being sent and in a row segment
C being received. 
      INTEGER COLSIZE, ROWSIZE
C size of message being sent and received during a swap
      INTEGER SENDSIZE, RECVSIZE
C arrays indicating beginning index and size of message being sent 
C during a given swap
      INTEGER SENDDEX(0:NPROCSX-1), SENDCOLS(NPROCSX-1)
C arrays indicating beginning index and size of message being 
C received during a given swap
      INTEGER RECVDEX(0:NPROCSX-1), RECVROWS(NPROCSX-1)
C arrays indicating swap partner and whether this processor sends or 
C receives first during a swap at a given step (for synchronous 
C communication)
      INTEGER SWAPNODE(NPROCSX-1), ORDER(NPROCSX-1)
C loop index
      INTEGER STEP
C
C---- Executable Statements --------------------------------------------
C
C     Identify who I am.
      ME = MAP(MYINDEX)
C
C     Calculate length of a single column segment (for sending) and a
C     single row segment (for receiving), in bytes.
      COLSIZE = RBYTES*W*ML*H1*H2
      ROWSIZE = RBYTES*W*NL*H1*H2
C
C     Precalculate swap partners and other information needed by 
C     transpose algorithm.
      CALL SWPTRNS_INIT(M, N, MAPSIZE, MAP, MYINDEX, SENDDEX,
     &                  SENDCOLS, RECVDEX, RECVROWS, SWAPNODE, ORDER)
C
C     Post receive requests.
      DO STEP=1,MAPSIZE-1
        RECVSIZE = RECVROWS(STEP)*ROWSIZE
        CALL SWAP1(COMMOPT, PROTOPT, FORCETYPE, .TRUE., ORDER(STEP), ME, 
     &             BASE, SWAPNODE(STEP), RECVSIZE, WS(1,RECVDEX(STEP)))
      ENDDO
C
C     Construct transpose using O(P) swap algorithm.
      DO STEP=1,MAPSIZE-1
C
C       Receive component and initiate corresponding send.
        SENDSIZE = SENDCOLS(STEP)*COLSIZE
        RECVSIZE = RECVROWS(STEP)*ROWSIZE
        CALL SWAP2(COMMOPT, PROTOPT, FORCETYPE, .TRUE., ORDER(STEP), ME, 
     &             BASE, SWAPNODE(STEP), SENDSIZE, A(1,SENDDEX(STEP)),
     &             RECVSIZE, WS(1,RECVDEX(STEP)))  
C
C       Transpose received component into B.
        CALL TRANS(DIR, W, M, H1, H2, RECVROWS(STEP), NL, MAX,
     &             RECVDEX(STEP), WS(1,RECVDEX(STEP)), B) 
C
      ENDDO
C
C     Wait until outstanding send operations are complete.
      DO STEP=1,MAPSIZE-1
        CALL SWAP3(COMMOPT, PROTOPT, FORCETYPE, ME, BASE,
     &             SWAPNODE(STEP))   
      ENDDO
C
C     Transpose last component from A to B.
      CALL TRANS(DIR, W, M, H1, H2, ML, NL, MAX, RECVDEX(0),
     &           A(1,SENDDEX(0)), B)  
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE SWPTRNS_INIT(M, N, MAPSIZE, MAP, MYINDEX, SENDDEX,
     &                        SENDCOLS, RECVDEX, RECVROWS, SWAPNODE, 
     &                        ORDER)
C
C This routine calculates swap partners and other information needed
C by the O(P) "swap" transpose algorithm. 
C
C called by: SWPTRNS1, SWPTRNS2
C calls:
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C size of last indices of input and output arrays in transpose
      INTEGER N, M
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in map array
      INTEGER MYINDEX
C
C     Output
C
C arrays indicating beginning index and size of message being sent 
C during a given swap
      INTEGER SENDDEX(0:MAPSIZE-1), SENDCOLS(MAPSIZE-1)
C arrays indicating beginning index and size of message being 
C received during a given swap
      INTEGER RECVDEX(0:MAPSIZE-1), RECVROWS(MAPSIZE-1)
C arrays indicating swap partner and whether this processor sends or 
C receives first during a swap at a given step (for synchronous 
C communication)
      INTEGER SWAPNODE(MAPSIZE-1), ORDER(MAPSIZE-1)
C
C---- Local Variables --------------------------------------------------
C
C loop index and bound
      INTEGER I, MAXPOW2
C swap step and partner indices
      INTEGER INEXT, ISWAP
C smallest power of two not greater than I
      INTEGER IPOW2
C
C---- External Functions ----------------------------------------------
C
C Exclusive OR
      INTEGER XOR
C
C---- Executable Statements -------------------------------------------
C
C     Calculate smallest power of two not smaller than MAPSIZE.
      MAXPOW2 = 1
      DO WHILE (MAXPOW2 .LT. MAPSIZE)
        MAXPOW2 = 2*MAXPOW2
      ENDDO
C
C     Calculate indices for local data.
      SENDDEX(0) = 1 + MYINDEX*(N/MAPSIZE) 
     &               + MIN0(MYINDEX,MOD(N,MAPSIZE))
      RECVDEX(0) = 1 + MYINDEX*(M/MAPSIZE) 
     &               + MIN0(MYINDEX,MOD(M,MAPSIZE))
C
      INEXT = 0
      IPOW2  = 1
      DO I=1,MAXPOW2-1
C
C       Identify potential swap partner index.
        ISWAP = XOR(MYINDEX,I)
C
C       If a legal swap partner, record it.
        IF (ISWAP .LT. MAPSIZE) THEN
          INEXT = INEXT+1
C
C         Compute source and destination indices and sizes for messages.
          SENDDEX(INEXT) = 1 + ISWAP*(N/MAPSIZE) 
     &                       + MIN0(ISWAP,MOD(N,MAPSIZE))
          RECVDEX(INEXT) = 1 + ISWAP*(M/MAPSIZE) 
     &                       + MIN0(ISWAP,MOD(M,MAPSIZE))
C
          SENDCOLS(INEXT) = N/MAPSIZE
          IF (ISWAP .LT. MOD(N, MAPSIZE)) 
     &      SENDCOLS(INEXT) = SENDCOLS(INEXT) + 1
C
          RECVROWS(INEXT)   = M/MAPSIZE
          IF (ISWAP .LT. MOD(M, MAPSIZE)) 
     &      RECVROWS(INEXT) = RECVROWS(INEXT) + 1
C
C         Save swap partner id.
          SWAPNODE(INEXT) = MAP(ISWAP)
C
C         Calculate swap order, using an order that minimizes collisions
C         on a bidirectional grid.
          IF (I .GE. IPOW2) IPOW2 = 2*IPOW2
          IF (MOD(MYINDEX, IPOW2) .LT. IPOW2/2) THEN
            IF (MOD(MYINDEX, 2) .EQ. 0) THEN
              ORDER(INEXT) = 1
            ELSE
              ORDER(INEXT) = -1
            ENDIF
          ELSE
            IF (MOD(ISWAP, 2) .EQ. 0) THEN
              ORDER(INEXT) = -1
            ELSE
              ORDER(INEXT) = 1
            ENDIF
          ENDIF
C
        ENDIF
C
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC

