C#######################################################################
C PSTSWM Version 1.0 (8/1/93)                                          #
C  A message-passing benchmark code and parallel algorithm testbed     #
C  that solves the nonlinear shallow water equations using the spectral#
C  transform method.                                                   #
C Written by:                                                          #
C  Patrick Worley of Oak Ridge National Laboratory                     #
C  Ian Foster of Argonne National Laboratory                           #
C Based on the sequential code STSWM 2.0 by James Hack and Ruediger    #
C  Jakob of the National Center for Atmospheric Research.              #
C Research and development funded by the Computer Hardware, Advanced   #
C  Mathematics, and Model Physics (CHAMMP) program of the U.S.         #
C  Department of Energy.                                               # 
C                                                                      #
C Questions and comments should be directed to worley@msr.epm.ornl.gov #
C Please notify and acknowledge the authors in any research or         #
C publications utilizing PSTSWM or any part of the code.               #
C                                                                      #
C NOTICE: Neither the institutions nor the authors make any            #
C representations about the suitability of this software for any       #
C purpose. This software is provided "as is", without express or       #
C implied warranty.                                                    #
C#######################################################################
      SUBROUTINE LOGTRANS(COMMOPT, BUFFERS, PROTOPT, FORCETYPE, MAPSIZE,
     &                    MAP, MYINDEX, BASE, DIR, W, M, N, H1, H2, ML,
     &                    NL, MAX, A, WS, B)
C
C This subroutine calls routines that compute B = transpose(A) using
C an O(log P) transpose algorithm. Here
C  A is a matrix of size (W,M,H1,H2,N) distributed by rows and
C  B is a matrix of size (W,N,H1,H2,M) distributed by rows
C over MAPSIZE processors, and each processor has part of A and B as 
C follows:
C  A(W,ML,H1,H2,N): Each processor has ML = (M/P or M/P+1) rows of A; 
C                   excess rows are allocated to lower-numbered nodes.
C  B(W,NL,H1,H2,M): Each processor has NL = (N/P or N/P+1) rows of B; 
C                   excess rows are allocated to lower-numbered nodes.
C W is 1 or 2, depending on whether the arrays are REAL or COMPLEX.
C
C Alternative reorganizations of B are also supported, and are 
C determined by the parameter DIR. DIR specifies where TRANSPOSE is 
C called from, allowing the routine to order B as required for 
C subsequent stages in PSTSWM: 
C  DIR=-1: B(W,MAX,NL,H1,H2)  (Used after real forward transpose.)
C  DIR=+1: B(W,MAX,M,H1,H2)   (Used after real backward transpose.)
C  DIR=-2: B(W,MAX,H1,M,H2)   (Used after complex forward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=+2: B(W,MAX,H1,NL,H2)  (Used after complex backward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=-3: B(W,MAX,NL,M,H2)   (Used after complex forward transpose
C                             in distributed FFT/transpose LT algorithm)
C  DIR=+3: B(W,MAX,M,NL,H2)   (Used after complex backward transpose
C                             in distributed FFT/transpose LT algorithm)
C The MAP array defines the processor subset and ordering to use.
C
C LOGTRANS requires that MAPSIZE be a power of two, and that M and N be
C integer multiples of MAPSIZE.
C
C Communication options (COMMOPT) for LOGTRANS include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for LOGTRANS include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5) .AND. (BUFFERS .GT. 1)
C    recv-ahead
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: TRANSPOSE
C calls: LGTRNS1, LGTRNS2
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C number of communication buffers (to use in recv-ahead algorithms)
      INTEGER BUFFERS
C communication protocol option 
      INTEGER PROTOPT
C forcetype message type offset
      INTEGER FORCETYPE
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offset to use in interprocessor communication
      INTEGER BASE
C context in which transpose occurs, and hence required data organization
      INTEGER DIR
C number of reals in datatype (1: REAL, 2: COMPLEX)
      INTEGER W
C dimensions of input and output arrays
      INTEGER M, N, H1, H2, ML, NL, MAX
C local component of the array that is to be transposed
      REAL A(W,ML,H1,H2,N)
C
C     Work Space
C
C message buffers
      REAL WS(W,NL,H1,H2,M,BUFFERS)
C
C     Output
C
C local component of the transposed array.
C (organized as REAL (W,MAX,NL,H1,H2), (W,MAX,M,H1,H2), (W,MAX,H1,M,H2),
C  (W,MAX,H1,NL,H2), (W,MAX,NL,M,H2), or (W,MAX,M,NL,H2)) 
      REAL B(1)
C
C---- Local Variables --------------------------------------------------
C
C temporary for checking legality of arguments
      INTEGER I
C
C---- Executable Statements --------------------------------------------
C
      IF (MAPSIZE .GT. 1) THEN
C       Compute transpose.
C
C       Check that P is a power or two.
        I = 1
        DO WHILE (I .LT. MAPSIZE)
          I = I*2
        ENDDO
C
        IF (MAPSIZE .NE. I) THEN
          WRITE(0,101) MAP(MYINDEX), MAPSIZE
  101     FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE TRANSPOSE ',/,
     &            ' O(log P) TRANSPOSE ALGORITHM REQUIRES THAT THE',
     &            ' NUMBER OF PROCESSORS BE A POWER OF TWO',/,
     &            ' PROCID = ',I4,' P = ',I4)
          STOP
        ENDIF
C
C       Check that M and N are multiples of P.
        IF ((MOD(M, MAPSIZE) .NE. 0) .OR. (MOD(N, MAPSIZE) .NE. 0)) THEN
          WRITE(0,102) MAP(MYINDEX), M, N, MAPSIZE
  102     FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE TRANSPOSE ',/,
     &            ' O(log P) TRANSPOSE ALGORITHM REQUIRES THAT THE',
     &            ' ARRAY DIMENSIONSS M AND N BE MULTIPLES OF P',/,
     &            ' PROCID = ',I4,' M = ',I4,' N = ',I4,' P = ',I4)
          STOP
        ENDIF
C
C       Choose transpose algorithm.
        IF (BUFFERS .EQ. 1) THEN
C         double buffer algorithms
          CALL LGTRNS1(COMMOPT, PROTOPT, FORCETYPE, MAPSIZE, MAP,
     &                 MYINDEX, BASE, DIR, W, M, N, H1, H2, ML, NL, MAX,
     &                 A, WS, B) 
C
        ELSEIF (BUFFERS .GE. 2) THEN
C         multiple buffer algorithms
          CALL LGTRNS2(COMMOPT, BUFFERS, PROTOPT, FORCETYPE, MAPSIZE,
     &                 MAP, MYINDEX, BASE, DIR, W, M, N, H1, H2, ML, NL,
     &                 MAX, A, WS, B) 
C
        ELSE
C         illegal number of buffers specified
          WRITE(0,100) MAP(MYINDEX), BUFFERS
  100     FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE LOGTRANS ',/,
     &            ' ILLEGAL COMMUNICATION BUFFER SIZE SPECIFIED',/,
     &            ' PROCID = ',I4,' BUFFERS = ',I4)
          STOP
        ENDIF
C
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE LGTRNS1(COMMOPT, PROTOPT, FORCETYPE, MAPSIZE, MAP, 
     &                   MYINDEX, BASE, DIR, W, M, N, H1, H2, ML, NL,
     &                   MAX, A, WS, B)
C
C This routine computes B = transpose(A) using an O(log P) transpose 
C algorithm and one communication buffer. Here
C  A is a matrix of size (W,M,H1,H2,N) distributed by rows and
C  B is a matrix of size (W,N,H1,H2,M) distributed by rows
C over MAPSIZE processors, and each processor has part of A and B as 
C follows:
C  A(W,ML,H1,H2,N): Each processor has ML = (M/P or M/P+1) rows of A; 
C                   excess rows are allocated to lower-numbered nodes.
C  B(W,NL,H1,H2,M): Each processor has NL = (N/P or N/P+1) rows of B; 
C                   excess rows are allocated to lower-numbered nodes.
C W is 1 or 2, depending on whether the arrays are REAL or COMPLEX.
C
C Alternative reorganizations of B are also supported, and are 
C determined by the parameter DIR. DIR specifies where TRANSPOSE is 
C called from, allowing the routine to order B as required for 
C subsequent stages in PSTSWM: 
C  DIR=-1: B(W,MAX,NL,H1,H2)  (Used after real forward transpose.)
C  DIR=+1: B(W,MAX,M,H1,H2)   (Used after real backward transpose.)
C  DIR=-2: B(W,MAX,H1,M,H2)   (Used after complex forward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=+2: B(W,MAX,H1,NL,H2)  (Used after complex backward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=-3: B(W,MAX,NL,M,H2)   (Used after complex forward transpose
C                             in distributed FFT/transpose LT algorithm)
C  DIR=+3: B(W,MAX,M,NL,H2)   (Used after complex backward transpose
C                             in distributed FFT/transpose LT algorithm)
C The MAP array defines the processor subset and ordering to use.
C
C LGTRNS1 requires that MAPSIZE be a power of two, and that M and N be 
C integer multiples of MAPSIZE.
C
C Communication options (COMMOPT) for LGTRNS1 include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for LGTRNS1 include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: LOGTRANS
C calls: LGTRNS_INIT, SWAP, TRANS
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
      INCLUDE 'params.i'
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C communication protocol option 
      INTEGER PROTOPT
C forcetype message type offset
      INTEGER FORCETYPE
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offset to use in interprocessor communication
      INTEGER BASE
C context in which transpose occurs, and hence required data organization
      INTEGER DIR
C number of reals in datatype (1: REAL, 2: COMPLEX)
      INTEGER W
C dimensions of input and output arrays
      INTEGER M, N, H1, H2, ML, NL, MAX
C Local component of the array that is to be transposed.
C (organized as REAL (W,ML,H1,H2,N))
      REAL A(W*ML*H1*H2*N)
C
C     Work Space
C
C message buffers
      REAL WS((W*ML*H1*H2*N)/2,2)
C
C     Output
C
C Local component of the transposed array.
C (organized as REAL (W,MAX,NL,H1,H2), (W,MAX,M,H1,H2), (W,MAX,H1,M,H2),
C  (W,MAX,H1,NL,H2), (W,MAX,NL,M,H2), or (W,MAX,M,NL,H2)) 
      REAL B(1)
C
C---- Local Variables --------------------------------------------------
C
C true processor id for "me"
      INTEGER ME
C size (in reals) of message being swapped
      INTEGER HALFSIZE
C number of swaps in O(log P) transpose algorithm
      INTEGER MAXSTEP
C information on number, size, and location of pieces used to construct
C messages for a given swap
      INTEGER BLOCKS(LGPROCSX), BUFFSIZE(LGPROCSX), DIRN(0: LGPROCSX)
C arrays indicating the destination of the message sent during a
C given swap, and whether this processor sends or receives first
      INTEGER SWAPNODE(LGPROCSX)
      INTEGER ORDER(LGPROCSX)
C indices indicating which message buffer is used for sending the
C message in a swap, and which buffer is used for receiving the message
      INTEGER SNDBUF, RCVBUF
C loop indices
      INTEGER I, J, STEP
C array indices
      INTEGER ITO, IFROM
C
C---- Executable Statements --------------------------------------------
C
C     Identify who I am.
      ME = MAP(MYINDEX)
C
C     Calculate length of vector being exchanged (in reals).
      HALFSIZE = W*ML*H1*H2*N/2
C
C     Precalculate swap partners and other information needed by 
C     transpose algorithm.
      CALL LGTRNS_INIT(MAPSIZE, MAP, MYINDEX, LGPROCSX, HALFSIZE, 
     &                 MAXSTEP, SWAPNODE, ORDER, BLOCKS, BUFFSIZE, DIRN)
C
C     Construct transpose using logarithmic exchange algorithm:
C     case 1) (STEP .EQ. 1)
C
C     Set initial roles of message buffers.
      SNDBUF = 1
      RCVBUF = 2
C
C     Send half of A and receive a message into WS(*,RCVBUF).
      IFROM = (1-DIRN(1))*BUFFSIZE(1)
      CALL SWAP(COMMOPT, PROTOPT, FORCETYPE, ORDER(1), ME, BASE, 
     &          SWAPNODE(1), RBYTES*HALFSIZE, A(IFROM+1),
     &          RBYTES*HALFSIZE, WS(1,RCVBUF))
C
C     Copy half of WS(*,2) into A, for use in subsequent steps.
      IF (MAXSTEP .GT. 1) THEN
        IFROM = DIRN(2)*BUFFSIZE(2)
        ITO   = (1-DIRN(1))*BUFFSIZE(1) + DIRN(2)*BUFFSIZE(2)
        DO I = 1,BUFFSIZE(2)
          A(ITO+I) = WS(IFROM+I,RCVBUF)
        ENDDO
      ELSE
C       For simplicity, copy all of message to A on last time through.
        ITO = (1-DIRN(1))*BUFFSIZE(1)
        DO I = 1,BUFFSIZE(1)
          A(ITO+I) = WS(I,RCVBUF)
        ENDDO
      ENDIF
C
C     case 2) (STEP = 2,...,MAXSTEP)
      DO STEP=2,MAXSTEP
C
C       Prepare a message of size HALFSIZE in WS(*,1). In step "STEP",
C       the message is constructed from 2**(STEP-1) pieces, 1/2 from A
C       and 1/2 from previous incoming message.
        IF (DIRN(STEP) .EQ. DIRN(STEP-1)) THEN
C         Construct new message in previous receive message buffer.
C
C         Switch roles of message buffers.
          SNDBUF = RCVBUF
          RCVBUF  = MOD(SNDBUF, 2) + 1
C
          DO J = 1,BLOCKS(STEP)
            IF (MOD(J, 2) .EQ. (1-DIRN(STEP-1))) THEN
              ITO   = (J-1)*BUFFSIZE(STEP)
              IFROM = (2*J-1-DIRN(STEP))*BUFFSIZE(STEP)
              DO I = 1,BUFFSIZE(STEP)
                WS(ITO+I,SNDBUF) = A(IFROM+I)
              ENDDO
            ENDIF
          ENDDO
C
        ELSE
C         Construct new message in previous send message buffer.
C
          DO J = 1,BLOCKS(STEP)
            ITO = (J-1)*BUFFSIZE(STEP)
            IF (MOD(J, 2) .EQ. (1-DIRN(STEP-1))) THEN
              IFROM = (2*J-1-DIRN(STEP))*BUFFSIZE(STEP)
              DO I = 1,BUFFSIZE(STEP)
                WS(ITO+I,SNDBUF) = A(IFROM+I)
              ENDDO
            ELSE
              IFROM = (((J-1)/2)*2+1-DIRN(STEP))*BUFFSIZE(STEP)
              DO I = 1,BUFFSIZE(STEP)
                WS(ITO+I,SNDBUF) = WS(IFROM+I,RCVBUF)
              ENDDO
            ENDIF 
          ENDDO
C
        ENDIF
C
C       Send newly assembled message and receive message into
C       WS(*,RCVBUF). 
        CALL SWAP(COMMOPT, PROTOPT, FORCETYPE, ORDER(STEP), ME, BASE, 
     &            SWAPNODE(STEP), RBYTES*HALFSIZE, WS(1,SNDBUF),
     &            RBYTES*HALFSIZE, WS(1,RCVBUF))
C
C       Copy half of WS(*,RCVBUF) into A, for use in subsequent steps.
        IF (STEP .LT. MAXSTEP) THEN
          DO J = 1,BLOCKS(STEP)
            IFROM = (J*2-2+DIRN(STEP+1))*BUFFSIZE(STEP+1)
            ITO   = (2*J-1-DIRN(STEP))*BUFFSIZE(STEP) 
     &            + DIRN(STEP+1)*BUFFSIZE(STEP+1)
            DO I = 1,BUFFSIZE(STEP+1)
              A(ITO+I) = WS(IFROM+I,RCVBUF)
            ENDDO
          ENDDO
        ELSE
C         For simplicity, copy all of message to A on last time through.
          DO J = 1,BLOCKS(STEP)
            IFROM = (J-1)*BUFFSIZE(STEP)
            ITO   = (2*J-1-DIRN(STEP))*BUFFSIZE(STEP)
            DO I = 1,BUFFSIZE(STEP)
              A(ITO+I) = WS(IFROM+I,RCVBUF)
            ENDDO
          ENDDO
        ENDIF
C
      ENDDO
C
C     Finally, transpose each of P components from A to B.
      DO I = 0,MAPSIZE-1
        CALL TRANS(DIR, W, M, H1, H2, ML, NL, MAX, ML*I+1,
     &             A(I*BUFFSIZE(MAXSTEP)+1), B)
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE LGTRNS2(COMMOPT, BUFFERS, PROTOPT, FORCETYPE, MAPSIZE,
     &                   MAP, MYINDEX, BASE, DIR, W, M, N, H1, H2, ML,
     &                   NL, MAX, A, WS, B)
C
C This routine computes B = transpose(A) using an O(log P) transpose 
C algorithm and more than one communication buffer. Here
C  A is a matrix of size (W,M,H1,H2,N) distributed by rows and
C  B is a matrix of size (W,N,H1,H2,M) distributed by rows
C over MAPSIZE processors, and each processor has part of A and B as 
C follows:
C  A(W,ML,H1,H2,N): Each processor has ML = (M/P or M/P+1) rows of A; 
C                   excess rows are allocated to lower-numbered nodes.
C  B(W,NL,H1,H2,M): Each processor has NL = (N/P or N/P+1) rows of B; 
C                   excess rows are allocated to lower-numbered nodes.
C W is 1 or 2, depending on whether the arrays are REAL or COMPLEX.
C
C Alternative reorganizations of B are also supported, and are 
C determined by the parameter DIR. DIR specifies where TRANSPOSE is 
C called from, allowing the routine to order B as required for 
C subsequent stages in PSTSWM: 
C  DIR=-1: B(W,MAX,NL,H1,H2)  (Used after real forward transpose.)
C  DIR=+1: B(W,MAX,M,H1,H2)   (Used after real backward transpose.)
C  DIR=-2: B(W,MAX,H1,M,H2)   (Used after complex forward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=+2: B(W,MAX,H1,NL,H2)  (Used after complex backward transpose
C                             in transpose FFT/transpose LT algorithm)
C  DIR=-3: B(W,MAX,NL,M,H2)   (Used after complex forward transpose
C                             in distributed FFT/transpose LT algorithm)
C  DIR=+3: B(W,MAX,M,NL,H2)   (Used after complex backward transpose
C                             in distributed FFT/transpose LT algorithm)
C The MAP array defines the processor subset and ordering to use.
C
C LGTRNS2 requires that MAPSIZE be a power of two, and that
C M and N be integer multiples of MAPSIZE.
C
C Communication options (COMMOPT) for LGTRANS2 include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for LGTRANS2 include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5) nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  
C    nonblocking receive and recv-ahead
C  IF (PROTOPT .EQ. 4 .OR. 5) forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: LOGTRANS
C calls: LGTRNS_INIT, SWAP1, SWAP2, SWAP3, TRANS
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
      INCLUDE 'params.i'
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C number of communication buffers (to use in recv-ahead algorithms)
      INTEGER BUFFERS
C communication protocol option 
      INTEGER PROTOPT
C forcetype message type offset
      INTEGER FORCETYPE
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offset to use in interprocessor communication
      INTEGER BASE
C context in which transpose occurs, and hence required data organization
      INTEGER DIR
C number of reals in datatype (1: REAL, 2: COMPLEX)
      INTEGER W
C dimensions of input and output arrays
      INTEGER M, N, H1, H2, ML, NL, MAX
C Local component of the array that is to be transposed.
C (organized as REAL (W,ML,H1,H2,N))
      REAL A(W*ML*H1*H2*N)
C
C     Work Space
C
C message buffers
C (messages sent from (*,1,*), messages received into (*,2,*))
      REAL WS((W*ML*H1*H2*N)/2,2,BUFFERS)
C
C     Output
C
C Local component of the transposed array.
C (organized as REAL (W,MAX,NL,H1,H2), (W,MAX,M,H1,H2), (W,MAX,H1,M,H2),
C  (W,MAX,H1,NL,H2), (W,MAX,NL,M,H2), or (W,MAX,M,NL,H2)) 
      REAL B(1)
C
C---- Local Variables --------------------------------------------------
C
C true processor id for "me"
      INTEGER ME
C size (in reals and in bytes) of message being swapped
      INTEGER HALFSIZE, MSGLTH
C number of swaps in O(log P) transpose algorithm
      INTEGER MAXSTEP
C number of buffers to use in recv-ahead algorithm
      INTEGER MAXBUF
C information on number, size, and location of pieces used to construct
C messages for a given swap
      INTEGER BLOCKS(LGPROCSX), BUFFSIZE(LGPROCSX), DIRN(0: LGPROCSX)
C arrays indicating the destination of the message sent during a
C given swap, and whether this processor sends or receives first
      INTEGER SWAPNODE(LGPROCSX)
      INTEGER ORDER(LGPROCSX)
C loop indices
      INTEGER I, J, STEP
C array indices
      INTEGER ITO, IFROM
C buffer pointers
      INTEGER CURBUF, PREVBUF
C
C---- Executable Statements --------------------------------------------
C
C     Identify who I am.
      ME = MAP(MYINDEX)
C
C     Calculate length of vector being exchanged (in reals and in bytes). 
      HALFSIZE = W*ML*H1*H2*N/2
      MSGLTH   = RBYTES*HALFSIZE
C
C     Precalculate swap partners and other information needed by
C     transpose algorithm.
      CALL LGTRNS_INIT(MAPSIZE, MAP, MYINDEX, LGPROCSX, HALFSIZE, 
     &                 MAXSTEP, SWAPNODE, ORDER, BLOCKS, BUFFSIZE, DIRN)
C
C     Calculate number of recv-ahead buffers to use.
      MAXBUF = MIN(MAXSTEP, BUFFERS)
C
C     Post MAXBUF receive requests.
      DO I=1,MAXBUF
        CALL SWAP1(COMMOPT, PROTOPT, FORCETYPE, .TRUE., ORDER(I), ME,
     &             BASE, SWAPNODE(I), MSGLTH, WS(1,2,I)) 
      ENDDO
C
C     Construct transpose using logarithmic exchange algorithm.
      IF (MAXSTEP .GT. 1) THEN
C
C       case 1) (STEP .EQ. 1)
C       Initialize buffer pointers.
        PREVBUF = MAXBUF
        CURBUF  = 1
C
C       Prepare a message of size HALFSIZE in WS(*,1,CURBUF) using A. 
        IFROM = (1-DIRN(1))*BUFFSIZE(1)
        DO I = 1,BUFFSIZE(1)
          WS(I,1,CURBUF) = A(IFROM+I)
        ENDDO
C
C       Send newly assembled message and receive message into
C       WS(*,2,CURBUF).  
        CALL SWAP2(COMMOPT, PROTOPT, FORCETYPE, .TRUE., ORDER(1), ME,
     &             BASE, SWAPNODE(1), MSGLTH, WS(1,1,CURBUF), MSGLTH,
     &             WS(1,2,CURBUF)) 
C
C       Copy half of WS(*,2,CURBUF) into A, for use in subsequent steps.
        IFROM = DIRN(2)*BUFFSIZE(2)
        ITO   = (1-DIRN(1))*BUFFSIZE(1) + DIRN(2)*BUFFSIZE(2)
        DO I = 1,BUFFSIZE(2)
          A(ITO+I) = WS(IFROM+I,2,CURBUF)
        ENDDO
C
C       case 2) (STEP = 2,...,MAXSTEP-1)
        DO STEP=2,MAXSTEP-1
C
C         Update buffer pointers.
          PREVBUF = CURBUF
          CURBUF  = MOD(STEP-1, MAXBUF) + 1
C
C         Wait until send of WS(1,1,CURBUF) is complete before
C         overwriting contents. 
          IF (STEP .GT. MAXBUF) THEN
            CALL SWAP3(COMMOPT, PROTOPT, FORCETYPE, ME, BASE,
     &                 SWAPNODE(STEP-MAXBUF)) 
          ENDIF
C
C         Prepare a message of size HALFSIZE in WS(*,1,CURBUF). The
C         message is constructed from 2**(STEP-1) pieces, 1/2 from A
C         and 1/2 from previous incoming message. 
          DO J = 1,BLOCKS(STEP)
            ITO = (J-1)*BUFFSIZE(STEP)
            IF (MOD(J, 2) .EQ. (1-DIRN(STEP-1))) THEN
              IFROM = (2*J-1-DIRN(STEP))*BUFFSIZE(STEP)
              DO I = 1,BUFFSIZE(STEP)
                WS(ITO+I,1,CURBUF) = A(IFROM+I)
              ENDDO
            ELSE
              IFROM = (((J-1)/2)*2+1-DIRN(STEP))*BUFFSIZE(STEP)
              DO I = 1,BUFFSIZE(STEP)
                WS(ITO+I,1,CURBUF) = WS(IFROM+I,2,PREVBUF)
              ENDDO
            ENDIF 
          ENDDO
C
C         Post next recv-ahead receive request.
          IF (MAXBUF-1+STEP .LE. MAXSTEP) THEN
            CALL SWAP1(COMMOPT, PROTOPT, FORCETYPE, .TRUE.,
     &                 ORDER(MAXBUF-1+STEP), ME, BASE,
     &                 SWAPNODE(MAXBUF-1+STEP), MSGLTH,
     &                 WS(1,2,PREVBUF)) 
          ENDIF
C
C         Send newly assembled message and receive message into
C         WS(*,2,CURBUF).  
          CALL SWAP2(COMMOPT, PROTOPT, FORCETYPE, .TRUE., ORDER(STEP),
     &               ME, BASE, SWAPNODE(STEP), MSGLTH, WS(1,1,CURBUF),
     &               MSGLTH, WS(1,2,CURBUF))  
C
C         Copy half of WS(*,2,CURBUF) into A, for use in subsequent
C         steps. 
          DO J = 1,BLOCKS(STEP)
            IFROM = (J*2-2+DIRN(STEP+1))*BUFFSIZE(STEP+1)
            ITO   = (2*J-1-DIRN(STEP))*BUFFSIZE(STEP) 
     &            + DIRN(STEP+1)*BUFFSIZE(STEP+1)
            DO I = 1,BUFFSIZE(STEP+1)
              A(ITO+I) = WS(IFROM+I,2,CURBUF)
            ENDDO
          ENDDO
C
        ENDDO
C
C       case 3) (STEP .EQ. MAXSTEP)
C       Update buffer pointers.
        PREVBUF = CURBUF
        CURBUF  = MOD(MAXSTEP-1, MAXBUF) + 1
C
C       Wait until send of WS(1,1,CURBUF) is complete before overwriting
C       contents.  
        IF (MAXSTEP .GT. MAXBUF) THEN
          CALL SWAP3(COMMOPT, PROTOPT, FORCETYPE, ME, BASE,
     &               SWAPNODE(MAXSTEP-MAXBUF))  
        ENDIF
C
C       Prepare a message of size HALFSIZE in WS(*,1,CURBUF). The
C       message is constructed from 2**(MAXSTEP-1) pieces, 1/2 from A 
C       and 1/2 from previous incoming message. 
        DO J = 1,BLOCKS(MAXSTEP)
          ITO = (J-1)*BUFFSIZE(MAXSTEP)
          IF (MOD(J, 2) .EQ. (1-DIRN(MAXSTEP-1))) THEN
            IFROM = (2*J-1-DIRN(MAXSTEP))*BUFFSIZE(MAXSTEP)
            DO I = 1,BUFFSIZE(MAXSTEP)
              WS(ITO+I,1,CURBUF) = A(IFROM+I)
            ENDDO
          ELSE
            IFROM = (((J-1)/2)*2+1-DIRN(MAXSTEP))*BUFFSIZE(MAXSTEP)
            DO I = 1,BUFFSIZE(MAXSTEP)
              WS(ITO+I,1,CURBUF) = WS(IFROM+I,2,PREVBUF)
            ENDDO
          ENDIF 
        ENDDO
C
C       Send newly assembled message and receive message into
C       WS(*,2,CURBUF).   
        CALL SWAP2(COMMOPT, PROTOPT, FORCETYPE, .TRUE., ORDER(MAXSTEP),
     &             ME, BASE, SWAPNODE(MAXSTEP), MSGLTH, WS(1,1,CURBUF),
     &             MSGLTH, WS(1,2,CURBUF))  
C
C       For simplicity, copy all of message to A this last time
        DO J = 1,BLOCKS(MAXSTEP)
          IFROM = (J-1)*BUFFSIZE(MAXSTEP)
          ITO   = (2*J-1-DIRN(MAXSTEP))*BUFFSIZE(MAXSTEP)
          DO I = 1,BUFFSIZE(MAXSTEP)
            A(ITO+I) = WS(IFROM+I,2,CURBUF)
          ENDDO
        ENDDO
C
C       Wait until outstanding send operations are complete.
        DO I=(MAXSTEP+1-MAXBUF),MAXSTEP
          CALL SWAP3(COMMOPT, PROTOPT, FORCETYPE, ME, BASE, SWAPNODE(I))
        ENDDO
C
      ELSE
C       (MAXSTEP .EQ. 1) case
C
C       Prepare a message of size HALFSIZE in WS(*,1,1) using A. 
        IFROM = (1-DIRN(1))*BUFFSIZE(1)
        DO I = 1,BUFFSIZE(1)
          WS(I,1,1) = A(IFROM+I)
        ENDDO
C
C       Send newly assembled message and receive message into A
        ITO = (1-DIRN(1))*BUFFSIZE(1)
        CALL SWAP2(COMMOPT, PROTOPT, FORCETYPE, .TRUE., ORDER(1), ME,
     &             BASE, SWAPNODE(1), MSGLTH, WS(1,1,1), MSGLTH,
     &             A(ITO+1)) 
C
      ENDIF
C
C     Finally, transpose each of P components from A to B.
      DO I = 0,MAPSIZE-1
        CALL TRANS(DIR, W, M, H1, H2, ML, NL, MAX, ML*I+1,
     &             A(I*BUFFSIZE(MAXSTEP)+1), B)
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE LGTRNS_INIT(MAPSIZE, MAP, MYINDEX, LGPROCSX, MSGSIZE,
     &                       MAXSWAP, SWAPNODE, ORDER, BLOCKS, BUFFSIZE, 
     &                       DIRN)
C
C This routine calculates swap partners and other information needed
C by the O(log P) exchange transpose algorithms.
C
C called by: LGTRNS1, LGTRNS2
C calls:
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C declared length of the output arrays, at least as long as 
C LOG2(MAPSIZE)
      INTEGER LGPROCSX
C number of reals in a swap message
      INTEGER MSGSIZE
C
C     Output
C
C number of swaps in transpose algorithm
      INTEGER MAXSWAP
C arrays indicating the destination of the message sent during a
C given swap, and whether this processor sends or receives first
      INTEGER SWAPNODE(LGPROCSX)
      INTEGER ORDER(LGPROCSX)
C information on number, size, and location of pieces used to construct
C messages for a given swap
      INTEGER BLOCKS(LGPROCSX)
      INTEGER BUFFSIZE(LGPROCSX)
      INTEGER DIRN(0:LGPROCSX)
C
C---- Local Variables --------------------------------------------------
C
C distance between MYINDEX and swap partner index
      INTEGER DISTANCE
C index of swap partner
      INTEGER DEST
C loop index
      INTEGER I
C
C---- Executable Statements -------------------------------------------
C
      MAXSWAP = 0                                        
      DISTANCE = MAPSIZE/2
      DIRN(0) = 0
      DO WHILE (DISTANCE .GT. 0)
C
C       Increment step index.
        MAXSWAP = MAXSWAP+1
C
C       Calculate swap partners, swap order, and buffer pointers. The
C       order is chosen in such a way as to minimize collisions on a
C       bidirectional grid. 
        IF (MOD(MYINDEX, 2*DISTANCE) .LT. DISTANCE) THEN
          DIRN(MAXSWAP) = 0
          DEST = MYINDEX + DISTANCE
          IF (MOD(MYINDEX, 2) .EQ. 0) THEN
            ORDER(MAXSWAP) = 1
          ELSE
            ORDER(MAXSWAP) = -1
          ENDIF
        ELSE
          DIRN(MAXSWAP) = 1
          DEST = MYINDEX - DISTANCE
          IF (MOD(DEST, 2) .EQ. 0) THEN
            ORDER(MAXSWAP) = -1
          ELSE
            ORDER(MAXSWAP) = 1
          ENDIF
        ENDIF
        SWAPNODE(MAXSWAP) = MAP(DEST)
C
C       Update distance.
        DISTANCE = DISTANCE/2
C
      ENDDO
C
C     Calculate segment sizes and number of segments making up a given 
C     message.
      BUFFSIZE(1) = MSGSIZE
      BLOCKS(1) = 1
      DO I=2,MAXSWAP
        BUFFSIZE(I) = BUFFSIZE(I-1)/2
        BLOCKS(I)   = 2*BLOCKS(I-1)
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
