C#######################################################################
C PSTSWM Version 1.0 (8/1/93)                                          #
C  A message-passing benchmark code and parallel algorithm testbed     #
C  that solves the nonlinear shallow water equations using the spectral#
C  transform method.                                                   #
C Written by:                                                          #
C  Patrick Worley of Oak Ridge National Laboratory                     #
C  Ian Foster of Argonne National Laboratory                           #
C Based on the sequential code STSWM 2.0 by James Hack and Ruediger    #
C  Jakob of the National Center for Atmospheric Research.              #
C Research and development funded by the Computer Hardware, Advanced   #
C  Mathematics, and Model Physics (CHAMMP) program of the U.S.         #
C  Department of Energy.                                               # 
C                                                                      #
C Questions and comments should be directed to worley@msr.epm.ornl.gov #
C Please notify and acknowledge the authors in any research or         #
C publications utilizing PSTSWM or any part of the code.               #
C                                                                      #
C NOTICE: Neither the institutions nor the authors make any            #
C representations about the suitability of this software for any       #
C purpose. This software is provided "as is", without express or       #
C implied warranty.                                                    #
C#######################################################################
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C frfft.f                                                              C
C                                                                      C
C The following routines calculate the forward real Fourier transform  C
C of a block of distributed vectors, or are utility routines used in   C
C the forward transform.                                               C
C                                                                      C
C FRFFT1 - calculates forward real Fourier transforms of a block of    C
C          distributed vectors.                                        C
C FRFFT2 - calculates forward real Fourier transforms of two blocks of C
C          distributed vectors, overlapping the butterfly calculation  C
C          phase for one block with the communication phase for the    C
C          other block.                                                C 
C BUTUPD - calculates a "forward" power-of-two butterfly update for    C
C          each vector in a block, representing a factor-of-two stage  C
C          of a forward Fourier transform.                             C
C SEQFOR - calculates a sequential (local) block forward complex       C
C          Fourier transform.                                          C
C FIXITX - calculates the forward real Fourier transform from the      C
C          output of a forward complex Fourier transform.              C
C                                                                      C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
      SUBROUTINE FRFFT1(COMMOPT, PROTOPT, FORCETYPE, MAPSIZE, MAP, 
     &                  MYINDEX, BASE, NLOCAL, MVECS, RJUMP, NFAX,
     &                  NTRIGS, TRIGS, WS, Y)  
C
C This subroutine calculates the forward Fourier transform of M
C real vectors, each of length N, over a subset of MAPSIZE processors. 
C Here N=NLOCAL*MAPSIZE. It uses a distributed algorithm, i.e., an
C algorithm that moves the data and partial results only as they are
C needed, and transforms all M vectors as a single block. The MAP array
C defines the subset and the processor ordering to use. 
C
C Communication options (COMMOPT) for FRFFT1 include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for FRFFT1 include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: PARRFT
C calls: BUFCPY, BUTUPD, CPYFIX, FIXITX, RFFT_INIT, SEQFOR, SWAP 
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
      INCLUDE 'params.i'
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C communication protocol option 
      INTEGER PROTOPT
C forcetype message type offset
      INTEGER FORCETYPE
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in map array
      INTEGER MYINDEX
C message type offset to use in interprocessor communication
      INTEGER BASE
C the local length of each vector to be transformed
      INTEGER NLOCAL
C the number of vectors per processor to be transformed in Y
      INTEGER MVECS
C the offset between the start of successive vectors.
      INTEGER RJUMP
C the factorization of N (= NLOCAL*MAPSIZE)
      INTEGER NFAX(13)
C number of trigonometric function values used by FRFFT1
      INTEGER NTRIGS
C trigonometric function values used in forward real Fourier transform
      COMPLEX TRIGS(NTRIGS)
C
C     Work Space
C
C work array
      REAL WS(MVECS*(RJUMP/2),2)
C
C     Input/Output
C
C Y contains MVECS complex vectors each of length NLOCAL/2. The real and
C imaginary parts of these vectors are stored as consecutive elements of
C Y. Hence the real and imaginary parts of the Ith component of the Jth
C complex vector in Y are stored at Y(2*I-1,J) and Y(2*I,J).
      REAL Y(RJUMP,MVECS)
C
C---- Local Variables --------------------------------------------------
C
C length  of distributed (real) vector being transformed
      INTEGER N
C number of complex values in real arrays of size NLOCAL and RJUMP,
C respectively
      INTEGER NCLOC, CJUMP
C number of complex values in each local vector segment swapped during
C a parallel stage 
      INTEGER NCOMM
C length of message swapped during each parallel stage (in bytes) 
      INTEGER BUFSIZ
C true processor id for "me"
      INTEGER ME
C arrays indicating the destination of the message sent during a
C given swap, and whether this processor sends or receives first
      INTEGER SWAPNODE(0:LGPROCSX)
      INTEGER ORDER(0:LGPROCSX)
C which half (0: low, 1: high) of the data buffer that is sent during a 
C given swap, and index offset to that half
      INTEGER DIRN(0:LGPROCSX), OFFSET
C index and true processor id of the processor whose data is needed
C in order to compute the real transform from the complex transform,
C and order in which swap with TWIN is to be performed
      INTEGER TWINDEX, TWIN, TWORDER
C index offsets into trigonometric function values array
      INTEGER ICFIX, ICSEQ
C number of "computational" swaps in distributed phase of transform
C algorithm. Total number of swaps includes one before sequential
C phase, and possibly one before real fix-up phase.
      INTEGER MAXSTEP
C loop index
      INTEGER STEP
C                                                                             
C---- Executable Statements --------------------------------------------
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C     initialization
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     length  of distributed (real) vector being transformed
      N = NLOCAL*MAPSIZE
C     number of complex values in real arrays of size NLOCAL and RJUMP
      NCLOC = NLOCAL/2 
      CJUMP = RJUMP/2
C     number of complex values in each local vector segment swapped
C     during a parallel stage 
      NCOMM = NCLOC/2
C     length of message swapped during each parallel stage (in bytes) 
      BUFSIZ = CBYTES*NCOMM*MVECS
C
C     Calculate swap partners (and swap order for (COMMOPT .EQ. 1)
C     protocol).  
      CALL RFFT_INIT(MAPSIZE, MAP, MYINDEX, LGPROCSX, MAXSTEP,
     &               SWAPNODE, ORDER, DIRN, TWINDEX, TWORDER)
C
C     index offsets into the trigonometric function values array
C     corresponding to the sequential and real fix-up phases of the
C     algorithm 
      IF (MAPSIZE .GT. 1) THEN
        ICSEQ = MAXSTEP + 1
      ELSE
        ICSEQ = 1
      ENDIF
      ICFIX = ICSEQ + NCLOC
C
C     Identify myself and TWIN.
      ME = MAP(MYINDEX)
      TWIN = MAP(TWINDEX)
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C     Calculate forward real Fourier transform.
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     Calculate initial (distributed) steps of the forward Fourier
C     transform. 
      DO STEP=MAXSTEP,0,-1
C
        OFFSET = NCOMM*(1-DIRN(STEP))
        CALL BUFCPY(2*NCOMM, 2*OFFSET, 0, RJUMP, 2*NCOMM, MVECS, Y,
     &              WS(1,1)) 
        CALL SWAP(COMMOPT, PROTOPT, FORCETYPE, ORDER(STEP), ME, BASE,
     &            SWAPNODE(STEP), BUFSIZ, WS(1,1), BUFSIZ, WS(1,2))
        IF (STEP .GT. 0) THEN
          CALL BUTUPD(DIRN(STEP), NCLOC, MVECS, CJUMP, TRIGS(STEP),
     &                WS(1,2), Y)
        ELSE
          CALL BUFCPY(2*NCOMM, 0, 2*OFFSET, 2*NCOMM, RJUMP, MVECS,
     &                WS(1,2), Y)  
        ENDIF
C
      ENDDO
C
C     Calculate local forward (complex) Fourier transform.
      CALL SEQFOR(NCLOC, MVECS, CJUMP, TRIGS(ICSEQ), Y)
C
C     Move data around in preparation for extracting transform of real
C     seqeunce from current results.
      IF ((MYINDEX .EQ. 0) .OR.(MYINDEX .EQ. TWINDEX)) THEN
        CALL CPYFIX(MYINDEX, NCLOC, MVECS, CJUMP, Y, WS(1,2))
      ELSE
        CALL CPYFIX(MYINDEX, NCLOC, MVECS, CJUMP, Y, WS(1,1))
        CALL SWAP(COMMOPT, PROTOPT, FORCETYPE, TWORDER, ME, BASE,
     &            TWIN, BUFSIZ, WS(1,1), BUFSIZ, WS(1,2))
      ENDIF
C
C     Extract transform of real sequence.
      CALL FIXITX (MYINDEX, N, NCLOC, MVECS, CJUMP, TRIGS(ICFIX),
     &             WS(1,2), Y) 
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE FRFFT2(COMMOPT, PROTOPT, FORCETYPE, MAPSIZE, MAP, 
     &                  MYINDEX, BASE1, BASE2, NLOCAL, MVEC1, MVEC2,
     &                  RJUMP, NFAX, NTRIGS, TRIGS, WS1, WS2, Y1, Y2)  
C
C This subroutine calculates the forward Fourier transform of 2 sets of
C real vectors, each of length N, over a subset of MAPSIZE processors. 
C Here N=NLOCAL*MAPSIZE. It uses a distributed algorithm, i.e., an
C algorithm that moves the data and partial results only as they are
C needed, and overlaps the butterfly calculation phase for one block
C with the communication phase for the other block. The MAP array
C defines the subset and the processor ordering to use. 
C
C Communication options (COMMOPT) for FRFFT2 include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C Communication protocol options (PROTOPT) for FRFFT2 include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C Since this is a "send-ahead" algorithm, only the simple exchange 
C (send/recv) communication option is invoked in SWAP_SEND and 
C SWAP_RECV. (Using odd/even swaps would prevent overlap.)
C The odd/even option is between SWAP_SEND and SWAP_RECV for different
C swaps, and is handled explicitly in this routine.
C
C called by: PARRFT
C calls: BUFCPY, BUTUPD, CPYFIX, FIXITX, RFFT_INIT, SEQFOR,
C        SWAP _SEND, SWAP_RECV
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Parameters -------------------------------------------------------
C
      INCLUDE 'params.i'
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C communication protocol option 
      INTEGER PROTOPT
C forcetype message type offset
      INTEGER FORCETYPE
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in map array
      INTEGER MYINDEX
C message type offsets to use in interprocessor communication
      INTEGER BASE1, BASE2
C the local length of each vector to be transformed
      INTEGER NLOCAL
C the number of vectors per processor to be transformed in Y1
      INTEGER MVEC1
C the number of vectors per processor to be transformed in Y2
      INTEGER MVEC2
C the offset between the start of successive vectors in Y1 and Y2
      INTEGER RJUMP
C the factorization of N (= NLOCAL*MAPSIZE)
      INTEGER NFAX(13)
C number of trigonometric function values used by FRFFT2
      INTEGER NTRIGS
C trigonometric function values used by in forward real Fourier
C transform 
      COMPLEX TRIGS(NTRIGS)
C
C     Work Space
C
C work array 1
      REAL WS1(MVEC1*(RJUMP/2),2)
C work array 2
      REAL WS2(MVEC2*(RJUMP/2),2)
C
C     Input/Output
C
C Y1 and Y2 each contain complex vectors of length NLOCAL/2. Y1 contains
C MVEC1 vectors and Y2 contains MVEC2 vectors. The real and imaginary
C parts of these vectors are stored as consecutive elements of Y1 and
C Y2.  Hence the real and imaginary parts of the Ith component of the
C Jth complex vector in Y1 are stored at Y1(2*I-1,J) and Y1(2*I,J).
      REAL Y1(RJUMP,MVEC1)
      REAL Y2(RJUMP,MVEC2)
C
C---- Local Variables --------------------------------------------------
C
C length  of distributed (real) vector being transformed
      INTEGER N
C number of complex values in real arrays of size NLOCAL and RJUMP,
C respectively
      INTEGER NCLOC, CJUMP
C number of complex values in each local vector segment swapped during
C a parallel stage 
      INTEGER NCOMM
C lengths of messages swapped during each parallel stage (in bytes) 
      INTEGER BUFSZ1, BUFSZ2
C true processor id for "me"
      INTEGER ME
C arrays indicating the destination of the message sent during a
C given swap, and whether this processor sends or receives first
      INTEGER SWAPNODE(0:LGPROCSX)
      INTEGER ORDER(0:LGPROCSX)
C which half (0: low, 1: high) of the data buffer that is sent during a 
C given swap, and index offset to that half
      INTEGER DIRN(0:LGPROCSX), OFFSET
C index and true processor id of the processor whose data is needed
C in order to compute the real transform from the complex transform,
C and order in which swap with TWIN is to be performed
      INTEGER TWINDEX, TWIN, TWORDER
C index offsets into trigonometric function values array
      INTEGER ICFIX, ICSEQ
C number of "computational" swaps in distributed phase of transform
C algorithm. Total number of swaps includes one before sequential
C phase, and possibly one before real fix-up phase.
      INTEGER MAXSTEP
C loop index
      INTEGER STEP
C                                                                             
C---- Executable Statements --------------------------------------------
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C     initialization
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     length  of distributed (real) vector being transformed
      N = NLOCAL*MAPSIZE
C     number of complex values in real arrays of size NLOCAL and RJUMP
      NCLOC = NLOCAL/2 
      CJUMP = RJUMP/2
C     number of complex values in each local vector segment swapped
C     during a parallel stage 
      NCOMM = NCLOC/2
C     lengths of messages swapped during each parallel stage (in bytes) 
      BUFSZ1 = CBYTES*NCOMM*MVEC1
      BUFSZ2 = CBYTES*NCOMM*MVEC2
C
C     Calculate swap partners (and swap order for (COMMOPT .EQ. 1)
C     protocol).  
      CALL RFFT_INIT(MAPSIZE, MAP, MYINDEX, LGPROCSX, MAXSTEP,
     &               SWAPNODE, ORDER, DIRN, TWINDEX, TWORDER)
C
C     If (COMMOPT .NE. 1), then not using odd/even option, so reset the
C     ORDER array.
      IF (COMMOPT .NE. 1) THEN
        DO STEP=0,MAXSTEP
          ORDER(STEP) = 1
        ENDDO
      ENDIF
C
C     index offsets into the trigonometric function values array
C     corresponding to the sequential and real fix-up phases of the
C     algorithm  
      IF (MAPSIZE .GT. 1) THEN
        ICSEQ = MAXSTEP + 1
      ELSE
        ICSEQ = 1
      ENDIF
      ICFIX = ICSEQ + NCLOC
C
C     Identify myself and TWIN.
      ME = MAP(MYINDEX)
      TWIN = MAP(TWINDEX)
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C     Calculate forward real Fourier transform. See FRFFT1 for a more 
C     readable outline of the algorithm.
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      IF (MAXSTEP .GT. -1) THEN
C
C       Prime the pipeline: send Y1, send Y2, recv Y1, update Y1.
        OFFSET = NCOMM*(1-DIRN(MAXSTEP))
        CALL BUFCPY(2*NCOMM, 2*OFFSET, 0, RJUMP, 2*NCOMM, MVEC1, Y1,
     &              WS1(1,1)) 
        CALL SWAP_SEND(0, PROTOPT, FORCETYPE, 1, ME, BASE1,
     &                 SWAPNODE(MAXSTEP), BUFSZ1, WS1(1,1), BUFSZ2,
     &                 WS1(1,2)) 
C
        CALL BUFCPY(2*NCOMM, 2*OFFSET, 0, RJUMP, 2*NCOMM, MVEC2, Y2,
     &              WS2(1,1)) 
        IF (ORDER(MAXSTEP) .EQ. 1) THEN
          CALL SWAP_SEND(0, PROTOPT, FORCETYPE, 1, ME, BASE2,
     &                   SWAPNODE(MAXSTEP), BUFSZ2, WS2(1,1), BUFSZ2,
     &                   WS2(1,2)) 
          CALL SWAP_RECV(0, PROTOPT, FORCETYPE, 1, ME, BASE1,
     &                   SWAPNODE(MAXSTEP), BUFSZ1, WS1(1,2)) 
        ELSE
          CALL SWAP_RECV(0, PROTOPT, FORCETYPE, 1, ME, BASE1,
     &                   SWAPNODE(MAXSTEP), BUFSZ1, WS1(1,2)) 
          CALL SWAP_SEND(0, PROTOPT, FORCETYPE, 1, ME, BASE2,
     &                   SWAPNODE(MAXSTEP), BUFSZ2, WS2(1,1), BUFSZ2,
     &                   WS2(1,2)) 
        ENDIF
C
        CALL BUTUPD(DIRN(MAXSTEP), NCLOC, MVEC1, CJUMP,
     &              TRIGS(MAXSTEP), WS1(1,2), Y1)  
C
        ENDIF        
C
C     Begin overlapped algorithm.
      DO STEP=MAXSTEP-1,0,-1
C
C       Finish swap with old partner and start swap with new
C       partner: send Y1, recv Y2, update Y2.
        OFFSET = NCOMM*(1-DIRN(STEP))
        CALL BUFCPY(2*NCOMM, 2*OFFSET, 0, RJUMP, 2*NCOMM, MVEC1, Y1,
     &              WS1(1,1)) 
        IF (ORDER(STEP) .EQ. 1) THEN
          CALL SWAP_SEND(0, PROTOPT, FORCETYPE, 1, ME, BASE1,
     &                   SWAPNODE(STEP), BUFSZ1, WS1(1,1), BUFSZ1,
     &                   WS1(1,2)) 
          CALL SWAP_RECV(0, PROTOPT, FORCETYPE, 1, ME, BASE2,
     &                   SWAPNODE(STEP+1), BUFSZ2, WS2(1,2)) 
        ELSE
          CALL SWAP_RECV(0, PROTOPT, FORCETYPE, 1, ME, BASE2,
     &                   SWAPNODE(STEP+1), BUFSZ2, WS2(1,2)) 
          CALL SWAP_SEND(0, PROTOPT, FORCETYPE, 1, ME, BASE1,
     &                   SWAPNODE(STEP), BUFSZ1, WS1(1,1), BUFSZ1,
     &                   WS1(1,2)) 
        ENDIF
C
        CALL BUTUPD(DIRN(STEP+1), NCLOC, MVEC2, CJUMP, TRIGS(STEP+1),
     &              WS2(1,2), Y2)  
C
C       Continue swap with new partner: send Y2, recv Y1, update Y1.
        CALL BUFCPY(2*NCOMM, 2*OFFSET, 0, RJUMP, 2*NCOMM, MVEC2, Y2,
     &              WS2(1,1)) 
        IF (ORDER(STEP) .EQ. 1) THEN
          CALL SWAP_SEND(0, PROTOPT, FORCETYPE, 1, ME, BASE2,
     &                   SWAPNODE(STEP), BUFSZ2, WS2(1,1), BUFSZ2,
     &                   WS2(1,2)) 
          CALL SWAP_RECV(0, PROTOPT, FORCETYPE, 1, ME, BASE1,
     &                   SWAPNODE(STEP), BUFSZ1, WS1(1,2)) 
        ELSE
          CALL SWAP_RECV(0, PROTOPT, FORCETYPE, 1, ME, BASE1,
     &                   SWAPNODE(STEP), BUFSZ1, WS1(1,2)) 
          CALL SWAP_SEND(0, PROTOPT, FORCETYPE, 1, ME, BASE2,
     &                   SWAPNODE(STEP), BUFSZ2, WS2(1,1), BUFSZ2,
     &                   WS2(1,2)) 
        ENDIF
C
        IF (STEP .GT. 0) THEN
          CALL BUTUPD(DIRN(STEP), NCLOC, MVEC1, CJUMP, TRIGS(STEP),
     &                WS1(1,2), Y1)  
        ELSE
          CALL BUFCPY(2*NCOMM, 0, 2*OFFSET, 2*NCOMM, RJUMP, MVEC1,
     &                WS1(1,2), Y1)  
        ENDIF
C
      ENDDO
C
C     Finish forward real Fourier transform.
      IF (( MYINDEX .EQ. 0) .OR. (MYINDEX .EQ. TWINDEX)) THEN
C
C       Compute sequential real RFT for Y1.
        CALL SEQFOR(NCLOC, MVEC1, CJUMP, TRIGS(ICSEQ), Y1)
        CALL CPYFIX(MYINDEX, NCLOC, MVEC1, CJUMP, Y1, WS1(1,2))
        CALL FIXITX (MYINDEX, N, NCLOC, MVEC1, CJUMP, TRIGS(ICFIX),
     &               WS1(1,2), Y1) 
C
C       Finish last dimensional swap: recv Y2, update Y2.
        IF (MAXSTEP .GT. -1) THEN
          CALL SWAP_RECV(0, PROTOPT, FORCETYPE, 1, ME, BASE2,
     &                   SWAPNODE(0), BUFSZ2, WS2(1,2)) 
          CALL BUFCPY(2*NCOMM, 0, 2*OFFSET, 2*NCOMM, RJUMP, MVEC2,
     &                WS2(1,2), Y2)  
        ENDIF
C
C       Compute sequential real RFT for Y2.
        CALL SEQFOR(NCLOC, MVEC2, CJUMP, TRIGS(ICSEQ), Y2)
        CALL CPYFIX(MYINDEX, NCLOC, MVEC2, CJUMP, Y2, WS2(1,2))
        CALL FIXITX (MYINDEX, N, NCLOC, MVEC2, CJUMP, TRIGS(ICFIX),
     &               WS2(1,2), Y2) 
C
      ELSE
C
C       Compute sequential real RFT for Y1.
        CALL SEQFOR(NCLOC, MVEC1, CJUMP, TRIGS(ICSEQ), Y1)
C
C       Begin swap with TWIN and finish last dimensional swap:
C       send Y1, recv Y2, update Y2.
        CALL CPYFIX(MYINDEX, NCLOC, MVEC1, CJUMP, Y1, WS1(1,1))
        IF (TWORDER .EQ. 1) THEN
          CALL SWAP_SEND(0, PROTOPT, FORCETYPE, 1, ME, BASE1,
     &                   TWIN, BUFSZ1, WS1(1,1), BUFSZ1, WS1(1,2))
          CALL SWAP_RECV(0, PROTOPT, FORCETYPE, 1, ME, BASE2,
     &                   SWAPNODE(0), BUFSZ2, WS2(1,2)) 
        ELSE
          CALL SWAP_RECV(0, PROTOPT, FORCETYPE, 1, ME, BASE2,
     &                   SWAPNODE(0), BUFSZ2, WS2(1,2)) 
          CALL SWAP_SEND(0, PROTOPT, FORCETYPE, 1, ME, BASE1,
     &                   TWIN, BUFSZ1, WS1(1,1), BUFSZ1, WS1(1,2))
        ENDIF
        CALL BUFCPY(2*NCOMM, 0, 2*OFFSET, 2*NCOMM, RJUMP, MVEC2,
     &              WS2(1,2), Y2)  
C
C       Compute sequential real RFT for Y2.
        CALL SEQFOR(NCLOC, MVEC2, CJUMP, TRIGS(ICSEQ), Y2)
C
C       Continue swap with TWIN: send Y2, recv Y1, update Y1.
        CALL CPYFIX(MYINDEX, NCLOC, MVEC2, CJUMP, Y2, WS2(1,1))
        IF (TWORDER .EQ. 1) THEN
          CALL SWAP_SEND(0, PROTOPT, FORCETYPE, 1, ME, BASE2,
     &                   TWIN, BUFSZ2, WS2(1,1), BUFSZ2, WS2(1,2))
          CALL SWAP_RECV(0, PROTOPT, FORCETYPE, 1, ME, BASE1,
     &                   TWIN, BUFSZ1, WS1(1,2)) 
        ELSE
          CALL SWAP_RECV(0, PROTOPT, FORCETYPE, 1, ME, BASE1,
     &                   TWIN, BUFSZ1, WS1(1,2)) 
          CALL SWAP_SEND(0, PROTOPT, FORCETYPE, 1, ME, BASE2,
     &                   TWIN, BUFSZ2, WS2(1,1), BUFSZ2, WS2(1,2))
        ENDIF
        CALL FIXITX (MYINDEX, N, NCLOC, MVEC1, CJUMP, TRIGS(ICFIX),
     &               WS1(1,2), Y1) 
C
C       Finish swap with TWIN: recv Y2, update Y2.
        CALL SWAP_RECV(0, PROTOPT, FORCETYPE, 1, ME, BASE2,
     &                 TWIN, BUFSZ2, WS2(1,2)) 
        CALL FIXITX (MYINDEX, N, NCLOC, MVEC2, CJUMP, TRIGS(ICFIX),
     &               WS2(1,2), Y2) 
C
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE BUTUPD(DIRN, VECLTH, MVECS, CJUMP, CEXP, BUF, Y)
C
C BUTUPD calculates a "forward" power-of-two butterfly update for each
C vector in Y, using VECLTH/2 complex data in BUF and Y to produce a new
C complex vector of length VECLTH in Y. This update represents a
C factor-of-two stage of a forward fast Fourier transform.
C
C called by: FRFFT1, FRFFT2
C calls: 
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C switch indicating whether the data for vector J start in BUF(1,J) and
C Y(1,J) (DIRN .EQ. 0) or in Y(VECLTH/2+1,J) and BUF(1,J) (DIRN .NE. 0)
      INTEGER DIRN
C length of each complex output vector and half the length of the
C complex data vectors
      INTEGER VECLTH
C number of vectors being updated
      INTEGER MVECS
C the offset between the start of successive vectors in Y
      INTEGER CJUMP
C trigonometric function values used in the update
      COMPLEX CEXP
C half of the data used in the update
      COMPLEX BUF(VECLTH/2,MVECS)
C
C     Input/Output
C
C On input, half of the data for the update is contained in Y. On
C output, Y contains the results of the update. 
      COMPLEX Y(CJUMP,MVECS)
C
C---- Local Variables --------------------------------------------------
C
C length of data vectors
      INTEGER HLFLTH
C loop indices
      INTEGER I, J
C temporary used in computing the inverse power-of-two butterfly update 
      COMPLEX C
C                                                                             
C---- Executable Statements --------------------------------------------
C
C     length of data vectors
      HLFLTH = VECLTH/2
C
      IF ( DIRN .EQ. 0 ) THEN
C
        DO J=1,MVECS
          DO I=1,HLFLTH
C           Calculate power-of-two butterfly update.
            C = BUF(I,J)*CEXP
            Y(HLFLTH+I,J) = Y(I,J) - C
            Y(I,J)        = Y(I,J) + C
          ENDDO
        ENDDO
C
      ELSE
C
        DO J=1,MVECS
          DO I=1,HLFLTH
C           Calculate power-of-two butterfly update.
            C = Y(HLFLTH+I,J)*CEXP
            Y(HLFLTH+I,J) = BUF(I,J) - C
            Y(I,J)        = BUF(I,J) + C
          ENDDO
        ENDDO
C
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE SEQFOR(N, MVECS, CJUMP, CEXP, Y)
C
C SEQFOR calculates in-place (unnormalized) forward Fourier transforms
C of an array of complex vectors. Each vector is of length N, where N
C is a power of two. 
C
C called by: FRFFT1, FRFFT2
C calls: 
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C length of each complex vector
      INTEGER N
C number of vectors being transformed
      INTEGER MVECS
C the offset between the start of successive vectors
      INTEGER CJUMP
C trigonometric function values used in the transform
      COMPLEX CEXP(N/2)
C
C     Input/Output
C
C On input, Y contains the data to be transformed. On output, Y
C contains the results of the transforms.
      COMPLEX Y(CJUMP,MVECS)
C
C---- Local Variables --------------------------------------------------
C
C loop limits for each stage of the power-of-two forward fast Fourier
C transform
      INTEGER NSTEPS, NBLOCK, INCREM
C index offset used to move through the trigonometric function values
C in the correct order
      INTEGER IC
C index offsets used to access the data needed in a given butterfly
C update
      INTEGER IOFF1, IOFF2
C loop indices
      INTEGER I, J, K, L
C temporary used in computing the forward power-of-two butterfly
C update
      COMPLEX C
C
C---- External Functions ----------------------------------------------
C
C log base 2 function
      EXTERNAL LOG2
      INTEGER LOG2
C                                                                             
C---- Executable Statements --------------------------------------------
C
      NSTEPS = LOG2(N)
      DO L=1,MVECS
C
        NBLOCK   = 1
        INCREM = N/2
        IC = 0
        DO K=1,NSTEPS
C
          DO J=1,NBLOCK
C
            IOFF1 = 2*(J-1)*INCREM
            IOFF2 = IOFF1 + INCREM
            DO I=1,INCREM
C
C             Calculate power-of-two butterfly update.
              C = Y(IOFF2+I,L)*CEXP(IC+J)
              Y(IOFF2+I,L)   = Y(IOFF1+I,L) - C
              Y(IOFF1+I,L)   = Y(IOFF1+I,L) + C
C
            ENDDO
C
          ENDDO
C
          IC = IC + NBLOCK
	  NBLOCK = NBLOCK*2
          INCREM = INCREM/2
C
        ENDDO
C
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE FIXITX(MYINDEX, N, VECLTH, MVECS, CJUMP, CEXP, BUF, Y)
C
C FIXITX combines each half-length complex vector in BUF with the odd
C elements of the corresponding vector in Y to produce a new complex
C vector. This combination completes  the calculation of a (normalized) 
C forward real Fourier transform.
C
C called by: FRFFT1, FRFFT2
C calls: 
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C index of "me" in processor subset collaborating in distributed real
C Fourier transform 
      INTEGER MYINDEX
C length of distributed (real) vector being transformed, used in
C normalizing the results
      INTEGER N
C length of each complex vector in Y (representing the local segment
C of a distributed complex vector)
      INTEGER VECLTH
C number of vectors being transformed
      INTEGER MVECS
C offset between the start of successive vectors in Y
      INTEGER CJUMP
C trigonometric function values used in combining the data
      COMPLEX CEXP(VECLTH/2)
C half of the data 
      COMPLEX BUF(VECLTH/2,MVECS)
C
C     Input/Output
C
C On input, the odd (complex) elements of Y represent half of the data.
C On output, Y contains the results of the forward Fourier transform,
C of which FIXITX represents the final step.
      COMPLEX Y(CJUMP,MVECS)
C
C---- Local Variables --------------------------------------------------
C
C half the length of the vectors in Y
      INTEGER HLFLTH
C loop bound
      INTEGER IST
C loop indices
      INTEGER I, J
C normalization factors
      REAL SFAC1, SFAC2
C temporaries used in combining the data
      COMPLEX H1, H2, C
C                                                                             
C---- Executable Statements --------------------------------------------
C
C     Calculate length of the data vectors.
      HLFLTH = VECLTH/2
C
C     Calculate normalization factors.
      SFAC1  = 1.0/REAL(N)
      SFAC2  = SFAC1/2.0
C
      DO J=1,MVECS
C
        IF (MYINDEX .EQ. 0) THEN
          IST = 2
	  Y(1,J) = SFAC1*(CMPLX(1.0,1.0)*CONJG(Y(1,J)))
	  Y(2,J) = SFAC1*Y(2,J)
        ELSE
          IST = 1
        ENDIF
C
        DO I=IST,HLFLTH
          H1 = Y(2*I-1,J)
          H2 = BUF(HLFLTH-I+1,J)
          C  = -CMPLX(0.0,1.0)*(H1-CONJG(H2))*CEXP(I)
          Y(2*I-1,J) = SFAC2*((H1+CONJG(H2)) + C)
          Y(2*I,J)   = SFAC2*((H2+CONJG(H1)) - CONJG(C))
        ENDDO
C
      ENDDO
C       
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
