C#######################################################################
C PSTSWM Version 1.0 (8/1/93)                                          #
C  A message-passing benchmark code and parallel algorithm testbed     #
C  that solves the nonlinear shallow water equations using the spectral#
C  transform method.                                                   #
C Written by:                                                          #
C  Patrick Worley of Oak Ridge National Laboratory                     #
C  Ian Foster of Argonne National Laboratory                           #
C Based on the sequential code STSWM 2.0 by James Hack and Ruediger    #
C  Jakob of the National Center for Atmospheric Research.              #
C Research and development funded by the Computer Hardware, Advanced   #
C  Mathematics, and Model Physics (CHAMMP) program of the U.S.         #
C  Department of Energy.                                               # 
C                                                                      #
C Questions and comments should be directed to worley@msr.epm.ornl.gov #
C Please notify and acknowledge the authors in any research or         #
C publications utilizing PSTSWM or any part of the code.               #
C                                                                      #
C NOTICE: Neither the institutions nor the authors make any            #
C representations about the suitability of this software for any       #
C purpose. This software is provided "as is", without express or       #
C implied warranty.                                                    #
C#######################################################################
      SUBROUTINE ALGINP
C                                                                              
C This subroutine inputs or determines logical machine and parallel
C algorithm parameters.
C                                                                              
C called by: INPUT
C calls: 
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C     
C---- Parameters -------------------------------------------------------
C
      INCLUDE 'params.i'
C     
C---- Common Blocks ----------------------------------------------------
C     
C machine architecture information
      INCLUDE 'machine.i'
C problem resolution information
      INCLUDE 'problem.i'
C parallel algorithm information
      INCLUDE 'algorithm.i'
C domain decomposition information
      INCLUDE 'physical.i'
      INCLUDE 'fourier.i'
      INCLUDE 'spectral.i'
C tracing parameters
      INCLUDE 'tracing.i'
C
C---- Local Variables --------------------------------------------------
C
C communication buffers
      INTEGER IBUF(31)
      CHARACTER CBUF*64
C process/processor mapping option for logical ring
      INTEGER RINGOPT
C variables for calculating processor mappings
      INTEGER PX, PY, INDEX_I, INDEX_J, INDEX_ME
      INTEGER MAP2D(0:NPROCSX-1), MAP1D(0:NPROCSX-1)
C offsets used to determine mapping from local latitude and
C longitude indices to global indices
      INTEGER LATBASE, LONBASE
C half the number of local latitudes
      INTEGER NLLATH
C Fourier wavenumber, local (untruncated) wavenumber index, processor
C index, and spectral coefficient index
      INTEGER M, IK, P, L
C local (truncated) indices for M=0 and M=1 Fourier coefficients
      INTEGER JM0, JM1
C (temporary) processor id when determining local Fourier coefficients
      INTEGER TMPDEX
C MTRUE_F/S index for next wavenumber to be truncated, used when 
C determining local Fourier coefficients
      INTEGER NTMM
C variables used to calculate number of local spectral coefficients
      INTEGER NTMP1, NTMP2
C polynomial degree and wavenumber (loop) indices
      INTEGER JN, JM
C polynomial degree transition index and index value denoting beginning
C of local spectral coefficients, used when computing the partition of 
C spectral coefficients
      INTEGER JNTRNS, JNME
C work array for shifting spectral domain decomposition
      INTEGER JMTMP(NPROCSX+MMX+1)
C variables used to calculate how much space must be allocated to 
C allow physical, Fourier, and spectral arrays to fit into the
C same memory
      INTEGER PFACTOR, FFACTOR, SFACTOR
C log_2 and (2**(log_2 P)) of a given number of processors
      INTEGER LGP, IPX
C other loop indices and counters
      INTEGER I, J
C base for message types used in broadcasting problem parameters
      INTEGER BASE
C
C---- External Functions ----------------------------------------------
C
C base for message types
      EXTERNAL MSGBASE
      INTEGER MSGBASE
C logical machine mapping functions
      EXTERNAL RING_MAP
      INTEGER RING_MAP
C FFT reordering description
      EXTERNAL MDEX
      INTEGER MDEX
C log base 2 function
      EXTERNAL LOG2
      INTEGER LOG2
C                                                                            
C---- Executable Statements --------------------------------------------
C
      BASE = MSGBASE()
      IF (ME .EQ. 0) THEN
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C       Initialize logical machine parameters.
C       See common block /MACHINE/ (in machine.i) for definitions.
        PX = 1
        PY = 1
        MESHOPT = 1
        RINGOPT = 1
C
C       Initialize parallel algorithm parameters.
C       See common block /ALGORITHM/ (in algorithm) for definitions.
        FTOPT     = 0
        LTOPT     = 0
        COMMFFT   = 0
        COMMIFT   = 0
        COMMFLT   = 0
        COMMILT   = 0
        BUFSFFT   = 0
        BUFSIFT   = 0
        BUFSFLT   = 0
        BUFSILT   = 0
        PROTFFT   = 0
        PROTIFT   = 0
        PROTFLT   = 0
        PROTILT   = 0
        FORCETYPE = 0
        SUMOPT    = 0
        EXCHSIZE  = 1
C
C       Initialize tracing parameters.
C       See common block /TRACING/ (in tracing.i) for definitions.
C
        TIMING    = .FALSE.
        TRACING   = .FALSE.
        TRSIZE    = 0
        TRSTART   = -1
        TRSTOP    = -1
        TL1       = -1
        TL2       = -1
        TL3       = -1
        TRACEFILE = .FALSE.
        TMPNAME   = '                                '
        PERMNAME  = '                                '
        VERBOSE   = 0
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C                                                                              
C       Read algorithm input data.
        WRITE(6,200)
  200   FORMAT(/,' READING PARAMETERS FROM FILE algorithm:',/)
C
C       Read input file for user selectable parameters.
        OPEN(8, FILE='algorithm')
C
C       problem size parameters
        READ(8,*) PX
        READ(8,*) PY
        READ(8,*) MESHOPT
        READ(8,*) RINGOPT
C
C       algorithm parameters
        READ(8,*) FTOPT
        READ(8,*) LTOPT
        READ(8,*) COMMFFT
        READ(8,*) COMMIFT
        READ(8,*) COMMFLT
        READ(8,*) COMMILT
        READ(8,*) BUFSFFT
        READ(8,*) BUFSIFT
        READ(8,*) BUFSFLT
        READ(8,*) BUFSILT
        READ(8,*) PROTFFT
        READ(8,*) PROTIFT
        READ(8,*) PROTFLT
        READ(8,*) PROTILT
        READ(8,*) FORCETYPE
        READ(8,*) SUMOPT
        READ(8,*) EXCHSIZE
C
C       tracing parameters
        READ(8,*) TIMING
        READ(8,*) TRACING
        READ(8,*) TRSIZE
        READ(8,*) TRSTART
        READ(8,*) TRSTOP
        READ(8,*) TL1
        READ(8,*) TL2
        READ(8,*) TL3
        READ(8,*) TRACEFILE
        READ(8,*) TMPNAME
        READ(8,*) PERMNAME
        READ(8,*) VERBOSE
C
C       Close input file.
        CLOSE(8)
C
C       Send input parameters to other processors.
        IF (NPROCS .GT. 1) THEN
C
          IBUF(1)  = PX
          IBUF(2)  = PY
          IBUF(3)  = MESHOPT
          IBUF(4)  = RINGOPT
          IBUF(5)  = FTOPT
          IBUF(6)  = LTOPT
          IBUF(7)  = COMMFFT
          IBUF(8)  = COMMIFT
          IBUF(9)  = COMMFLT
          IBUF(10) = COMMILT
          IBUF(11) = BUFSFFT
          IBUF(12) = BUFSIFT
          IBUF(13) = BUFSFLT
          IBUF(14) = BUFSILT
          IBUF(15) = PROTFFT
          IBUF(16) = PROTIFT
          IBUF(17) = PROTFLT
          IBUF(18) = PROTILT
          IBUF(19) = FORCETYPE
          IBUF(20) = SUMOPT
          IBUF(21) = EXCHSIZE
          IF (TIMING) THEN
            IBUF(22) = 1
          ELSE
            IBUF(22) = 0
          ENDIF            
          IF (TRACING) THEN
            IBUF(23) = 1
          ELSE
            IBUF(23) = 0
          ENDIF            
          IF (TRACEFILE) THEN
            IBUF(24) = 1
          ELSE
            IBUF(24) = 0
          ENDIF            
          IBUF(25) = TRSIZE
          IBUF(26) = TRSTART
          IBUF(27) = TRSTOP
          IBUF(28) = TL1
          IBUF(29) = TL2
          IBUF(30) = TL3
          IBUF(31) = VERBOSE
          CALL BCAST0(IBUF, IBYTES*31, BASE, 0)
C
          CBUF(1:32)  = TMPNAME
          CBUF(33:64) = PERMNAME
          CALL BCAST0(CBUF, 64, BASE+1, 0)
C
        ENDIF
C
      ELSE
C
C       Get input parameters from node 0.
        CALL BCAST0(IBUF, IBYTES*31, BASE, 0)
        PX        = IBUF(1)
        PY        = IBUF(2)
        MESHOPT   = IBUF(3)
        RINGOPT   = IBUF(4)
        FTOPT     = IBUF(5)
        LTOPT     = IBUF(6)
        COMMFFT   = IBUF(7)
        COMMIFT   = IBUF(8)
        COMMFLT   = IBUF(9)
        COMMILT   = IBUF(10)
        BUFSFFT   = IBUF(11)
        BUFSIFT   = IBUF(12)
        BUFSFLT   = IBUF(13)
        BUFSILT   = IBUF(14)
        PROTFFT   = IBUF(15)
        PROTIFT   = IBUF(16)
        PROTFLT   = IBUF(17)
        PROTILT   = IBUF(18)
        FORCETYPE = IBUF(19)
        SUMOPT    = IBUF(20)
        EXCHSIZE  = IBUF(21)
        IF (IBUF(22) .EQ. 1) THEN
           TIMING = .TRUE.
        ELSE
           TIMING = .FALSE.
        ENDIF
        IF (IBUF(23) .EQ. 1) THEN
          TRACING = .TRUE.
        ELSE
          TRACING = .FALSE.
        ENDIF
        IF (IBUF(24) .EQ. 1) THEN
          TRACEFILE = .TRUE.
        ELSE
          TRACEFILE = .FALSE.
        ENDIF
        TRSIZE  = IBUF(25)
        TRSTART = IBUF(26)
        TRSTOP  = IBUF(27)
        TL1     = IBUF(28)
        TL2     = IBUF(29)
        TL3     = IBUF(30)
        VERBOSE = IBUF(31)
C
        CALL BCAST0(CBUF, 64, BASE+1, 0)
        TMPNAME = CBUF(1:32)
        PERMNAME = CBUF(33:64)
C
      ENDIF
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     Check consistency of logical machine parameters
C     and calculate neighbor information.
C
      IF (NPROCS .NE. PY*PX) THEN
        IF (ME .EQ. 0) WRITE(0,641) NPROCS, PX, PY
  641   FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &          ' PX*PY NOT EQUAL TO NPROCS',/,
     &          ' NPROCS = ',I4,' PX = ',I4,' PY = ',I4)
        STOP
      ENDIF
C
C     Initialize logical-to-physical processor mappings.
      CALL MESH_MAP_INIT(MESHOPT, PX, PY, MAP2D)
      CALL RING_MAP_INIT(RINGOPT, NPROCS, MAP1D)
C
C     Calculate i,j coordinates of processor.
      CALL MESH_INDEX(ME, PX, PY, MAP2D, INDEX_I, INDEX_J)
C
C     Calculate location in big ring.
      CALL RING_INDEX(ME, NPROCS, MAP1D, INDEX_ME)
      PREV = RING_MAP(INDEX_ME+1, NPROCS, MAP1D)
      NEXT = RING_MAP(INDEX_ME-1, NPROCS, MAP1D)
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     Calculate and check consistency of domain decomposition.
C
C     1) Define domain decomposition in physical space.
C
C     allocation of processors to decomposition
      NPLON_P = PX
      CALL ROW_MAP_INIT(INDEX_J, PX, PY, MAP2D, MAPLON_P)
C
      NPLAT_P = PY
      CALL COLUMN_MAP_INIT(INDEX_I, PX, PY, MAP2D, MAPLAT_P)
C
      NPVER_P = 1
      MAPVER_P(0) = ME
C
C     indices of current processor in this mapping
      LONDEX_P = INDEX_I
      LATDEX_P = INDEX_J
      VERDEX_P = 0
C
C     Longitude
C
C     Decomposition should assign at least one longitude per processor.
      IF (NLON .LT. NPLON_P) THEN
        IF (ME .EQ. 0) WRITE(0, 651) NLON, NPLON_P
  651   FORMAT(' NPLON_P IS LARGER THAN NLON',/,
     &         ' NLON = ',I4,' NPLON_P = ',I4)
        STOP
      ENDIF
C
      NLLON_P = NLON/NPLON_P
      NTMP2 = MOD(NLON,NPLON_P)
      IF (LONDEX_P .LT. NTMP2) THEN
        NLLON_P = NLLON_P+1
        LONBASE = LONDEX_P*NLLON_P
      ELSE
        LONBASE = NTMP2*(NLLON_P+1) + (LONDEX_P-NTMP2)*NLLON_P
      ENDIF
C
      DO I=1,NLLON_P
         LONTRUE_P(I) = LONBASE + I
      ENDDO
C
C     Latitude
C
      IF (LTOPT .EQ. 0) THEN
C       Using a distributed Legendre transform algorithm, so
C       decomposition should assign pairs (north/south) of latitudes to 
C       processors, with at least one pair in each processor.
C
        IF (NLAT/2 .LT. NPLAT_P) THEN
          IF (ME .EQ. 0) WRITE(0, 653) NLAT, NPLAT_P
  653     FORMAT(' NPLAT_P IS LARGER THAN NLAT/2',/,
     &           ' NLAT = ',I4,' NPLAT_P = ',I4)
          STOP
        ENDIF
C
        NLLATH = (NLAT/2)/NPLAT_P
        NTMP2 = MOD(NLAT/2,NPLAT_P)
        IF (LATDEX_P .LT. NTMP2) THEN
          NLLATH = NLLATH+1
          LATBASE = LATDEX_P*NLLATH
        ELSE
          LATBASE = NTMP2*(NLLATH+1) + (LATDEX_P-NTMP2)*NLLATH
        ENDIF
        NLLAT_P = 2*NLLATH
C
        DO I=1,NLLATH
          LATTRUE_P(I)             = LATBASE + I
          LATTRUE_P(NLLAT_P-(I-1)) = (NLAT-LATBASE) - (I-1)
        ENDDO
C
      ELSE
C
C       Using a transpose/serial Legendre transform algorithm, so
C       decomposition should assign at least one latitude to each
C       processor.
C
        IF (NLAT .LT. NPLAT_P) THEN
          IF (ME .EQ. 0) WRITE(0, 654) NLAT, NPLAT_P
  654     FORMAT(' NPLAT_P IS LARGER THAN NLAT',/,
     &           ' NLAT = ',I4,' NPLAT_P = ',I4)
          STOP
        ENDIF

        NLLAT_P = NLAT/NPLAT_P
        NTMP2 = MOD(NLAT,NPLAT_P)
        IF (LATDEX_P .LT. NTMP2) THEN
          NLLAT_P = NLLAT_P+1
          LATBASE = LATDEX_P*NLLAT_P
        ELSE
          LATBASE = NTMP2*(NLLAT_P+1) + (LATDEX_P-NTMP2)*NLLAT_P
        ENDIF
C
        DO I=1,NLLAT_P
           LATTRUE_P(I) = LATBASE + I
        ENDDO
C
      ENDIF
C
C     Vertical
C
C     Decomposition should assign at least one vertical layer per 
C     processor.
      IF(NVER .LT. NPVER_P) THEN
        IF (ME .EQ. 0) WRITE(0,655) NVER, NPVER_P
 655    FORMAT(' TOO FEW VERTICAL LEVELS',/,
     &         ' NVER = ',I4,' NPVER_P = ',I4)
        STOP
      ENDIF
C
      IF (VERDEX_P .LT. MOD(NVER, NPVER_P)) THEN
        NLVER_P = NVER/NPVER_P + 1
      ELSE
        NLVER_P = NVER/NPVER_P
      ENDIF
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     2) Define domain decomposition for Fourier and 
C     physical spaces when transforming between them.
C
      IF (FTOPT .EQ. 0) THEN
C       Using distributed FFT
C
C       Check that PX is a power of two.
        LGP = LOG2(PX)
        IPX = 2**LGP
        IF (PX .NE. IPX) THEN
          IF (ME .EQ. 0) WRITE(0,644) PX
 644      FORMAT(' PX NOT A POWER OF TWO',/,
     &           ' PX = ',I4)
          STOP
        ENDIF
C
C       allocation of processors to decomposition
        NPFC_F = PX
        CALL ROW_MAP_INIT(INDEX_J, PX, PY, MAP2D, MAPFC_F)
C
        NPLAT_F = PY
        CALL COLUMN_MAP_INIT(INDEX_I, PX, PY, MAP2D, MAPLAT_F)
C
        NPVER_F = 1
        MAPVER_F(0) = ME
C
C       indices of current processor in this mapping
        FCDEX_F  = INDEX_I
        LATDEX_F = INDEX_J
        VERDEX_F = 0
C
      ELSE
C       Using transpose and serial FFT
C
C       If using O(log P) tranpose, check that PX is a power of two.
        IF((COMMFFT .GE. 20) .OR. (COMMIFT .GE. 20)) THEN
          LGP = LOG2(PX)
          IPX = 2**LGP
          IF (PX .NE. IPX) THEN
            IF (ME .EQ. 0) WRITE(0,645) PX
 645        FORMAT(' PX NOT A POWER OF TWO',/,
     &             ' PX = ',I4)
            STOP
          ENDIF
        ENDIF
C
C       allocation of processors to decomposition
        NPFC_F = 1
        MAPFC_F(0) = ME
C
        NPLAT_F = PY
        CALL COLUMN_MAP_INIT(INDEX_I, PX, PY, MAP2D, MAPLAT_F)
C
        NPVER_F = PX
        CALL ROW_MAP_INIT(INDEX_J, PX, PY, MAP2D, MAPVER_F)
C
C       indices of current processor in this mapping
        FCDEX_F = 0
        LATDEX_F = INDEX_J
        VERDEX_F = INDEX_I
C
      ENDIF
C
C     Longitude
C
C     Decomposition should assign at least one longitude per processor.
      IF (NLON .LT. NPFC_F) THEN
        IF (ME .EQ. 0) WRITE(0, 660) NLON, NPFC_F
  660   FORMAT(' NPFC_F IS LARGER THAN NLON',/,
     &         ' NLON = ',I4,' NPFC_F = ',I4)
        STOP
      ENDIF
C
      NLLON_F = NLON/NPFC_F
      DO I=1,NLLON_F
        LONTRUE_F(I) = FCDEX_F*NLLON_F + I
      ENDDO
C
C     For distributed FFT algorithm and O(log P) transpose, need even 
C     partition with power of two longitudes and power of two number of
C     processors.
      IF ((FTOPT .EQ. 0) .OR. 
     &    ((COMMFFT .GE. 20) .OR. (COMMIFT .GE. 20))) THEN
C
        IF (NLON .NE. (NPFC_F*NLLON_F)) THEN
          IF (ME .EQ. 0) WRITE(0, 661) NLON, NPFC_F
  661     FORMAT(' NPFC_F DOES NOT DIVIDE NLON EVENLY',/,
     &           ' NLON = ',I4,' NPFC_F = ',I4)
          STOP
        ENDIF
C
        IF (NLLON_F .NE. (2*(NLLON_F/2))) THEN
          IF (ME .EQ. 0) WRITE(0, 662) NLON, NPFC_F
  662     FORMAT(' 2 DOES NOT DIVIDE NLON/NPFC_F EVENLY',/,
     &           ' NLON = ',I4,' NPFC_F = ',I4)
          STOP
        ENDIF
C
      ENDIF
C
C     For distributed FFT algorithm need at least four local longitudes
C     per processor.
      IF (FTOPT .EQ. 0) THEN
        IF (NLLON_F .LT. 4) THEN
          IF (ME .EQ. 0) WRITE(0, 663) NLON, NPFC_F
  663     FORMAT(' NLON/NPFC_F MUST BE AT LEAST 4 FOR PARALLEL FFT',/,
     &           ' NLON = ',I4,' NPFC_F = ',I4)
          STOP
        ENDIF
      ENDIF
C
C     Latitude
C
      IF (LTOPT .EQ. 0) THEN
C       Using a distributed Legendre transform algorithm, so
C       decomposition should assign pairs (north/south) of latitudes to 
C       processors, with at least one pair in each processor.
C
        IF (NLAT/2 .LT. NPLAT_F) THEN
          IF (ME .EQ. 0) WRITE(0, 664) NLAT, NPLAT_F
  664     FORMAT(' NPLAT_F IS LARGER THAN NLAT/2',/,
     &           ' NLAT = ',I4,' NPLAT_F = ',I4)
          STOP
        ENDIF
C
        NLLATH = (NLAT/2)/NPLAT_F
        NTMP2 = MOD(NLAT/2,NPLAT_F)
        IF (LATDEX_F .LT. NTMP2) THEN
          LATBASE = LATDEX_F*NLLATH
        ELSE
          LATBASE = NTMP2*(NLLATH+1) + (LATDEX_F-NTMP2)*NLLATH
        ENDIF
        NLLAT_F = 2*NLLATH
C
        DO I=1,NLLATH
          LATTRUE_F(I)             = LATBASE + I
          LATTRUE_F(NLLAT_F-(I-1)) = (NLAT-LATBASE) - (I-1)
        ENDDO
C
      ELSE
C
C       Using a transpose/serial Legendre transform algorithm, so
C       decomposition should assign at least one latitude to each
C       processor.
C
        IF (NLAT .LT. NPLAT_F) THEN
          IF (ME .EQ. 0) WRITE(0, 665) NLAT, NPLAT_F
  665     FORMAT(' NPLAT_F IS LARGER THAN NLAT',/,
     &           ' NLAT = ',I4,' NPLAT_F = ',I4)
          STOP
        ENDIF

        NLLAT_F = NLAT/NPLAT_F
        NTMP2 = MOD(NLAT,NPLAT_F)
        IF (LATDEX_F .LT. NTMP2) THEN
          NLLAT_F = NLLAT_F+1
          LATBASE = LATDEX_F*NLLAT_F
        ELSE
          LATBASE = NTMP2*(NLLAT_F+1) + (LATDEX_F-NTMP2)*NLLAT_F
        ENDIF
C
        DO I=1,NLLAT_F
           LATTRUE_F(I) = LATBASE + I
        ENDDO
C
      ENDIF
C
C     Vertical
C
C     Decomposition should assign at least one level per processor.
      IF(NVER .LT. NPVER_F) THEN
        IF (ME .EQ. 0) WRITE(0,646) NVER, NPVER_F
 646    FORMAT(' TOO FEW VERTICAL LEVELS',/,
     &         ' NVER = ',I4,' NPVER_F = ',I4)
        STOP
      ENDIF
C
C     If using O(log P) transpose, NPVER_F must divide NVER.
      IF((FTOPT .EQ. 1) .AND. 
     &   ((COMMFFT .GE. 20) .OR. (COMMIFT .GE. 20))) THEN
        IF((NVER/NPVER_F)*NPVER_F .NE. NVER) THEN
          IF (ME .EQ. 0) WRITE(0,643) NPVER_F, NVER
 643      FORMAT(' NPVER_F DOES NOT DIVIDE NVER EVENLY',/,
     &           ' NPVER_F = ',I4,' NVER = ',I4)
          STOP
        ENDIF
      ENDIF
C
      IF (VERDEX_F .LT. MOD(NVER,NPVER_F)) THEN
        NLVER_F = NVER/NPVER_F + 1
      ELSE
        NLVER_F = NVER/NPVER_F
      ENDIF
C
C     Wavenumber
C
      DO M=0,NLON/2
        MTINV_F(M) = -1
      ENDDO
      NLMM_F = 0
      NLFC_F = ((NLON/2)/NPFC_F) + 1
      NTMM = NLFC_F
      DO IK=0,NLFC_F-1
C        Get the "true" index of the "ik"th fc index on a processor
C        with fcdex index FCDEX_F (particular to this distributed FFT
C        when NPFC_F > 1, otherwise a standard unordered FFT).
         M = MDEX(IK, FCDEX_F, NPFC_F, NLFC_F-1)
         IF (M .LE. MM .AND. M .GE. 0 ) THEN
           NLMM_F = NLMM_F + 1
           MTRUE_F(NLMM_F) = M
           MTINV_F(M) = NLMM_F
           JMLTRUE_F(NLMM_F) = IK+1
         ELSE
           MTRUE_F(NTMM) = M
           IF ( M .GE. 0 ) MTINV_F(M) = NTMM
           JMLTRUE_F(NTMM) = IK+1
           NTMM = NTMM - 1
         ENDIF
      ENDDO
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     3) Define domain decomposition for spectral and Fourier spaces
C        when transforming between them.
C
      IF ((FTOPT .EQ. 0) .AND. (LTOPT .EQ. 0)) THEN
C       Using distributed FFT and distributed Legendre transform.
C
C       allocation of processors to decomposition
        NPFC_S   = PX
        CALL ROW_MAP_INIT(INDEX_J, PX, PY, MAP2D, MAPFC_S)
C
        NPSPEC_S = PY
        CALL COLUMN_MAP_INIT(INDEX_I, PX, PY, MAP2D, MAPSPEC_S)
C
        NPVER_S  = 1
        MAPVER_S(0) = ME
C
C       indices of current processor in this mapping
        FCDEX_S    = INDEX_I
        SPECDEX_S  = INDEX_J
        VERDEX_S   = 0
C
      ELSEIF ((FTOPT .EQ. 1) .AND. (LTOPT .EQ. 0)) THEN
C       Using transpose/serial FFT and distributed Legendre transform.
C
C       allocation of processors to decomposition
        NPFC_S = 1
        MAPFC_S(0) = ME
C
        NPSPEC_S = PY
        CALL COLUMN_MAP_INIT(INDEX_I, PX, PY, MAP2D, MAPSPEC_S)
C
        NPVER_S = PX
        CALL ROW_MAP_INIT(INDEX_J, PX, PY, MAP2D, MAPVER_S)
C
C       indices of current processor in this mapping
        FCDEX_S = 0
        SPECDEX_S = INDEX_J
        VERDEX_S = INDEX_I
C
      ELSEIF ((FTOPT .EQ. 0) .AND. (LTOPT .EQ. 1)) THEN
C       Using distributed FFT and transpose/serial Legendre transform.
C
C       allocation of processors to decomposition
        NPFC_S = PX
        CALL ROW_MAP_INIT(INDEX_J, PX, PY, MAP2D, MAPFC_S)
C
        NPSPEC_S = 1
        MAPSPEC_S(0) = ME
C
        NPVER_S = PY
        CALL COLUMN_MAP_INIT(INDEX_I, PX, PY, MAP2D, MAPVER_S)
C
C       indices of current processor in this mapping
        FCDEX_S    = INDEX_I
        SPECDEX_S  = 0
        VERDEX_S   = INDEX_J
C
      ELSEIF ((FTOPT .EQ. 1) .AND. (LTOPT .EQ. 1)) THEN
C       Using transpose/serial FFT and transpose/serial Legendre 
C       transform.
C
C       allocation of processors to decomposition
        NPFC_S = PY
        CALL COLUMN_MAP_INIT(INDEX_I, PX, PY, MAP2D, MAPFC_S)
C
        NPSPEC_S = 1
        MAPSPEC_S(0) = ME
C
        NPVER_S = PX
        CALL ROW_MAP_INIT(INDEX_J, PX, PY, MAP2D, MAPVER_S)
C
C       indices of current processor in this mapping
        FCDEX_S    = INDEX_J
        SPECDEX_S  = 0
        VERDEX_S   = INDEX_I
C
      ENDIF
C
C     Differentiate between overlapped and non-overlapped 
C     distributed vector sum algorithms.
      IF ((LTOPT .EQ. 0) .AND. (COMMFLT .LT. 10)) THEN
        NLTSTEPS = NPSPEC_S
      ELSE
        NLTSTEPS = 1
      ENDIF
C
C     Wavenumber
C
      DO M=0,NLON/2
        MTINV_S(M) = -1
      ENDDO
      NLMM_S = 0
      NLFC_S = ((NLON/2)/NPFC_S) + 1
      NTMM = NLFC_S
      JM0 = -1
      JM1 = -1
C
C     The Fourier coefficient ordering is different for a full transpose
C     and the distributed FFT algorithms even when NPFC_S and 
C     NLFC_S are the same, but the difference is a simple permutation
C     of the processor assignment.
      IF (NPFC_F .EQ. NPFC_S) THEN
        TMPDEX = FCDEX_S
      ELSE
        TMPDEX = FCDEX_S/2
        IF (MOD(FCDEX_S,2) .EQ. 1)
     &    TMPDEX = TMPDEX + 2**(LOG2(NPFC_S)-1)
      ENDIF
C
      DO IK=0,NLFC_S-1
C        Get the "true" index of the "ik"th fc index on a processor
C        with fcdex index FCDEX_S (particular to this distributed FFT
C        when NPFC_S > 1, otherwise a standard unordered FFT).
         M = MDEX(IK, TMPDEX, NPFC_S, NLFC_S-1)
         IF (M .LE. MM .AND. M .GE. 0 ) THEN
           NLMM_S = NLMM_S + 1
           MTRUE_S(NLMM_S) = M
           MTINV_S(M) = NLMM_S
           JMLTRUE_S(NLMM_S) = IK+1
           IF (M .EQ. 0) JM0 = NLMM_S
           IF (M .EQ. 1) JM1 = NLMM_S
         ELSE
           MTRUE_S(NTMM) = M
           IF ( M .GE. 0 ) MTINV_S(M) = NTMM
           JMLTRUE_S(NTMM) = IK+1
           NTMM = NTMM - 1
         ENDIF
      ENDDO
C
C     Latitude
C
C     Decomposition should assign pairs (north/south) of latitudes to 
C     processors, with at least one pair in each processor.
      NLLATH_S = (NLAT/2)/NPSPEC_S
      NTMP2 = MOD(NLAT/2,NPSPEC_S)
      IF (SPECDEX_S .LT. NTMP2) THEN
        NLLATH_S = NLLATH_S+1
        LATBASE = SPECDEX_S*NLLATH_S
      ELSE
        LATBASE = NTMP2*(NLLATH_S+1) + (SPECDEX_S-NTMP2)*NLLATH_S
      ENDIF
      NLLAT_S = 2*NLLATH_S
C
      DO I=1,NLLATH_S
        LATTRUE_S(I)             = LATBASE + I
        LATTRUE_S(NLLAT_S-(I-1)) = (NLAT-LATBASE) - (I-1)
      ENDDO
C
      IF (NLAT .LT. NPSPEC_S) THEN
        IF (ME .EQ. 0) WRITE(0, 671) NLAT, NPSPEC_S
  671   FORMAT(' NPSPEC_S IS LARGER THAN NLAT/2',/,
     &         ' NLAT = ',I4,' NPSPEC_S = ',I4)
        STOP
      ENDIF
C
C     Vertical
C
C     Decomposition should assign at least one level per processor.
      IF(NVER .LT. NPVER_S) THEN
        IF (ME .EQ. 0) WRITE(0,6461) NVER, NPVER_S
 6461   FORMAT(' TOO FEW VERTICAL LEVELS',/,
     &         ' NVER = ',I4,' NPVER_S = ',I4)
        STOP
      ENDIF
C
C     If using O(log P) transpose, NPVER_S must divide NVER.
      IF((LTOPT .EQ. 1) .AND. 
     &   ((COMMFLT .GE. 20) .OR. (COMMILT .GE. 20))) THEN
        IF((NVER/NPVER_S)*NPVER_S .NE. NVER) THEN
          IF (ME .EQ. 0) WRITE(0,6431) NPVER_S, NVER
 6431     FORMAT(' NPVER_S DOES NOT DIVIDE NVER EVENLY',/,
     &           ' NPVER_S = ',I4,' NVER = ',I4)
          STOP
        ENDIF
      ENDIF
C
      IF (VERDEX_S .LT. MOD(NVER, NPVER_S)) THEN
        NLVER_S = NVER/NPVER_S + 1
      ELSE
        NLVER_S = NVER/NPVER_S
      ENDIF
C
C     Determine LLCOL_S(1:NLMM+1,1:2), column lengthsm and cumulative
C     displacements for local ALP and DALP arrays.
      LLCOL_S(1,1) = (NN+1) - MAX0(MTRUE_S(1)+NN-KK, 0)
      LLCOL_S(1,2) = 0
      DO JM=2,NLMM_S
         LLCOL_S(JM,1) = (NN+1) - MAX0(MTRUE_S(JM)+NN-KK, 0)
         LLCOL_S(JM,2) = LLCOL_S(JM-1,1) + LLCOL_S(JM-1,2)
      ENDDO
C
C     Spectral
C
C     Calculate number of spectral coefficients associated with local 
C     Fourier wavenumbers.
      NFSPEC_S = LLCOL_S(NLMM_S,1) + LLCOL_S(NLMM_S,2)
C
C     Calculate the number of local spectral coefficients associated 
C     with each processor in the set associated with the local Fourier 
C     wavenumbers.
      NTMP1 = NFSPEC_S/NLTSTEPS
      NTMP2 = MOD(NFSPEC_S,NLTSTEPS)
      DO J=NTMP2+1,NPSPEC_S
        NLSPEC_S(J) = NTMP1
      ENDDO
      NTMP1 = NTMP1 + 1
      DO J=1,NTMP2
        NLSPEC_S(J) = NTMP1
      ENDDO
      MXLSPEC_S = NLSPEC_S(1)
C
C     Compute the partition of the spectral coefficients of the
C     state vectors in terms of JM and JN for distribution among 
C     the processors.
      JMB_S(1) = 1
      P = 1
      L = 0
      JNTRNS = 1
      JNB_S(1) = 1
      JNME = 1
      P01_S = -1
      P11_S = -1
C
      DO JM=1,NLMM_S
        DO JN=1,LLCOL_S(JM,1)
C
          L = L + 1
C
          IF ((JM .EQ. JM0) .AND. (JN .EQ. 2)) THEN
C            WAVE M=0, N=1
             P01_S = P
             L01_S = L
          ENDIF
C
          IF ((JM .EQ. JM1) .AND. (JN .EQ. 1)) THEN
C            WAVE M=1, N=1
             P11_S = P
             L11_S = L
          ENDIF
C
          IF ((L .EQ. NLSPEC_S(P)) .OR. (JN .EQ. LLCOL_S(JM,1))) THEN
C
            JNE_S(JNTRNS) = JN
            JNTRNS = JNTRNS + 1
            IF (JN .LT. LLCOL_S(JM,1)) THEN
              JNB_S(JNTRNS) = JN+1
            ELSE
              JNB_S(JNTRNS) = 1
            ENDIF
          ENDIF
C
          IF (L .EQ. NLSPEC_S(P)) THEN
C
            JME_S(P) = JM
            P = P + 1
            IF (JN .LT. LLCOL_S(JM,1)) THEN
              JMB_S(P) = JM
            ELSE
              JMB_S(P) = JM+1
            ENDIF
            IF (P .EQ. (SPECDEX_S+1)) JNME = JNTRNS
            L = 0
          ENDIF
C
        ENDDO
      ENDDO
C
C
      JME_S(NLTSTEPS) = NLMM_S
      JNB_S(JNTRNS) = 0
      NTRNS_S = JNTRNS - 1
C
      IF (NLTSTEPS .GT. 1) THEN
C        Permute the partition so that the "ME" processor information
C        is in the first elements, and the rest are rotated around a 
C        ring of length NLTSTEPS (NPSPEC_S for overlapped algorithm).
C
         IF (P01_S .NE. -1) 
     &     P01_S = MOD((P01_S-1)+(NPSPEC_S-SPECDEX_S),NPSPEC_S) + 1
         IF (P11_S .NE. -1) 
     &     P11_S = MOD((P11_S-1)+(NPSPEC_S-SPECDEX_S),NPSPEC_S) + 1
C
         DO I=1,NPSPEC_S
           JMTMP(I) = JMB_S(I)
         ENDDO
         DO I=SPECDEX_S+1,NPSPEC_S
           JMB_S(I-SPECDEX_S) = JMTMP(I) 
         ENDDO
         DO I=1,SPECDEX_S
           JMB_S(I+(NPSPEC_S-SPECDEX_S)) = JMTMP(I) 
         ENDDO
C
         DO I=1,NPSPEC_S
           JMTMP(I) = JME_S(I)
         ENDDO
         DO I=SPECDEX_S+1,NPSPEC_S
           JME_S(I-SPECDEX_S) = JMTMP(I) 
         ENDDO
         DO I=1,SPECDEX_S
           JME_S(I+(NPSPEC_S-SPECDEX_S)) = JMTMP(I) 
         ENDDO
C
         DO I=1,NPSPEC_S
           JMTMP(I) = NLSPEC_S(I)
         ENDDO
         DO I=SPECDEX_S+1,NPSPEC_S
           NLSPEC_S(I-SPECDEX_S) = JMTMP(I) 
         ENDDO
         DO I=1,SPECDEX_S
           NLSPEC_S(I+(NPSPEC_S-SPECDEX_S)) = JMTMP(I) 
         ENDDO
C
         DO I=1,NTRNS_S
           JMTMP(I) = JNB_S(I)
         ENDDO
         DO I=JNME,NTRNS_S
           JNB_S((I+1)-JNME) = JMTMP(I) 
         ENDDO
         DO I=1,JNME-1
           JNB_S((I+1)+(NTRNS_S-JNME)) = JMTMP(I) 
         ENDDO
C
         DO I=1,NTRNS_S
           JMTMP(I) = JNE_S(I)
         ENDDO
         DO I=JNME,NTRNS_S
           JNE_S((I+1)-JNME) = JMTMP(I) 
         ENDDO
         DO I=1,JNME-1
           JNE_S((I+1)+(NTRNS_S-JNME)) = JMTMP(I) 
         ENDDO
C
      ENDIF
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     Determine number of local longitude and Fourier coefficients to 
C     declare when allocating space so as to guarantee that the physical,
C     Fourier, and spectral partitions of the gridpoint fields and 
C     the Fourier coefficients will fit into the same memory.
      PFACTOR  = NLVER_P*NLLAT_P
      FFACTOR  = 2*NLVER_F*NLLAT_F
      SFACTOR  = 2*NLVER_S*NLLAT_S
      MXLLON_P = NLLON_P
      MXLFC_F  = NLFC_F
      MXLFC_S  = NLFC_S
      DO WHILE ((MXLLON_P*PFACTOR .NE. MXLFC_F*FFACTOR) .OR.
     &          (MXLFC_F*FFACTOR .NE. MXLFC_S*SFACTOR))
C
        IF (MXLLON_P*PFACTOR .LT. MXLFC_F*FFACTOR) THEN
          MXLLON_P = MXLLON_P + 1
        ELSEIF (MXLLON_P*PFACTOR .GT. MXLFC_F*FFACTOR) THEN
          MXLFC_F = MXLFC_F + 1
        ENDIF
C
        IF (MXLFC_F*FFACTOR .LT. MXLFC_S*SFACTOR) THEN
          MXLFC_F = MXLFC_F + 1
        ELSEIF (MXLFC_F*FFACTOR .GT. MXLFC_S*SFACTOR) THEN
          MXLFC_S = MXLFC_S + 1
        ENDIF
C
      ENDDO
      MXLLON_F = 2*MXLFC_F
C
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     Check consistency of parallel and communication algorithm 
C     parameters and determine amount of work space required.
C
C     Initialize buffers size indicators (which are used in INPUT when
C     allocating work space).
      BUFSWS2 = 0
      BUFSWS3 = 0
C
C     Legendre transform algorithms
C
      IF ((LTOPT .LT. 0) .OR. (LTOPT .GT. 1)) THEN
        IF (ME .EQ. 0) WRITE(0,701) LTOPT
  701   FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &        ' ILLEGAL PARALLEL ALGORITHM OPTION SPECIFIED FOR',/,
     &        ' LEGENDRE TRANSFORM',/,
     &        ' LTOPT = ',I4)
        STOP
      ENDIF
C
      IF (LTOPT .EQ. 0) THEN
C       distributed vector sum algorithms
C
        IF ((COMMFLT .LT. 0) .OR. (COMMFLT .GT. 41)) THEN
          IF (ME .EQ. 0) WRITE(0,702) LTOPT, COMMFLT, COMMILT
  702     FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &          ' ILLEGAL COMMUNICATION OPTION SPECIFIED FOR',/,
     &          ' FORWARD LEGENDRE TRANSFORM',/,
     &          ' LTOPT = ',I4,' COMMFLT = ',I4, ' COMMILT = ',I4)
          STOP
        ENDIF

        IF (COMMFLT .LT. 10) THEN
C         ring-pipeline algorithms for forward and inverse Legendre 
C         transforms
C
C         1) SHIFTSUM algorithm for forward Legendre transform
C
C         Need at most NLTSTEPS-1 buffers in recv-ahead 
C         variants, but if specify (BUFFERS .EQ. NLTSTEPS), can 
C         request all of these buffers at one time. If 
C         (BUFFERS .EQ. NLTSTEPS-1), then, for code consistency
C         between all recv-ahead variants, must delay the last 
C         request somewhat. 
          IF (BUFSFLT .GT. NLTSTEPS) BUFSFLT = NLTSTEPS 
          IF (BUFSFLT .LT. 1) BUFSFLT = 1
C
          IF ((COMMFLT .LT. 0) .OR. (COMMFLT .GT. 2)) THEN
            IF (ME .EQ. 0) WRITE(0,702) LTOPT, COMMFLT, COMMILT
            STOP
          ENDIF
C
          IF ((PROTFLT .LT. 0) .OR. (PROTFLT .GT. 6) .OR. 
     &        ((PROTFLT .EQ. 6) .AND. (COMMFLT .NE. 1))) THEN
            IF (ME .EQ. 0) WRITE(0,703) LTOPT, COMMFLT, PROTFLT
 703        FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &            ' ILLEGAL PROTOCOL OPTION SPECIFIED FOR',/,
     &            ' FORWARD LEGENDRE TRANSFORM',/,
     &            ' LTOPT = ',I4,' COMMFLT = ',I4,' PROTFLT = ',I4)
            STOP
          ENDIF
C
C         2) SHIFT algorithm for inverse Legendre transform
C
C         Need at most NLTSTEPS-1 buffers in recv-ahead 
C         variants, but if specify (BUFFERS .EQ. NLTSTEPS+1), can 
C         request all of these buffers at one time. If 
C         (BUFFERS .EQ. NLTSTEPS) or (BUFFFERS .EQ. NLTSTEPS-1), 
C         then, for code consistency between all recv-ahead variants, 
C         must delay the last requests somewhat. 
          IF (BUFSILT .GT. NLTSTEPS+1) BUFSILT = NLTSTEPS + 1
          IF (BUFSILT .LE. 3) BUFSILT = 2
C
          IF ((COMMILT .LT. 0) .OR. (COMMILT .GT. 2)) THEN
            IF (ME .EQ. 0) WRITE(0,704) LTOPT, COMMFLT, COMMILT
  704       FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &            ' ILLEGAL COMMUNICATION OPTION SPECIFIED FOR',/,
     &            ' INVERSE LEGENDRE TRANSFORM',/,
     &            ' LTOPT = ',I4,' COMMFLT = ',I4, ' COMMILT = ',I4)
            STOP
          ENDIF
C
          IF ((PROTILT .LT. 0) .OR. (PROTILT .GT. 6) .OR. 
     &        ((PROTFLT .EQ. 6) .AND. 
     &         ((COMMFLT .EQ. 0) .OR. (COMMFLT .EQ. 2)))) THEN
            IF (ME .EQ. 0) WRITE(0,705) LTOPT, COMMILT, PROTILT
  705       FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &            ' ILLEGAL PROTOCOL OPTION SPECIFIED FOR',/,
     &            ' INVERSE LEGENDRE TRANSFORM',/,
     &            ' LTOPT = ',I4,' COMMILT = ',I4,' PROTILT = ',I4)
            STOP
          ENDIF
C
        ELSEIF (COMMFLT .LT. 20) THEN
C         nonoverlapped ring algorithm for forward Legendre transform
C
C         one buffer needed in recv-ahead variants of ring algorithm
          BUFSFLT = 1
C
          IF ((PROTFLT .LT. 0) .OR. (PROTFLT .GT. 6) .OR. 
     &        ((PROTFLT .EQ. 6) .AND. 
     &         ((COMMFLT .EQ. 10) .OR. (COMMFLT .EQ. 12)))) THEN
            IF (ME .EQ. 0) WRITE(0,703) LTOPT, COMMFLT, PROTFLT
            STOP
          ENDIF
C
C         no interprocessor communication in inverse Legendre transform
          BUFSILT = 0
C
          IF (COMMILT .LT. 10) THEN
            IF (ME .EQ. 0) WRITE(0,704) LTOPT, COMMFLT, COMMILT
            STOP
          ENDIF
C
        ELSEIF (COMMFLT .LT. 30) THEN
C         recursive halving algorithm for forward Legendre transform
C
C         number of buffers used in recv-ahead variants of recursive 
C         halving algorithm
          IF (BUFSFLT .GT. 3) BUFSFLT = 3
          IF (BUFSFLT .LT. 1) BUFSFLT = 1
C
          IF ((PROTFLT .LT. 0) .OR. (PROTFLT .GT. 6) .OR. 
     &        ((PROTFLT .EQ. 6) .AND. (COMMFLT .NE. 21))) THEN
            IF (ME .EQ. 0) WRITE(0,703) LTOPT, COMMFLT, PROTFLT
            STOP
          ENDIF
C
C         no interprocessor communication in inverse Legendre transform
          BUFSILT = 0
C
          IF (COMMILT .LT. 10) THEN
            IF (ME .EQ. 0) WRITE(0,704) LTOPT, COMMFLT, COMMILT
            STOP
          ENDIF
C
        ELSEIF (COMMFLT .LT. 40) THEN
C         exchange algorithm for forward Legendre transform
C
C         Can use up to 2*log(P) buffers in nonblocking send and recv 
C         variants of EXCHSUM algorithm.
          LGP = LOG2(NPSPEC_S)        
          IF (BUFSFLT .GT. 2*LGP) BUFSFLT = 2*LGP
          IF (BUFSFLT .LT. 1) BUFSFLT = 1
C
          IF ((PROTFLT .LT. 0) .OR. (PROTFLT .GT. 6) .OR. 
     &        ((PROTFLT .EQ. 6) .AND. (COMMFLT .NE. 31))) THEN
            IF (ME .EQ. 0) WRITE(0,703) LTOPT, COMMFLT, PROTFLT
            STOP
          ENDIF
C
C         no interprocessor communication in inverse Legendre transform
          BUFSILT = 0
C
          IF (COMMILT .LT. 10) THEN
            IF (ME .EQ. 0) WRITE(0,704) LTOPT, COMMFLT, COMMILT
            STOP
          ENDIF
C
        ENDIF
C
C       Translate number of communication buffers into storage
C       requirement indicators.
        IF (SUMOPT .EQ. 0) THEN
C         "natural" summation ordering, so need less storage
C         in forward Legendre sum
          BUFSWS3 = MAX(BUFSWS3, BUFSFLT+1)
        ELSE
C         binary tree summation ordering, so need extra storage
C         in forward Legendre sum
          BUFSWS3 = MAX(BUFSWS3, BUFSFLT+NLLATH_S)
        ENDIF
        BUFSWS3 = MAX(BUFSWS3, BUFSILT)
C
      ELSE
C       transpose algorithm for Legendre transforms
C
        IF (((COMMFLT .LT. 0) .OR. (COMMFLT .GT. 3)) .AND.
     &      ((COMMFLT .LT. 10) .OR. (COMMFLT .GT. 13)) .AND.
     &      ((COMMFLT .LT. 20) .OR. (COMMFLT .GT. 21))) THEN
          IF (ME .EQ. 0) WRITE(0,702) LTOPT, COMMFLT, COMMILT
          STOP
        ENDIF
C
        IF (COMMFLT .LT. 20) THEN
C         O(P) transpose algorithms for forward Legendre transform
C
C         Need two buffers for O(P) transpose algorithms.
          BUFSFLT = 2
C
          IF ((PROTFLT .LT. 0) .OR. (PROTFLT .GT. 6) .OR. 
     &        ((PROTFLT .EQ. 6) .AND. 
     &         ((COMMFLT .EQ. 0) .OR. (COMMFLT .EQ. 2) .OR.
     &          (COMMFLT .EQ. 10) .OR. (COMMFLT .EQ. 12)))) THEN
            IF (ME .EQ. 0) WRITE(0,703) LTOPT, COMMFLT, PROTFLT
            STOP
          ENDIF
C
        ELSE
C         O(log(P)) transpose algorithms for forward Legendre transform
C
C         Can use up to log(P)+1 buffers in nonblocking send and recv 
C         variants of LOGTRANS algorithm.
          IF (FTOPT .EQ. 1) THEN
            LGP = LOG2(NPFC_S)
          ELSE
            LGP = LOG2(NPVER_S)        
          ENDIF
          IF (BUFSFLT .GT. LGP+1) BUFSFLT = LGP + 1
          IF (BUFSFLT .LT. 2) BUFSFLT = 2
C
          IF ((PROTFLT .LT. 0) .OR. (PROTFLT .GT. 6) .OR. 
     &        ((PROTFLT .EQ. 6) .AND. (COMMFLT .EQ. 20))) THEN
            IF (ME .EQ. 0) WRITE(0,703) LTOPT, COMMFLT, PROTFLT
            STOP
          ENDIF
C
        ENDIF
C
        IF (((COMMILT .LT. 0) .OR. (COMMILT .GT. 3)) .AND.
     &      ((COMMILT .LT. 10) .OR. (COMMILT .GT. 13)) .AND.
     &      ((COMMILT .LT. 20) .OR. (COMMILT .GT. 21))) THEN
          IF (ME .EQ. 0) WRITE(0,709) LTOPT, COMMFLT, COMMILT
          STOP
        ENDIF
C
        IF (COMMILT .LT. 20) THEN
C         O(P) transpose algorithms for inverse Legendre transform
C
C         Need two buffers for O(P) transpose algorithms.
          BUFSILT = 2
C
          IF ((PROTILT .LT. 0) .OR. (PROTILT .GT. 6) .OR. 
     &        ((PROTILT .EQ. 6) .AND. 
     &         ((COMMILT .EQ. 0) .OR. (COMMILT .EQ. 2) .OR.
     &          (COMMILT .EQ. 10) .OR. (COMMILT .EQ. 12)))) THEN
            IF (ME .EQ. 0) WRITE(0,705) LTOPT, COMMILT, PROTILT
            STOP
          ENDIF
C
        ELSE
C         O(log(P)) transpose algorithms for inverse Legendre transform
C
C         Can use up to log(P)+1 buffers in nonblocking send and recv 
C         variants of LOGTRANS algorithm.
          IF (FTOPT .EQ. 1) THEN
            LGP = LOG2(NPFC_S)
          ELSE
            LGP = LOG2(NPVER_S)        
          ENDIF
          IF (BUFSILT .GT. LGP+1) BUFSILT = LGP + 1
          IF (BUFSILT .LT. 2) BUFSILT = 2
C
          IF ((PROTILT .LT. 0) .OR. (PROTILT .GT. 6) .OR. 
     &        ((PROTILT .EQ. 6) .AND. (COMMILT .EQ. 20))) THEN
            IF (ME .EQ. 0) WRITE(0,705) LTOPT, COMMILT, PROTILT
            STOP
          ENDIF
C
        ENDIF
C
C       Translate number of communication buffers into storage
C       requirement indicators.
        BUFSWS2 = MAX(BUFSWS2, BUFSFLT)
        BUFSWS2 = MAX(BUFSWS2, BUFSILT)
        IF (SUMOPT .EQ. 0) THEN
C         "natural" summation ordering, so need less storage
C         in forward Legendre sum
          BUFSWS3 = MAX(BUFSWS3, BUFSFLT+1)
        ELSE
C         binary tree summation ordering, so need extra storage
C         in forward Legendre sum
          BUFSWS3 = MAX(BUFSWS3, BUFSFLT+NLLATH_S)
        ENDIF
        BUFSWS3 = MAX(BUFSWS3, BUFSILT)
C
      ENDIF
C
C     Fourier transform algorithms
C
      IF ((FTOPT .LT. 0) .OR. (FTOPT .GT. 1)) THEN
        IF (ME .EQ. 0) WRITE(0,706) FTOPT
  706   FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &        ' ILLEGAL PARALLEL ALGORITHM OPTION SPECIFIED FOR',/,
     &        ' FOURIER TRANSFORM',/,
     &        ' FTOPT = ',I4)
        STOP
      ENDIF
C
      IF (FTOPT .EQ. 0) THEN
C       distributed algorithms for Fourier transform
C
C       Always need exactly one buffer for distributed algorithm,
C       (which is enough space to allow sending and receiving to proceed
C        simultaneously.)
        BUFSFFT = 1
        BUFSIFT = 1
C
        IF ((COMMFFT .LT. 0) .OR. (COMMFFT .GT. 4)) THEN
          IF (ME .EQ. 0) WRITE(0,707) FTOPT, COMMFFT, COMMIFT
  707     FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &          ' ILLEGAL COMMUNICATION OPTION SPECIFIED FOR',/,
     &          ' FORWARD FOURIER TRANSFORM',/,
     &          ' FTOPT = ',I4,' COMMFFT = ',I4,' COMMIFT = ',I4)
          STOP
        ENDIF
C
        IF ((PROTFFT .LT. 0) .OR. (PROTFFT .GT. 6) .OR. 
     &      ((PROTFFT .EQ. 6) .AND. (COMMFFT .NE. 1))) THEN
          IF (ME .EQ. 0) WRITE(0,708) FTOPT, COMMFFT, PROTFFT
 708      FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &          ' ILLEGAL PROTOCOL OPTION SPECIFIED FOR',/,
     &          ' FORWARD FOURIER TRANSFORM',/,
     &          ' FTOPT = ',I4,' COMMFFT = ',I4,' PROTFFT = ',I4)
          STOP
        ENDIF
C
        IF ((COMMIFT .LT. 0) .OR. (COMMIFT .GT. 4)) THEN
          IF (ME .EQ. 0) WRITE(0,709) FTOPT, COMMFFT, COMMIFT
  709     FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &          ' ILLEGAL COMMUNICATION OPTION SPECIFIED FOR',/,
     &          ' INVERSE FOURIER TRANSFORM',/,
     &          ' FTOPT = ',I4,' COMMFFT = ',I4,' COMMIFT = ',I4)
          STOP
        ENDIF
C
        IF ((PROTIFT .LT. 0) .OR. (PROTIFT .GT. 6) .OR. 
     &      ((PROTIFT .EQ. 6) .AND. (COMMIFT .NE. 1))) THEN
          IF (ME .EQ. 0) WRITE(0,720) FTOPT, COMMIFT, PROTIFT
  720     FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE ALGINP ',/,
     &          ' ILLEGAL PROTOCOL OPTION SPECIFIED FOR',/,
     &          ' INVERSE FOURIER TRANSFORM',/,
     &          ' FTOPT = ',I4,' COMMIFT = ',I4,' PROTIFT = ',I4)
          STOP
        ENDIF
C
C       Translate number of communication buffers into storage
C       requirement indicators.
        BUFSWS2 = MAX(BUFSWS2, BUFSFFT)
        BUFSWS2 = MAX(BUFSWS2, BUFSIFT)
C
      ELSEIF (FTOPT .EQ. 1) THEN
C       transpose algorithms for Fourier transform
C
        IF (((COMMFFT .LT. 0) .OR. (COMMFFT .GT. 3)) .AND.
     &      ((COMMFFT .LT. 10) .OR. (COMMFFT .GT. 13)) .AND.
     &      ((COMMFFT .LT. 20) .OR. (COMMFFT .GT. 21))) THEN
          IF (ME .EQ. 0) WRITE(0,707) FTOPT, COMMFFT, COMMIFT
          STOP
        ENDIF
C
        IF (COMMFFT .LT. 20) THEN
C         O(P) transpose algorithms for forward Fourier transform
C
C         Need two buffers for O(P) transpose algorithms.
          BUFSFFT = 2
C
          IF ((PROTFFT .LT. 0) .OR. (PROTFFT .GT. 6) .OR. 
     &        ((PROTFFT .EQ. 6) .AND. 
     &         ((COMMFFT .EQ. 0) .OR. (COMMFFT .EQ. 2) .OR.
     &          (COMMFFT .EQ. 10) .OR. (COMMFFT .EQ. 12)))) THEN
            IF (ME .EQ. 0) WRITE(0,708) FTOPT, COMMFFT, PROTFFT
            STOP
          ENDIF
C
        ELSE
C         O(log(P)) transpose algorithms for forward Fourier transform
C
C         Can use up to log(P)+1 buffers in nonblocking send and recv 
C         variants of LOGTRANS algorithm.
          LGP = LOG2(NPLON_P)        
          IF (BUFSFFT .GT. LGP+1) BUFSFFT = LGP + 1
          IF (BUFSFFT .LT. 2) BUFSFFT = 2
C
          IF ((PROTFFT .LT. 0) .OR. (PROTFFT .GT. 6) .OR. 
     &        ((PROTFFT .EQ. 6) .AND. ((COMMFFT .EQ. 20)))) THEN
            IF (ME .EQ. 0) WRITE(0,708) FTOPT, COMMFFT, PROTFFT
            STOP
          ENDIF
C
        ENDIF
C
        IF (((COMMIFT .LT. 0) .OR. (COMMIFT .GT. 3)) .AND.
     &      ((COMMIFT .LT. 10) .OR. (COMMIFT .GT. 13)) .AND.
     &      ((COMMIFT .LT. 20) .OR. (COMMIFT .GT. 21))) THEN
          IF (ME .EQ. 0) WRITE(0,709) FTOPT, COMMFFT, COMMIFT
          STOP
        ENDIF
C
        IF (COMMIFT .LT. 20) THEN
C         O(P) transpose algorithms for inverse Fourier transform
C
C         Need two buffers for O(P) transpose algorithms.
          BUFSIFT = 2
C
          IF ((PROTIFT .LT. 0) .OR. (PROTIFT .GT. 6) .OR. 
     &        ((PROTIFT .EQ. 6) .AND. 
     &         ((COMMIFT .EQ. 0) .OR. (COMMIFT .EQ. 2) .OR.
     &          (COMMIFT .EQ. 10) .OR. (COMMIFT .EQ. 12)))) THEN
            IF (ME .EQ. 0) WRITE(0,720) FTOPT, COMMIFT, PROTIFT
            STOP
          ENDIF
C
        ELSE
C         O(log(P)) transpose algorithms for inverse Fourier transform
C
C         Can use up to log(P)+1 buffers in nonblocking send and recv 
C         variants of LOGTRANS algorithm.
          LGP = LOG2(NPLON_P)        
          IF (BUFSIFT .GT. LGP+1) BUFSIFT = LGP + 1
          IF (BUFSIFT .LT. 2) BUFSIFT = 2
C
          IF ((PROTIFT .LT. 0) .OR. (PROTIFT .GT. 6) .OR. 
     &        ((PROTIFT .EQ. 6) .AND. ((COMMIFT .EQ. 20)))) THEN
            IF (ME .EQ. 0) WRITE(0,720) FTOPT, COMMIFT, PROTIFT
            STOP
          ENDIF
C
        ENDIF
C
C       Translate number of communication buffers into storage
C       requirement indicators.
        BUFSWS2 = MAX(BUFSWS2, BUFSFFT)
        BUFSWS2 = MAX(BUFSWS2, BUFSIFT)
C
      ENDIF
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     Output primary algorithm and domain decomposition parameters.
C
      IF (ME .EQ. 0) THEN
        WRITE (6,201) PX, PY
  201   FORMAT (/,' NUMBER OF PROCESSORS IN PARALLELIZATION'
     &          /,' ROW PROCESSORS:    PX = ',I4,
     &          /,' COLUMN PROCESSORS: PY = ',I4,/)
C
        WRITE(6,*) 'PARALLEL ALGORITHMS'
        IF (FTOPT .EQ. 0) THEN
          WRITE(6,*) '  FT: DISTRIBUTED'
        ELSEIF (FTOPT .EQ. 1) THEN
          IF (COMMFFT .LT. 20) THEN
            WRITE(6,*) ' FFT: TRANSPOSE (O(P))'
          ELSEIF (COMMFFT .LT. 30) THEN
            WRITE(6,*) ' FFT: TRANSPOSE (O(LOG P))'
          ENDIF
          IF (COMMIFT .LT. 20) THEN
            WRITE(6,*) ' IFT: TRANSPOSE (O(P))'
          ELSEIF (COMMIFT .LT. 30) THEN
            WRITE(6,*) ' IFT: TRANSPOSE (O(LOG P))'
          ENDIF
        ENDIF
C
        IF (LTOPT .EQ. 0) THEN
          IF (COMMFLT .LT. 10) THEN
            WRITE(6,*) '  LT: DISTRIBUTED (OVERLAPPED RING VECTOR SUM)'
          ELSEIF (COMMFLT .LT. 20) THEN
            WRITE(6,*)'  LT: DISTRIBUTED (RING VECTOR SUM)'
          ELSEIF (COMMFLT .LT. 30) THEN
            WRITE(6,*)'  LT: DISTRIBUTED (RECURSIVE HALVING VECTOR SUM)'
          ELSEIF (COMMFLT .LT. 40) THEN
            WRITE(6,*)'  LT: DISTRIBUTED (EXCHANGE VECTOR SUM)'
          ENDIF
        ELSEIF (LTOPT .EQ. 1) THEN
          IF (COMMFLT .LT. 20) THEN
            WRITE(6,*) ' FLT: TRANSPOSE (O(P))'
          ELSEIF (COMMFLT .LT. 30) THEN
            WRITE(6,*) ' FLT: TRANSPOSE (O(LOG P))'
          ENDIF
          IF (COMMILT .LT. 20) THEN
            WRITE(6,*) ' ILT: TRANSPOSE (O(P))'
          ELSEIF (COMMILT .LT. 30) THEN
            WRITE(6,*) ' ILT: TRANSPOSE (O(LOG P))'
          ENDIF
        ENDIF
C
        WRITE(6,*) ' NLLAT_P =',NLLAT_P, ';   NLLON_P =',NLLON_P,
     &             ' NLVER_P =',NLVER_P
        WRITE(6,*) ' NLLAT_F =',NLLAT_F, ';   NLLON_F =',NLLON_F
        WRITE(6,*) ' NLVER_F =',NLVER_F, ';   NLFC_F  =',NLFC_F
        WRITE(6,*) ' NLLAT_S =',NLLAT_S, ';   NLFC_S  =',NLFC_S,
     &             ' NLVER_S =',NLVER_S
C
        WRITE(6,*) ' MXLLON_P=',MXLLON_P, ';   MXLLON_F=',MXLLON_F
        WRITE(6,*) ' MXLFC_F =',MXLFC_F, ';   MXLFC_S =',MXLFC_S
C
      ENDIF
C
      RETURN
      END
