ScaLAPACK  2.0.2
ScaLAPACK: Scalable Linear Algebra PACKage
psgbtrs.f
Go to the documentation of this file.
00001       SUBROUTINE PSGBTRS( TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, IPIV,
00002      $                    B, IB, DESCB, AF, LAF, WORK, LWORK, INFO )
00003 *
00004 *  -- ScaLAPACK routine (version 2.0.2) --
00005 *     Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
00006 *     May 1 2012
00007 *
00008 *     .. Scalar Arguments ..
00009       CHARACTER          TRANS
00010       INTEGER            BWL, BWU, IB, INFO, JA, LAF, LWORK, N, NRHS
00011 *     ..
00012 *     .. Array Arguments ..
00013       INTEGER            DESCA( * ), DESCB( * ), IPIV( * )
00014       REAL               A( * ), AF( * ), B( * ), WORK( * )
00015 *     ..
00016 *
00017 *  Purpose
00018 *  =======
00019 *
00020 *  PSGBTRS solves a system of linear equations
00021 *
00022 *            A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
00023 *                                    or
00024 *            A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS)
00025 *
00026 *  where A(1:N, JA:JA+N-1) is the matrix used to produce the factors
00027 *  stored in A(1:N,JA:JA+N-1) and AF by PSGBTRF.
00028 *  A(1:N, JA:JA+N-1) is an N-by-N real
00029 *  banded distributed
00030 *  matrix with bandwidth BWL, BWU.
00031 *
00032 *  Routine PSGBTRF MUST be called first.
00033 *
00034 *  =====================================================================
00035 *
00036 *  Arguments
00037 *  =========
00038 *
00039 *
00040 *  TRANS   (global input) CHARACTER
00041 *          = 'N':  Solve with A(1:N, JA:JA+N-1);
00042 *          = 'T' or 'C':  Solve with A(1:N, JA:JA+N-1)^T;
00043 *
00044 *  N       (global input) INTEGER
00045 *          The number of rows and columns to be operated on, i.e. the
00046 *          order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
00047 *
00048 *  BWL     (global input) INTEGER
00049 *          Number of subdiagonals. 0 <= BWL <= N-1
00050 *
00051 *  BWU     (global input) INTEGER
00052 *          Number of superdiagonals. 0 <= BWU <= N-1
00053 *
00054 *  NRHS    (global input) INTEGER
00055 *          The number of right hand sides, i.e., the number of columns
00056 *          of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
00057 *          NRHS >= 0.
00058 *
00059 *  A       (local input/local output) REAL pointer into
00060 *          local memory to an array with first dimension
00061 *          LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA).
00062 *          On entry, this array contains the local pieces of the
00063 *          N-by-N unsymmetric banded distributed Cholesky factor L or
00064 *          L^T A(1:N, JA:JA+N-1).
00065 *          This local portion is stored in the packed banded format
00066 *            used in LAPACK. Please see the Notes below and the
00067 *            ScaLAPACK manual for more detail on the format of
00068 *            distributed matrices.
00069 *
00070 *  JA      (global input) INTEGER
00071 *          The index in the global array A that points to the start of
00072 *          the matrix to be operated on (which may be either all of A
00073 *          or a submatrix of A).
00074 *
00075 *  DESCA   (global and local input) INTEGER array of dimension DLEN.
00076 *          if 1D type (DTYPE_A=501), DLEN >= 7;
00077 *          if 2D type (DTYPE_A=1), DLEN >= 9 .
00078 *          The array descriptor for the distributed matrix A.
00079 *          Contains information of mapping of A to memory. Please
00080 *          see NOTES below for full description and options.
00081 *
00082 *  IPIV    (local output) INTEGER array, dimension >= DESCA( NB ).
00083 *          Pivot indices for local factorizations.
00084 *          Users *should not* alter the contents between
00085 *          factorization and solve.
00086 *
00087 *  B       (local input/local output) REAL pointer into
00088 *          local memory to an array of local lead dimension lld_b>=NB.
00089 *          On entry, this array contains the
00090 *          the local pieces of the right hand sides
00091 *          B(IB:IB+N-1, 1:NRHS).
00092 *          On exit, this contains the local piece of the solutions
00093 *          distributed matrix X.
00094 *
00095 *  IB      (global input) INTEGER
00096 *          The row index in the global array B that points to the first
00097 *          row of the matrix to be operated on (which may be either
00098 *          all of B or a submatrix of B).
00099 *
00100 *  DESCB   (global and local input) INTEGER array of dimension DLEN.
00101 *          if 1D type (DTYPE_B=502), DLEN >=7;
00102 *          if 2D type (DTYPE_B=1), DLEN >= 9.
00103 *          The array descriptor for the distributed matrix B.
00104 *          Contains information of mapping of B to memory. Please
00105 *          see NOTES below for full description and options.
00106 *
00107 *  AF      (local output) REAL array, dimension LAF.
00108 *          Auxiliary Fillin Space.
00109 *          Fillin is created during the factorization routine
00110 *          PSGBTRF and this is stored in AF. If a linear system
00111 *          is to be solved using PSGBTRS after the factorization
00112 *          routine, AF *must not be altered* after the factorization.
00113 *
00114 *  LAF     (local input) INTEGER
00115 *          Size of user-input Auxiliary Fillin space AF. Must be >=
00116 *          (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu)
00117 *          If LAF is not large enough, an error code will be returned
00118 *          and the minimum acceptable size will be returned in AF( 1 )
00119 *
00120 *  WORK    (local workspace/local output)
00121 *          REAL temporary workspace. This space may
00122 *          be overwritten in between calls to routines. WORK must be
00123 *          the size given in LWORK.
00124 *          On exit, WORK( 1 ) contains the minimal LWORK.
00125 *
00126 *  LWORK   (local input or global input) INTEGER
00127 *          Size of user-input workspace WORK.
00128 *          If LWORK is too small, the minimal acceptable size will be
00129 *          returned in WORK(1) and an error code is returned. LWORK>=
00130 *          NRHS*(NB+2*bwl+4*bwu)
00131 *
00132 *  INFO    (global output) INTEGER
00133 *          = 0:  successful exit
00134 *          < 0:  If the i-th argument is an array and the j-entry had
00135 *                an illegal value, then INFO = -(i*100+j), if the i-th
00136 *                argument is a scalar and had an illegal value, then
00137 *                INFO = -i.
00138 *
00139 *  =====================================================================
00140 *
00141 *  Restrictions
00142 *  ============
00143 *
00144 *  The following are restrictions on the input parameters. Some of these
00145 *    are temporary and will be removed in future releases, while others
00146 *    may reflect fundamental technical limitations.
00147 *
00148 *    Non-cyclic restriction: VERY IMPORTANT!
00149 *      P*NB>= mod(JA-1,NB)+N.
00150 *      The mapping for matrices must be blocked, reflecting the nature
00151 *      of the divide and conquer algorithm as a task-parallel algorithm.
00152 *      This formula in words is: no processor may have more than one
00153 *      chunk of the matrix.
00154 *
00155 *    Blocksize cannot be too small:
00156 *      If the matrix spans more than one processor, the following
00157 *      restriction on NB, the size of each block on each processor,
00158 *      must hold:
00159 *      NB >= (BWL+BWU)+1
00160 *      The bulk of parallel computation is done on the matrix of size
00161 *      O(NB) on each processor. If this is too small, divide and conquer
00162 *      is a poor choice of algorithm.
00163 *
00164 *    Submatrix reference:
00165 *      JA = IB
00166 *      Alignment restriction that prevents unnecessary communication.
00167 *
00168 *  =====================================================================
00169 *
00170 *  Notes
00171 *  =====
00172 *
00173 *  If the factorization routine and the solve routine are to be called
00174 *    separately (to solve various sets of righthand sides using the same
00175 *    coefficient matrix), the auxiliary space AF *must not be altered*
00176 *    between calls to the factorization routine and the solve routine.
00177 *
00178 *  The best algorithm for solving banded and tridiagonal linear systems
00179 *    depends on a variety of parameters, especially the bandwidth.
00180 *    Currently, only algorithms designed for the case N/P >> bw are
00181 *    implemented. These go by many names, including Divide and Conquer,
00182 *    Partitioning, domain decomposition-type, etc.
00183 *
00184 *  Algorithm description: Divide and Conquer
00185 *
00186 *    The Divide and Conqer algorithm assumes the matrix is narrowly
00187 *      banded compared with the number of equations. In this situation,
00188 *      it is best to distribute the input matrix A one-dimensionally,
00189 *      with columns atomic and rows divided amongst the processes.
00190 *      The basic algorithm divides the banded matrix up into
00191 *      P pieces with one stored on each processor,
00192 *      and then proceeds in 2 phases for the factorization or 3 for the
00193 *      solution of a linear system.
00194 *      1) Local Phase:
00195 *         The individual pieces are factored independently and in
00196 *         parallel. These factors are applied to the matrix creating
00197 *         fillin, which is stored in a non-inspectable way in auxiliary
00198 *         space AF. Mathematically, this is equivalent to reordering
00199 *         the matrix A as P A P^T and then factoring the principal
00200 *         leading submatrix of size equal to the sum of the sizes of
00201 *         the matrices factored on each processor. The factors of
00202 *         these submatrices overwrite the corresponding parts of A
00203 *         in memory.
00204 *      2) Reduced System Phase:
00205 *         A small (max(bwl,bwu)* (P-1)) system is formed representing
00206 *         interaction of the larger blocks, and is stored (as are its
00207 *         factors) in the space AF. A parallel Block Cyclic Reduction
00208 *         algorithm is used. For a linear system, a parallel front solve
00209 *         followed by an analagous backsolve, both using the structure
00210 *         of the factored matrix, are performed.
00211 *      3) Backsubsitution Phase:
00212 *         For a linear system, a local backsubstitution is performed on
00213 *         each processor in parallel.
00214 *
00215 *
00216 *  Descriptors
00217 *  ===========
00218 *
00219 *  Descriptors now have *types* and differ from ScaLAPACK 1.0.
00220 *
00221 *  Note: banded codes can use either the old two dimensional
00222 *    or new one-dimensional descriptors, though the processor grid in
00223 *    both cases *must be one-dimensional*. We describe both types below.
00224 *
00225 *  Each global data object is described by an associated description
00226 *  vector.  This vector stores the information required to establish
00227 *  the mapping between an object element and its corresponding process
00228 *  and memory location.
00229 *
00230 *  Let A be a generic term for any 2D block cyclicly distributed array.
00231 *  Such a global array has an associated description vector DESCA.
00232 *  In the following comments, the character _ should be read as
00233 *  "of the global array".
00234 *
00235 *  NOTATION        STORED IN      EXPLANATION
00236 *  --------------- -------------- --------------------------------------
00237 *  DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case,
00238 *                                 DTYPE_A = 1.
00239 *  CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
00240 *                                 the BLACS process grid A is distribu-
00241 *                                 ted over. The context itself is glo-
00242 *                                 bal, but the handle (the integer
00243 *                                 value) may vary.
00244 *  M_A    (global) DESCA( M_ )    The number of rows in the global
00245 *                                 array A.
00246 *  N_A    (global) DESCA( N_ )    The number of columns in the global
00247 *                                 array A.
00248 *  MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute
00249 *                                 the rows of the array.
00250 *  NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute
00251 *                                 the columns of the array.
00252 *  RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
00253 *                                 row of the array A is distributed.
00254 *  CSRC_A (global) DESCA( CSRC_ ) The process column over which the
00255 *                                 first column of the array A is
00256 *                                 distributed.
00257 *  LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
00258 *                                 array.  LLD_A >= MAX(1,LOCr(M_A)).
00259 *
00260 *  Let K be the number of rows or columns of a distributed matrix,
00261 *  and assume that its process grid has dimension p x q.
00262 *  LOCr( K ) denotes the number of elements of K that a process
00263 *  would receive if K were distributed over the p processes of its
00264 *  process column.
00265 *  Similarly, LOCc( K ) denotes the number of elements of K that a
00266 *  process would receive if K were distributed over the q processes of
00267 *  its process row.
00268 *  The values of LOCr() and LOCc() may be determined via a call to the
00269 *  ScaLAPACK tool function, NUMROC:
00270 *          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
00271 *          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
00272 *  An upper bound for these quantities may be computed by:
00273 *          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
00274 *          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
00275 *
00276 *
00277 *  One-dimensional descriptors:
00278 *
00279 *  One-dimensional descriptors are a new addition to ScaLAPACK since
00280 *    version 1.0. They simplify and shorten the descriptor for 1D
00281 *    arrays.
00282 *
00283 *  Since ScaLAPACK supports two-dimensional arrays as the fundamental
00284 *    object, we allow 1D arrays to be distributed either over the
00285 *    first dimension of the array (as if the grid were P-by-1) or the
00286 *    2nd dimension (as if the grid were 1-by-P). This choice is
00287 *    indicated by the descriptor type (501 or 502)
00288 *    as described below.
00289 *
00290 *    IMPORTANT NOTE: the actual BLACS grid represented by the
00291 *    CTXT entry in the descriptor may be *either*  P-by-1 or 1-by-P
00292 *    irrespective of which one-dimensional descriptor type
00293 *    (501 or 502) is input.
00294 *    This routine will interpret the grid properly either way.
00295 *    ScaLAPACK routines *do not support intercontext operations* so that
00296 *    the grid passed to a single ScaLAPACK routine *must be the same*
00297 *    for all array descriptors passed to that routine.
00298 *
00299 *    NOTE: In all cases where 1D descriptors are used, 2D descriptors
00300 *    may also be used, since a one-dimensional array is a special case
00301 *    of a two-dimensional array with one dimension of size unity.
00302 *    The two-dimensional array used in this case *must* be of the
00303 *    proper orientation:
00304 *      If the appropriate one-dimensional descriptor is DTYPEA=501
00305 *      (1 by P type), then the two dimensional descriptor must
00306 *      have a CTXT value that refers to a 1 by P BLACS grid;
00307 *      If the appropriate one-dimensional descriptor is DTYPEA=502
00308 *      (P by 1 type), then the two dimensional descriptor must
00309 *      have a CTXT value that refers to a P by 1 BLACS grid.
00310 *
00311 *
00312 *  Summary of allowed descriptors, types, and BLACS grids:
00313 *  DTYPE           501         502         1         1
00314 *  BLACS grid      1xP or Px1  1xP or Px1  1xP       Px1
00315 *  -----------------------------------------------------
00316 *  A               OK          NO          OK        NO
00317 *  B               NO          OK          NO        OK
00318 *
00319 *  Note that a consequence of this chart is that it is not possible
00320 *    for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
00321 *    to opposite requirements for the orientation of the BLACS grid,
00322 *    and as noted before, the *same* BLACS context must be used in
00323 *    all descriptors in a single ScaLAPACK subroutine call.
00324 *
00325 *  Let A be a generic term for any 1D block cyclicly distributed array.
00326 *  Such a global array has an associated description vector DESCA.
00327 *  In the following comments, the character _ should be read as
00328 *  "of the global array".
00329 *
00330 *  NOTATION        STORED IN  EXPLANATION
00331 *  --------------- ---------- ------------------------------------------
00332 *  DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
00333 *                                TYPE_A = 501: 1-by-P grid.
00334 *                                TYPE_A = 502: P-by-1 grid.
00335 *  CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
00336 *                                the BLACS process grid A is distribu-
00337 *                                ted over. The context itself is glo-
00338 *                                bal, but the handle (the integer
00339 *                                value) may vary.
00340 *  N_A    (global) DESCA( 3 ) The size of the array dimension being
00341 *                                distributed.
00342 *  NB_A   (global) DESCA( 4 ) The blocking factor used to distribute
00343 *                                the distributed dimension of the array.
00344 *  SRC_A  (global) DESCA( 5 ) The process row or column over which the
00345 *                                first row or column of the array
00346 *                                is distributed.
00347 *  LLD_A  (local)  DESCA( 6 ) The leading dimension of the local array
00348 *                                storing the local blocks of the distri-
00349 *                                buted array A. Minimum value of LLD_A
00350 *                                depends on TYPE_A.
00351 *                                TYPE_A = 501: LLD_A >=
00352 *                                   size of undistributed dimension, 1.
00353 *                                TYPE_A = 502: LLD_A >=NB_A, 1.
00354 *  Reserved        DESCA( 7 ) Reserved for future use.
00355 *
00356 *  =====================================================================
00357 *
00358 *  Implemented for ScaLAPACK by:
00359 *     Andrew J. Cleary, Livermore National Lab and University of Tenn.,
00360 *     and Markus Hegland, Australian National University. Feb., 1997.
00361 *  Based on code written by    : Peter Arbenz, ETH Zurich, 1996.
00362 *  Last modified by:  Peter Arbenz, Institute of Scientific Computing,
00363 *    ETH, Zurich.
00364 *
00365 *  =====================================================================
00366 *
00367 *     .. Parameters ..
00368       REAL               ONE
00369       PARAMETER          ( ONE = 1.0E+0 )
00370       REAL               ZERO
00371       PARAMETER          ( ZERO = 0.0E+0 )
00372       INTEGER            INT_ONE
00373       PARAMETER          ( INT_ONE = 1 )
00374       INTEGER            DESCMULT, BIGNUM
00375       PARAMETER          ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT )
00376       INTEGER            BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
00377      $                   LLD_, MB_, M_, NB_, N_, RSRC_
00378       PARAMETER          ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1,
00379      $                     CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
00380      $                     RSRC_ = 7, CSRC_ = 8, LLD_ = 9 )
00381 *     ..
00382 *     .. Local Scalars ..
00383       INTEGER            APTR, BBPTR, BM, BMN, BN, BNN, BW, CSRC,
00384      $                   FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE,
00385      $                   IDUM2, IDUM3, J, JA_NEW, L, LBWL, LBWU, LDBB,
00386      $                   LDW, LLDA, LLDB, LM, LMJ, LN, LPTR, MYCOL,
00387      $                   MYROW, NB, NEICOL, NP, NPACT, NPCOL, NPROW,
00388      $                   NPSTR, NP_SAVE, ODD_SIZE, PART_OFFSET,
00389      $                   RECOVERY_VAL, RETURN_CODE, STORE_M_B,
00390      $                   STORE_N_A, WORK_SIZE_MIN, WPTR
00391 *     ..
00392 *     .. Local Arrays ..
00393       INTEGER            DESCA_1XP( 7 ), DESCB_PX1( 7 ),
00394      $                   PARAM_CHECK( 17, 3 )
00395 *     ..
00396 *     .. External Subroutines ..
00397       EXTERNAL           BLACS_GRIDEXIT, BLACS_GRIDINFO, SCOPY,
00398      $                   DESC_CONVERT, SGEMM, SGEMV, SGER, SGERV2D,
00399      $                   SGESD2D, SGETRS, SLAMOV, SLASWP, SSCAL, SSWAP,
00400      $                   STRSM, GLOBCHK, PXERBLA, RESHAPE
00401 *     ..
00402 *     .. External Functions ..
00403       LOGICAL            LSAME
00404       INTEGER            NUMROC
00405       EXTERNAL           LSAME, NUMROC
00406 *     ..
00407 *     .. Intrinsic Functions ..
00408       INTRINSIC          ICHAR, MAX, MIN, MOD
00409 *     ..
00410 *     .. Executable Statements ..
00411 *
00412 *
00413 *     Test the input parameters
00414 *
00415       INFO = 0
00416 *
00417 *     Convert descriptor into standard form for easy access to
00418 *        parameters, check that grid is of right shape.
00419 *
00420       DESCA_1XP( 1 ) = 501
00421       DESCB_PX1( 1 ) = 502
00422 *
00423       CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE )
00424 *
00425       IF( RETURN_CODE.NE.0 ) THEN
00426          INFO = -( 8*100+2 )
00427       END IF
00428 *
00429       CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE )
00430 *
00431       IF( RETURN_CODE.NE.0 ) THEN
00432          INFO = -( 11*100+2 )
00433       END IF
00434 *
00435 *     Consistency checks for DESCA and DESCB.
00436 *
00437 *     Context must be the same
00438       IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN
00439          INFO = -( 11*100+2 )
00440       END IF
00441 *
00442 *        These are alignment restrictions that may or may not be removed
00443 *        in future releases. -Andy Cleary, April 14, 1996.
00444 *
00445 *     Block sizes must be the same
00446       IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN
00447          INFO = -( 11*100+4 )
00448       END IF
00449 *
00450 *     Source processor must be the same
00451 *
00452       IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN
00453          INFO = -( 11*100+5 )
00454       END IF
00455 *
00456 *     Get values out of descriptor for use in code.
00457 *
00458       ICTXT = DESCA_1XP( 2 )
00459       CSRC = DESCA_1XP( 5 )
00460       NB = DESCA_1XP( 4 )
00461       LLDA = DESCA_1XP( 6 )
00462       STORE_N_A = DESCA_1XP( 3 )
00463       LLDB = DESCB_PX1( 6 )
00464       STORE_M_B = DESCB_PX1( 3 )
00465 *
00466 *     Get grid parameters
00467 *
00468 *
00469       CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
00470       NP = NPROW*NPCOL
00471 *
00472 *
00473 *
00474       IF( LSAME( TRANS, 'N' ) ) THEN
00475          IDUM2 = ICHAR( 'N' )
00476       ELSE IF( LSAME( TRANS, 'T' ) ) THEN
00477          IDUM2 = ICHAR( 'T' )
00478       ELSE IF( LSAME( TRANS, 'C' ) ) THEN
00479          IDUM2 = ICHAR( 'T' )
00480       ELSE
00481          INFO = -1
00482       END IF
00483 *
00484       IF( LWORK.LT.-1 ) THEN
00485          INFO = -16
00486       ELSE IF( LWORK.EQ.-1 ) THEN
00487          IDUM3 = -1
00488       ELSE
00489          IDUM3 = 1
00490       END IF
00491 *
00492       IF( N.LT.0 ) THEN
00493          INFO = -2
00494       END IF
00495 *
00496       IF( N+JA-1.GT.STORE_N_A ) THEN
00497          INFO = -( 8*100+6 )
00498       END IF
00499 *
00500       IF( ( BWL.GT.N-1 ) .OR. ( BWL.LT.0 ) ) THEN
00501          INFO = -3
00502       END IF
00503 *
00504       IF( ( BWU.GT.N-1 ) .OR. ( BWU.LT.0 ) ) THEN
00505          INFO = -4
00506       END IF
00507 *
00508       IF( LLDA.LT.( 2*BWL+2*BWU+1 ) ) THEN
00509          INFO = -( 8*100+6 )
00510       END IF
00511 *
00512       IF( NB.LE.0 ) THEN
00513          INFO = -( 8*100+4 )
00514       END IF
00515 *
00516       BW = BWU + BWL
00517 *
00518       IF( N+IB-1.GT.STORE_M_B ) THEN
00519          INFO = -( 11*100+3 )
00520       END IF
00521 *
00522       IF( LLDB.LT.NB ) THEN
00523          INFO = -( 11*100+6 )
00524       END IF
00525 *
00526       IF( NRHS.LT.0 ) THEN
00527          INFO = -5
00528       END IF
00529 *
00530 *     Current alignment restriction
00531 *
00532       IF( JA.NE.IB ) THEN
00533          INFO = -7
00534       END IF
00535 *
00536 *     Argument checking that is specific to Divide & Conquer routine
00537 *
00538       IF( NPROW.NE.1 ) THEN
00539          INFO = -( 8*100+2 )
00540       END IF
00541 *
00542       IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN
00543          INFO = -( 2 )
00544          CALL PXERBLA( ICTXT, 'PSGBTRS, D&C alg.: only 1 block per proc'
00545      $                 , -INFO )
00546          RETURN
00547       END IF
00548 *
00549       IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.( BWL+BWU+1 ) ) ) THEN
00550          INFO = -( 8*100+4 )
00551          CALL PXERBLA( ICTXT, 'PSGBTRS, D&C alg.: NB too small', -INFO )
00552          RETURN
00553       END IF
00554 *
00555 *
00556 *     Check worksize
00557 *
00558       WORK_SIZE_MIN = NRHS*( NB+2*BWL+4*BWU )
00559 *
00560       WORK( 1 ) = WORK_SIZE_MIN
00561 *
00562       IF( LWORK.LT.WORK_SIZE_MIN ) THEN
00563          IF( LWORK.NE.-1 ) THEN
00564             INFO = -16
00565             CALL PXERBLA( ICTXT, 'PSGBTRS: worksize error ', -INFO )
00566          END IF
00567          RETURN
00568       END IF
00569 *
00570 *     Pack params and positions into arrays for global consistency check
00571 *
00572       PARAM_CHECK( 17, 1 ) = DESCB( 5 )
00573       PARAM_CHECK( 16, 1 ) = DESCB( 4 )
00574       PARAM_CHECK( 15, 1 ) = DESCB( 3 )
00575       PARAM_CHECK( 14, 1 ) = DESCB( 2 )
00576       PARAM_CHECK( 13, 1 ) = DESCB( 1 )
00577       PARAM_CHECK( 12, 1 ) = IB
00578       PARAM_CHECK( 11, 1 ) = DESCA( 5 )
00579       PARAM_CHECK( 10, 1 ) = DESCA( 4 )
00580       PARAM_CHECK( 9, 1 ) = DESCA( 3 )
00581       PARAM_CHECK( 8, 1 ) = DESCA( 1 )
00582       PARAM_CHECK( 7, 1 ) = JA
00583       PARAM_CHECK( 6, 1 ) = NRHS
00584       PARAM_CHECK( 5, 1 ) = BWU
00585       PARAM_CHECK( 4, 1 ) = BWL
00586       PARAM_CHECK( 3, 1 ) = N
00587       PARAM_CHECK( 2, 1 ) = IDUM3
00588       PARAM_CHECK( 1, 1 ) = IDUM2
00589 *
00590       PARAM_CHECK( 17, 2 ) = 1105
00591       PARAM_CHECK( 16, 2 ) = 1104
00592       PARAM_CHECK( 15, 2 ) = 1103
00593       PARAM_CHECK( 14, 2 ) = 1102
00594       PARAM_CHECK( 13, 2 ) = 1101
00595       PARAM_CHECK( 12, 2 ) = 10
00596       PARAM_CHECK( 11, 2 ) = 805
00597       PARAM_CHECK( 10, 2 ) = 804
00598       PARAM_CHECK( 9, 2 ) = 803
00599       PARAM_CHECK( 8, 2 ) = 801
00600       PARAM_CHECK( 7, 2 ) = 7
00601       PARAM_CHECK( 6, 2 ) = 5
00602       PARAM_CHECK( 5, 2 ) = 4
00603       PARAM_CHECK( 4, 2 ) = 3
00604       PARAM_CHECK( 3, 2 ) = 2
00605       PARAM_CHECK( 2, 2 ) = 16
00606       PARAM_CHECK( 1, 2 ) = 1
00607 *
00608 *     Want to find errors with MIN( ), so if no error, set it to a big
00609 *     number. If there already is an error, multiply by the the
00610 *     descriptor multiplier.
00611 *
00612       IF( INFO.GE.0 ) THEN
00613          INFO = BIGNUM
00614       ELSE IF( INFO.LT.-DESCMULT ) THEN
00615          INFO = -INFO
00616       ELSE
00617          INFO = -INFO*DESCMULT
00618       END IF
00619 *
00620 *     Check consistency across processors
00621 *
00622       CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17, PARAM_CHECK( 1, 3 ),
00623      $              INFO )
00624 *
00625 *     Prepare output: set info = 0 if no error, and divide by DESCMULT
00626 *     if error is not in a descriptor entry.
00627 *
00628       IF( INFO.EQ.BIGNUM ) THEN
00629          INFO = 0
00630       ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN
00631          INFO = -INFO / DESCMULT
00632       ELSE
00633          INFO = -INFO
00634       END IF
00635 *
00636       IF( INFO.LT.0 ) THEN
00637          CALL PXERBLA( ICTXT, 'PSGBTRS', -INFO )
00638          RETURN
00639       END IF
00640 *
00641 *     Quick return if possible
00642 *
00643       IF( N.EQ.0 )
00644      $   RETURN
00645 *
00646       IF( NRHS.EQ.0 )
00647      $   RETURN
00648 *
00649 *
00650 *     Adjust addressing into matrix space to properly get into
00651 *        the beginning part of the relevant data
00652 *
00653       PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) )
00654 *
00655       IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN
00656          PART_OFFSET = PART_OFFSET + NB
00657       END IF
00658 *
00659       IF( MYCOL.LT.CSRC ) THEN
00660          PART_OFFSET = PART_OFFSET - NB
00661       END IF
00662 *
00663 *     Form a new BLACS grid (the "standard form" grid) with only procs
00664 *        holding part of the matrix, of size 1xNP where NP is adjusted,
00665 *        starting at csrc=0, with JA modified to reflect dropped procs.
00666 *
00667 *     First processor to hold part of the matrix:
00668 *
00669       FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL )
00670 *
00671 *     Calculate new JA one while dropping off unused processors.
00672 *
00673       JA_NEW = MOD( JA-1, NB ) + 1
00674 *
00675 *     Save and compute new value of NP
00676 *
00677       NP_SAVE = NP
00678       NP = ( JA_NEW+N-2 ) / NB + 1
00679 *
00680 *     Call utility routine that forms "standard-form" grid
00681 *
00682       CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC,
00683      $              INT_ONE, NP )
00684 *
00685 *     Use new context from standard grid as context.
00686 *
00687       ICTXT_SAVE = ICTXT
00688       ICTXT = ICTXT_NEW
00689       DESCA_1XP( 2 ) = ICTXT_NEW
00690       DESCB_PX1( 2 ) = ICTXT_NEW
00691 *
00692 *     Get information about new grid.
00693 *
00694       CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
00695 *
00696 *     Drop out processors that do not have part of the matrix.
00697 *
00698       IF( MYROW.LT.0 ) THEN
00699          GO TO 100
00700       END IF
00701 *
00702 *
00703 *
00704 *     Begin main code
00705 *
00706 *     Move data into workspace - communicate/copy (overlap)
00707 *
00708       IF( MYCOL.LT.NPCOL-1 ) THEN
00709          CALL SGESD2D( ICTXT, BWU, NRHS, B( NB-BWU+1 ), LLDB, 0,
00710      $                 MYCOL+1 )
00711       END IF
00712 *
00713       IF( MYCOL.LT.NPCOL-1 ) THEN
00714          LM = NB - BWU
00715       ELSE
00716          LM = NB
00717       END IF
00718 *
00719       IF( MYCOL.GT.0 ) THEN
00720          WPTR = BWU + 1
00721       ELSE
00722          WPTR = 1
00723       END IF
00724 *
00725       LDW = NB + BWU + 2*BW + BWU
00726 *
00727       CALL SLAMOV( 'G', LM, NRHS, B( 1 ), LLDB, WORK( WPTR ), LDW )
00728 *
00729 *     Zero out rest of work
00730 *
00731       DO 20 J = 1, NRHS
00732          DO 10 L = WPTR + LM, LDW
00733             WORK( ( J-1 )*LDW+L ) = ZERO
00734    10    CONTINUE
00735    20 CONTINUE
00736 *
00737       IF( MYCOL.GT.0 ) THEN
00738          CALL SGERV2D( ICTXT, BWU, NRHS, WORK( 1 ), LDW, 0, MYCOL-1 )
00739       END IF
00740 *
00741 ********************************************************************
00742 *       PHASE 1: Local computation phase -- Solve L*X = B
00743 ********************************************************************
00744 *
00745 *     Size of main (or odd) partition in each processor
00746 *
00747       ODD_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL )
00748 *
00749       IF( MYCOL.NE.0 ) THEN
00750          LBWL = BW
00751          LBWU = 0
00752          APTR = 1
00753       ELSE
00754          LBWL = BWL
00755          LBWU = BWU
00756          APTR = 1 + BWU
00757       END IF
00758 *
00759       IF( MYCOL.NE.NPCOL-1 ) THEN
00760          LM = NB - LBWU
00761          LN = NB - BW
00762       ELSE IF( MYCOL.NE.0 ) THEN
00763          LM = ODD_SIZE + BWU
00764          LN = MAX( ODD_SIZE-BW, 0 )
00765       ELSE
00766          LM = N
00767          LN = MAX( N-BW, 0 )
00768       END IF
00769 *
00770       DO 30 J = 1, LN
00771 *
00772          LMJ = MIN( LBWL, LM-J )
00773          L = IPIV( J )
00774 *
00775          IF( L.NE.J ) THEN
00776             CALL SSWAP( NRHS, WORK( L ), LDW, WORK( J ), LDW )
00777          END IF
00778 *
00779          LPTR = BW + 1 + ( J-1 )*LLDA + APTR
00780 *
00781          CALL SGER( LMJ, NRHS, -ONE, A( LPTR ), 1, WORK( J ), LDW,
00782      $              WORK( J+1 ), LDW )
00783 *
00784    30 CONTINUE
00785 *
00786 ********************************************************************
00787 *       PHASE 2: Global computation phase -- Solve L*X = B
00788 ********************************************************************
00789 *
00790 *     Define the initial dimensions of the diagonal blocks
00791 *     The offdiagonal blocks (for MYCOL > 0) are of size BM by BW
00792 *
00793       IF( MYCOL.NE.NPCOL-1 ) THEN
00794          BM = BW - LBWU
00795          BN = BW
00796       ELSE
00797          BM = MIN( BW, ODD_SIZE ) + BWU
00798          BN = MIN( BW, ODD_SIZE )
00799       END IF
00800 *
00801 *     Pointer to first element of block bidiagonal matrix in AF
00802 *     Leading dimension of block bidiagonal system
00803 *
00804       BBPTR = ( NB+BWU )*BW + 1
00805       LDBB = 2*BW + BWU
00806 *
00807       IF( NPCOL.EQ.1 ) THEN
00808 *
00809 *        In this case the loop over the levels will not be
00810 *        performed.
00811          CALL SGETRS( 'N', N-LN, NRHS, AF( BBPTR+BW*LDBB ), LDBB,
00812      $                IPIV( LN+1 ), WORK( LN+1 ), LDW, INFO )
00813 *
00814       END IF
00815 *
00816 * Loop over levels ...
00817 *
00818 *     The two integers NPACT (nu. of active processors) and NPSTR
00819 *     (stride between active processors) is used to control the
00820 *     loop.
00821 *
00822       NPACT = NPCOL
00823       NPSTR = 1
00824 *
00825 *     Begin loop over levels
00826    40 CONTINUE
00827       IF( NPACT.LE.1 )
00828      $   GO TO 50
00829 *
00830 *     Test if processor is active
00831       IF( MOD( MYCOL, NPSTR ).EQ.0 ) THEN
00832 *
00833 *   Send/Receive blocks
00834 *
00835          IF( MOD( MYCOL, 2*NPSTR ).EQ.0 ) THEN
00836 *
00837             NEICOL = MYCOL + NPSTR
00838 *
00839             IF( NEICOL / NPSTR.LE.NPACT-1 ) THEN
00840 *
00841                IF( NEICOL / NPSTR.LT.NPACT-1 ) THEN
00842                   BMN = BW
00843                ELSE
00844                   BMN = MIN( BW, NUMROC( N, NB, NEICOL, 0, NPCOL ) ) +
00845      $                  BWU
00846                END IF
00847 *
00848                CALL SGESD2D( ICTXT, BM, NRHS, WORK( LN+1 ), LDW, 0,
00849      $                       NEICOL )
00850 *
00851                IF( NPACT.NE.2 ) THEN
00852 *
00853 *                     Receive answers back from partner processor
00854 *
00855                   CALL SGERV2D( ICTXT, BM+BMN-BW, NRHS, WORK( LN+1 ),
00856      $                          LDW, 0, NEICOL )
00857 *
00858                   BM = BM + BMN - BW
00859 *
00860                END IF
00861 *
00862             END IF
00863 *
00864          ELSE
00865 *
00866             NEICOL = MYCOL - NPSTR
00867 *
00868             IF( NEICOL.EQ.0 ) THEN
00869                BMN = BW - BWU
00870             ELSE
00871                BMN = BW
00872             END IF
00873 *
00874             CALL SLAMOV( 'G', BM, NRHS, WORK( LN+1 ), LDW,
00875      $                   WORK( NB+BWU+BMN+1 ), LDW )
00876 *
00877             CALL SGERV2D( ICTXT, BMN, NRHS, WORK( NB+BWU+1 ), LDW, 0,
00878      $                    NEICOL )
00879 *
00880 *               and do the permutations and eliminations
00881 *
00882             IF( NPACT.NE.2 ) THEN
00883 *
00884 *                  Solve locally for BW variables
00885 *
00886                CALL SLASWP( NRHS, WORK( NB+BWU+1 ), LDW, 1, BW,
00887      $                      IPIV( LN+1 ), 1 )
00888 *
00889                CALL STRSM( 'L', 'L', 'N', 'U', BW, NRHS, ONE,
00890      $                     AF( BBPTR+BW*LDBB ), LDBB, WORK( NB+BWU+1 ),
00891      $                     LDW )
00892 *
00893 *                  Use soln just calculated to update RHS
00894 *
00895                CALL SGEMM( 'N', 'N', BM+BMN-BW, NRHS, BW, -ONE,
00896      $                     AF( BBPTR+BW*LDBB+BW ), LDBB,
00897      $                     WORK( NB+BWU+1 ), LDW, ONE,
00898      $                     WORK( NB+BWU+1+BW ), LDW )
00899 *
00900 *                  Give answers back to partner processor
00901 *
00902                CALL SGESD2D( ICTXT, BM+BMN-BW, NRHS,
00903      $                       WORK( NB+BWU+1+BW ), LDW, 0, NEICOL )
00904 *
00905             ELSE
00906 *
00907 *                  Finish up calculations for final level
00908 *
00909                CALL SLASWP( NRHS, WORK( NB+BWU+1 ), LDW, 1, BM+BMN,
00910      $                      IPIV( LN+1 ), 1 )
00911 *
00912                CALL STRSM( 'L', 'L', 'N', 'U', BM+BMN, NRHS, ONE,
00913      $                     AF( BBPTR+BW*LDBB ), LDBB, WORK( NB+BWU+1 ),
00914      $                     LDW )
00915             END IF
00916 *
00917          END IF
00918 *
00919          NPACT = ( NPACT+1 ) / 2
00920          NPSTR = NPSTR*2
00921          GO TO 40
00922 *
00923       END IF
00924 *
00925    50 CONTINUE
00926 *
00927 *
00928 **************************************
00929 *     BACKSOLVE
00930 ********************************************************************
00931 *       PHASE 2: Global computation phase -- Solve U*Y = X
00932 ********************************************************************
00933 *
00934       IF( NPCOL.EQ.1 ) THEN
00935 *
00936 *        In this case the loop over the levels will not be
00937 *        performed.
00938 *        In fact, the backsolve portion was done in the call to
00939 *          SGETRS in the frontsolve.
00940 *
00941       END IF
00942 *
00943 *     Compute variable needed to reverse loop structure in
00944 *        reduced system.
00945 *
00946       RECOVERY_VAL = NPACT*NPSTR - NPCOL
00947 *
00948 *     Loop over levels
00949 *      Terminal values of NPACT and NPSTR from frontsolve are used
00950 *
00951    60 CONTINUE
00952       IF( NPACT.GE.NPCOL )
00953      $   GO TO 80
00954 *
00955       NPSTR = NPSTR / 2
00956 *
00957       NPACT = NPACT*2
00958 *
00959 *        Have to adjust npact for non-power-of-2
00960 *
00961       NPACT = NPACT - MOD( ( RECOVERY_VAL / NPSTR ), 2 )
00962 *
00963 *        Find size of submatrix in this proc at this level
00964 *
00965       IF( MYCOL / NPSTR.LT.NPACT-1 ) THEN
00966          BN = BW
00967       ELSE
00968          BN = MIN( BW, NUMROC( N, NB, NPCOL-1, 0, NPCOL ) )
00969       END IF
00970 *
00971 *        If this processor is even in this level...
00972 *
00973       IF( MOD( MYCOL, 2*NPSTR ).EQ.0 ) THEN
00974 *
00975          NEICOL = MYCOL + NPSTR
00976 *
00977          IF( NEICOL / NPSTR.LE.NPACT-1 ) THEN
00978 *
00979             IF( NEICOL / NPSTR.LT.NPACT-1 ) THEN
00980                BMN = BW
00981                BNN = BW
00982             ELSE
00983                BMN = MIN( BW, NUMROC( N, NB, NEICOL, 0, NPCOL ) ) + BWU
00984                BNN = MIN( BW, NUMROC( N, NB, NEICOL, 0, NPCOL ) )
00985             END IF
00986 *
00987             IF( NPACT.GT.2 ) THEN
00988 *
00989                CALL SGESD2D( ICTXT, 2*BW, NRHS, WORK( LN+1 ), LDW, 0,
00990      $                       NEICOL )
00991 *
00992                CALL SGERV2D( ICTXT, BW, NRHS, WORK( LN+1 ), LDW, 0,
00993      $                       NEICOL )
00994 *
00995             ELSE
00996 *
00997                CALL SGERV2D( ICTXT, BW, NRHS, WORK( LN+1 ), LDW, 0,
00998      $                       NEICOL )
00999 *
01000             END IF
01001 *
01002          END IF
01003 *
01004       ELSE
01005 *           This processor is odd on this level
01006 *
01007          NEICOL = MYCOL - NPSTR
01008 *
01009          IF( NEICOL.EQ.0 ) THEN
01010             BMN = BW - BWU
01011          ELSE
01012             BMN = BW
01013          END IF
01014 *
01015          IF( NEICOL.LT.NPCOL-1 ) THEN
01016             BNN = BW
01017          ELSE
01018             BNN = MIN( BW, NUMROC( N, NB, NEICOL, 0, NPCOL ) )
01019          END IF
01020 *
01021          IF( NPACT.GT.2 ) THEN
01022 *
01023 *              Move RHS to make room for received solutions
01024 *
01025             CALL SLAMOV( 'G', BW, NRHS, WORK( NB+BWU+1 ), LDW,
01026      $                   WORK( NB+BWU+BW+1 ), LDW )
01027 *
01028             CALL SGERV2D( ICTXT, 2*BW, NRHS, WORK( LN+1 ), LDW, 0,
01029      $                    NEICOL )
01030 *
01031             CALL SGEMM( 'N', 'N', BW, NRHS, BN, -ONE, AF( BBPTR ), LDBB,
01032      $                  WORK( LN+1 ), LDW, ONE, WORK( NB+BWU+BW+1 ),
01033      $                  LDW )
01034 *
01035 *
01036             IF( MYCOL.GT.NPSTR ) THEN
01037 *
01038                CALL SGEMM( 'N', 'N', BW, NRHS, BW, -ONE,
01039      $                     AF( BBPTR+2*BW*LDBB ), LDBB, WORK( LN+BW+1 ),
01040      $                     LDW, ONE, WORK( NB+BWU+BW+1 ), LDW )
01041 *
01042             END IF
01043 *
01044             CALL STRSM( 'L', 'U', 'N', 'N', BW, NRHS, ONE,
01045      $                  AF( BBPTR+BW*LDBB ), LDBB, WORK( NB+BWU+BW+1 ),
01046      $                  LDW )
01047 *
01048 *              Send new solution to neighbor
01049 *
01050             CALL SGESD2D( ICTXT, BW, NRHS, WORK( NB+BWU+BW+1 ), LDW, 0,
01051      $                    NEICOL )
01052 *
01053 *              Copy new solution into expected place
01054 *
01055             CALL SLAMOV( 'G', BW, NRHS, WORK( NB+BWU+1+BW ), LDW,
01056      $                   WORK( LN+BW+1 ), LDW )
01057 *
01058          ELSE
01059 *
01060 *              Solve with local diagonal block
01061 *
01062             CALL STRSM( 'L', 'U', 'N', 'N', BN+BNN, NRHS, ONE,
01063      $                  AF( BBPTR+BW*LDBB ), LDBB, WORK( NB+BWU+1 ),
01064      $                  LDW )
01065 *
01066 *              Send new solution to neighbor
01067 *
01068             CALL SGESD2D( ICTXT, BW, NRHS, WORK( NB+BWU+1 ), LDW, 0,
01069      $                    NEICOL )
01070 *
01071 *              Shift solutions into expected positions
01072 *
01073             CALL SLAMOV( 'G', BNN+BN-BW, NRHS, WORK( NB+BWU+1+BW ), LDW,
01074      $                   WORK( LN+1 ), LDW )
01075 *
01076 *
01077             IF( ( NB+BWU+1 ).NE.( LN+1+BW ) ) THEN
01078 *
01079 *                 Copy one row at a time since spaces may overlap
01080 *
01081                DO 70 J = 1, BW
01082                   CALL SCOPY( NRHS, WORK( NB+BWU+J ), LDW,
01083      $                        WORK( LN+BW+J ), LDW )
01084    70          CONTINUE
01085 *
01086             END IF
01087 *
01088          END IF
01089 *
01090       END IF
01091 *
01092       GO TO 60
01093 *
01094    80 CONTINUE
01095 *     End of loop over levels
01096 *
01097 ********************************************************************
01098 *       PHASE 1: (Almost) Local computation phase -- Solve U*Y = X
01099 ********************************************************************
01100 *
01101 *     Reset BM to value it had before reduced system frontsolve...
01102 *
01103       IF( MYCOL.NE.NPCOL-1 ) THEN
01104          BM = BW - LBWU
01105       ELSE
01106          BM = MIN( BW, ODD_SIZE ) + BWU
01107       END IF
01108 *
01109 *     First metastep is to account for the fillin blocks AF
01110 *
01111       IF( MYCOL.LT.NPCOL-1 ) THEN
01112 *
01113          CALL SGESD2D( ICTXT, BW, NRHS, WORK( NB-BW+1 ), LDW, 0,
01114      $                 MYCOL+1 )
01115 *
01116       END IF
01117 *
01118       IF( MYCOL.GT.0 ) THEN
01119 *
01120          CALL SGERV2D( ICTXT, BW, NRHS, WORK( NB+BWU+1 ), LDW, 0,
01121      $                 MYCOL-1 )
01122 *
01123 *        Modify local right hand sides with received rhs's
01124 *
01125          CALL SGEMM( 'T', 'N', LM-BM, NRHS, BW, -ONE, AF( 1 ), BW,
01126      $               WORK( NB+BWU+1 ), LDW, ONE, WORK( 1 ), LDW )
01127 *
01128       END IF
01129 *
01130       DO 90 J = LN, 1, -1
01131 *
01132          LMJ = MIN( BW, ODD_SIZE-1 )
01133 *
01134          LPTR = BW - 1 + J*LLDA + APTR
01135 *
01136 *        In the following, the TRANS=T option is used to reverse
01137 *           the order of multiplication, not as a true transpose
01138 *
01139          CALL SGEMV( 'T', LMJ, NRHS, -ONE, WORK( J+1 ), LDW, A( LPTR ),
01140      $               LLDA-1, ONE, WORK( J ), LDW )
01141 *
01142 *        Divide by diagonal element
01143 *
01144          CALL SSCAL( NRHS, ONE / A( LPTR-LLDA+1 ), WORK( J ), LDW )
01145    90 CONTINUE
01146 *
01147 *
01148 *
01149       CALL SLAMOV( 'G', ODD_SIZE, NRHS, WORK( 1 ), LDW, B( 1 ), LLDB )
01150 *
01151 *     Free BLACS space used to hold standard-form grid.
01152 *
01153       ICTXT = ICTXT_SAVE
01154       IF( ICTXT.NE.ICTXT_NEW ) THEN
01155          CALL BLACS_GRIDEXIT( ICTXT_NEW )
01156       END IF
01157 *
01158   100 CONTINUE
01159 *
01160 *     Restore saved input parameters
01161 *
01162       NP = NP_SAVE
01163 *
01164 *     Output worksize
01165 *
01166       WORK( 1 ) = WORK_SIZE_MIN
01167 *
01168       RETURN
01169 *
01170 *     End of PSGBTRS
01171 *
01172       END