ScaLAPACK  2.0.2
ScaLAPACK: Scalable Linear Algebra PACKage
psdbtrf.f
Go to the documentation of this file.
00001       SUBROUTINE PSDBTRF( N, BWL, BWU, A, JA, DESCA, AF, LAF, WORK,
00002      $                    LWORK, INFO )
00003 *
00004 *  -- ScaLAPACK routine (version 2.0.2) --
00005 *     Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
00006 *     May 1 2012
00007 *
00008 *     .. Scalar Arguments ..
00009       INTEGER            BWL, BWU, INFO, JA, LAF, LWORK, N
00010 *     ..
00011 *     .. Array Arguments ..
00012       INTEGER            DESCA( * )
00013       REAL               A( * ), AF( * ), WORK( * )
00014 *     ..
00015 *
00016 *  Purpose
00017 *  =======
00018 *
00019 *  PSDBTRF computes a LU factorization
00020 *  of an N-by-N real banded
00021 *  diagonally dominant-like distributed matrix
00022 *  with bandwidth BWL, BWU: A(1:N, JA:JA+N-1).
00023 *  Reordering is used to increase parallelism in the factorization.
00024 *  This reordering results in factors that are DIFFERENT from those
00025 *  produced by equivalent sequential codes. These factors cannot
00026 *  be used directly by users; however, they can be used in
00027 *  subsequent calls to PSDBTRS to solve linear systems.
00028 *
00029 *  The factorization has the form
00030 *
00031 *          P A(1:N, JA:JA+N-1) P^T = L U
00032 *
00033 *  where U is a banded upper triangular matrix and L is banded
00034 *  lower triangular, and P is a permutation matrix.
00035 *
00036 *  =====================================================================
00037 *
00038 *  Arguments
00039 *  =========
00040 *
00041 *
00042 *  N       (global input) INTEGER
00043 *          The number of rows and columns to be operated on, i.e. the
00044 *          order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
00045 *
00046 *  BWL     (global input) INTEGER
00047 *          Number of subdiagonals. 0 <= BWL <= N-1
00048 *
00049 *  BWU     (global input) INTEGER
00050 *          Number of superdiagonals. 0 <= BWU <= N-1
00051 *
00052 *  A       (local input/local output) REAL pointer into
00053 *          local memory to an array with first dimension
00054 *          LLD_A >=(bwl+bwu+1) (stored in DESCA).
00055 *          On entry, this array contains the local pieces of the
00056 *          N-by-N unsymmetric banded distributed matrix
00057 *          A(1:N, JA:JA+N-1) to be factored.
00058 *          This local portion is stored in the packed banded format
00059 *            used in LAPACK. Please see the Notes below and the
00060 *            ScaLAPACK manual for more detail on the format of
00061 *            distributed matrices.
00062 *          On exit, this array contains information containing details
00063 *            of the factorization.
00064 *          Note that permutations are performed on the matrix, so that
00065 *            the factors returned are different from those returned
00066 *            by LAPACK.
00067 *
00068 *  JA      (global input) INTEGER
00069 *          The index in the global array A that points to the start of
00070 *          the matrix to be operated on (which may be either all of A
00071 *          or a submatrix of A).
00072 *
00073 *  DESCA   (global and local input) INTEGER array of dimension DLEN.
00074 *          if 1D type (DTYPE_A=501), DLEN >= 7;
00075 *          if 2D type (DTYPE_A=1), DLEN >= 9 .
00076 *          The array descriptor for the distributed matrix A.
00077 *          Contains information of mapping of A to memory. Please
00078 *          see NOTES below for full description and options.
00079 *
00080 *  AF      (local output) REAL array, dimension LAF.
00081 *          Auxiliary Fillin Space.
00082 *          Fillin is created during the factorization routine
00083 *          PSDBTRF and this is stored in AF. If a linear system
00084 *          is to be solved using PSDBTRS after the factorization
00085 *          routine, AF *must not be altered* after the factorization.
00086 *
00087 *  LAF     (local input) INTEGER
00088 *          Size of user-input Auxiliary Fillin space AF. Must be >=
00089 *          NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu)
00090 *          If LAF is not large enough, an error code will be returned
00091 *          and the minimum acceptable size will be returned in AF( 1 )
00092 *
00093 *  WORK    (local workspace/local output)
00094 *          REAL temporary workspace. This space may
00095 *          be overwritten in between calls to routines. WORK must be
00096 *          the size given in LWORK.
00097 *          On exit, WORK( 1 ) contains the minimal LWORK.
00098 *
00099 *  LWORK   (local input or global input) INTEGER
00100 *          Size of user-input workspace WORK.
00101 *          If LWORK is too small, the minimal acceptable size will be
00102 *          returned in WORK(1) and an error code is returned. LWORK>=
00103 *          max(bwl,bwu)*max(bwl,bwu)
00104 *
00105 *  INFO    (global output) INTEGER
00106 *          = 0:  successful exit
00107 *          < 0:  If the i-th argument is an array and the j-entry had
00108 *                an illegal value, then INFO = -(i*100+j), if the i-th
00109 *                argument is a scalar and had an illegal value, then
00110 *                INFO = -i.
00111 *          > 0:  If INFO = K<=NPROCS, the submatrix stored on processor
00112 *                INFO and factored locally was not
00113 *                diagonally dominant-like,  and
00114 *                the factorization was not completed.
00115 *                If INFO = K>NPROCS, the submatrix stored on processor
00116 *                INFO-NPROCS representing interactions with other
00117 *                processors was not
00118 *                stably factorable wo/interchanges,
00119 *                and the factorization was not completed.
00120 *
00121 *  =====================================================================
00122 *
00123 *  Restrictions
00124 *  ============
00125 *
00126 *  The following are restrictions on the input parameters. Some of these
00127 *    are temporary and will be removed in future releases, while others
00128 *    may reflect fundamental technical limitations.
00129 *
00130 *    Non-cyclic restriction: VERY IMPORTANT!
00131 *      P*NB>= mod(JA-1,NB)+N.
00132 *      The mapping for matrices must be blocked, reflecting the nature
00133 *      of the divide and conquer algorithm as a task-parallel algorithm.
00134 *      This formula in words is: no processor may have more than one
00135 *      chunk of the matrix.
00136 *
00137 *    Blocksize cannot be too small:
00138 *      If the matrix spans more than one processor, the following
00139 *      restriction on NB, the size of each block on each processor,
00140 *      must hold:
00141 *      NB >= 2*MAX(BWL,BWU)
00142 *      The bulk of parallel computation is done on the matrix of size
00143 *      O(NB) on each processor. If this is too small, divide and conquer
00144 *      is a poor choice of algorithm.
00145 *
00146 *    Submatrix reference:
00147 *      JA = IB
00148 *      Alignment restriction that prevents unnecessary communication.
00149 *
00150 *
00151 *  =====================================================================
00152 *
00153 *  Notes
00154 *  =====
00155 *
00156 *  If the factorization routine and the solve routine are to be called
00157 *    separately (to solve various sets of righthand sides using the same
00158 *    coefficient matrix), the auxiliary space AF *must not be altered*
00159 *    between calls to the factorization routine and the solve routine.
00160 *
00161 *  The best algorithm for solving banded and tridiagonal linear systems
00162 *    depends on a variety of parameters, especially the bandwidth.
00163 *    Currently, only algorithms designed for the case N/P >> bw are
00164 *    implemented. These go by many names, including Divide and Conquer,
00165 *    Partitioning, domain decomposition-type, etc.
00166 *
00167 *  Algorithm description: Divide and Conquer
00168 *
00169 *    The Divide and Conqer algorithm assumes the matrix is narrowly
00170 *      banded compared with the number of equations. In this situation,
00171 *      it is best to distribute the input matrix A one-dimensionally,
00172 *      with columns atomic and rows divided amongst the processes.
00173 *      The basic algorithm divides the banded matrix up into
00174 *      P pieces with one stored on each processor,
00175 *      and then proceeds in 2 phases for the factorization or 3 for the
00176 *      solution of a linear system.
00177 *      1) Local Phase:
00178 *         The individual pieces are factored independently and in
00179 *         parallel. These factors are applied to the matrix creating
00180 *         fillin, which is stored in a non-inspectable way in auxiliary
00181 *         space AF. Mathematically, this is equivalent to reordering
00182 *         the matrix A as P A P^T and then factoring the principal
00183 *         leading submatrix of size equal to the sum of the sizes of
00184 *         the matrices factored on each processor. The factors of
00185 *         these submatrices overwrite the corresponding parts of A
00186 *         in memory.
00187 *      2) Reduced System Phase:
00188 *         A small (max(bwl,bwu)* (P-1)) system is formed representing
00189 *         interaction of the larger blocks, and is stored (as are its
00190 *         factors) in the space AF. A parallel Block Cyclic Reduction
00191 *         algorithm is used. For a linear system, a parallel front solve
00192 *         followed by an analagous backsolve, both using the structure
00193 *         of the factored matrix, are performed.
00194 *      3) Backsubsitution Phase:
00195 *         For a linear system, a local backsubstitution is performed on
00196 *         each processor in parallel.
00197 *
00198 *  Descriptors
00199 *  ===========
00200 *
00201 *  Descriptors now have *types* and differ from ScaLAPACK 1.0.
00202 *
00203 *  Note: banded codes can use either the old two dimensional
00204 *    or new one-dimensional descriptors, though the processor grid in
00205 *    both cases *must be one-dimensional*. We describe both types below.
00206 *
00207 *  Each global data object is described by an associated description
00208 *  vector.  This vector stores the information required to establish
00209 *  the mapping between an object element and its corresponding process
00210 *  and memory location.
00211 *
00212 *  Let A be a generic term for any 2D block cyclicly distributed array.
00213 *  Such a global array has an associated description vector DESCA.
00214 *  In the following comments, the character _ should be read as
00215 *  "of the global array".
00216 *
00217 *  NOTATION        STORED IN      EXPLANATION
00218 *  --------------- -------------- --------------------------------------
00219 *  DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case,
00220 *                                 DTYPE_A = 1.
00221 *  CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
00222 *                                 the BLACS process grid A is distribu-
00223 *                                 ted over. The context itself is glo-
00224 *                                 bal, but the handle (the integer
00225 *                                 value) may vary.
00226 *  M_A    (global) DESCA( M_ )    The number of rows in the global
00227 *                                 array A.
00228 *  N_A    (global) DESCA( N_ )    The number of columns in the global
00229 *                                 array A.
00230 *  MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute
00231 *                                 the rows of the array.
00232 *  NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute
00233 *                                 the columns of the array.
00234 *  RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
00235 *                                 row of the array A is distributed.
00236 *  CSRC_A (global) DESCA( CSRC_ ) The process column over which the
00237 *                                 first column of the array A is
00238 *                                 distributed.
00239 *  LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
00240 *                                 array.  LLD_A >= MAX(1,LOCr(M_A)).
00241 *
00242 *  Let K be the number of rows or columns of a distributed matrix,
00243 *  and assume that its process grid has dimension p x q.
00244 *  LOCr( K ) denotes the number of elements of K that a process
00245 *  would receive if K were distributed over the p processes of its
00246 *  process column.
00247 *  Similarly, LOCc( K ) denotes the number of elements of K that a
00248 *  process would receive if K were distributed over the q processes of
00249 *  its process row.
00250 *  The values of LOCr() and LOCc() may be determined via a call to the
00251 *  ScaLAPACK tool function, NUMROC:
00252 *          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
00253 *          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
00254 *  An upper bound for these quantities may be computed by:
00255 *          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
00256 *          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
00257 *
00258 *
00259 *  One-dimensional descriptors:
00260 *
00261 *  One-dimensional descriptors are a new addition to ScaLAPACK since
00262 *    version 1.0. They simplify and shorten the descriptor for 1D
00263 *    arrays.
00264 *
00265 *  Since ScaLAPACK supports two-dimensional arrays as the fundamental
00266 *    object, we allow 1D arrays to be distributed either over the
00267 *    first dimension of the array (as if the grid were P-by-1) or the
00268 *    2nd dimension (as if the grid were 1-by-P). This choice is
00269 *    indicated by the descriptor type (501 or 502)
00270 *    as described below.
00271 *
00272 *    IMPORTANT NOTE: the actual BLACS grid represented by the
00273 *    CTXT entry in the descriptor may be *either*  P-by-1 or 1-by-P
00274 *    irrespective of which one-dimensional descriptor type
00275 *    (501 or 502) is input.
00276 *    This routine will interpret the grid properly either way.
00277 *    ScaLAPACK routines *do not support intercontext operations* so that
00278 *    the grid passed to a single ScaLAPACK routine *must be the same*
00279 *    for all array descriptors passed to that routine.
00280 *
00281 *    NOTE: In all cases where 1D descriptors are used, 2D descriptors
00282 *    may also be used, since a one-dimensional array is a special case
00283 *    of a two-dimensional array with one dimension of size unity.
00284 *    The two-dimensional array used in this case *must* be of the
00285 *    proper orientation:
00286 *      If the appropriate one-dimensional descriptor is DTYPEA=501
00287 *      (1 by P type), then the two dimensional descriptor must
00288 *      have a CTXT value that refers to a 1 by P BLACS grid;
00289 *      If the appropriate one-dimensional descriptor is DTYPEA=502
00290 *      (P by 1 type), then the two dimensional descriptor must
00291 *      have a CTXT value that refers to a P by 1 BLACS grid.
00292 *
00293 *
00294 *  Summary of allowed descriptors, types, and BLACS grids:
00295 *  DTYPE           501         502         1         1
00296 *  BLACS grid      1xP or Px1  1xP or Px1  1xP       Px1
00297 *  -----------------------------------------------------
00298 *  A               OK          NO          OK        NO
00299 *  B               NO          OK          NO        OK
00300 *
00301 *  Let A be a generic term for any 1D block cyclicly distributed array.
00302 *  Such a global array has an associated description vector DESCA.
00303 *  In the following comments, the character _ should be read as
00304 *  "of the global array".
00305 *
00306 *  NOTATION        STORED IN  EXPLANATION
00307 *  --------------- ---------- ------------------------------------------
00308 *  DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
00309 *                                TYPE_A = 501: 1-by-P grid.
00310 *                                TYPE_A = 502: P-by-1 grid.
00311 *  CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
00312 *                                the BLACS process grid A is distribu-
00313 *                                ted over. The context itself is glo-
00314 *                                bal, but the handle (the integer
00315 *                                value) may vary.
00316 *  N_A    (global) DESCA( 3 ) The size of the array dimension being
00317 *                                distributed.
00318 *  NB_A   (global) DESCA( 4 ) The blocking factor used to distribute
00319 *                                the distributed dimension of the array.
00320 *  SRC_A  (global) DESCA( 5 ) The process row or column over which the
00321 *                                first row or column of the array
00322 *                                is distributed.
00323 *  LLD_A  (local)  DESCA( 6 ) The leading dimension of the local array
00324 *                                storing the local blocks of the distri-
00325 *                                buted array A. Minimum value of LLD_A
00326 *                                depends on TYPE_A.
00327 *                                TYPE_A = 501: LLD_A >=
00328 *                                   size of undistributed dimension, 1.
00329 *                                TYPE_A = 502: LLD_A >=NB_A, 1.
00330 *  Reserved        DESCA( 7 ) Reserved for future use.
00331 *
00332 *  =====================================================================
00333 *
00334 *  Code Developer: Andrew J. Cleary, University of Tennessee.
00335 *    Current address: Lawrence Livermore National Labs.
00336 *  Last modified by:  Peter Arbenz, Institute of Scientific Computing,
00337 *    ETH, Zurich.
00338 *
00339 *  =====================================================================
00340 *
00341 *     .. Parameters ..
00342       REAL               ONE
00343       PARAMETER          ( ONE = 1.0E+0 )
00344       REAL               ZERO
00345       PARAMETER          ( ZERO = 0.0E+0 )
00346       INTEGER            INT_ONE
00347       PARAMETER          ( INT_ONE = 1 )
00348       INTEGER            DESCMULT, BIGNUM
00349       PARAMETER          ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT )
00350       INTEGER            BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
00351      $                   LLD_, MB_, M_, NB_, N_, RSRC_
00352       PARAMETER          ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1,
00353      $                     CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
00354      $                     RSRC_ = 7, CSRC_ = 8, LLD_ = 9 )
00355 *     ..
00356 *     .. Local Scalars ..
00357       INTEGER            COMM_PROC, CSRC, FIRST_PROC, I, I1, I2, ICTXT,
00358      $                   ICTXT_NEW, ICTXT_SAVE, IDUM3, JA_NEW, LAF_MIN,
00359      $                   LEVEL_DIST, LLDA, MAX_BW, MBW2, MYCOL, MYROW,
00360      $                   MY_NUM_COLS, NB, NEXT_TRI_SIZE_M,
00361      $                   NEXT_TRI_SIZE_N, NP, NPCOL, NPROW, NP_SAVE,
00362      $                   ODD_SIZE, OFST, PART_OFFSET, PART_SIZE,
00363      $                   PREV_TRI_SIZE_M, PREV_TRI_SIZE_N, RETURN_CODE,
00364      $                   STORE_N_A, UP_PREV_TRI_SIZE_M,
00365      $                   UP_PREV_TRI_SIZE_N, WORK_SIZE_MIN, WORK_U
00366 *     ..
00367 *     .. Local Arrays ..
00368       INTEGER            DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 )
00369 *     ..
00370 *     .. External Subroutines ..
00371       EXTERNAL           BLACS_GRIDEXIT, BLACS_GRIDINFO, SAXPY, SDBTRF,
00372      $                   DESC_CONVERT, SGEMM, SGEMV, SGERV2D, SGESD2D,
00373      $                   SLAMOV, SLATCPY, STBTRS, STRMM, STRRV2D,
00374      $                   STRSD2D, GLOBCHK, IGAMX2D, IGEBR2D, IGEBS2D,
00375      $                   PXERBLA, RESHAPE
00376 *     ..
00377 *     .. External Functions ..
00378       INTEGER            NUMROC
00379       EXTERNAL           NUMROC
00380 *     ..
00381 *     .. Intrinsic Functions ..
00382       INTRINSIC          MAX, MIN, MOD
00383 *     ..
00384 *     .. Executable Statements ..
00385 *
00386 *     Test the input parameters
00387 *
00388       INFO = 0
00389 *
00390 *     Convert descriptor into standard form for easy access to
00391 *        parameters, check that grid is of right shape.
00392 *
00393       DESCA_1XP( 1 ) = 501
00394 *
00395       CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE )
00396 *
00397       IF( RETURN_CODE.NE.0 ) THEN
00398          INFO = -( 6*100+2 )
00399       END IF
00400 *
00401 *     Get values out of descriptor for use in code.
00402 *
00403       ICTXT = DESCA_1XP( 2 )
00404       CSRC = DESCA_1XP( 5 )
00405       NB = DESCA_1XP( 4 )
00406       LLDA = DESCA_1XP( 6 )
00407       STORE_N_A = DESCA_1XP( 3 )
00408 *
00409 *     Get grid parameters
00410 *
00411 *
00412 *     Size of separator blocks is maximum of bandwidths
00413 *
00414       MAX_BW = MAX( BWL, BWU )
00415       MBW2 = MAX_BW*MAX_BW
00416 *
00417       CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
00418       NP = NPROW*NPCOL
00419 *
00420 *
00421 *
00422       IF( LWORK.LT.-1 ) THEN
00423          INFO = -10
00424       ELSE IF( LWORK.EQ.-1 ) THEN
00425          IDUM3 = -1
00426       ELSE
00427          IDUM3 = 1
00428       END IF
00429 *
00430       IF( N.LT.0 ) THEN
00431          INFO = -1
00432       END IF
00433 *
00434       IF( N+JA-1.GT.STORE_N_A ) THEN
00435          INFO = -( 6*100+6 )
00436       END IF
00437 *
00438       IF( ( BWL.GT.N-1 ) .OR. ( BWL.LT.0 ) ) THEN
00439          INFO = -2
00440       END IF
00441 *
00442       IF( ( BWU.GT.N-1 ) .OR. ( BWU.LT.0 ) ) THEN
00443          INFO = -3
00444       END IF
00445 *
00446       IF( LLDA.LT.( BWL+BWU+1 ) ) THEN
00447          INFO = -( 6*100+6 )
00448       END IF
00449 *
00450       IF( NB.LE.0 ) THEN
00451          INFO = -( 6*100+4 )
00452       END IF
00453 *
00454 *     Argument checking that is specific to Divide & Conquer routine
00455 *
00456       IF( NPROW.NE.1 ) THEN
00457          INFO = -( 6*100+2 )
00458       END IF
00459 *
00460       IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN
00461          INFO = -( 1 )
00462          CALL PXERBLA( ICTXT, 'PSDBTRF, D&C alg.: only 1 block per proc'
00463      $                 , -INFO )
00464          RETURN
00465       END IF
00466 *
00467       IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.2*MAX( BWL, BWU ) ) ) THEN
00468          INFO = -( 6*100+4 )
00469          CALL PXERBLA( ICTXT, 'PSDBTRF, D&C alg.: NB too small', -INFO )
00470          RETURN
00471       END IF
00472 *
00473 *
00474 *     Check auxiliary storage size
00475 *
00476       LAF_MIN = NB*( BWL+BWU ) + 6*MAX( BWL, BWU )*MAX( BWL, BWU )
00477 *
00478       IF( LAF.LT.LAF_MIN ) THEN
00479          INFO = -8
00480 *        put minimum value of laf into AF( 1 )
00481          AF( 1 ) = LAF_MIN
00482          CALL PXERBLA( ICTXT, 'PSDBTRF: auxiliary storage error ',
00483      $                 -INFO )
00484          RETURN
00485       END IF
00486 *
00487 *     Check worksize
00488 *
00489       WORK_SIZE_MIN = MAX( BWL, BWU )*MAX( BWL, BWU )
00490 *
00491       WORK( 1 ) = WORK_SIZE_MIN
00492 *
00493       IF( LWORK.LT.WORK_SIZE_MIN ) THEN
00494          IF( LWORK.NE.-1 ) THEN
00495             INFO = -10
00496             CALL PXERBLA( ICTXT, 'PSDBTRF: worksize error ', -INFO )
00497          END IF
00498          RETURN
00499       END IF
00500 *
00501 *     Pack params and positions into arrays for global consistency check
00502 *
00503       PARAM_CHECK( 9, 1 ) = DESCA( 5 )
00504       PARAM_CHECK( 8, 1 ) = DESCA( 4 )
00505       PARAM_CHECK( 7, 1 ) = DESCA( 3 )
00506       PARAM_CHECK( 6, 1 ) = DESCA( 1 )
00507       PARAM_CHECK( 5, 1 ) = JA
00508       PARAM_CHECK( 4, 1 ) = BWU
00509       PARAM_CHECK( 3, 1 ) = BWL
00510       PARAM_CHECK( 2, 1 ) = N
00511       PARAM_CHECK( 1, 1 ) = IDUM3
00512 *
00513       PARAM_CHECK( 9, 2 ) = 605
00514       PARAM_CHECK( 8, 2 ) = 604
00515       PARAM_CHECK( 7, 2 ) = 603
00516       PARAM_CHECK( 6, 2 ) = 601
00517       PARAM_CHECK( 5, 2 ) = 5
00518       PARAM_CHECK( 4, 2 ) = 3
00519       PARAM_CHECK( 3, 2 ) = 2
00520       PARAM_CHECK( 2, 2 ) = 1
00521       PARAM_CHECK( 1, 2 ) = 10
00522 *
00523 *     Want to find errors with MIN( ), so if no error, set it to a big
00524 *     number. If there already is an error, multiply by the the
00525 *     descriptor multiplier.
00526 *
00527       IF( INFO.GE.0 ) THEN
00528          INFO = BIGNUM
00529       ELSE IF( INFO.LT.-DESCMULT ) THEN
00530          INFO = -INFO
00531       ELSE
00532          INFO = -INFO*DESCMULT
00533       END IF
00534 *
00535 *     Check consistency across processors
00536 *
00537       CALL GLOBCHK( ICTXT, 9, PARAM_CHECK, 9, PARAM_CHECK( 1, 3 ),
00538      $              INFO )
00539 *
00540 *     Prepare output: set info = 0 if no error, and divide by DESCMULT
00541 *     if error is not in a descriptor entry.
00542 *
00543       IF( INFO.EQ.BIGNUM ) THEN
00544          INFO = 0
00545       ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN
00546          INFO = -INFO / DESCMULT
00547       ELSE
00548          INFO = -INFO
00549       END IF
00550 *
00551       IF( INFO.LT.0 ) THEN
00552          CALL PXERBLA( ICTXT, 'PSDBTRF', -INFO )
00553          RETURN
00554       END IF
00555 *
00556 *     Quick return if possible
00557 *
00558       IF( N.EQ.0 )
00559      $   RETURN
00560 *
00561 *
00562 *     Adjust addressing into matrix space to properly get into
00563 *        the beginning part of the relevant data
00564 *
00565       PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) )
00566 *
00567       IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN
00568          PART_OFFSET = PART_OFFSET + NB
00569       END IF
00570 *
00571       IF( MYCOL.LT.CSRC ) THEN
00572          PART_OFFSET = PART_OFFSET - NB
00573       END IF
00574 *
00575 *     Form a new BLACS grid (the "standard form" grid) with only procs
00576 *        holding part of the matrix, of size 1xNP where NP is adjusted,
00577 *        starting at csrc=0, with JA modified to reflect dropped procs.
00578 *
00579 *     First processor to hold part of the matrix:
00580 *
00581       FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL )
00582 *
00583 *     Calculate new JA one while dropping off unused processors.
00584 *
00585       JA_NEW = MOD( JA-1, NB ) + 1
00586 *
00587 *     Save and compute new value of NP
00588 *
00589       NP_SAVE = NP
00590       NP = ( JA_NEW+N-2 ) / NB + 1
00591 *
00592 *     Call utility routine that forms "standard-form" grid
00593 *
00594       CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC,
00595      $              INT_ONE, NP )
00596 *
00597 *     Use new context from standard grid as context.
00598 *
00599       ICTXT_SAVE = ICTXT
00600       ICTXT = ICTXT_NEW
00601       DESCA_1XP( 2 ) = ICTXT_NEW
00602 *
00603 *     Get information about new grid.
00604 *
00605       CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
00606 *
00607 *     Drop out processors that do not have part of the matrix.
00608 *
00609       IF( MYROW.LT.0 ) THEN
00610          GO TO 140
00611       END IF
00612 *
00613 *     ********************************
00614 *     Values reused throughout routine
00615 *
00616 *     User-input value of partition size
00617 *
00618       PART_SIZE = NB
00619 *
00620 *     Number of columns in each processor
00621 *
00622       MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL )
00623 *
00624 *     Offset in columns to beginning of main partition in each proc
00625 *
00626       IF( MYCOL.EQ.0 ) THEN
00627          PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE )
00628          MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE )
00629       END IF
00630 *
00631 *     Offset in elements
00632 *
00633       OFST = PART_OFFSET*LLDA
00634 *
00635 *     Size of main (or odd) partition in each processor
00636 *
00637       ODD_SIZE = MY_NUM_COLS
00638       IF( MYCOL.LT.NP-1 ) THEN
00639          ODD_SIZE = ODD_SIZE - MAX_BW
00640       END IF
00641 *
00642 *     Offset to workspace for Upper triangular factor
00643 *
00644       WORK_U = BWU*ODD_SIZE + 3*MBW2
00645 *
00646 *
00647 *       Zero out space for fillin
00648 *
00649       DO 10 I = 1, LAF_MIN
00650          AF( I ) = ZERO
00651    10 CONTINUE
00652 *
00653 *       Zero out space for work
00654 *
00655       DO 20 I = 1, WORK_SIZE_MIN
00656          WORK( I ) = ZERO
00657    20 CONTINUE
00658 *
00659 *     Begin main code
00660 *
00661 *
00662 ********************************************************************
00663 *       PHASE 1: Local computation phase.
00664 ********************************************************************
00665 *
00666 *
00667 *       Sizes of the extra triangles communicated bewtween processors
00668 *
00669       IF( MYCOL.GT.0 ) THEN
00670          PREV_TRI_SIZE_M = MIN( BWL, NUMROC( N, PART_SIZE, MYCOL, 0,
00671      $                     NPCOL ) )
00672          PREV_TRI_SIZE_N = MIN( BWL, NUMROC( N, PART_SIZE, MYCOL-1, 0,
00673      $                     NPCOL ) )
00674       END IF
00675 *
00676       IF( MYCOL.GT.0 ) THEN
00677          UP_PREV_TRI_SIZE_M = MIN( BWU,
00678      $                        NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) )
00679          UP_PREV_TRI_SIZE_N = MIN( BWU,
00680      $                        NUMROC( N, PART_SIZE, MYCOL-1, 0,
00681      $                        NPCOL ) )
00682       END IF
00683 *
00684       IF( MYCOL.LT.NPCOL-1 ) THEN
00685          NEXT_TRI_SIZE_M = MIN( BWL, NUMROC( N, PART_SIZE, MYCOL+1, 0,
00686      $                     NPCOL ) )
00687          NEXT_TRI_SIZE_N = MIN( BWL, NUMROC( N, PART_SIZE, MYCOL, 0,
00688      $                     NPCOL ) )
00689       END IF
00690 *
00691       IF( MYCOL.LT.NP-1 ) THEN
00692 *         Transfer last triangle D_i of local matrix to next processor
00693 *         which needs it to calculate fillin due to factorization of
00694 *         its main (odd) block A_i.
00695 *         Overlap the send with the factorization of A_i.
00696 *
00697          CALL STRSD2D( ICTXT, 'U', 'N', NEXT_TRI_SIZE_M,
00698      $                 NEXT_TRI_SIZE_N, A( OFST+( MY_NUM_COLS-BWL )*
00699      $                 LLDA+( BWL+BWU+1 ) ), LLDA-1, 0, MYCOL+1 )
00700 *
00701       END IF
00702 *
00703 *
00704 *       Factor main partition A_i = L_i {U_i} in each processor
00705 *
00706       CALL SDBTRF( ODD_SIZE, ODD_SIZE, BWL, BWU, A( OFST+1 ), LLDA,
00707      $             INFO )
00708 *
00709       IF( INFO.NE.0 ) THEN
00710          INFO = MYCOL + 1
00711          GO TO 30
00712       END IF
00713 *
00714 *
00715       IF( MYCOL.LT.NP-1 ) THEN
00716 *
00717 *         Apply factorization to lower connection block BL_i
00718 *         transpose the connection block in preparation.
00719 *         Apply factorization to upper connection block BU_i
00720 *         Move the connection block in preparation.
00721 *
00722          CALL SLATCPY( 'U', BWL, BWL, A( ( OFST+( BWL+BWU+1 )+
00723      $                 ( ODD_SIZE-BWL )*LLDA ) ), LLDA-1,
00724      $                 AF( ODD_SIZE*BWU+2*MBW2+1+MAX_BW-BWL ), MAX_BW )
00725          CALL SLAMOV( 'L', BWU, BWU, A( ( OFST+1+ODD_SIZE*LLDA ) ),
00726      $                LLDA-1, AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1+MAX_BW-
00727      $                BWU ), MAX_BW )
00728 *
00729 *         Perform the triangular system solve {L_i}{{BU'}_i} = {B_i}
00730 *
00731          CALL STBTRS( 'L', 'N', 'U', BWU, BWL, BWU,
00732      $                A( OFST+BWU+1+( ODD_SIZE-BWU )*LLDA ), LLDA,
00733      $                AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1+MAX_BW-BWU ),
00734      $                MAX_BW, INFO )
00735 *
00736 *         Perform the triangular solve {U_i}^T{BL'}_i^T = {BL_i}^T
00737 *
00738          CALL STBTRS( 'U', 'T', 'N', BWL, BWU, BWL,
00739      $                A( OFST+1+( ODD_SIZE-BWL )*LLDA ), LLDA,
00740      $                AF( ODD_SIZE*BWU+2*MBW2+1+MAX_BW-BWL ), MAX_BW,
00741      $                INFO )
00742 *
00743 *         transpose resulting block to its location
00744 *           in main storage.
00745 *
00746          CALL SLATCPY( 'L', BWL, BWL, AF( ODD_SIZE*BWU+2*MBW2+1+MAX_BW-
00747      $                 BWL ), MAX_BW, A( ( OFST+( BWL+BWU+1 )+
00748      $                 ( ODD_SIZE-BWL )*LLDA ) ), LLDA-1 )
00749 *
00750 *         Move the resulting block back to its location in main storage.
00751 *
00752          CALL SLAMOV( 'L', BWU, BWU, AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1+
00753      $                MAX_BW-BWU ), MAX_BW, A( ( OFST+1+ODD_SIZE*
00754      $                LLDA ) ), LLDA-1 )
00755 *
00756 *
00757 *         Compute contribution to diagonal block(s) of reduced system.
00758 *          {C'}_i = {C_i}-{{BL'}_i}{{BU'}_i}
00759 *
00760 *         The following method uses more flops than necessary but
00761 *           does not necessitate the writing of a new BLAS routine.
00762 *
00763 *
00764          CALL SGEMM( 'T', 'N', MAX_BW, MAX_BW, MAX_BW, -ONE,
00765      $               AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW,
00766      $               AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW, ONE,
00767      $               A( OFST+ODD_SIZE*LLDA+1+BWU ), LLDA-1 )
00768 *
00769       END IF
00770 *       End of "if ( MYCOL .lt. NP-1 )..." loop
00771 *
00772 *
00773    30 CONTINUE
00774 *       If the processor could not locally factor, it jumps here.
00775 *
00776       IF( MYCOL.NE.0 ) THEN
00777 *         Discard temporary matrix stored beginning in
00778 *           AF( (odd_size+2*bwl, bwu)*bwl, bwu+1 ) and use for
00779 *           off_diagonal block of reduced system.
00780 *
00781 *         Receive previously transmitted matrix section, which forms
00782 *         the right-hand-side for the triangular solve that calculates
00783 *         the "spike" fillin.
00784 *
00785 *
00786          CALL STRRV2D( ICTXT, 'U', 'N', PREV_TRI_SIZE_M,
00787      $                 PREV_TRI_SIZE_N, AF( WORK_U+1 ), BWL, 0,
00788      $                 MYCOL-1 )
00789 *
00790          IF( INFO.EQ.0 ) THEN
00791 *
00792 *         Calculate the "spike" fillin, ${L_i} {{GU}_i} = {DL_i}$ .
00793 *
00794 *         Transpose transmitted triangular matrix  $DL_i$
00795 *
00796             DO 50 I1 = 1, BWL
00797                DO 40 I2 = I1 + 1, BWL
00798                   AF( WORK_U+I2+( I1-1 )*BWL ) = AF( WORK_U+I1+( I2-1 )*
00799      $               BWL )
00800                   AF( WORK_U+I1+( I2-1 )*BWL ) = ZERO
00801    40          CONTINUE
00802    50       CONTINUE
00803 *
00804             DO 60 I1 = 2, ODD_SIZE
00805                I2 = MIN( I1-1, BWL )
00806                CALL SGEMV( 'N', BWL, I2, -ONE,
00807      $                     AF( WORK_U+1+( I1-1-I2 )*BWL ), BWL,
00808      $                     A( OFST+BWU+1+I2+( I1-1-I2 )*LLDA ), LLDA-1,
00809      $                     ONE, AF( WORK_U+1+( I1-1 )*BWL ), 1 )
00810    60       CONTINUE
00811 *
00812 *
00813 *         Calculate the "spike" fillin, ${U_i}^T {{GL}_i}^T = {DU_i}^T$
00814 *
00815 *
00816 *         Copy D block into AF storage for solve.
00817 *
00818             CALL SLAMOV( 'L', UP_PREV_TRI_SIZE_N, UP_PREV_TRI_SIZE_M,
00819      $                   A( OFST+1 ), LLDA-1, AF( 1 ), BWU )
00820 *
00821             DO 80 I1 = 1, ODD_SIZE
00822                I2 = MIN( BWU, I1-1 )
00823                CALL SGEMV( 'N', BWU, I2, -ONE, AF( ( I1-1-I2 )*BWU+1 ),
00824      $                     BWU, A( OFST+BWU+1-I2+( I1-1 )*LLDA ), 1,
00825      $                     ONE, AF( ( I1-1 )*BWU+1 ), 1 )
00826 *
00827                DO 70 I = 1, BWU
00828                   AF( ( I1-1 )*BWU+I ) = AF( ( I1-1 )*BWU+I ) /
00829      $                                   A( ( I1-1 )*LLDA+BWU+1 )
00830    70          CONTINUE
00831    80       CONTINUE
00832 *
00833 *         Calculate the update block for previous proc, E_i = GL_i{GU_i}
00834 *
00835 *
00836 *         Zero out space in case result is smaller than storage block
00837 *
00838             DO 90 I = 1, MBW2
00839                AF( ODD_SIZE*BWU+2*MBW2+I ) = ZERO
00840    90       CONTINUE
00841 *
00842             CALL SGEMM( 'N', 'T', BWU, BWL, ODD_SIZE, -ONE, AF( 1 ),
00843      $                  BWU, AF( WORK_U+1 ), BWL, ZERO,
00844      $                  AF( 1+MAX( 0, BWL-BWU )+ODD_SIZE*BWU+( 2*MAX_BW+
00845      $                  MAX( 0, BWU-BWL ) )*MAX_BW ), MAX_BW )
00846 *
00847 *
00848 *         Initiate send of E_i to previous processor to overlap
00849 *           with next computation.
00850 *
00851             CALL SGESD2D( ICTXT, MAX_BW, MAX_BW,
00852      $                    AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, 0,
00853      $                    MYCOL-1 )
00854 *
00855 *
00856             IF( MYCOL.LT.NP-1 ) THEN
00857 *
00858 *           Calculate off-diagonal block(s) of reduced system.
00859 *           Note: for ease of use in solution of reduced system, store
00860 *           L's off-diagonal block in transpose form.
00861 *
00862 *           Copy matrix HU_i (the last bwl rows of GU_i) to AFL storage
00863 *             as per requirements of BLAS routine DTRMM.
00864 *             Since we have GU_i stored,
00865 *             transpose HU_i to HU_i^T.
00866 *
00867                CALL SLAMOV( 'N', BWL, BWL,
00868      $                      AF( WORK_U+( ODD_SIZE-BWL )*BWL+1 ), BWL,
00869      $                      AF( ( ODD_SIZE )*BWU+1+( MAX_BW-BWL ) ),
00870      $                      MAX_BW )
00871 *
00872                CALL STRMM( 'R', 'U', 'T', 'N', BWL, BWL, -ONE,
00873      $                     A( ( OFST+( BWL+BWU+1 )+( ODD_SIZE-BWL )*
00874      $                     LLDA ) ), LLDA-1, AF( ( ODD_SIZE )*BWU+1+
00875      $                     ( MAX_BW-BWL ) ), MAX_BW )
00876 *
00877 *
00878 *           Copy matrix HL_i (the last bwu rows of GL_i^T) to AFU store
00879 *             as per requirements of BLAS routine DTRMM.
00880 *             Since we have GL_i^T stored,
00881 *             transpose HL_i^T to HL_i.
00882 *
00883                CALL SLAMOV( 'N', BWU, BWU, AF( ( ODD_SIZE-BWU )*BWU+1 ),
00884      $                      BWU, AF( WORK_U+( ODD_SIZE )*BWL+1+MAX_BW-
00885      $                      BWU ), MAX_BW )
00886 *
00887                CALL STRMM( 'R', 'L', 'N', 'N', BWU, BWU, -ONE,
00888      $                     A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1,
00889      $                     AF( WORK_U+( ODD_SIZE )*BWL+1+MAX_BW-BWU ),
00890      $                     MAX_BW )
00891 *
00892             END IF
00893 *
00894          END IF
00895 *       End of "if ( MYCOL .ne. 0 )..."
00896 *
00897       END IF
00898 *       End of "if (info.eq.0) then"
00899 *
00900 *
00901 *       Check to make sure no processors have found errors
00902 *
00903       CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0,
00904      $              0 )
00905 *
00906       IF( MYCOL.EQ.0 ) THEN
00907          CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 )
00908       ELSE
00909          CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 )
00910       END IF
00911 *
00912       IF( INFO.NE.0 ) THEN
00913          GO TO 130
00914       END IF
00915 *       No errors found, continue
00916 *
00917 *
00918 ********************************************************************
00919 *       PHASE 2: Formation and factorization of Reduced System.
00920 ********************************************************************
00921 *
00922 *       Gather up local sections of reduced system
00923 *
00924 *
00925 *     The last processor does not participate in the factorization of
00926 *       the reduced system, having sent its E_i already.
00927       IF( MYCOL.EQ.NPCOL-1 ) THEN
00928          GO TO 120
00929       END IF
00930 *
00931 *       Initiate send of off-diag block(s) to overlap with next part.
00932 *       Off-diagonal block needed on neighboring processor to start
00933 *       algorithm.
00934 *
00935       IF( ( MOD( MYCOL+1, 2 ).EQ.0 ) .AND. ( MYCOL.GT.0 ) ) THEN
00936 *
00937          CALL SGESD2D( ICTXT, MAX_BW, MAX_BW, AF( ODD_SIZE*BWU+1 ),
00938      $                 MAX_BW, 0, MYCOL-1 )
00939 *
00940          CALL SGESD2D( ICTXT, MAX_BW, MAX_BW,
00941      $                 AF( WORK_U+ODD_SIZE*BWL+1 ), MAX_BW, 0, MYCOL-1 )
00942 *
00943       END IF
00944 *
00945 *       Copy last diagonal block into AF storage for subsequent
00946 *         operations.
00947 *
00948       CALL SLAMOV( 'N', MAX_BW, MAX_BW, A( OFST+ODD_SIZE*LLDA+BWU+1 ),
00949      $             LLDA-1, AF( ODD_SIZE*BWU+MBW2+1 ), MAX_BW )
00950 *
00951 *       Receive cont. to diagonal block that is stored on this proc.
00952 *
00953       IF( MYCOL.LT.NPCOL-1 ) THEN
00954 *
00955          CALL SGERV2D( ICTXT, MAX_BW, MAX_BW,
00956      $                 AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, 0, MYCOL+1 )
00957 *
00958 *          Add contribution to diagonal block
00959 *
00960          CALL SAXPY( MBW2, ONE, AF( ODD_SIZE*BWU+2*MBW2+1 ), 1,
00961      $               AF( ODD_SIZE*BWU+MBW2+1 ), 1 )
00962 *
00963       END IF
00964 *
00965 *
00966 *       *************************************
00967 *       Modification Loop
00968 *
00969 *       The distance for sending and receiving for each level starts
00970 *         at 1 for the first level.
00971       LEVEL_DIST = 1
00972 *
00973 *       Do until this proc is needed to modify other procs' equations
00974 *
00975   100 CONTINUE
00976       IF( MOD( ( MYCOL+1 ) / LEVEL_DIST, 2 ).NE.0 )
00977      $   GO TO 110
00978 *
00979 *         Receive and add contribution to diagonal block from the left
00980 *
00981       IF( MYCOL-LEVEL_DIST.GE.0 ) THEN
00982          CALL SGERV2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, 0,
00983      $                 MYCOL-LEVEL_DIST )
00984 *
00985          CALL SAXPY( MBW2, ONE, WORK( 1 ), 1, AF( ODD_SIZE*BWU+MBW2+1 ),
00986      $               1 )
00987 *
00988       END IF
00989 *
00990 *         Receive and add contribution to diagonal block from the right
00991 *
00992       IF( MYCOL+LEVEL_DIST.LT.NPCOL-1 ) THEN
00993          CALL SGERV2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, 0,
00994      $                 MYCOL+LEVEL_DIST )
00995 *
00996          CALL SAXPY( MBW2, ONE, WORK( 1 ), 1, AF( ODD_SIZE*BWU+MBW2+1 ),
00997      $               1 )
00998 *
00999       END IF
01000 *
01001       LEVEL_DIST = LEVEL_DIST*2
01002 *
01003       GO TO 100
01004   110 CONTINUE
01005 *       [End of GOTO Loop]
01006 *
01007 *
01008 *       *********************************
01009 *       Calculate and use this proc's blocks to modify other procs'...
01010 *
01011 *       Factor diagonal block
01012 *
01013       CALL SDBTRF( MAX_BW, MAX_BW, MIN( MAX_BW-1, BWL ),
01014      $             MIN( MAX_BW-1, BWU ), AF( ODD_SIZE*BWU+MBW2+1-
01015      $             ( MIN( MAX_BW-1, BWU ) ) ), MAX_BW+1, INFO )
01016 *
01017       IF( INFO.NE.0 ) THEN
01018          INFO = NPCOL + MYCOL
01019       END IF
01020 *
01021 *       ****************************************************************
01022 *       Receive offdiagonal block from processor to right.
01023 *         If this is the first group of processors, the receive comes
01024 *         from a different processor than otherwise.
01025 *
01026       IF( LEVEL_DIST.EQ.1 ) THEN
01027          COMM_PROC = MYCOL + 1
01028 *
01029 *           Move block into place that it will be expected to be for
01030 *             calcs.
01031 *
01032          CALL SLAMOV( 'N', MAX_BW, MAX_BW, AF( ODD_SIZE*BWU+1 ), MAX_BW,
01033      $                AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW )
01034 *
01035          CALL SLAMOV( 'N', MAX_BW, MAX_BW, AF( WORK_U+ODD_SIZE*BWL+1 ),
01036      $                MAX_BW, AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW )
01037 *
01038       ELSE
01039          COMM_PROC = MYCOL + LEVEL_DIST / 2
01040       END IF
01041 *
01042       IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN
01043 *
01044          CALL SGERV2D( ICTXT, MAX_BW, MAX_BW, AF( ODD_SIZE*BWU+1 ),
01045      $                 MAX_BW, 0, COMM_PROC )
01046 *
01047          CALL SGERV2D( ICTXT, MAX_BW, MAX_BW,
01048      $                 AF( WORK_U+ODD_SIZE*BWL+1 ), MAX_BW, 0,
01049      $                 COMM_PROC )
01050 *
01051          IF( INFO.EQ.0 ) THEN
01052 *
01053 *
01054 *         Modify upper off_diagonal block with diagonal block
01055 *
01056 *
01057             CALL STBTRS( 'L', 'N', 'U', BWU, MIN( BWL, BWU-1 ), BWU,
01058      $                   AF( ODD_SIZE*BWU+MBW2+1+( MAX_BW+1 )*( MAX_BW-
01059      $                   BWU ) ), MAX_BW+1, AF( WORK_U+ODD_SIZE*BWL+1+
01060      $                   MAX_BW-BWU ), MAX_BW, INFO )
01061 *
01062 *         Modify lower off_diagonal block with diagonal block
01063 *
01064 *
01065             CALL STBTRS( 'U', 'T', 'N', BWL, MIN( BWU, BWL-1 ), BWL,
01066      $                   AF( ODD_SIZE*BWU+MBW2+1-MIN( BWU,
01067      $                   BWL-1 )+( MAX_BW+1 )*( MAX_BW-BWL ) ),
01068      $                   MAX_BW+1, AF( ODD_SIZE*BWU+1+MAX_BW-BWL ),
01069      $                   MAX_BW, INFO )
01070 *
01071          END IF
01072 *         End of "if ( info.eq.0 ) then"
01073 *
01074 *         Calculate contribution from this block to next diagonal block
01075 *
01076          CALL SGEMM( 'T', 'N', MAX_BW, MAX_BW, MAX_BW, -ONE,
01077      $               AF( ( ODD_SIZE )*BWU+1 ), MAX_BW,
01078      $               AF( WORK_U+( ODD_SIZE )*BWL+1 ), MAX_BW, ZERO,
01079      $               WORK( 1 ), MAX_BW )
01080 *
01081 *         Send contribution to diagonal block's owning processor.
01082 *
01083          CALL SGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, 0,
01084      $                 MYCOL+LEVEL_DIST )
01085 *
01086       END IF
01087 *       End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
01088 *
01089 *
01090 *       ****************************************************************
01091 *       Receive off_diagonal block from left and use to finish with this
01092 *         processor.
01093 *
01094       IF( ( MYCOL / LEVEL_DIST.GT.0 ) .AND.
01095      $    ( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-1 ) ) THEN
01096 *
01097          IF( LEVEL_DIST.GT.1 ) THEN
01098 *
01099 *           Receive offdiagonal block(s) from proc level_dist/2 to the
01100 *           left
01101 *
01102             CALL SGERV2D( ICTXT, MAX_BW, MAX_BW,
01103      $                    AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW, 0,
01104      $                    MYCOL-LEVEL_DIST / 2 )
01105 *
01106 *           Receive offdiagonal block(s) from proc level_dist/2 to the
01107 *           left
01108 *
01109             CALL SGERV2D( ICTXT, MAX_BW, MAX_BW,
01110      $                    AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, 0,
01111      $                    MYCOL-LEVEL_DIST / 2 )
01112 *
01113          END IF
01114 *
01115 *
01116          IF( INFO.EQ.0 ) THEN
01117 *
01118 *         Use diagonal block(s) to modify this offdiagonal block
01119 *
01120 *
01121 *         Since STBTRS has no "left-right" option, we must transpose
01122 *
01123             CALL SLATCPY( 'N', MAX_BW, MAX_BW,
01124      $                    AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW,
01125      $                    WORK( 1 ), MAX_BW )
01126 *
01127             CALL STBTRS( 'L', 'N', 'U', MAX_BW, MIN( BWL, MAX_BW-1 ),
01128      $                   BWL, AF( ODD_SIZE*BWU+MBW2+1 ), MAX_BW+1,
01129      $                   WORK( 1+MAX_BW*( MAX_BW-BWL ) ), MAX_BW, INFO )
01130 *
01131 *         Transpose back
01132 *
01133             CALL SLATCPY( 'N', MAX_BW, MAX_BW, WORK( 1 ), MAX_BW,
01134      $                    AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW )
01135 *
01136 *
01137 *
01138 *         Since STBTRS has no "left-right" option, we must transpose
01139 *
01140             CALL SLATCPY( 'N', MAX_BW, MAX_BW,
01141      $                    AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW,
01142      $                    WORK( 1 ), MAX_BW )
01143 *
01144             CALL STBTRS( 'U', 'T', 'N', MAX_BW, MIN( BWU, MAX_BW-1 ),
01145      $                   BWU, AF( ODD_SIZE*BWU+MBW2+1-MIN( BWU,
01146      $                   MAX_BW-1 ) ), MAX_BW+1,
01147      $                   WORK( 1+MAX_BW*( MAX_BW-BWU ) ), MAX_BW, INFO )
01148 *
01149 *         Transpose back
01150 *
01151             CALL SLATCPY( 'N', MAX_BW, MAX_BW, WORK( 1 ), MAX_BW,
01152      $                    AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW )
01153 *
01154 *
01155          END IF
01156 *         End of "if( info.eq.0 ) then"
01157 *
01158 *         Use offdiag block(s) to calculate modification to diag block
01159 *           of processor to the left
01160 *
01161          CALL SGEMM( 'N', 'T', MAX_BW, MAX_BW, MAX_BW, -ONE,
01162      $               AF( ( ODD_SIZE )*BWU+2*MBW2+1 ), MAX_BW,
01163      $               AF( WORK_U+( ODD_SIZE )*BWL+2*MBW2+1 ), MAX_BW,
01164      $               ZERO, WORK( 1 ), MAX_BW )
01165 *
01166 *         Send contribution to diagonal block's owning processor.
01167 *
01168          CALL SGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, 0,
01169      $                 MYCOL-LEVEL_DIST )
01170 *
01171 *         *******************************************************
01172 *
01173          IF( MYCOL / LEVEL_DIST.LE.( NPCOL-1 ) / LEVEL_DIST-2 ) THEN
01174 *
01175 *           Decide which processor offdiagonal block(s) goes to
01176 *
01177             IF( ( MOD( MYCOL / ( 2*LEVEL_DIST ), 2 ) ).EQ.0 ) THEN
01178                COMM_PROC = MYCOL + LEVEL_DIST
01179             ELSE
01180                COMM_PROC = MYCOL - LEVEL_DIST
01181             END IF
01182 *
01183 *           Use offdiagonal blocks to calculate offdiag
01184 *             block to send to neighboring processor. Depending
01185 *             on circumstances, may need to transpose the matrix.
01186 *
01187             CALL SGEMM( 'N', 'N', MAX_BW, MAX_BW, MAX_BW, -ONE,
01188      $                  AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW,
01189      $                  AF( ODD_SIZE*BWU+1 ), MAX_BW, ZERO, WORK( 1 ),
01190      $                  MAX_BW )
01191 *
01192 *           Send contribution to offdiagonal block's owning processor.
01193 *
01194             CALL SGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, 0,
01195      $                    COMM_PROC )
01196 *
01197             CALL SGEMM( 'N', 'N', MAX_BW, MAX_BW, MAX_BW, -ONE,
01198      $                  AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW,
01199      $                  AF( WORK_U+ODD_SIZE*BWL+1 ), MAX_BW, ZERO,
01200      $                  WORK( 1 ), MAX_BW )
01201 *
01202 *           Send contribution to offdiagonal block's owning processor.
01203 *
01204             CALL SGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW, 0,
01205      $                    COMM_PROC )
01206 *
01207          END IF
01208 *
01209       END IF
01210 *       End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
01211 *
01212   120 CONTINUE
01213 *
01214 *
01215   130 CONTINUE
01216 *
01217 *
01218 *     Free BLACS space used to hold standard-form grid.
01219 *
01220       IF( ICTXT_SAVE.NE.ICTXT_NEW ) THEN
01221          CALL BLACS_GRIDEXIT( ICTXT_NEW )
01222       END IF
01223 *
01224   140 CONTINUE
01225 *
01226 *     Restore saved input parameters
01227 *
01228       ICTXT = ICTXT_SAVE
01229       NP = NP_SAVE
01230 *
01231 *     Output minimum worksize
01232 *
01233       WORK( 1 ) = WORK_SIZE_MIN
01234 *
01235 *         Make INFO consistent across processors
01236 *
01237       CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0,
01238      $              0 )
01239 *
01240       IF( MYCOL.EQ.0 ) THEN
01241          CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 )
01242       ELSE
01243          CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 )
01244       END IF
01245 *
01246 *
01247       RETURN
01248 *
01249 *     End of PSDBTRF
01250 *
01251       END