ScaLAPACK  2.0.2
ScaLAPACK: Scalable Linear Algebra PACKage
pdgbtrf.f
Go to the documentation of this file.
00001       SUBROUTINE PDGBTRF( N, BWL, BWU, A, JA, DESCA, IPIV, AF, LAF,
00002      $                    WORK, LWORK, INFO )
00003 *
00004 *  -- ScaLAPACK routine (version 2.0.2) --
00005 *     Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
00006 *     May 1 2012
00007 *
00008 *     .. Scalar Arguments ..
00009       INTEGER            BWL, BWU, INFO, JA, LAF, LWORK, N
00010 *     ..
00011 *     .. Array Arguments ..
00012       INTEGER            DESCA( * ), IPIV( * )
00013       DOUBLE PRECISION   A( * ), AF( * ), WORK( * )
00014 *     ..
00015 *
00016 *  Purpose
00017 *  =======
00018 *
00019 *  PDGBTRF computes a LU factorization
00020 *  of an N-by-N real banded
00021 *  distributed matrix
00022 *  with bandwidth BWL, BWU: A(1:N, JA:JA+N-1).
00023 *  Reordering is used to increase parallelism in the factorization.
00024 *  This reordering results in factors that are DIFFERENT from those
00025 *  produced by equivalent sequential codes. These factors cannot
00026 *  be used directly by users; however, they can be used in
00027 *  subsequent calls to PDGBTRS to solve linear systems.
00028 *
00029 *  The factorization has the form
00030 *
00031 *          P A(1:N, JA:JA+N-1) Q = L U
00032 *
00033 *  where U is a banded upper triangular matrix and L is banded
00034 *  lower triangular, and P and Q are permutation matrices.
00035 *  The matrix Q represents reordering of columns
00036 *  for parallelism's sake, while P represents
00037 *  reordering of rows for numerical stability using
00038 *  classic partial pivoting.
00039 *
00040 *  =====================================================================
00041 *
00042 *  Arguments
00043 *  =========
00044 *
00045 *
00046 *  N       (global input) INTEGER
00047 *          The number of rows and columns to be operated on, i.e. the
00048 *          order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
00049 *
00050 *  BWL     (global input) INTEGER
00051 *          Number of subdiagonals. 0 <= BWL <= N-1
00052 *
00053 *  BWU     (global input) INTEGER
00054 *          Number of superdiagonals. 0 <= BWU <= N-1
00055 *
00056 *  A       (local input/local output) DOUBLE PRECISION pointer into
00057 *          local memory to an array with first dimension
00058 *          LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA).
00059 *          On entry, this array contains the local pieces of the
00060 *          N-by-N unsymmetric banded distributed matrix
00061 *          A(1:N, JA:JA+N-1) to be factored.
00062 *          This local portion is stored in the packed banded format
00063 *            used in LAPACK. Please see the Notes below and the
00064 *            ScaLAPACK manual for more detail on the format of
00065 *            distributed matrices.
00066 *          On exit, this array contains information containing details
00067 *            of the factorization.
00068 *          Note that permutations are performed on the matrix, so that
00069 *            the factors returned are different from those returned
00070 *            by LAPACK.
00071 *
00072 *  JA      (global input) INTEGER
00073 *          The index in the global array A that points to the start of
00074 *          the matrix to be operated on (which may be either all of A
00075 *          or a submatrix of A).
00076 *
00077 *  DESCA   (global and local input) INTEGER array of dimension DLEN.
00078 *          if 1D type (DTYPE_A=501), DLEN >= 7;
00079 *          if 2D type (DTYPE_A=1), DLEN >= 9 .
00080 *          The array descriptor for the distributed matrix A.
00081 *          Contains information of mapping of A to memory. Please
00082 *          see NOTES below for full description and options.
00083 *
00084 *  IPIV    (local output) INTEGER array, dimension >= DESCA( NB ).
00085 *          Pivot indices for local factorizations.
00086 *          Users *should not* alter the contents between
00087 *          factorization and solve.
00088 *
00089 *  AF      (local output) DOUBLE PRECISION array, dimension LAF.
00090 *          Auxiliary Fillin Space.
00091 *          Fillin is created during the factorization routine
00092 *          PDGBTRF and this is stored in AF. If a linear system
00093 *          is to be solved using PDGBTRS after the factorization
00094 *          routine, AF *must not be altered* after the factorization.
00095 *
00096 *  LAF     (local input) INTEGER
00097 *          Size of user-input Auxiliary Fillin space AF. Must be >=
00098 *          (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu)
00099 *          If LAF is not large enough, an error code will be returned
00100 *          and the minimum acceptable size will be returned in AF( 1 )
00101 *
00102 *  WORK    (local workspace/local output)
00103 *          DOUBLE PRECISION temporary workspace. This space may
00104 *          be overwritten in between calls to routines. WORK must be
00105 *          the size given in LWORK.
00106 *          On exit, WORK( 1 ) contains the minimal LWORK.
00107 *
00108 *  LWORK   (local input or global input) INTEGER
00109 *          Size of user-input workspace WORK.
00110 *          If LWORK is too small, the minimal acceptable size will be
00111 *          returned in WORK(1) and an error code is returned. LWORK>=
00112 *          1
00113 *
00114 *  INFO    (global output) INTEGER
00115 *          = 0:  successful exit
00116 *          < 0:  If the i-th argument is an array and the j-entry had
00117 *                an illegal value, then INFO = -(i*100+j), if the i-th
00118 *                argument is a scalar and had an illegal value, then
00119 *                INFO = -i.
00120 *          > 0:  If INFO = K<=NPROCS, the submatrix stored on processor
00121 *                INFO and factored locally was not
00122 *                nonsingular,  and
00123 *                the factorization was not completed.
00124 *                If INFO = K>NPROCS, the submatrix stored on processor
00125 *                INFO-NPROCS representing interactions with other
00126 *                processors was not
00127 *                nonsingular,
00128 *                and the factorization was not completed.
00129 *
00130 *  =====================================================================
00131 *
00132 *
00133 *  Restrictions
00134 *  ============
00135 *
00136 *  The following are restrictions on the input parameters. Some of these
00137 *    are temporary and will be removed in future releases, while others
00138 *    may reflect fundamental technical limitations.
00139 *
00140 *    Non-cyclic restriction: VERY IMPORTANT!
00141 *      P*NB>= mod(JA-1,NB)+N.
00142 *      The mapping for matrices must be blocked, reflecting the nature
00143 *      of the divide and conquer algorithm as a task-parallel algorithm.
00144 *      This formula in words is: no processor may have more than one
00145 *      chunk of the matrix.
00146 *
00147 *    Blocksize cannot be too small:
00148 *      If the matrix spans more than one processor, the following
00149 *      restriction on NB, the size of each block on each processor,
00150 *      must hold:
00151 *      NB >= (BWL+BWU)+1
00152 *      The bulk of parallel computation is done on the matrix of size
00153 *      O(NB) on each processor. If this is too small, divide and conquer
00154 *      is a poor choice of algorithm.
00155 *
00156 *    Submatrix reference:
00157 *      JA = IB
00158 *      Alignment restriction that prevents unnecessary communication.
00159 *
00160 *
00161 *  =====================================================================
00162 *
00163 *
00164 *  Notes
00165 *  =====
00166 *
00167 *  If the factorization routine and the solve routine are to be called
00168 *    separately (to solve various sets of righthand sides using the same
00169 *    coefficient matrix), the auxiliary space AF *must not be altered*
00170 *    between calls to the factorization routine and the solve routine.
00171 *
00172 *  The best algorithm for solving banded and tridiagonal linear systems
00173 *    depends on a variety of parameters, especially the bandwidth.
00174 *    Currently, only algorithms designed for the case N/P >> bw are
00175 *    implemented. These go by many names, including Divide and Conquer,
00176 *    Partitioning, domain decomposition-type, etc.
00177 *
00178 *  Algorithm description: Divide and Conquer
00179 *
00180 *    The Divide and Conqer algorithm assumes the matrix is narrowly
00181 *      banded compared with the number of equations. In this situation,
00182 *      it is best to distribute the input matrix A one-dimensionally,
00183 *      with columns atomic and rows divided amongst the processes.
00184 *      The basic algorithm divides the banded matrix up into
00185 *      P pieces with one stored on each processor,
00186 *      and then proceeds in 2 phases for the factorization or 3 for the
00187 *      solution of a linear system.
00188 *      1) Local Phase:
00189 *         The individual pieces are factored independently and in
00190 *         parallel. These factors are applied to the matrix creating
00191 *         fillin, which is stored in a non-inspectable way in auxiliary
00192 *         space AF. Mathematically, this is equivalent to reordering
00193 *         the matrix A as P A P^T and then factoring the principal
00194 *         leading submatrix of size equal to the sum of the sizes of
00195 *         the matrices factored on each processor. The factors of
00196 *         these submatrices overwrite the corresponding parts of A
00197 *         in memory.
00198 *      2) Reduced System Phase:
00199 *         A small (max(bwl,bwu)* (P-1)) system is formed representing
00200 *         interaction of the larger blocks, and is stored (as are its
00201 *         factors) in the space AF. A parallel Block Cyclic Reduction
00202 *         algorithm is used. For a linear system, a parallel front solve
00203 *         followed by an analagous backsolve, both using the structure
00204 *         of the factored matrix, are performed.
00205 *      3) Backsubsitution Phase:
00206 *         For a linear system, a local backsubstitution is performed on
00207 *         each processor in parallel.
00208 *
00209 *
00210 *  Descriptors
00211 *  ===========
00212 *
00213 *  Descriptors now have *types* and differ from ScaLAPACK 1.0.
00214 *
00215 *  Note: banded codes can use either the old two dimensional
00216 *    or new one-dimensional descriptors, though the processor grid in
00217 *    both cases *must be one-dimensional*. We describe both types below.
00218 *
00219 *  Each global data object is described by an associated description
00220 *  vector.  This vector stores the information required to establish
00221 *  the mapping between an object element and its corresponding process
00222 *  and memory location.
00223 *
00224 *  Let A be a generic term for any 2D block cyclicly distributed array.
00225 *  Such a global array has an associated description vector DESCA.
00226 *  In the following comments, the character _ should be read as
00227 *  "of the global array".
00228 *
00229 *  NOTATION        STORED IN      EXPLANATION
00230 *  --------------- -------------- --------------------------------------
00231 *  DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case,
00232 *                                 DTYPE_A = 1.
00233 *  CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
00234 *                                 the BLACS process grid A is distribu-
00235 *                                 ted over. The context itself is glo-
00236 *                                 bal, but the handle (the integer
00237 *                                 value) may vary.
00238 *  M_A    (global) DESCA( M_ )    The number of rows in the global
00239 *                                 array A.
00240 *  N_A    (global) DESCA( N_ )    The number of columns in the global
00241 *                                 array A.
00242 *  MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute
00243 *                                 the rows of the array.
00244 *  NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute
00245 *                                 the columns of the array.
00246 *  RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
00247 *                                 row of the array A is distributed.
00248 *  CSRC_A (global) DESCA( CSRC_ ) The process column over which the
00249 *                                 first column of the array A is
00250 *                                 distributed.
00251 *  LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
00252 *                                 array.  LLD_A >= MAX(1,LOCr(M_A)).
00253 *
00254 *  Let K be the number of rows or columns of a distributed matrix,
00255 *  and assume that its process grid has dimension p x q.
00256 *  LOCr( K ) denotes the number of elements of K that a process
00257 *  would receive if K were distributed over the p processes of its
00258 *  process column.
00259 *  Similarly, LOCc( K ) denotes the number of elements of K that a
00260 *  process would receive if K were distributed over the q processes of
00261 *  its process row.
00262 *  The values of LOCr() and LOCc() may be determined via a call to the
00263 *  ScaLAPACK tool function, NUMROC:
00264 *          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
00265 *          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
00266 *  An upper bound for these quantities may be computed by:
00267 *          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
00268 *          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
00269 *
00270 *
00271 *  One-dimensional descriptors:
00272 *
00273 *  One-dimensional descriptors are a new addition to ScaLAPACK since
00274 *    version 1.0. They simplify and shorten the descriptor for 1D
00275 *    arrays.
00276 *
00277 *  Since ScaLAPACK supports two-dimensional arrays as the fundamental
00278 *    object, we allow 1D arrays to be distributed either over the
00279 *    first dimension of the array (as if the grid were P-by-1) or the
00280 *    2nd dimension (as if the grid were 1-by-P). This choice is
00281 *    indicated by the descriptor type (501 or 502)
00282 *    as described below.
00283 *
00284 *    IMPORTANT NOTE: the actual BLACS grid represented by the
00285 *    CTXT entry in the descriptor may be *either*  P-by-1 or 1-by-P
00286 *    irrespective of which one-dimensional descriptor type
00287 *    (501 or 502) is input.
00288 *    This routine will interpret the grid properly either way.
00289 *    ScaLAPACK routines *do not support intercontext operations* so that
00290 *    the grid passed to a single ScaLAPACK routine *must be the same*
00291 *    for all array descriptors passed to that routine.
00292 *
00293 *    NOTE: In all cases where 1D descriptors are used, 2D descriptors
00294 *    may also be used, since a one-dimensional array is a special case
00295 *    of a two-dimensional array with one dimension of size unity.
00296 *    The two-dimensional array used in this case *must* be of the
00297 *    proper orientation:
00298 *      If the appropriate one-dimensional descriptor is DTYPEA=501
00299 *      (1 by P type), then the two dimensional descriptor must
00300 *      have a CTXT value that refers to a 1 by P BLACS grid;
00301 *      If the appropriate one-dimensional descriptor is DTYPEA=502
00302 *      (P by 1 type), then the two dimensional descriptor must
00303 *      have a CTXT value that refers to a P by 1 BLACS grid.
00304 *
00305 *
00306 *  Summary of allowed descriptors, types, and BLACS grids:
00307 *  DTYPE           501         502         1         1
00308 *  BLACS grid      1xP or Px1  1xP or Px1  1xP       Px1
00309 *  -----------------------------------------------------
00310 *  A               OK          NO          OK        NO
00311 *  B               NO          OK          NO        OK
00312 *
00313 *  Let A be a generic term for any 1D block cyclicly distributed array.
00314 *  Such a global array has an associated description vector DESCA.
00315 *  In the following comments, the character _ should be read as
00316 *  "of the global array".
00317 *
00318 *  NOTATION        STORED IN  EXPLANATION
00319 *  --------------- ---------- ------------------------------------------
00320 *  DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
00321 *                                TYPE_A = 501: 1-by-P grid.
00322 *                                TYPE_A = 502: P-by-1 grid.
00323 *  CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
00324 *                                the BLACS process grid A is distribu-
00325 *                                ted over. The context itself is glo-
00326 *                                bal, but the handle (the integer
00327 *                                value) may vary.
00328 *  N_A    (global) DESCA( 3 ) The size of the array dimension being
00329 *                                distributed.
00330 *  NB_A   (global) DESCA( 4 ) The blocking factor used to distribute
00331 *                                the distributed dimension of the array.
00332 *  SRC_A  (global) DESCA( 5 ) The process row or column over which the
00333 *                                first row or column of the array
00334 *                                is distributed.
00335 *  LLD_A  (local)  DESCA( 6 ) The leading dimension of the local array
00336 *                                storing the local blocks of the distri-
00337 *                                buted array A. Minimum value of LLD_A
00338 *                                depends on TYPE_A.
00339 *                                TYPE_A = 501: LLD_A >=
00340 *                                   size of undistributed dimension, 1.
00341 *                                TYPE_A = 502: LLD_A >=NB_A, 1.
00342 *  Reserved        DESCA( 7 ) Reserved for future use.
00343 *
00344 *  =====================================================================
00345 *
00346 *  Implemented for ScaLAPACK by:
00347 *     Andrew J. Cleary, Livermore National Lab and University of Tenn.,
00348 *     and Markus Hegland, Australian National University. Feb., 1997.
00349 *  Based on code written by    : Peter Arbenz, ETH Zurich, 1996.
00350 *  Last modified by:  Peter Arbenz, Institute of Scientific Computing,
00351 *    ETH, Zurich.
00352 *
00353 *  =====================================================================
00354 *
00355 *     .. Parameters ..
00356       DOUBLE PRECISION   ONE
00357       PARAMETER          ( ONE = 1.0D+0 )
00358       DOUBLE PRECISION   ZERO
00359       PARAMETER          ( ZERO = 0.0D+0 )
00360       INTEGER            INT_ONE
00361       PARAMETER          ( INT_ONE = 1 )
00362       INTEGER            DESCMULT, BIGNUM
00363       PARAMETER          ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT )
00364       INTEGER            BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
00365      $                   LLD_, MB_, M_, NB_, N_, RSRC_
00366       PARAMETER          ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1,
00367      $                     CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
00368      $                     RSRC_ = 7, CSRC_ = 8, LLD_ = 9 )
00369 *     ..
00370 *     .. Local Scalars ..
00371       INTEGER            APTR, BBPTR, BIPTR, BM, BM1, BM2, BMN, BN, BW,
00372      $                   CSRC, DBPTR, FIRST_PROC, I, I1, I2, ICTXT,
00373      $                   ICTXT_NEW, ICTXT_SAVE, IDUM3, J, JA_NEW, JPTR,
00374      $                   L, LAF_MIN, LBWL, LBWU, LDB, LDBB, LLDA, LM,
00375      $                   LMJ, LN, LNJ, LPTR, MYCOL, MYROW, MY_NUM_COLS,
00376      $                   NB, NEICOL, NP, NPACT, NPCOL, NPROW, NPSTR,
00377      $                   NP_SAVE, NRHS, ODD_N, ODD_SIZE, ODPTR, OFST,
00378      $                   PART_OFFSET, PART_SIZE, RETURN_CODE, STORE_N_A,
00379      $                   WORK_SIZE_MIN
00380 *     ..
00381 *     .. Local Arrays ..
00382       INTEGER            DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 )
00383 *     ..
00384 *     .. External Subroutines ..
00385       EXTERNAL           BLACS_GRIDEXIT, BLACS_GRIDINFO, DESC_CONVERT,
00386      $                   DGBTRF, DGEMM, DGER, DGERV2D, DGESD2D, DGETRF,
00387      $                   DLAMOV, DLASWP, DLATCPY, DSWAP, DTRRV2D,
00388      $                   DTRSD2D, DTRSM, GLOBCHK, IGAMX2D, IGEBR2D,
00389      $                   IGEBS2D, PXERBLA, RESHAPE
00390 *     ..
00391 *     .. External Functions ..
00392       INTEGER            NUMROC
00393       EXTERNAL           NUMROC
00394 *     ..
00395 *     .. Intrinsic Functions ..
00396       INTRINSIC          MAX, MIN, MOD
00397 *     ..
00398 *     .. Executable Statements ..
00399 *
00400 *
00401 *     Test the input parameters
00402 *
00403       INFO = 0
00404 *
00405 *     Convert descriptor into standard form for easy access to
00406 *        parameters, check that grid is of right shape.
00407 *
00408       DESCA_1XP( 1 ) = 501
00409 *
00410       CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE )
00411 *
00412       IF( RETURN_CODE.NE.0 ) THEN
00413          INFO = -( 6*100+2 )
00414       END IF
00415 *
00416 *     Get values out of descriptor for use in code.
00417 *
00418       ICTXT = DESCA_1XP( 2 )
00419       CSRC = DESCA_1XP( 5 )
00420       NB = DESCA_1XP( 4 )
00421       LLDA = DESCA_1XP( 6 )
00422       STORE_N_A = DESCA_1XP( 3 )
00423 *
00424 *     Get grid parameters
00425 *
00426 *
00427       CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
00428       NP = NPROW*NPCOL
00429 *
00430 *
00431 *
00432       IF( LWORK.LT.-1 ) THEN
00433          INFO = -11
00434       ELSE IF( LWORK.EQ.-1 ) THEN
00435          IDUM3 = -1
00436       ELSE
00437          IDUM3 = 1
00438       END IF
00439 *
00440       IF( N.LT.0 ) THEN
00441          INFO = -1
00442       END IF
00443 *
00444       IF( N+JA-1.GT.STORE_N_A ) THEN
00445          INFO = -( 6*100+6 )
00446       END IF
00447 *
00448       IF( ( BWL.GT.N-1 ) .OR. ( BWL.LT.0 ) ) THEN
00449          INFO = -2
00450       END IF
00451 *
00452       IF( ( BWU.GT.N-1 ) .OR. ( BWU.LT.0 ) ) THEN
00453          INFO = -3
00454       END IF
00455 *
00456       IF( LLDA.LT.( 2*BWL+2*BWU+1 ) ) THEN
00457          INFO = -( 6*100+6 )
00458       END IF
00459 *
00460       IF( NB.LE.0 ) THEN
00461          INFO = -( 6*100+4 )
00462       END IF
00463 *
00464       BW = BWU + BWL
00465 *
00466 *     Argument checking that is specific to Divide & Conquer routine
00467 *
00468       IF( NPROW.NE.1 ) THEN
00469          INFO = -( 6*100+2 )
00470       END IF
00471 *
00472       IF( N.GT.NP*NB-MOD( JA-1, NB ) ) THEN
00473          INFO = -( 1 )
00474          CALL PXERBLA( ICTXT, 'PDGBTRF, D&C alg.: only 1 block per proc'
00475      $                 , -INFO )
00476          RETURN
00477       END IF
00478 *
00479       IF( ( JA+N-1.GT.NB ) .AND. ( NB.LT.( BWL+BWU+1 ) ) ) THEN
00480          INFO = -( 6*100+4 )
00481          CALL PXERBLA( ICTXT, 'PDGBTRF, D&C alg.: NB too small', -INFO )
00482          RETURN
00483       END IF
00484 *
00485 *
00486 *     Check auxiliary storage size
00487 *
00488       LAF_MIN = ( NB+BWU )*( BWL+BWU ) + 6*( BWL+BWU )*( BWL+2*BWU )
00489 *
00490       IF( LAF.LT.LAF_MIN ) THEN
00491          INFO = -9
00492 *        put minimum value of laf into AF( 1 )
00493          AF( 1 ) = LAF_MIN
00494          CALL PXERBLA( ICTXT, 'PDGBTRF: auxiliary storage error ',
00495      $                 -INFO )
00496          RETURN
00497       END IF
00498 *
00499 *     Check worksize
00500 *
00501       WORK_SIZE_MIN = 1
00502 *
00503       WORK( 1 ) = WORK_SIZE_MIN
00504 *
00505       IF( LWORK.LT.WORK_SIZE_MIN ) THEN
00506          IF( LWORK.NE.-1 ) THEN
00507             INFO = -11
00508 *        put minimum value of work into work( 1 )
00509             WORK( 1 ) = WORK_SIZE_MIN
00510             CALL PXERBLA( ICTXT, 'PDGBTRF: worksize error ', -INFO )
00511          END IF
00512          RETURN
00513       END IF
00514 *
00515 *     Pack params and positions into arrays for global consistency check
00516 *
00517       PARAM_CHECK( 9, 1 ) = DESCA( 5 )
00518       PARAM_CHECK( 8, 1 ) = DESCA( 4 )
00519       PARAM_CHECK( 7, 1 ) = DESCA( 3 )
00520       PARAM_CHECK( 6, 1 ) = DESCA( 1 )
00521       PARAM_CHECK( 5, 1 ) = JA
00522       PARAM_CHECK( 4, 1 ) = BWU
00523       PARAM_CHECK( 3, 1 ) = BWL
00524       PARAM_CHECK( 2, 1 ) = N
00525       PARAM_CHECK( 1, 1 ) = IDUM3
00526 *
00527       PARAM_CHECK( 9, 2 ) = 605
00528       PARAM_CHECK( 8, 2 ) = 604
00529       PARAM_CHECK( 7, 2 ) = 603
00530       PARAM_CHECK( 6, 2 ) = 601
00531       PARAM_CHECK( 5, 2 ) = 5
00532       PARAM_CHECK( 4, 2 ) = 3
00533       PARAM_CHECK( 3, 2 ) = 2
00534       PARAM_CHECK( 2, 2 ) = 1
00535       PARAM_CHECK( 1, 2 ) = 11
00536 *
00537 *     Want to find errors with MIN( ), so if no error, set it to a big
00538 *     number. If there already is an error, multiply by the the
00539 *     descriptor multiplier.
00540 *
00541       IF( INFO.GE.0 ) THEN
00542          INFO = BIGNUM
00543       ELSE IF( INFO.LT.-DESCMULT ) THEN
00544          INFO = -INFO
00545       ELSE
00546          INFO = -INFO*DESCMULT
00547       END IF
00548 *
00549 *     Check consistency across processors
00550 *
00551       CALL GLOBCHK( ICTXT, 9, PARAM_CHECK, 9, PARAM_CHECK( 1, 3 ),
00552      $              INFO )
00553 *
00554 *     Prepare output: set info = 0 if no error, and divide by DESCMULT
00555 *     if error is not in a descriptor entry.
00556 *
00557       IF( INFO.EQ.BIGNUM ) THEN
00558          INFO = 0
00559       ELSE IF( MOD( INFO, DESCMULT ).EQ.0 ) THEN
00560          INFO = -INFO / DESCMULT
00561       ELSE
00562          INFO = -INFO
00563       END IF
00564 *
00565       IF( INFO.LT.0 ) THEN
00566          CALL PXERBLA( ICTXT, 'PDGBTRF', -INFO )
00567          RETURN
00568       END IF
00569 *
00570 *     Quick return if possible
00571 *
00572       IF( N.EQ.0 )
00573      $   RETURN
00574 *
00575 *
00576 *     Adjust addressing into matrix space to properly get into
00577 *        the beginning part of the relevant data
00578 *
00579       PART_OFFSET = NB*( ( JA-1 ) / ( NPCOL*NB ) )
00580 *
00581       IF( ( MYCOL-CSRC ).LT.( JA-PART_OFFSET-1 ) / NB ) THEN
00582          PART_OFFSET = PART_OFFSET + NB
00583       END IF
00584 *
00585       IF( MYCOL.LT.CSRC ) THEN
00586          PART_OFFSET = PART_OFFSET - NB
00587       END IF
00588 *
00589 *     Form a new BLACS grid (the "standard form" grid) with only procs
00590 *        holding part of the matrix, of size 1xNP where NP is adjusted,
00591 *        starting at csrc=0, with JA modified to reflect dropped procs.
00592 *
00593 *     First processor to hold part of the matrix:
00594 *
00595       FIRST_PROC = MOD( ( JA-1 ) / NB+CSRC, NPCOL )
00596 *
00597 *     Calculate new JA one while dropping off unused processors.
00598 *
00599       JA_NEW = MOD( JA-1, NB ) + 1
00600 *
00601 *     Save and compute new value of NP
00602 *
00603       NP_SAVE = NP
00604       NP = ( JA_NEW+N-2 ) / NB + 1
00605 *
00606 *     Call utility routine that forms "standard-form" grid
00607 *
00608       CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE, FIRST_PROC,
00609      $              INT_ONE, NP )
00610 *
00611 *     Use new context from standard grid as context.
00612 *
00613       ICTXT_SAVE = ICTXT
00614       ICTXT = ICTXT_NEW
00615       DESCA_1XP( 2 ) = ICTXT_NEW
00616 *
00617 *     Get information about new grid.
00618 *
00619       CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
00620 *
00621 *     Drop out processors that do not have part of the matrix.
00622 *
00623       IF( MYROW.LT.0 ) THEN
00624          GO TO 210
00625       END IF
00626 *
00627 *     ********************************
00628 *     Values reused throughout routine
00629 *
00630 *     User-input value of partition size
00631 *
00632       PART_SIZE = NB
00633 *
00634 *     Number of columns in each processor
00635 *
00636       MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL )
00637 *
00638 *     Offset in columns to beginning of main partition in each proc
00639 *
00640       IF( MYCOL.EQ.0 ) THEN
00641          PART_OFFSET = PART_OFFSET + MOD( JA_NEW-1, PART_SIZE )
00642          MY_NUM_COLS = MY_NUM_COLS - MOD( JA_NEW-1, PART_SIZE )
00643       END IF
00644 *
00645 *     Offset in elements
00646 *
00647       OFST = PART_OFFSET*LLDA
00648 *
00649 *     Size of main (or odd) partition in each processor
00650 *
00651       ODD_SIZE = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL )
00652 *
00653 *
00654 *     Zero out space for fillin
00655 *
00656       DO 10 I = 1, LAF_MIN
00657          AF( I ) = ZERO
00658    10 CONTINUE
00659 *
00660       DO 30 J = 1, ODD_SIZE
00661          DO 20 I = 1, BW
00662             A( I+( J-1 )*LLDA ) = ZERO
00663    20    CONTINUE
00664    30 CONTINUE
00665 *
00666 *     Begin main code
00667 *
00668 ********************************************************************
00669 *     PHASE 1: Local computation phase.
00670 ********************************************************************
00671 *
00672 *
00673 *     Transfer triangle B_i of local matrix to next processor
00674 *     for fillin. Overlap the send with the factorization of A_i.
00675 *
00676       IF( MYCOL.LE.NPCOL-2 ) THEN
00677 *
00678 *     The last processor does not need to send anything.
00679 *     BIPTR = location of triangle B_i in memory
00680          BIPTR = ( NB-BW )*LLDA + 2*BW + 1
00681 *
00682          CALL DTRSD2D( ICTXT, 'U', 'N',
00683      $                 MIN( BW, BWU+NUMROC( N, NB, MYCOL+1, 0,
00684      $                 NPCOL ) ), BW, A( BIPTR ), LLDA-1, 0, MYCOL+1 )
00685 *
00686       END IF
00687 *
00688 *     Factor main partition P_i A_i = L_i U_i on each processor
00689 *
00690 *     LBWL, LBWU: lower and upper bandwidth of local solver
00691 *     Note that for MYCOL > 0 one has lower triangular blocks!
00692 *     LM is the number of rows which is usually NB except for
00693 *     MYCOL = 0 where it is BWU less and MYCOL=NPCOL-1 where it
00694 *     is NR+BWU where NR is the number of columns on the last processor
00695 *     Finally APTR is the pointer to the first element of A. As LAPACK
00696 *     has a slightly different matrix format than Scalapack the pointer
00697 *     has to be adjusted on processor MYCOL=0.
00698 *
00699       IF( MYCOL.NE.0 ) THEN
00700          LBWL = BW
00701          LBWU = 0
00702          APTR = 1
00703       ELSE
00704          LBWL = BWL
00705          LBWU = BWU
00706          APTR = 1 + BWU
00707       END IF
00708 *
00709       IF( MYCOL.NE.NPCOL-1 ) THEN
00710          LM = NB - LBWU
00711          LN = NB - BW
00712       ELSE IF( MYCOL.NE.0 ) THEN
00713          LM = ODD_SIZE + BWU
00714          LN = MAX( ODD_SIZE-BW, 0 )
00715       ELSE
00716          LM = N
00717          LN = MAX( N-BW, 0 )
00718       END IF
00719 *
00720       IF( LN.GT.0 ) THEN
00721 *
00722          CALL DGBTRF( LM, LN, LBWL, LBWU, A( APTR ), LLDA, IPIV, INFO )
00723 *
00724          IF( INFO.NE.0 ) THEN
00725             INFO = INFO + NB*MYCOL
00726             GO TO 80
00727          END IF
00728 *
00729          NRHS = BW
00730          LDB = LLDA - 1
00731 *
00732 *     Update the last BW columns of A_i (code modified from DGBTRS)
00733 *
00734 *     Only the eliminations of unknowns > LN-BW have an effect on
00735 *     the last BW columns. Loop over them...
00736 *
00737          DO 40 J = MAX( LN-BW+1, 1 ), LN
00738 *
00739             LMJ = MIN( LBWL, LM-J )
00740             LNJ = MIN( BW, J+BW-LN+APTR-1 )
00741 *
00742             L = IPIV( J )
00743 *
00744             JPTR = J - ( LN+1 ) + 2*BW + 1 - LBWL + LN*LLDA
00745 *
00746             IF( L.NE.J ) THEN
00747 *
00748 *        Element (L,LN+1) is swapped with element (J,LN+1) etc
00749 *        Furthermore, the elements in the same row are LDB=LLDA-1 apart
00750 *        The complicated formulas are to cope with the banded
00751 *          data format:
00752 *
00753                LPTR = L - ( LN+1 ) + 2*BW + 1 - LBWL + LN*LLDA
00754 *
00755                CALL DSWAP( LNJ, A( LPTR ), LDB, A( JPTR ), LDB )
00756 *
00757             END IF
00758 *
00759 *              LPTR is the pointer to the beginning of the
00760 *                 coefficients of L
00761 *
00762             LPTR = BW + 1 + APTR + ( J-1 )*LLDA
00763 *
00764             CALL DGER( LMJ, LNJ, -ONE, A( LPTR ), 1, A( JPTR ), LDB,
00765      $                 A( JPTR+1 ), LDB )
00766    40    CONTINUE
00767 *
00768       END IF
00769 *
00770 *      Compute spike fill-in, L_i F_i = P_i B_{i-1}
00771 *
00772 *      Receive triangle B_{i-1} from previous processor
00773 *
00774       IF( MYCOL.GT.0 ) THEN
00775          CALL DTRRV2D( ICTXT, 'U', 'N', MIN( BW, LM ), BW, AF( 1 ), BW,
00776      $                 0, MYCOL-1 )
00777 *
00778 *        Transpose transmitted upper triangular (trapezoidal) matrix
00779 *
00780          DO 60 I2 = 1, MIN( BW, LM )
00781             DO 50 I1 = I2 + 1, BW
00782                AF( I1+( I2-1 )*BW ) = AF( I2+( I1-1 )*BW )
00783                AF( I2+( I1-1 )*BW ) = ZERO
00784    50       CONTINUE
00785    60    CONTINUE
00786 *
00787 *      Permutation and forward elimination (triang. solve)
00788 *
00789          DO 70 J = 1, LN
00790 *
00791             LMJ = MIN( LBWL, LM-J )
00792             L = IPIV( J )
00793 *
00794             IF( L.NE.J ) THEN
00795                CALL DSWAP( BW, AF( ( L-1 )*BW+1 ), 1,
00796      $                     AF( ( J-1 )*BW+1 ), 1 )
00797             END IF
00798 *
00799             LPTR = BW + 1 + APTR + ( J-1 )*LLDA
00800 *
00801             CALL DGER( NRHS, LMJ, -ONE, AF( ( J-1 )*BW+1 ), 1,
00802      $                 A( LPTR ), 1, AF( J*BW+1 ), BW )
00803 *
00804    70    CONTINUE
00805 *
00806       END IF
00807 *
00808    80 CONTINUE
00809 *
00810 ********************************************************************
00811 *     PHASE 2: Formation and factorization of Reduced System.
00812 ********************************************************************
00813 *
00814 *     Define the initial dimensions of the diagonal blocks
00815 *     The offdiagonal blocks (for MYCOL > 0) are of size BM by BW
00816 *
00817       IF( MYCOL.NE.NPCOL-1 ) THEN
00818          BM = BW - LBWU
00819          BN = BW
00820       ELSE
00821          BM = MIN( BW, ODD_SIZE ) + BWU
00822          BN = MIN( BW, ODD_SIZE )
00823       END IF
00824 *
00825 *     Pointer to first element of block bidiagonal matrix in AF
00826 *     Leading dimension of block bidiagonal system
00827 *
00828       BBPTR = ( NB+BWU )*BW + 1
00829       LDBB = 2*BW + BWU
00830 *
00831 *     Copy from A and AF into block bidiagonal matrix (tail of AF)
00832 *
00833 *     DBPTR = Pointer to diagonal blocks in A
00834       DBPTR = BW + 1 + LBWU + LN*LLDA
00835 *
00836       CALL DLAMOV( 'G', BM, BN, A( DBPTR ), LLDA-1, AF( BBPTR+BW*LDBB ),
00837      $             LDBB )
00838 *
00839 *     Zero out any junk entries that were copied
00840 *
00841       DO 100 J = 1, BM
00842          DO 90 I = J + LBWL, BM - 1
00843             AF( BBPTR+BW*LDBB+( J-1 )*LDBB+I ) = ZERO
00844    90    CONTINUE
00845   100 CONTINUE
00846 *
00847       IF( MYCOL.NE.0 ) THEN
00848 *
00849 *        ODPTR = Pointer to offdiagonal blocks in A
00850 *
00851          ODPTR = ( LM-BM )*BW + 1
00852          CALL DLATCPY( 'G', BW, BM, AF( ODPTR ), BW,
00853      $                 AF( BBPTR+2*BW*LDBB ), LDBB )
00854       END IF
00855 *
00856       IF( NPCOL.EQ.1 ) THEN
00857 *
00858 *        In this case the loop over the levels will not be
00859 *        performed.
00860          CALL DGETRF( N-LN, N-LN, AF( BBPTR+BW*LDBB ), LDBB,
00861      $                IPIV( LN+1 ), INFO )
00862 *
00863       END IF
00864 *
00865 *     Loop over levels ... only occurs if npcol > 1
00866 *
00867 *     The two integers NPACT (nu. of active processors) and NPSTR
00868 *     (stride between active processors) are used to control the
00869 *     loop.
00870 *
00871       NPACT = NPCOL
00872       NPSTR = 1
00873 *
00874 *       Begin loop over levels
00875 *
00876   110 CONTINUE
00877       IF( NPACT.LE.1 )
00878      $   GO TO 190
00879 *
00880 *         Test if processor is active
00881 *
00882       IF( MOD( MYCOL, NPSTR ).EQ.0 ) THEN
00883 *
00884 *   Send/Receive blocks
00885 *
00886 *
00887          IF( MOD( MYCOL, 2*NPSTR ).EQ.0 ) THEN
00888 *
00889 *            This node will potentially do more work later
00890 *
00891             NEICOL = MYCOL + NPSTR
00892 *
00893             IF( NEICOL / NPSTR.LT.NPACT-1 ) THEN
00894                BMN = BW
00895             ELSE IF( NEICOL / NPSTR.EQ.NPACT-1 ) THEN
00896                ODD_N = NUMROC( N, NB, NPCOL-1, 0, NPCOL )
00897                BMN = MIN( BW, ODD_N ) + BWU
00898             ELSE
00899 *
00900 *                  Last processor skips to next level
00901                GO TO 180
00902             END IF
00903 *
00904 *               BM1 = M for 1st block on proc pair, BM2 2nd block
00905 *
00906             BM1 = BM
00907             BM2 = BMN
00908 *
00909             IF( NEICOL / NPSTR.LE.NPACT-1 ) THEN
00910 *
00911                CALL DGESD2D( ICTXT, BM, 2*BW, AF( BBPTR+BW*LDBB ), LDBB,
00912      $                       0, NEICOL )
00913 *
00914                CALL DGERV2D( ICTXT, BMN, 2*BW, AF( BBPTR+BM ), LDBB, 0,
00915      $                       NEICOL )
00916 *
00917                IF( NPACT.EQ.2 ) THEN
00918 *
00919 *                     Copy diagonal block to align whole system
00920 *
00921                   CALL DLAMOV( 'G', BMN, BW, AF( BBPTR+BM ), LDBB,
00922      $                         AF( BBPTR+2*BW*LDBB+BM ), LDBB )
00923                END IF
00924 *
00925             END IF
00926 *
00927          ELSE
00928 *
00929 *               This node stops work after this stage -- an extra copy
00930 *               is required to make the odd and even frontal matrices
00931 *               look identical
00932 *
00933             NEICOL = MYCOL - NPSTR
00934 *
00935             IF( NEICOL.EQ.0 ) THEN
00936                BMN = BW - BWU
00937             ELSE
00938                BMN = BW
00939             END IF
00940 *
00941             BM1 = BMN
00942             BM2 = BM
00943 *
00944             CALL DGESD2D( ICTXT, BM, 2*BW, AF( BBPTR+BW*LDBB ), LDBB, 0,
00945      $                    NEICOL )
00946 *
00947             CALL DLAMOV( 'G', BM, 2*BW, AF( BBPTR+BW*LDBB ), LDBB,
00948      $                   AF( BBPTR+BMN ), LDBB )
00949 *
00950             DO 130 J = BBPTR + 2*BW*LDBB, BBPTR + 3*BW*LDBB - 1, LDBB
00951                DO 120 I = 0, LDBB - 1
00952                   AF( I+J ) = ZERO
00953   120          CONTINUE
00954   130       CONTINUE
00955 *
00956             CALL DGERV2D( ICTXT, BMN, 2*BW, AF( BBPTR+BW*LDBB ), LDBB,
00957      $                    0, NEICOL )
00958 *
00959             IF( NPACT.EQ.2 ) THEN
00960 *
00961 *                  Copy diagonal block to align whole system
00962 *
00963                CALL DLAMOV( 'G', BM, BW, AF( BBPTR+BMN ), LDBB,
00964      $                      AF( BBPTR+2*BW*LDBB+BMN ), LDBB )
00965             END IF
00966 *
00967          END IF
00968 *
00969 *            LU factorization with partial pivoting
00970 *
00971          IF( NPACT.NE.2 ) THEN
00972 *
00973             CALL DGETRF( BM+BMN, BW, AF( BBPTR+BW*LDBB ), LDBB,
00974      $                   IPIV( LN+1 ), INFO )
00975 *
00976 *   Backsolve left side
00977 *
00978             DO 150 J = BBPTR, BBPTR + BW*LDBB - 1, LDBB
00979                DO 140 I = 0, BM1 - 1
00980                   AF( I+J ) = ZERO
00981   140          CONTINUE
00982   150       CONTINUE
00983 *
00984             CALL DLASWP( BW, AF( BBPTR ), LDBB, 1, BW, IPIV( LN+1 ), 1 )
00985 *
00986             CALL DTRSM( 'L', 'L', 'N', 'U', BW, BW, ONE,
00987      $                  AF( BBPTR+BW*LDBB ), LDBB, AF( BBPTR ), LDBB )
00988 *
00989 *               Use partial factors to update remainder
00990 *
00991             CALL DGEMM( 'N', 'N', BM+BMN-BW, BW, BW, -ONE,
00992      $                  AF( BBPTR+BW*LDBB+BW ), LDBB, AF( BBPTR ), LDBB,
00993      $                  ONE, AF( BBPTR+BW ), LDBB )
00994 *
00995 *   Backsolve right side
00996 *
00997             NRHS = BW
00998 *
00999             CALL DLASWP( NRHS, AF( BBPTR+2*BW*LDBB ), LDBB, 1, BW,
01000      $                   IPIV( LN+1 ), 1 )
01001 *
01002             CALL DTRSM( 'L', 'L', 'N', 'U', BW, NRHS, ONE,
01003      $                  AF( BBPTR+BW*LDBB ), LDBB,
01004      $                  AF( BBPTR+2*BW*LDBB ), LDBB )
01005 *
01006 *               Use partial factors to update remainder
01007 *
01008             CALL DGEMM( 'N', 'N', BM+BMN-BW, NRHS, BW, -ONE,
01009      $                  AF( BBPTR+BW*LDBB+BW ), LDBB,
01010      $                  AF( BBPTR+2*BW*LDBB ), LDBB, ONE,
01011      $                  AF( BBPTR+2*BW*LDBB+BW ), LDBB )
01012 *
01013 *
01014 *     Test if processor is active in next round
01015 *
01016             IF( MOD( MYCOL, 2*NPSTR ).EQ.0 ) THEN
01017 *
01018 *                  Reset BM
01019 *
01020                BM = BM1 + BM2 - BW
01021 *
01022 *                  Local copying in the block bidiagonal area
01023 *
01024 *
01025                CALL DLAMOV( 'G', BM, BW, AF( BBPTR+BW ), LDBB,
01026      $                      AF( BBPTR+BW*LDBB ), LDBB )
01027                CALL DLAMOV( 'G', BM, BW, AF( BBPTR+2*BW*LDBB+BW ), LDBB,
01028      $                      AF( BBPTR+2*BW*LDBB ), LDBB )
01029 *
01030 *                  Zero out space that held original copy
01031 *
01032                DO 170 J = 0, BW - 1
01033                   DO 160 I = 0, BM - 1
01034                      AF( BBPTR+2*BW*LDBB+BW+J*LDBB+I ) = ZERO
01035   160             CONTINUE
01036   170          CONTINUE
01037 *
01038             END IF
01039 *
01040          ELSE
01041 *
01042 *               Factor the final 2 by 2 block matrix
01043 *
01044             CALL DGETRF( BM+BMN, BM+BMN, AF( BBPTR+BW*LDBB ), LDBB,
01045      $                   IPIV( LN+1 ), INFO )
01046          END IF
01047 *
01048       END IF
01049 *
01050 *        Last processor in an odd-sized NPACT skips to here
01051 *
01052   180 CONTINUE
01053 *
01054       NPACT = ( NPACT+1 ) / 2
01055       NPSTR = NPSTR*2
01056       GO TO 110
01057 *
01058   190 CONTINUE
01059 *     End loop over levels
01060 *
01061   200 CONTINUE
01062 *     If error was found in Phase 1, processors jump here.
01063 *
01064 *     Free BLACS space used to hold standard-form grid.
01065 *
01066       ICTXT = ICTXT_SAVE
01067       IF( ICTXT.NE.ICTXT_NEW ) THEN
01068          CALL BLACS_GRIDEXIT( ICTXT_NEW )
01069       END IF
01070 *
01071   210 CONTINUE
01072 *     If this processor did not hold part of the grid it
01073 *        jumps here.
01074 *
01075 *     Restore saved input parameters
01076 *
01077       ICTXT = ICTXT_SAVE
01078       NP = NP_SAVE
01079 *
01080 *     Output worksize
01081 *
01082       WORK( 1 ) = WORK_SIZE_MIN
01083 *
01084 *         Make INFO consistent across processors
01085 *
01086       CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO, -1, 0,
01087      $              0 )
01088 *
01089       IF( MYCOL.EQ.0 ) THEN
01090          CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 )
01091       ELSE
01092          CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 )
01093       END IF
01094 *
01095 *
01096       RETURN
01097 *
01098 *     End of PDGBTRF
01099 *
01100       END