ScaLAPACK  2.0.2
ScaLAPACK: Scalable Linear Algebra PACKage
PB_CptrsmB.c
Go to the documentation of this file.
00001 /* ---------------------------------------------------------------------
00002 *
00003 *  -- PBLAS auxiliary routine (version 2.0) --
00004 *     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
00005 *     and University of California, Berkeley.
00006 *     April 1, 1998
00007 *
00008 *  ---------------------------------------------------------------------
00009 */
00010 /*
00011 *  Include files
00012 */
00013 #include "../pblas.h"
00014 #include "../PBpblas.h"
00015 #include "../PBtools.h"
00016 #include "../PBblacs.h"
00017 #include "../PBblas.h"
00018 
00019 #ifdef __STDC__
00020 void PB_CptrsmB( PBTYP_T * TYPE, char * DIRECB, char * SIDE,
00021                  char * UPLO, char * TRANSA, char * DIAG, int M, int N,
00022                  char * ALPHA, char * A, int IA, int JA, int * DESCA,
00023                  char * B, int IB, int JB, int * DESCB )
00024 #else
00025 void PB_CptrsmB( TYPE, DIRECB, SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A,
00026                  IA, JA, DESCA, B, IB, JB, DESCB )
00027 /*
00028 *  .. Scalar Arguments ..
00029 */
00030    char           * DIAG, * DIRECB, * SIDE, * TRANSA, * UPLO;
00031    int            IA, IB, JA, JB, M, N;
00032    char           * ALPHA;
00033    PBTYP_T        * TYPE;
00034 /*
00035 *  .. Array Arguments ..
00036 */
00037    int            * DESCA, * DESCB;
00038    char           * A, * B;
00039 #endif
00040 {
00041 /*
00042 *  Purpose
00043 *  =======
00044 *
00045 *  PB_CptrsmB  solves one of the matrix equations
00046 *
00047 *     op( sub( A ) )*X = alpha*sub( B ),   or
00048 *
00049 *     X*op( sub( A ) ) = alpha*sub( B ),
00050 *
00051 *  where
00052 *
00053 *     sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1)  if SIDE = 'L',
00054 *                      A(IA:IA+N-1,JA:JA+N-1)  if SIDE = 'R', and,
00055 *
00056 *     sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1).
00057 *
00058 *  Alpha is a scalar, X and sub( B ) are m by n submatrices, sub( A ) is
00059 *  a unit, or non-unit, upper or lower  triangular submatrix and op( Y )
00060 *  is one of
00061 *
00062 *     op( Y ) = Y   or   op( Y ) = Y'   or   op( Y ) = conjg( Y' ).
00063 *
00064 *  The submatrix X is overwritten on sub( B ).
00065 *
00066 *  This is the inner-product algorithm  using  the  logical  LCM  hybrid
00067 *  and static blocking techniques. The submatrix operand  sub( A ) stays
00068 *  in place.
00069 *
00070 *  Notes
00071 *  =====
00072 *
00073 *  A description  vector  is associated with each 2D block-cyclicly dis-
00074 *  tributed matrix.  This  vector  stores  the  information  required to
00075 *  establish the  mapping  between a  matrix entry and its corresponding
00076 *  process and memory location.
00077 *
00078 *  In  the  following  comments,   the character _  should  be  read  as
00079 *  "of  the  distributed  matrix".  Let  A  be a generic term for any 2D
00080 *  block cyclicly distributed matrix.  Its description vector is DESC_A:
00081 *
00082 *  NOTATION         STORED IN       EXPLANATION
00083 *  ---------------- --------------- ------------------------------------
00084 *  DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
00085 *  CTXT_A  (global) DESCA[ CTXT_  ] The BLACS context handle, indicating
00086 *                                   the NPROW x NPCOL BLACS process grid
00087 *                                   A  is  distributed over. The context
00088 *                                   itself  is  global,  but  the handle
00089 *                                   (the integer value) may vary.
00090 *  M_A     (global) DESCA[ M_     ] The  number of rows in the distribu-
00091 *                                   ted matrix A, M_A >= 0.
00092 *  N_A     (global) DESCA[ N_     ] The number of columns in the distri-
00093 *                                   buted matrix A, N_A >= 0.
00094 *  IMB_A   (global) DESCA[ IMB_   ] The number of rows of the upper left
00095 *                                   block of the matrix A, IMB_A > 0.
00096 *  INB_A   (global) DESCA[ INB_   ] The  number  of columns of the upper
00097 *                                   left   block   of   the  matrix   A,
00098 *                                   INB_A > 0.
00099 *  MB_A    (global) DESCA[ MB_    ] The blocking factor used to  distri-
00100 *                                   bute the last  M_A-IMB_A  rows of A,
00101 *                                   MB_A > 0.
00102 *  NB_A    (global) DESCA[ NB_    ] The blocking factor used to  distri-
00103 *                                   bute the last  N_A-INB_A  columns of
00104 *                                   A, NB_A > 0.
00105 *  RSRC_A  (global) DESCA[ RSRC_  ] The process row over which the first
00106 *                                   row of the matrix  A is distributed,
00107 *                                   NPROW > RSRC_A >= 0.
00108 *  CSRC_A  (global) DESCA[ CSRC_  ] The  process column  over  which the
00109 *                                   first column of  A  is  distributed.
00110 *                                   NPCOL > CSRC_A >= 0.
00111 *  LLD_A   (local)  DESCA[ LLD_   ] The  leading dimension  of the local
00112 *                                   array  storing  the  local blocks of
00113 *                                   the distributed matrix A,
00114 *                                   IF( Lc( 1, N_A ) > 0 )
00115 *                                      LLD_A >= MAX( 1, Lr( 1, M_A ) )
00116 *                                   ELSE
00117 *                                      LLD_A >= 1.
00118 *
00119 *  Let K be the number of  rows of a matrix A starting at the global in-
00120 *  dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
00121 *  that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
00122 *  receive if these K rows were distributed over NPROW processes.  If  K
00123 *  is the number of columns of a matrix  A  starting at the global index
00124 *  JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number  of co-
00125 *  lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would  receive if
00126 *  these K columns were distributed over NPCOL processes.
00127 *
00128 *  The values of Lr() and Lc() may be determined via a call to the func-
00129 *  tion PB_Cnumroc:
00130 *  Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
00131 *  Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
00132 *
00133 *  Arguments
00134 *  =========
00135 *
00136 *  TYPE    (local input) pointer to a PBTYP_T structure
00137 *          On entry,  TYPE  is a pointer to a structure of type PBTYP_T,
00138 *          that contains type information (See pblas.h).
00139 *
00140 *  DIRECB  (global input) pointer to CHAR
00141 *          On entry, DIRECB  specifies  the direction in which the  rows
00142 *          or columns of sub( B ) should be looped  over as follows:
00143 *             DIRECB = 'F' or 'f'   forward  or increasing,
00144 *             DIRECB = 'B' or 'b'   backward or decreasing.
00145 *
00146 *  SIDE    (global input) pointer to CHAR
00147 *          On entry,  SIDE  specifies  whether op( sub( A ) ) appears on
00148 *          the left or right of X as follows:
00149 *
00150 *             SIDE = 'L' or 'l'   op( sub( A ) )*X = alpha*sub( B ),
00151 *
00152 *             SIDE = 'R' or 'r'   X*op( sub( A ) ) = alpha*sub( B ).
00153 *
00154 *  UPLO    (global input) pointer to CHAR
00155 *          On entry,  UPLO  specifies whether the submatrix  sub( A ) is
00156 *          an upper or lower triangular submatrix as follows:
00157 *
00158 *             UPLO = 'U' or 'u'   sub( A ) is an upper triangular
00159 *                                 submatrix,
00160 *
00161 *             UPLO = 'L' or 'l'   sub( A ) is a  lower triangular
00162 *                                 submatrix.
00163 *
00164 *  TRANSA  (global input) pointer to CHAR
00165 *          On entry,  TRANSA  specifies the form of op( sub( A ) ) to be
00166 *          used in the matrix multiplication as follows:
00167 *
00168 *             TRANSA = 'N' or 'n'   op( sub( A ) ) = sub( A ),
00169 *
00170 *             TRANSA = 'T' or 't'   op( sub( A ) ) = sub( A )',
00171 *
00172 *             TRANSA = 'C' or 'c'   op( sub( A ) ) = conjg( sub( A )' ).
00173 *
00174 *  DIAG    (global input) pointer to CHAR
00175 *          On entry,  DIAG  specifies  whether or not  sub( A )  is unit
00176 *          triangular as follows:
00177 *
00178 *             DIAG = 'U' or 'u'  sub( A )  is  assumed to be unit trian-
00179 *                                gular,
00180 *
00181 *             DIAG = 'N' or 'n'  sub( A ) is not assumed to be unit tri-
00182 *                                angular.
00183 *
00184 *  M       (global input) INTEGER
00185 *          On entry,  M  specifies the number of rows of  the  submatrix
00186 *          sub( B ). M  must be at least zero.
00187 *
00188 *  N       (global input) INTEGER
00189 *          On entry, N  specifies the number of columns of the submatrix
00190 *          sub( B ). N  must be at least zero.
00191 *
00192 *  ALPHA   (global input) pointer to CHAR
00193 *          On entry, ALPHA specifies the scalar alpha.   When  ALPHA  is
00194 *          supplied  as  zero  then  the  local entries of  the array  B
00195 *          corresponding to the entries of the submatrix  sub( B )  need
00196 *          not be set on input.
00197 *
00198 *  A       (local input) pointer to CHAR
00199 *          On entry, A is an array of dimension (LLD_A, Ka), where Ka is
00200 *          at least Lc( 1, JA+M-1 ) when  SIDE = 'L' or 'l'   and  is at
00201 *          least Lc( 1, JA+N-1 ) otherwise.  Before  entry,  this  array
00202 *          contains the local entries of the matrix A.
00203 *          Before entry with  UPLO = 'U' or 'u', this array contains the
00204 *          local entries corresponding to  the entries of the upper tri-
00205 *          angular submatrix  sub( A ), and the local entries correspon-
00206 *          ding to the entries of the strictly lower triangular part  of
00207 *          the submatrix  sub( A )  are not referenced.
00208 *          Before entry with  UPLO = 'L' or 'l', this array contains the
00209 *          local entries corresponding to  the entries of the lower tri-
00210 *          angular submatrix  sub( A ), and the local entries correspon-
00211 *          ding to the entries of the strictly upper triangular part  of
00212 *          the submatrix  sub( A )  are not referenced.
00213 *          Note  that  when DIAG = 'U' or 'u', the local entries corres-
00214 *          ponding to the  diagonal elements  of the submatrix  sub( A )
00215 *          are not referenced either, but are assumed to be unity.
00216 *
00217 *  IA      (global input) INTEGER
00218 *          On entry, IA  specifies A's global row index, which points to
00219 *          the beginning of the submatrix sub( A ).
00220 *
00221 *  JA      (global input) INTEGER
00222 *          On entry, JA  specifies A's global column index, which points
00223 *          to the beginning of the submatrix sub( A ).
00224 *
00225 *  DESCA   (global and local input) INTEGER array
00226 *          On entry, DESCA  is an integer array of dimension DLEN_. This
00227 *          is the array descriptor for the matrix A.
00228 *
00229 *  B       (local input/local output) pointer to CHAR
00230 *          On entry, B is an array of dimension (LLD_B, Kb), where Kb is
00231 *          at least Lc( 1, JB+N-1 ).  Before  entry, this array contains
00232 *          the local entries of the matrix  B.
00233 *          On exit, the local entries of this array corresponding to the
00234 *          to  the entries of the submatrix sub( B ) are  overwritten by
00235 *          the local entries of the m by n  solution submatrix.
00236 *
00237 *  IB      (global input) INTEGER
00238 *          On entry, IB  specifies B's global row index, which points to
00239 *          the beginning of the submatrix sub( B ).
00240 *
00241 *  JB      (global input) INTEGER
00242 *          On entry, JB  specifies B's global column index, which points
00243 *          to the beginning of the submatrix sub( B ).
00244 *
00245 *  DESCB   (global and local input) INTEGER array
00246 *          On entry, DESCB  is an integer array of dimension DLEN_. This
00247 *          is the array descriptor for the matrix B.
00248 *
00249 *  -- Written on April 1, 1998 by
00250 *     Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
00251 *
00252 *  ---------------------------------------------------------------------
00253 */
00254 /*
00255 *  .. Local Scalars ..
00256 */
00257    char           Broc, TranOp, conjg, * negone, * one, * talpha, * talph0, top,
00258                   * zero;
00259    int            Acol, Aii, Aimb1, Ainb1, Ajj, Akp, Akq, Alcmb, Ald, Amb, An,
00260                   Anb, Anp, Anp0, Anq, Anq0, Arow, Asrc, Astart, BcurrocR, Bfwd,
00261                   BiiD, BiiR, Binb1D, Binb1R, BisR, Bld, BmyprocD, BmyprocR,
00262                   BnD, BnR, BnbD, BnbR, BnpR, BnprocsD, BnprocsR, BrocD, BrocR,
00263                   BsrcR, LNorRT, WBCfr, WBCld, WBCapbX, WBCsum, WBRfr, WBRld,
00264                   WBRapbX, WBRsum, ctxt, izero=0, k, kb, kbnext, kbprev, ktmp,
00265                   lside, mycol, myrow, n, nb, nbb, notran, npcol, nprow, p=0,
00266                   size, tmp, upper;
00267    TZPAD_T        pad;
00268    GEMM_T         gemm;
00269    GSUM2D_T       gsum2d;
00270 /*
00271 *  .. Local Arrays ..
00272 */
00273    int            Ad0[DLEN_], DBUFB[DLEN_], WBCd[DLEN_], WBRd[DLEN_];
00274    char           * Aptr = NULL, * Bptr = NULL, * WBC = NULL, * WBR = NULL;
00275 /* ..
00276 *  .. Executable Statements ..
00277 *
00278 */
00279 /*
00280 *  Retrieve process grid information
00281 */
00282    Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
00283 
00284    Bfwd   = ( Mupcase( DIRECB[0] ) == CFORWARD );
00285    lside  = ( Mupcase( SIDE  [0] ) == CLEFT    );
00286    upper  = ( Mupcase( UPLO  [0] ) == CUPPER   );
00287    notran = ( ( TranOp = Mupcase( TRANSA[0] ) ) == CNOTRAN  );
00288    LNorRT = ( lside && notran ) || ( !( lside ) && !( notran ) );
00289 
00290    size   = TYPE->size;
00291    one    = TYPE->one;    zero = TYPE->zero;  negone = TYPE->negone;
00292    pad    = TYPE->Ftzpad; gemm = TYPE->Fgemm; gsum2d = TYPE->Cgsum2d;
00293    nb     = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
00294 /*
00295 *  Compute local information for sub( A ) and sub( B )
00296 */
00297    if( lside )
00298    {
00299       BnD = An = M;            BnR      = N;           Broc  = CCOLUMN;
00300       BmyprocD = myrow;        BnprocsD = nprow;
00301       BmyprocR = mycol;        BnprocsR = npcol;
00302       BnbD     = DESCB[MB_  ]; BnbR     = DESCB[NB_ ];
00303       BsrcR    = DESCB[CSRC_]; Bld      = DESCB[LLD_];
00304       PB_Cinfog2l( IB, JB, DESCB, BnprocsD, BnprocsR, BmyprocD, BmyprocR,
00305                    &BiiD, &BiiR, &BrocD, &BrocR );
00306       Binb1D   = PB_Cfirstnb( BnD, IB, DESCB[IMB_], BnbD );
00307       Binb1R   = PB_Cfirstnb( BnR, JB, DESCB[INB_], BnbR );
00308    }
00309    else
00310    {
00311       BnD = An = N;            BnR      = M;           Broc  = CROW;
00312       BmyprocD = mycol;        BnprocsD = npcol;
00313       BmyprocR = myrow;        BnprocsR = nprow;
00314       BnbR     = DESCB[MB_  ]; BnbD     = DESCB[NB_ ];
00315       BsrcR    = DESCB[RSRC_]; Bld      = DESCB[LLD_];
00316       PB_Cinfog2l( IB, JB, DESCB, BnprocsR, BnprocsD, BmyprocR, BmyprocD,
00317                    &BiiR, &BiiD, &BrocR, &BrocD );
00318       Binb1D   = PB_Cfirstnb( BnD, JB, DESCB[INB_], BnbD );
00319       Binb1R   = PB_Cfirstnb( BnR, IB, DESCB[IMB_], BnbR );
00320    }
00321 /*
00322 *  Compute descriptor Ad0 for sub( A )
00323 */
00324    PB_Cdescribe( An, An, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj,
00325                  &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 );
00326 /*
00327 *  Compute conjugate of alpha for the conjugate transpose cases
00328 */
00329    if( TranOp == CCOTRAN )
00330    {
00331       conjg = CCONJG; talpha = PB_Cmalloc( size );
00332       PB_Cconjg( TYPE, ALPHA, talpha );
00333    }
00334    else { conjg = CNOCONJG; talpha = ALPHA; }
00335 /*
00336 *  Retrieve BLACS combine topology, select backward ot forward substitution.
00337 */
00338    if( LNorRT )
00339    {
00340       top    = *PB_Ctop( &ctxt, COMBINE, ROW,    TOP_GET );
00341       Astart = ( upper ? An - 1 : 0 );
00342    }
00343    else
00344    {
00345       top    = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
00346       Astart = ( upper ? 0 : An - 1 );
00347    }
00348 /*
00349 *  Computational partitioning size is computed as the product of the logical
00350 *  value returned by pilaenv_ and 2 * lcm( nprow, npcol ).
00351 */
00352    Alcmb  = 2 * nb * PB_Clcm( ( Arow >= 0 ? nprow : 1 ),
00353                               ( Acol >= 0 ? npcol : 1 ) );
00354 /*
00355 *  When sub( B ) is not replicated and backward pass on sub( B ), find the
00356 *  virtual process p owning the last row or column of sub( B ).
00357 */
00358    if( !( BisR = ( ( BsrcR < 0 ) || ( BnprocsR == 1 ) ) ) && !( Bfwd ) )
00359    {
00360       tmp = PB_Cindxg2p( BnR - 1, Binb1R, BnbR, BrocR, BrocR, BnprocsR );
00361       p   = MModSub( tmp, BrocR, BnprocsR );
00362    }
00363 /*
00364 *  Loop over the processes rows or columns owning the BnR rows or columns of
00365 *  sub( B ) to be processed.
00366 */
00367    n = BnR;
00368 
00369    while( n > 0 )
00370    {
00371 /*
00372 *  Find out who is the active process row or column as well as the number of
00373 *  rows or columns of sub( B ) it owns.
00374 */
00375       BcurrocR = ( BisR ? -1 : MModAdd( BrocR, p, BnprocsR ) );
00376       BnpR     = PB_Cnumroc( BnR, 0, Binb1R, BnbR, BcurrocR, BrocR, BnprocsR );
00377 
00378       n       -= BnpR;
00379 /*
00380 *  Re-adjust the number of rows or columns to be handled at each step, in order
00381 *  to average the message sizes and the computational granularity.
00382 */
00383       if( BnpR ) nbb = BnpR / ( ( BnpR - 1 ) / nb + 1 );
00384 
00385       while( BnpR )
00386       {
00387          nbb = MIN( nbb, BnpR );
00388 /*
00389 *  Describe the local contiguous panel of sub( B )
00390 */
00391          if( lside )
00392          {
00393             PB_Cdescset( DBUFB, BnD, nbb, Binb1D, nbb, BnbD, BnbR, BrocD,
00394                          BcurrocR, ctxt, Bld );
00395             if( BisR || ( BmyprocR == BcurrocR ) )
00396                Bptr = Mptr( B, BiiD, BiiR, Bld, size );
00397          }
00398          else
00399          {
00400             PB_Cdescset( DBUFB, nbb, BnD, nbb, Binb1D, BnbR, BnbD, BcurrocR,
00401                          BrocD, ctxt, Bld );
00402             if( BisR || ( BmyprocR == BcurrocR ) )
00403                Bptr = Mptr( B, BiiR, BiiD, Bld, size );
00404          }
00405 
00406          talph0 = talpha;
00407 
00408          if( LNorRT )
00409          {
00410 /*
00411 *  Reuse sub( B ) and/or create vector WBC in process column owning the first
00412 *  or last column of sub( A )
00413 */
00414             PB_CInOutV2( TYPE, &conjg, COLUMN, An, An, Astart, Ad0, nbb, Bptr,
00415                          0, 0, DBUFB, &Broc, &WBC, WBCd, &WBCfr, &WBCsum,
00416                          &WBCapbX );
00417 /*
00418 *  Create WBR in process rows spanned by sub( A )
00419 */
00420             PB_COutV( TYPE, ROW, INIT, An, An, Ad0, nbb, &WBR, WBRd, &WBRfr,
00421                       &WBRsum );
00422 /*
00423 *  Retrieve local quantities related to sub( A ) -> Ad0
00424 */
00425             Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
00426             Amb   = Ad0[MB_  ]; Anb   = Ad0[NB_  ];
00427             Arow  = Ad0[RSRC_]; Acol  = Ad0[CSRC_]; Ald = Ad0[LLD_];
00428 
00429             Anp   = PB_Cnumroc( An, 0, Aimb1, Amb, myrow, Arow, nprow );
00430             Anq   = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol );
00431             if( ( Anp > 0 ) && ( Anq > 0 ) )
00432                Aptr = Mptr( A, Aii, Ajj, Ald, size );
00433 
00434             WBCld = WBCd[LLD_]; WBRld = WBRd[LLD_];
00435 
00436             if( upper )
00437             {
00438 /*
00439 *  sub( A ) is upper triangular
00440 */
00441                for( k = ( Astart / Alcmb ) * Alcmb; k >= 0; k -= Alcmb )
00442                {
00443                   ktmp = An - k; kb = MIN( ktmp, Alcmb );
00444 /*
00445 *  Solve logical diagonal block, WBC contains the solution scattered in multiple
00446 *  process columns and WBR contains the solution replicated in the process rows.
00447 */
00448                   Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
00449                   Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
00450                   PB_Cptrsm( TYPE, WBRsum, SIDE, UPLO, TRANSA, DIAG, kb, nbb,
00451                              talph0, Aptr, k, k, Ad0, Mptr( WBC, Akp, 0, WBCld,
00452                              size ), WBCld, Mptr( WBR, 0, Akq, WBRld, size ),
00453                              WBRld );
00454 /*
00455 *  Update: only the part of sub( B ) to be solved at the next step is locally
00456 *  updated and combined, the remaining part of the matrix to be solved later
00457 *  is only locally updated.
00458 */
00459                   if( Akp > 0 )
00460                   {
00461                      Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol );
00462                      if( WBCsum )
00463                      {
00464                         kbprev = MIN( k, Alcmb );
00465                         ktmp   = PB_Cnumroc( kbprev, k-kbprev, Aimb1, Amb,
00466                                              myrow, Arow, nprow );
00467                         Akp   -= ktmp;
00468 
00469                         if( ktmp > 0 )
00470                         {
00471                            if( Anq0 > 0 )
00472                               gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &ktmp,
00473                                     &nbb, &Anq0, negone, Mptr( Aptr, Akp, Akq,
00474                                     Ald, size ), &Ald, Mptr( WBR, 0, Akq, WBRld,
00475                                     size ), &WBRld, talph0, Mptr( WBC, Akp, 0,
00476                                     WBCld, size ), &WBCld );
00477                            Asrc = PB_Cindxg2p( k-1, Ainb1, Anb, Acol, Acol,
00478                                                npcol );
00479                            gsum2d( ctxt, ROW, &top, ktmp, nbb, Mptr( WBC, Akp,
00480                                    0, WBCld, size ), WBCld, myrow, Asrc );
00481                            if( mycol != Asrc )
00482                               pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &ktmp,
00483                                    &nbb, &izero, zero, zero, Mptr( WBC, Akp, 0,
00484                                    WBCld, size ), &WBCld );
00485                         }
00486                         if( ( Akp > 0 ) && ( Anq0 > 0 ) )
00487                            gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &Akp,
00488                                  &nbb, &Anq0, negone, Mptr( Aptr, 0, Akq, Ald,
00489                                  size ), &Ald, Mptr( WBR, 0, Akq, WBRld, size ),
00490                                  &WBRld, talph0, WBC, &WBCld );
00491                      }
00492                      else
00493                      {
00494                         if( Anq0 > 0 )
00495                            gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &Akp,
00496                                  &nbb, &Anq0, negone, Mptr( Aptr, 0, Akq, Ald,
00497                                  size ), &Ald, Mptr( WBR, 0, Akq, WBRld, size ),
00498                                  &WBRld, talph0, WBC, &WBCld );
00499                      }
00500                   }
00501                   talph0 = one;
00502                }
00503             }
00504             else
00505             {
00506 /*
00507 *  sub( A ) is lower triangular
00508 */
00509                for( k = 0; k < An; k += Alcmb )
00510                {
00511                   ktmp = An - k; kb = MIN( ktmp, Alcmb );
00512 /*
00513 *  Solve logical diagonal block, WBC contains the solution scattered in multiple
00514 *  process columns and WBR contains the solution replicated in the process rows.
00515 */
00516                   Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
00517                   Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
00518                   PB_Cptrsm( TYPE, WBRsum, SIDE, UPLO, TRANSA, DIAG, kb, nbb,
00519                              talph0, Aptr, k, k, Ad0, Mptr( WBC, Akp, 0, WBCld,
00520                              size ), WBCld, Mptr( WBR, 0, Akq, WBRld, size ),
00521                              WBRld );
00522 /*
00523 *  Update: only the part of sub( B ) to be solved at the next step is locally
00524 *  updated and combined, the remaining part of the matrix to be solved later is
00525 *  only locally updated.
00526 */
00527                   Akp = PB_Cnumroc( k+kb, 0, Aimb1, Amb, myrow, Arow, nprow );
00528                   if( ( Anp0 = Anp - Akp ) > 0 )
00529                   {
00530                      Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol );
00531                      if( WBCsum )
00532                      {
00533                         kbnext = ktmp - kb;
00534                         kbnext = MIN( kbnext, Alcmb );
00535                         ktmp   = PB_Cnumroc( kbnext, k+kb, Aimb1, Amb, myrow,
00536                                              Arow, nprow );
00537                         Anp0 -= ktmp;
00538 
00539                         if( ktmp > 0 )
00540                         {
00541                            if( Anq0 > 0 )
00542                               gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &ktmp,
00543                                     &nbb, &Anq0, negone, Mptr( Aptr, Akp, Akq,
00544                                     Ald, size ), &Ald, Mptr( WBR, 0, Akq, WBRld,
00545                                     size ), &WBRld, talph0, Mptr( WBC, Akp, 0,
00546                                     WBCld, size ), &WBCld );
00547                            Asrc = PB_Cindxg2p( k+kb, Ainb1, Anb, Acol, Acol,
00548                                                npcol );
00549                            gsum2d( ctxt, ROW, &top, ktmp, nbb, Mptr( WBC, Akp,
00550                                    0, WBCld, size ), WBCld, myrow, Asrc );
00551                            if( mycol != Asrc )
00552                               pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &ktmp,
00553                                    &nbb, &izero, zero, zero, Mptr( WBC, Akp, 0,
00554                                    WBCld, size ), &WBCld );
00555                         }
00556                         if( ( Anp0 > 0 ) && ( Anq0 > 0 ) )
00557                            gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &Anp0,
00558                                  &nbb, &Anq0, negone, Mptr( Aptr, Akp+ktmp, Akq,
00559                                  Ald, size ), &Ald, Mptr( WBR, 0, Akq, WBRld,
00560                                  size ), &WBRld, talph0, Mptr( WBC, Akp+ktmp, 0,
00561                                  WBCld, size ), &WBCld );
00562                      }
00563                      else
00564                      {
00565                         if( Anq0 > 0 )
00566                            gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRAN ), &Anp0,
00567                                  &nbb, &Anq0, negone, Mptr( Aptr, Akp, Akq, Ald,
00568                                  size ), &Ald, Mptr( WBR, 0, Akq, WBRld, size ),
00569                                  &WBRld, talph0, Mptr( WBC, Akp, 0, WBCld,
00570                                  size ), &WBCld );
00571                      }
00572                   }
00573                   talph0 = one;
00574                }
00575             }
00576 /*
00577 *  Combine the scattered resulting matrix WBC
00578 */
00579             if( WBCsum && ( Anp > 0 ) )
00580                gsum2d( ctxt, ROW, &top, Anp, nbb, WBC, WBCld, myrow,
00581                        WBCd[CSRC_] );
00582 /*
00583 *  sub( B ) := WBC (if necessary)
00584 */
00585             if( WBCapbX )
00586                PB_Cpaxpby( TYPE, &conjg, An, nbb, one, WBC, 0, 0, WBCd, COLUMN,
00587                            zero, Bptr, 0, 0, DBUFB, &Broc );
00588          }
00589          else
00590          {
00591 /*
00592 *  Reuse sub( B ) and/or create vector WBR in process row owning the first or
00593 *  last row of sub( A )
00594 */
00595             PB_CInOutV2( TYPE, &conjg, ROW, An, An, Astart, Ad0, nbb, Bptr,
00596                          0, 0, DBUFB, &Broc, &WBR, WBRd, &WBRfr, &WBRsum,
00597                          &WBRapbX );
00598 /*
00599 *  Create WBC in process columns spanned by sub( A )
00600 */
00601             PB_COutV( TYPE, COLUMN, INIT, An, An, Ad0, nbb, &WBC, WBCd, &WBCfr,
00602                       &WBCsum );
00603 /*
00604 *  Retrieve local quantities related to Ad0 -> sub( A )
00605 */
00606             Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
00607             Amb   = Ad0[MB_  ]; Anb   = Ad0[NB_  ];
00608             Arow  = Ad0[RSRC_]; Acol  = Ad0[CSRC_]; Ald = Ad0[LLD_];
00609 
00610             Anp   = PB_Cnumroc( An, 0, Aimb1, Amb, myrow, Arow, nprow );
00611             Anq   = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol );
00612             if( ( Anp > 0 ) && ( Anq > 0 ) )
00613                Aptr = Mptr( A, Aii, Ajj, Ald, size );
00614 
00615             WBCld = WBCd[LLD_]; WBRld = WBRd[LLD_];
00616 
00617             if( upper )
00618             {
00619 /*
00620 *  sub( A ) is upper triangular
00621 */
00622                for( k = 0; k < An; k += Alcmb )
00623                {
00624                   ktmp = An - k; kb = MIN( ktmp, Alcmb );
00625 /*
00626 *  Solve logical diagonal block, WBR contains the solution scattered in multiple
00627 *  process rows and WBC contains the solution replicated in the process columns.
00628 */
00629                   Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
00630                   Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
00631                   PB_Cptrsm( TYPE, WBCsum, SIDE, UPLO, TRANSA, DIAG, nbb, kb,
00632                              talph0, Aptr, k, k, Ad0, Mptr( WBC, Akp, 0, WBCld,
00633                              size ), WBCld, Mptr( WBR, 0, Akq, WBRld, size ),
00634                              WBRld );
00635 /*
00636 *  Update: only the part of sub( B ) to be solved at the next step is locally
00637 *  updated and combined, the remaining part of the matrix to be solved later is
00638 *  only locally updated.
00639 */
00640                   Akq = PB_Cnumroc( k+kb, 0, Ainb1, Anb, mycol, Acol, npcol );
00641                   if( ( Anq0 = Anq - Akq ) > 0 )
00642                   {
00643                      Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow );
00644                      if( WBRsum )
00645                      {
00646                         kbnext = ktmp - kb;
00647                         kbnext = MIN( kbnext, Alcmb );
00648                         ktmp   = PB_Cnumroc( kbnext, k+kb, Ainb1, Anb, mycol,
00649                                              Acol, npcol );
00650                         Anq0  -= ktmp;
00651 
00652                         if( ktmp > 0 )
00653                         {
00654                            if( Anp0 > 0 )
00655                               gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb,
00656                                     &ktmp, &Anp0, negone, Mptr( WBC, Akp, 0,
00657                                     WBCld, size ), &WBCld, Mptr( Aptr, Akp, Akq,
00658                                     Ald, size ), &Ald, talph0, Mptr( WBR, 0,
00659                                     Akq, WBRld, size ), &WBRld );
00660                            Asrc = PB_Cindxg2p( k+kb, Aimb1, Amb, Arow, Arow,
00661                                                nprow );
00662                            gsum2d( ctxt, COLUMN, &top, nbb, ktmp, Mptr( WBR, 0,
00663                                    Akq, WBRld, size ), WBRld, Asrc, mycol );
00664                            if( myrow != Asrc )
00665                               pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &nbb,
00666                                    &ktmp, &izero, zero, zero, Mptr( WBR, 0, Akq,
00667                                    WBRld, size ), &WBRld );
00668                         }
00669                         if( ( Anp0 > 0 ) && ( Anq0 > 0 ) )
00670                            gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb,
00671                                  &Anq0, &Anp0, negone, Mptr( WBC, Akp, 0, WBCld,
00672                                  size ), &WBCld, Mptr( Aptr, Akp, Akq+ktmp, Ald,
00673                                  size ), &Ald, talph0, Mptr( WBR, 0, Akq+ktmp,
00674                                  WBRld, size ), &WBRld );
00675                      }
00676                      else
00677                      {
00678                         if( Anp0 > 0 )
00679                            gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb,
00680                                  &Anq0, &Anp0, negone, Mptr( WBC, Akp, 0, WBCld,
00681                                  size ), &WBCld, Mptr( Aptr, Akp, Akq, Ald,
00682                                  size ), &Ald, talph0, Mptr( WBR, 0, Akq, WBRld,
00683                                  size ), &WBRld );
00684                      }
00685                   }
00686                   talph0 = one;
00687                }
00688             }
00689             else
00690             {
00691 /*
00692 *  sub( A ) is lower triangular
00693 */
00694                for( k = ( Astart / Alcmb ) * Alcmb; k >= 0; k -= Alcmb )
00695                {
00696                   ktmp = An - k; kb = MIN( ktmp, Alcmb );
00697 /*
00698 *  Solve logical diagonal block, WBR contains the solution scattered in multiple
00699 *  process rows and WBC contains the solution replicated in the process columns.
00700 */
00701                   Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
00702                   Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
00703                   PB_Cptrsm( TYPE, WBCsum, SIDE, UPLO, TRANSA, DIAG, nbb, kb,
00704                              talph0, Aptr, k, k, Ad0, Mptr( WBC, Akp, 0, WBCld,
00705                              size ), WBCld, Mptr( WBR, 0, Akq, WBRld, size ),
00706                              WBRld );
00707 /*
00708 *  Update: only the part of sub( B ) to be solved at the next step is locally
00709 *  updated and combined, the remaining part of the matrix to be solved later
00710 *  is only locally updated.
00711 */
00712                   if( Akq > 0 )
00713                   {
00714                      Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow );
00715                      if( WBRsum )
00716                      {
00717                         kbprev = MIN( k, Alcmb );
00718                         ktmp   = PB_Cnumroc( kbprev, k-kbprev, Ainb1, Anb,
00719                                              mycol, Acol, npcol );
00720                         Akq   -= ktmp;
00721 
00722                         if( ktmp > 0 )
00723                         {
00724                            if( Anp0 > 0 )
00725                               gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb,
00726                                     &ktmp, &Anp0, negone, Mptr( WBC, Akp, 0,
00727                                     WBCld, size ), &WBCld, Mptr( Aptr, Akp, Akq,
00728                                     Ald, size ), &Ald, talph0, Mptr( WBR, 0,
00729                                     Akq, WBRld, size ), &WBRld );
00730                            Asrc = PB_Cindxg2p( k-1, Aimb1, Amb, Arow, Arow,
00731                                                nprow );
00732                            gsum2d( ctxt, COLUMN, &top, nbb, ktmp, Mptr( WBR, 0,
00733                                    Akq, WBRld, size ), WBRld, Asrc, mycol );
00734                            if( myrow != Asrc )
00735                               pad( C2F_CHAR( ALL ), C2F_CHAR( NOCONJG ), &nbb,
00736                                    &ktmp, &izero, zero, zero, Mptr( WBR, 0, Akq,
00737                                    WBRld, size ), &WBRld );
00738                         }
00739                         if( ( Anp0 > 0 ) && ( Akq > 0 ) )
00740                            gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb,
00741                                  &Akq, &Anp0, negone, Mptr( WBC, Akp, 0, WBCld,
00742                                  size ), &WBCld, Mptr( Aptr, Akp, 0, Ald,
00743                                  size ), &Ald, talph0, WBR, &WBRld );
00744                      }
00745                      else
00746                      {
00747                         if( Anp0 > 0 )
00748                            gemm( C2F_CHAR( TRAN ), C2F_CHAR( NOTRAN ), &nbb,
00749                                  &Akq, &Anp0, negone, Mptr( WBC, Akp, 0, WBCld,
00750                                  size ), &WBCld, Mptr( Aptr, Akp, 0, Ald,
00751                                  size ), &Ald, talph0, WBR, &WBRld );
00752                      }
00753                   }
00754                   talph0 = one;
00755                }
00756             }
00757 /*
00758 *  Combine the scattered resulting matrix WBR
00759 */
00760             if( WBRsum && ( Anq > 0 ) )
00761                gsum2d( ctxt, COLUMN, &top, nbb, Anq, WBR, WBRld, WBRd[RSRC_],
00762                        mycol );
00763 /*
00764 *  sub( B ) := WBR (if necessary)
00765 */
00766             if( WBRapbX )
00767                PB_Cpaxpby( TYPE, &conjg, nbb, An, one, WBR, 0, 0, WBRd, ROW,
00768                            zero, Bptr, 0, 0, DBUFB, &Broc );
00769          }
00770 
00771          if( WBCfr ) free( WBC );
00772          if( WBRfr ) free( WBR );
00773 /*
00774 *  Go to the next contiguous panel if any residing in this process row or column
00775 */
00776          BnpR -= nbb;
00777 
00778          if( BisR || ( BmyprocR == BcurrocR ) ) BiiR += nbb;
00779       }
00780 /*
00781 *  Go to next or previous process row or column owning some of sub( B )
00782 */
00783       if( !( BisR ) )
00784          p = ( Bfwd ? MModAdd1( p, BnprocsR ) : MModSub1( p, BnprocsR ) );
00785    }
00786 
00787    if( TranOp == CCOTRAN ) free( talpha );
00788 /*
00789 *  End of PB_CptrsmB
00790 */
00791 }