ScaLAPACK  2.0.2
ScaLAPACK: Scalable Linear Algebra PACKage
PB_CpsymmBC.c
Go to the documentation of this file.
00001 /* ---------------------------------------------------------------------
00002 *
00003 *  -- PBLAS auxiliary routine (version 2.0) --
00004 *     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
00005 *     and University of California, Berkeley.
00006 *     April 1, 1998
00007 *
00008 *  ---------------------------------------------------------------------
00009 */
00010 /*
00011 *  Include files
00012 */
00013 #include "../pblas.h"
00014 #include "../PBpblas.h"
00015 #include "../PBtools.h"
00016 #include "../PBblacs.h"
00017 #include "../PBblas.h"
00018 
00019 #ifdef __STDC__
00020 void PB_CpsymmBC( PBTYP_T * TYPE, char * DIRECAB, char * CONJUG,
00021                   char * SIDE, char * UPLO, int M, int N, char * ALPHA,
00022                   char * A, int IA, int JA, int * DESCA, char * B,
00023                   int IB, int JB, int * DESCB, char * BETA, char * C,
00024                   int IC, int JC, int * DESCC )
00025 #else
00026 void PB_CpsymmBC( TYPE, DIRECAB, CONJUG, SIDE, UPLO, M, N, ALPHA, A, IA,
00027                   JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC )
00028 /*
00029 *  .. Scalar Arguments ..
00030 */
00031    char           * CONJUG, * DIRECAB, * SIDE, * UPLO;
00032    int            IA, IB, IC, JA, JB, JC, M, N;
00033    char           * ALPHA, * BETA;
00034    PBTYP_T        * TYPE;
00035 /*
00036 *  .. Array Arguments ..
00037 */
00038    int            * DESCA, * DESCB, * DESCC;
00039    char           * A, * B, * C;
00040 #endif
00041 {
00042 /*
00043 *  Purpose
00044 *  =======
00045 *
00046 *  PB_CpsymmBC  performs one of the matrix-matrix operations
00047 *
00048 *     sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ),
00049 *
00050 *  or
00051 *
00052 *     sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ),
00053 *
00054 *  where
00055 *
00056 *     sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1),
00057 *
00058 *     sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1)  if SIDE = 'L',
00059 *                      A(IA:IA+N-1,JA:JA+N-1)  if SIDE = 'R', and,
00060 *
00061 *     sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1).
00062 *
00063 *  Alpha  and  beta  are scalars,  sub( A )  is a symmetric or Hermitian
00064 *  submatrix and sub( B ) and sub( C ) are m by n submatrices.
00065 *
00066 *  This is the inner-product algorithm  using  the  logical  LCM  hybrid
00067 *  and static blocking techniques. The submatrix operand  sub( A ) stays
00068 *  in place.
00069 *
00070 *  Notes
00071 *  =====
00072 *
00073 *  A description  vector  is associated with each 2D block-cyclicly dis-
00074 *  tributed matrix.  This  vector  stores  the  information  required to
00075 *  establish the  mapping  between a  matrix entry and its corresponding
00076 *  process and memory location.
00077 *
00078 *  In  the  following  comments,   the character _  should  be  read  as
00079 *  "of  the  distributed  matrix".  Let  A  be a generic term for any 2D
00080 *  block cyclicly distributed matrix.  Its description vector is DESC_A:
00081 *
00082 *  NOTATION         STORED IN       EXPLANATION
00083 *  ---------------- --------------- ------------------------------------
00084 *  DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
00085 *  CTXT_A  (global) DESCA[ CTXT_  ] The BLACS context handle, indicating
00086 *                                   the NPROW x NPCOL BLACS process grid
00087 *                                   A  is  distributed over. The context
00088 *                                   itself  is  global,  but  the handle
00089 *                                   (the integer value) may vary.
00090 *  M_A     (global) DESCA[ M_     ] The  number of rows in the distribu-
00091 *                                   ted matrix A, M_A >= 0.
00092 *  N_A     (global) DESCA[ N_     ] The number of columns in the distri-
00093 *                                   buted matrix A, N_A >= 0.
00094 *  IMB_A   (global) DESCA[ IMB_   ] The number of rows of the upper left
00095 *                                   block of the matrix A, IMB_A > 0.
00096 *  INB_A   (global) DESCA[ INB_   ] The  number  of columns of the upper
00097 *                                   left   block   of   the  matrix   A,
00098 *                                   INB_A > 0.
00099 *  MB_A    (global) DESCA[ MB_    ] The blocking factor used to  distri-
00100 *                                   bute the last  M_A-IMB_A  rows of A,
00101 *                                   MB_A > 0.
00102 *  NB_A    (global) DESCA[ NB_    ] The blocking factor used to  distri-
00103 *                                   bute the last  N_A-INB_A  columns of
00104 *                                   A, NB_A > 0.
00105 *  RSRC_A  (global) DESCA[ RSRC_  ] The process row over which the first
00106 *                                   row of the matrix  A is distributed,
00107 *                                   NPROW > RSRC_A >= 0.
00108 *  CSRC_A  (global) DESCA[ CSRC_  ] The  process column  over  which the
00109 *                                   first column of  A  is  distributed.
00110 *                                   NPCOL > CSRC_A >= 0.
00111 *  LLD_A   (local)  DESCA[ LLD_   ] The  leading dimension  of the local
00112 *                                   array  storing  the  local blocks of
00113 *                                   the distributed matrix A,
00114 *                                   IF( Lc( 1, N_A ) > 0 )
00115 *                                      LLD_A >= MAX( 1, Lr( 1, M_A ) )
00116 *                                   ELSE
00117 *                                      LLD_A >= 1.
00118 *
00119 *  Let K be the number of  rows of a matrix A starting at the global in-
00120 *  dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
00121 *  that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
00122 *  receive if these K rows were distributed over NPROW processes.  If  K
00123 *  is the number of columns of a matrix  A  starting at the global index
00124 *  JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number  of co-
00125 *  lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would  receive if
00126 *  these K columns were distributed over NPCOL processes.
00127 *
00128 *  The values of Lr() and Lc() may be determined via a call to the func-
00129 *  tion PB_Cnumroc:
00130 *  Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
00131 *  Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
00132 *
00133 *  Arguments
00134 *  =========
00135 *
00136 *  TYPE    (local input) pointer to a PBTYP_T structure
00137 *          On entry,  TYPE  is a pointer to a structure of type PBTYP_T,
00138 *          that contains type information (See pblas.h).
00139 *
00140 *  DIRECAB (global input) pointer to CHAR
00141 *          On entry, DIRECAB specifies  the direction in which the  rows
00142 *          or columns of sub( B ) should be looped over as follows:
00143 *             DIRECAB = 'F' or 'f'   forward  or increasing,
00144 *             DIRECAB = 'B' or 'b'   backward or decreasing.
00145 *
00146 *  CONJUG  (global input) pointer to CHAR
00147 *          On entry, CONJUG specifies whether sub( A ) is a symmetric or
00148 *          Hermitian submatrix operand as follows:
00149 *             CONJUG = 'N' or 'n'    sub( A ) is symmetric,
00150 *             CONJUG = 'Z' or 'z'    sub( A ) is Hermitian.
00151 *
00152 *  SIDE    (global input) pointer to CHAR
00153 *          On entry, SIDE  specifies  whether the symmetric or Hermitian
00154 *          submatrix sub( A ) appears on the left or right in the opera-
00155 *          tion as follows:
00156 *
00157 *             SIDE = 'L' or 'l'
00158 *                   sub( C ) := alpha*sub( A )*sub( B ) + beta*sub( C ),
00159 *
00160 *             SIDE = 'R' or 'r'
00161 *                   sub( C ) := alpha*sub( B )*sub( A ) + beta*sub( C ).
00162 *
00163 *  UPLO    (global input) pointer to CHAR
00164 *          On  entry,   UPLO  specifies  whether  the  local  pieces  of
00165 *          the array  A  containing the  upper or lower triangular  part
00166 *          of the submatrix  sub( A )  are to be referenced as follows:
00167 *             UPLO = 'U' or 'u'   Only the local pieces corresponding to
00168 *                                 the   upper  triangular  part  of  the
00169 *                                 submatrix sub( A ) are referenced,
00170 *             UPLO = 'L' or 'l'   Only the local pieces corresponding to
00171 *                                 the   lower  triangular  part  of  the
00172 *                                 submatrix sub( A ) are referenced.
00173 *
00174 *  M       (global input) INTEGER
00175 *          On entry,  M  specifies the number of rows of  the  submatrix
00176 *          sub( C ). M  must be at least zero.
00177 *
00178 *  N       (global input) INTEGER
00179 *          On entry, N  specifies the number of columns of the submatrix
00180 *          sub( C ). N  must be at least zero.
00181 *
00182 *  ALPHA   (global input) pointer to CHAR
00183 *          On entry, ALPHA specifies the scalar alpha.   When  ALPHA  is
00184 *          supplied  as zero then the local entries of the arrays  A and
00185 *          B corresponding to the entries of  the  submatrices  sub( A )
00186 *          and sub( B ) respectively need not be set on input.
00187 *
00188 *  A       (local input) pointer to CHAR
00189 *          On entry, A is an array of dimension (LLD_A, Ka), where Ka is
00190 *          at least  Lc( 1, JA+M-1 )  when  SIDE = 'L' or 'l'  and is at
00191 *          at least Lc( 1, JA+N-1 ) otherwise. Before  entry, this array
00192 *          contains the local entries of the matrix A.
00193 *          Before  entry  with  SIDE = 'L' or 'l', this  array  contains
00194 *          the local entries corresponding to the entries of the  m by m
00195 *          symmetric or Hermitian submatrix  sub( A ),  such  that  when
00196 *          UPLO = 'U' or 'u', this  array contains the local entries  of
00197 *          the upper triangular part of the submatrix  sub( A ), and the
00198 *          local entries  of  the strictly lower triangular of  sub( A )
00199 *          are not referenced, and when  UPLO = 'L' or 'l',  this  array
00200 *          contains  the local entries of the  lower triangular part  of
00201 *          the symmetric or Hermitian submatrix sub( A ), and  the local
00202 *          entries of the strictly upper triangular of sub( A ) are  not
00203 *          referenced.
00204 *          Before  entry  with  SIDE = 'R' or 'r', this  array  contains
00205 *          the local entries corresponding to the entries of the  n by n
00206 *          symmetric or Hermitian submatrix  sub( A ),  such  that  when
00207 *          UPLO = 'U' or 'u', this  array contains the local entries  of
00208 *          the upper triangular part of the submatrix sub( A ), and  the
00209 *          local entries  of  the strictly lower triangular of  sub( A )
00210 *          are not referenced, and when  UPLO = 'L' or 'l',  this  array
00211 *          contains  the local entries of the  lower triangular part  of
00212 *          the  symmetric or Hermitian submatrix sub( A ), and the local
00213 *          entries of the strictly upper triangular of sub( A ) are  not
00214 *          referenced.
00215 *          Note that the  imaginary parts  of the local entries  corres-
00216 *          ponding to the  diagonal elements  of  sub( A )  need not  be
00217 *          set and assumed to be zero.
00218 *
00219 *  IA      (global input) INTEGER
00220 *          On entry, IA  specifies A's global row index, which points to
00221 *          the beginning of the submatrix sub( A ).
00222 *
00223 *  JA      (global input) INTEGER
00224 *          On entry, JA  specifies A's global column index, which points
00225 *          to the beginning of the submatrix sub( A ).
00226 *
00227 *  DESCA   (global and local input) INTEGER array
00228 *          On entry, DESCA  is an integer array of dimension DLEN_. This
00229 *          is the array descriptor for the matrix A.
00230 *
00231 *  B       (local input) pointer to CHAR
00232 *          On entry, B is an array of dimension (LLD_B, Kb), where Kb is
00233 *          at least Lc( 1, JB+N-1 ).  Before  entry, this array contains
00234 *          the local entries of the matrix B.
00235 *
00236 *  IB      (global input) INTEGER
00237 *          On entry, IB  specifies B's global row index, which points to
00238 *          the beginning of the submatrix sub( B ).
00239 *
00240 *  JB      (global input) INTEGER
00241 *          On entry, JB  specifies B's global column index, which points
00242 *          to the beginning of the submatrix sub( B ).
00243 *
00244 *  DESCB   (global and local input) INTEGER array
00245 *          On entry, DESCB  is an integer array of dimension DLEN_. This
00246 *          is the array descriptor for the matrix B.
00247 *
00248 *  BETA    (global input) pointer to CHAR
00249 *          On entry,  BETA  specifies the scalar  beta.   When  BETA  is
00250 *          supplied  as  zero  then  the  local entries of  the array  C
00251 *          corresponding to  the  entries of the submatrix sub( C ) need
00252 *          not be set on input.
00253 *
00254 *  C       (local input/local output) pointer to CHAR
00255 *          On entry, C is an array of dimension (LLD_C, Kc), where Kc is
00256 *          at least Lc( 1, JC+N-1 ).  Before  entry, this array contains
00257 *          the local entries of the matrix C.
00258 *          On exit, the entries of this array corresponding to the local
00259 *          entries  of the submatrix  sub( C )  are  overwritten by  the
00260 *          local entries of the m by n updated submatrix.
00261 *
00262 *  IC      (global input) INTEGER
00263 *          On entry, IC  specifies C's global row index, which points to
00264 *          the beginning of the submatrix sub( C ).
00265 *
00266 *  JC      (global input) INTEGER
00267 *          On entry, JC  specifies C's global column index, which points
00268 *          to the beginning of the submatrix sub( C ).
00269 *
00270 *  DESCC   (global and local input) INTEGER array
00271 *          On entry, DESCC  is an integer array of dimension DLEN_. This
00272 *          is the array descriptor for the matrix C.
00273 *
00274 *  -- Written on April 1, 1998 by
00275 *     Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
00276 *
00277 *  ---------------------------------------------------------------------
00278 */
00279 /*
00280 *  .. Local Scalars ..
00281 */
00282    char           GemmTa, GemmTb, cctop, * one, rctop, * talphaCR, * talphaRC,
00283                   * tbeta, * zero;
00284    int            Acol, Aii, Aimb1, Ainb1, Ajj, Alcmb, Ald, Alp, Alp0, Alq,
00285                   Alq0, Amb, Amp, An, Anb, Anq, Arow, BCfwd, BCmyprocD,
00286                   BCmyprocR, BCnD, BCnR, BCnprocsD, BCnprocsR, Bbufld, BcurrocR,
00287                   Bfr, BiD, BiR, BiiD, BiiR, BinbD, BinbR, Binb1D, Binb1R, BisR,
00288                   Bkk, Bld, BnbD, BnbR, BnpD, BnpR, Boff, BrocD, BrocR, BsrcR,
00289                   Cfr, CiD, CiR, CiiD, CiiR, CinbD, CinbR, Cinb1D, Cinb1R, Ckk,
00290                   CnbD, CnbR, CnpD, CnpR, Coff, CrocD, CrocR, CsrcR, Cbufld,
00291                   CcurrocR, CisR, Cld, WBCfr, WBCld, WBRfr, WBRld, WCCfr, WCCld,
00292                   WCCsum, WCRfr, WCRld, WCRsum, conjg, ctxt, l, lb, lcmb, lside,
00293                   ltmp, maxp, maxpm1, maxq, mycol, myrow, n, nb, nbb, ncpq,
00294                   npcol, npq=0, nprow, nrpq, p=0, q=0, size, tmp, upper;
00295    TZSYM_T        tzsymm;
00296    GEMM_T         gemm;
00297    GSUM2D_T       gsum2d;
00298 /*
00299 *  .. Local Arrays ..
00300 */
00301    PB_VM_T        VM;
00302    int            Ad0 [DLEN_], DBUFB[DLEN_], DBUFC[DLEN_], WBCd[DLEN_],
00303                   WBRd[DLEN_], WCCd [DLEN_], WCRd [DLEN_];
00304    char           * Aptr = NULL, * Bbuf = NULL, * Cbuf = NULL, * WBC  = NULL,
00305                   * WBR  = NULL, * WCC  = NULL, * WCR  = NULL;
00306 /* ..
00307 *  .. Executable Statements ..
00308 *
00309 */
00310    Cblacs_gridinfo( ( ctxt = DESCC[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
00311 
00312    BCfwd = ( Mupcase( DIRECAB[0] ) == CFORWARD );
00313    conjg = ( Mupcase( CONJUG [0] ) ==   CCONJG );
00314    lside = ( Mupcase( SIDE   [0] ) ==    CLEFT );
00315    upper = ( Mupcase( UPLO   [0] ) ==   CUPPER );
00316 
00317    size  = TYPE->size;  one    = TYPE->one;     zero  = TYPE->zero;
00318    gemm  = TYPE->Fgemm; gsum2d = TYPE->Cgsum2d;
00319    nb    = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
00320 /*
00321 *  Compute local information for sub( A ), sub( B ) and sub( C )
00322 */
00323    if( lside )
00324    {
00325       BCnD = An = M;            BCnR      = N;
00326       BCmyprocD = myrow;        BCnprocsD = nprow;
00327       BCmyprocR = mycol;        BCnprocsR = npcol;
00328 
00329       BiD       = IB;           BiR       = JB;
00330       BinbD     = DESCB[IMB_ ]; BinbR     = DESCB[INB_];
00331       BnbD      = DESCB[MB_  ]; BnbR      = DESCB[NB_ ];
00332       BsrcR     = DESCB[CSRC_]; Bld       = DESCB[LLD_];
00333       PB_Cinfog2l( IB, JB, DESCB, BCnprocsD, BCnprocsR, BCmyprocD, BCmyprocR,
00334                    &BiiD, &BiiR, &BrocD, &BrocR );
00335 
00336       CiD       = IC;           CiR       = JC;
00337       CinbD     = DESCC[IMB_ ]; CinbR     = DESCC[INB_];
00338       CnbD      = DESCC[MB_  ]; CnbR      = DESCC[NB_ ];
00339       CsrcR     = DESCC[CSRC_]; Cld       = DESCC[LLD_];
00340       PB_Cinfog2l( IC, JC, DESCC, BCnprocsD, BCnprocsR, BCmyprocD, BCmyprocR,
00341                    &CiiD, &CiiR, &CrocD, &CrocR );
00342    }
00343    else
00344    {
00345       BCnD = An = N;            BCnR      = M;
00346       BCmyprocD = mycol;        BCnprocsD = npcol;
00347       BCmyprocR = myrow;        BCnprocsR = nprow;
00348 
00349       BiD       = JB;           BiR       = IB;
00350       BinbR     = DESCB[IMB_ ]; BinbD     = DESCB[INB_];
00351       BnbR      = DESCB[MB_  ]; BnbD      = DESCB[NB_ ];
00352       BsrcR     = DESCB[RSRC_]; Bld       = DESCB[LLD_];
00353       PB_Cinfog2l( IB, JB, DESCB, BCnprocsR, BCnprocsD, BCmyprocR, BCmyprocD,
00354                    &BiiR, &BiiD, &BrocR, &BrocD );
00355 
00356       CiD       = JC;           CiR       = IC;
00357       CinbR     = DESCC[IMB_ ]; CinbD     = DESCC[INB_];
00358       CnbR      = DESCC[MB_  ]; CnbD      = DESCC[NB_ ];
00359       CsrcR     = DESCC[RSRC_]; Cld       = DESCC[LLD_];
00360       PB_Cinfog2l( IC, JC, DESCC, BCnprocsR, BCnprocsD, BCmyprocR, BCmyprocD,
00361                    &CiiR, &CiiD, &CrocR, &CrocD );
00362    }
00363 
00364    Binb1D = PB_Cfirstnb( BCnD, BiD, BinbD, BnbD );
00365    BnpD   = PB_Cnumroc( BCnD, 0, Binb1D, BnbD, BCmyprocD, BrocD, BCnprocsD );
00366    Binb1R = PB_Cfirstnb( BCnR, BiR, BinbR, BnbR );
00367    BisR   = ( ( BsrcR < 0 ) || ( BCnprocsR == 1 ) );
00368 
00369    Cinb1D = PB_Cfirstnb( BCnD, CiD, CinbD, CnbD );
00370    CnpD   = PB_Cnumroc( BCnD, 0, Cinb1D, CnbD, BCmyprocD, CrocD, BCnprocsD );
00371    Cinb1R = PB_Cfirstnb( BCnR, CiR, CinbR, CnbR );
00372    CisR   = ( ( CsrcR < 0 ) || ( BCnprocsR == 1 ) );
00373 /*
00374 *  Compute descriptor Ad0 for sub( A )
00375 */
00376    PB_Cdescribe( An, An, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj,
00377                  &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 );
00378 
00379    Amp = PB_Cnumroc( An, 0, Aimb1, Amb, myrow, Arow, nprow );
00380    Anq = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol );
00381    if( ( Amp > 0 ) && ( Anq > 0 ) ) Aptr = Mptr( A, Aii, Ajj, Ald, size );
00382 /*
00383 *  Retrieve the BLACS combine topologies, compute conjugate of alpha for the
00384 *  Hermitian case and set the transpose parameters to be passed to the BLAS
00385 *  matrix multiply routine.
00386 */
00387    rctop = *PB_Ctop( &ctxt, COMBINE, ROW,    TOP_GET );
00388    cctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
00389 
00390    if( conjg )
00391    {
00392       tzsymm = PB_Ctzhemm;
00393       if( lside )
00394       {
00395          talphaRC = ALPHA; GemmTa = CCOTRAN; GemmTb = CTRAN;
00396          talphaCR = PB_Cmalloc( size );
00397          PB_Cconjg( TYPE, ALPHA, talphaCR );
00398       }
00399       else
00400       {
00401          talphaCR = ALPHA; GemmTa = CTRAN; GemmTb = CCOTRAN;
00402          talphaRC = PB_Cmalloc( size );
00403          PB_Cconjg( TYPE, ALPHA, talphaRC );
00404       }
00405    }
00406    else
00407    {
00408       tzsymm = PB_Ctzsymm; talphaCR = talphaRC = ALPHA;
00409       GemmTa = CTRAN; GemmTb = CTRAN;
00410    }
00411 /*
00412 *  Computational partitioning size is computed as the product of the logical
00413 *  value returned by pilaenv_ and 2 * lcm( nprow, npcol ).
00414 */
00415    Alcmb  = 2 * nb * PB_Clcm( ( Arow >= 0 ? nprow : 1 ),
00416                               ( Acol >= 0 ? npcol : 1 ) );
00417 /*
00418 *  When sub( B ) is not replicated and backward pass on sub( B ), find the
00419 *  virtual process q owning the last row or column of sub( B ).
00420 */
00421    if( !( BisR ) && !( BCfwd ) )
00422    {
00423       tmp = PB_Cindxg2p( BCnR - 1, Binb1R, BnbR, BrocR, BrocR, BCnprocsR );
00424       q   = MModSub( tmp, BrocR, BCnprocsR );
00425    }
00426 /*
00427 *  When sub( C ) is not replicated and backward pass on sub( C ), find the
00428 *  virtual process p owning the last row or column of sub( C ).
00429 */
00430    if( !( CisR ) && !( BCfwd ) )
00431    {
00432       tmp = PB_Cindxg2p( BCnR - 1, Cinb1R, CnbR, CrocR, CrocR, BCnprocsR );
00433       p   = MModSub( tmp, CrocR, BCnprocsR );
00434    }
00435 /*
00436 *  Loop over the virtual process grid induced by the rows or columns of
00437 *  sub( B ) and sub( C ).
00438 */
00439    lcmb   = PB_Clcm( ( maxp = ( CisR ? 1 : BCnprocsR ) ) * CnbR,
00440                      ( maxq = ( BisR ? 1 : BCnprocsR ) ) * BnbR );
00441    n      = BCnR;
00442    maxpm1 = maxp - 1;
00443 
00444    while( n > 0 )
00445    {
00446 /*
00447 *  Initialize local virtual matrix in process (p,q)
00448 */
00449       BcurrocR = ( BisR ? -1 : MModAdd( BrocR, q, BCnprocsR ) );
00450       Bkk  = PB_Cg2lrem( BiR, BinbR, BnbR, BcurrocR, BsrcR, BCnprocsR );
00451       BnpR = PB_Cnumroc( BCnR, 0, Binb1R, BnbR, BcurrocR, BrocR, BCnprocsR );
00452 
00453       CcurrocR = ( CisR ? -1 : MModAdd( CrocR, p, BCnprocsR ) );
00454       Ckk  = PB_Cg2lrem( CiR, CinbR, CnbR, CcurrocR, CsrcR, BCnprocsR );
00455       CnpR = PB_Cnumroc( BCnR, 0, Cinb1R, CnbR, CcurrocR, CrocR, BCnprocsR );
00456 
00457       PB_CVMinit( &VM, 0, CnpR, BnpR, Cinb1R, Binb1R, CnbR, BnbR, p, q,
00458                   maxp, maxq, lcmb );
00459 /*
00460 *  Find how many diagonals in this virtual process
00461 */
00462       npq = PB_CVMnpq( &VM );
00463 
00464       n -= npq;
00465 /*
00466 *  Re-adjust the number of rows or columns to be (un)packed, in order to
00467 *  average the message sizes.
00468 */
00469       if( npq ) nbb = npq / ( ( npq - 1 ) / nb + 1 );
00470 
00471       while( npq )
00472       {
00473          nbb = MIN( nbb, npq );
00474 /*
00475 *  Find out how many rows or columns of sub( B ) and sub( C ) are contiguous
00476 */
00477          PB_CVMcontig( &VM, &nrpq, &ncpq, &Coff, &Boff );
00478 
00479          if( lside )
00480          {
00481 /*
00482 *  Compute the descriptor DBUFB for the buffer that will contained the packed
00483 *  columns of sub( B ).
00484 */
00485             if( ( Bfr = ( ncpq < nbb ) ) != 0 )
00486             {
00487 /*
00488 *  If columns of sub( B ) are not contiguous, then allocate the buffer and
00489 *  pack the kbb columns of sub( B ).
00490 */
00491                Bbufld = MAX( 1, BnpD );
00492                if( BisR || ( BCmyprocR == BcurrocR ) )
00493                {
00494                   Bbuf = PB_Cmalloc( BnpD * nbb * size );
00495                   PB_CVMpack( TYPE, &VM, COLUMN, COLUMN, PACKING, NOTRAN, nbb,
00496                               BnpD, one, Mptr( B, BiiD, Bkk, Bld, size ), Bld,
00497                               zero, Bbuf, Bbufld );
00498                }
00499             }
00500             else
00501             {
00502 /*
00503 *  Otherwise, re-use sub( B ) directly.
00504 */
00505                Bbufld = Bld;
00506                if( BisR || ( BCmyprocR == BcurrocR ) )
00507                   Bbuf = Mptr( B, BiiD, Bkk+Boff, Bld, size );
00508             }
00509             PB_Cdescset( DBUFB, BCnD, nbb, Binb1D, nbb, BnbD, nbb, BrocD,
00510                          BcurrocR, ctxt, Bbufld );
00511 /*
00512 *  Replicate this panel of columns of sub( B ) as well as its transposed
00513 *  over sub( A ) -> WBC, WBR
00514 */
00515             PB_CInV( TYPE, NOCONJG, COLUMN, An, An, Ad0, nbb, Bbuf, 0, 0,
00516                      DBUFB, COLUMN, &WBC, WBCd, &WBCfr );
00517             PB_CInV( TYPE, NOCONJG, ROW,    An, An, Ad0, nbb, WBC,  0, 0,
00518                      WBCd,  COLUMN, &WBR, WBRd, &WBRfr );
00519          }
00520          else
00521          {
00522 /*
00523 *  Compute the descriptor DBUFB for the buffer that will contained the packed
00524 *  rows of sub( B ).
00525 */
00526             if( ( Bfr = ( ncpq < nbb ) ) != 0 )
00527             {
00528 /*
00529 *  If rows of sub( B ) are not contiguous, then allocate the buffer and pack
00530 *  the kbb rows of sub( B ).
00531 */
00532                Bbufld = nbb;
00533                if( BisR || ( BCmyprocR == BcurrocR ) )
00534                {
00535                   Bbuf = PB_Cmalloc( BnpD * nbb * size );
00536                   PB_CVMpack( TYPE, &VM, COLUMN, ROW,    PACKING, NOTRAN, nbb,
00537                               BnpD, one, Mptr( B, Bkk, BiiD, Bld, size ), Bld,
00538                               zero, Bbuf, Bbufld );
00539                }
00540             }
00541             else
00542             {
00543 /*
00544 *  Otherwise, re-use sub( B ) directly.
00545 */
00546                Bbufld = Bld;
00547                if( BisR || ( BCmyprocR == BcurrocR ) )
00548                   Bbuf = Mptr( B, Bkk+Boff, BiiD, Bld, size );
00549             }
00550             PB_Cdescset( DBUFB, nbb, BCnD, nbb, Binb1D, nbb, BnbD, BcurrocR,
00551                          BrocD, ctxt, Bbufld );
00552 /*
00553 *  Replicate this panel of rows of sub( B ) as well as its transposed
00554 *  over sub( A ) -> WBR, WBC
00555 */
00556             PB_CInV( TYPE, NOCONJG, ROW,    An, An, Ad0, nbb, Bbuf, 0, 0,
00557                      DBUFB, ROW,    &WBR, WBRd, &WBRfr );
00558             PB_CInV( TYPE, NOCONJG, COLUMN, An, An, Ad0, nbb, WBR,  0, 0,
00559                      WBRd,  ROW,    &WBC, WBCd, &WBCfr );
00560          }
00561 /*
00562 *  Allocate space for temporary results in scope of sub( A ) -> WCC, WCR
00563 */
00564          PB_COutV( TYPE, COLUMN, INIT, An, An, Ad0, nbb, &WCC, WCCd, &WCCfr,
00565                    &WCCsum );
00566          PB_COutV( TYPE, ROW,    INIT, An, An, Ad0, nbb, &WCR, WCRd, &WCRfr,
00567                    &WCRsum );
00568 /*
00569 *  Local matrix-matrix multiply iff I own some data
00570 */
00571          WBCld = WBCd[LLD_]; WBRld = WBRd[LLD_];
00572          WCCld = WCCd[LLD_]; WCRld = WCRd[LLD_];
00573 
00574          if( ( Amp > 0 ) && ( Anq > 0 ) )
00575          {
00576             if( upper )
00577             {
00578 /*
00579 *  sub( A ) is upper triangular
00580 */
00581                for( l = 0; l < An; l += Alcmb )
00582                {
00583                   lb   = An - l; lb = MIN( lb, Alcmb );
00584                   Alp  = PB_Cnumroc( l,  0, Aimb1, Amb, myrow, Arow, nprow );
00585                   Alq  = PB_Cnumroc( l,  0, Ainb1, Anb, mycol, Acol, npcol );
00586                   Alq0 = PB_Cnumroc( lb, l, Ainb1, Anb, mycol, Acol, npcol );
00587                   if( Alp > 0 && Alq0 > 0 )
00588                   {
00589                      gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &GemmTb ), &Alp, &nbb,
00590                            &Alq0, talphaRC, Mptr( Aptr, 0, Alq, Ald, size ),
00591                            &Ald, Mptr( WBR, 0, Alq, WBRld, size ), &WBRld, one,
00592                            WCC, &WCCld );
00593                      gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( NOTRAN ), &nbb, &Alq0,
00594                            &Alp, talphaCR, WBC, &WBCld, Mptr( Aptr, 0, Alq, Ald,
00595                            size ), &Ald, one, Mptr( WCR, 0, Alq, WCRld, size ),
00596                            &WCRld );
00597                   }
00598                   PB_Cpsym( TYPE, TYPE, SIDE, UPPER, lb, nbb, ALPHA, Aptr, l, l,
00599                             Ad0, Mptr( WBC, Alp, 0, WBCld, size ), WBCld,
00600                             Mptr( WBR, 0, Alq, WBRld, size ), WBRld, Mptr( WCC,
00601                             Alp, 0, WCCld, size ), WCCld, Mptr( WCR, 0, Alq,
00602                             WCRld, size ), WCRld, tzsymm );
00603                }
00604             }
00605             else
00606             {
00607 /*
00608 *  sub( A ) is lower triangular
00609 */
00610                for( l = 0; l < An; l += Alcmb )
00611                {
00612                   lb  = An - l; ltmp = l + ( lb = MIN( lb, Alcmb ) );
00613                   Alp = PB_Cnumroc( l, 0, Aimb1, Amb, myrow, Arow, nprow );
00614                   Alq = PB_Cnumroc( l, 0, Ainb1, Anb, mycol, Acol, npcol );
00615                   PB_Cpsym( TYPE, TYPE, SIDE, LOWER, lb, nbb, ALPHA, Aptr, l, l,
00616                             Ad0, Mptr( WBC, Alp, 0, WBCld, size ), WBCld,
00617                             Mptr( WBR, 0, Alq, WBRld, size ), WBRld, Mptr( WCC,
00618                             Alp, 0, WCCld, size ), WCCld, Mptr( WCR, 0, Alq,
00619                             WCRld, size ), WCRld, tzsymm );
00620                   Alp  = PB_Cnumroc( ltmp, 0, Aimb1, Amb, myrow, Arow, nprow );
00621                   Alp0 = Amp - Alp;
00622                   Alq0 = PB_Cnumroc( lb,   l, Ainb1, Anb, mycol, Acol, npcol );
00623                   if( Alp0 > 0 && Alq0 > 0 )
00624                   {
00625                      gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &GemmTb ), &Alp0, &nbb,
00626                            &Alq0, talphaRC, Mptr( Aptr, Alp, Alq, Ald, size ),
00627                            &Ald, Mptr( WBR, 0, Alq, WBRld, size ), &WBRld, one,
00628                            Mptr( WCC, Alp, 0, WCCld, size ), &WCCld );
00629                      gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( NOTRAN ), &nbb, &Alq0,
00630                            &Alp0, talphaCR, Mptr( WBC, Alp, 0, WBCld, size ),
00631                            &WBCld, Mptr( Aptr, Alp, Alq, Ald, size ), &Ald, one,
00632                            Mptr( WCR, 0, Alq, WCRld, size ), &WCRld );
00633                   }
00634                }
00635             }
00636          }
00637          if( WBCfr ) free( WBC );
00638          if( WBRfr ) free( WBR );
00639 
00640          if( Bfr && ( BisR || ( BCmyprocR == BcurrocR ) ) )
00641             if( Bbuf ) free( Bbuf );
00642 
00643          if( lside )
00644          {
00645 /*
00646 *  Accumulate the intermediate results in WCC and WCR
00647 */
00648             if( WCCsum )
00649             {
00650                WCCd[CSRC_] = CcurrocR;
00651                if( Amp > 0 )
00652                   gsum2d( ctxt, ROW,    &rctop, Amp, nbb, WCC, WCCld, myrow,
00653                           WCCd[CSRC_] );
00654             }
00655 
00656             if( WCRsum )
00657             {
00658                WCRd[RSRC_] = 0;
00659                if( Anq > 0 )
00660                   gsum2d( ctxt, COLUMN, &cctop, nbb, Anq, WCR, WCRld,
00661                           WCRd[RSRC_], mycol );
00662             }
00663 /*
00664 *  WCC := WCC + WCR'
00665 */
00666             PB_Cpaxpby( TYPE, CONJUG, nbb, An, one, WCR, 0, 0, WCRd, ROW, one,
00667                         WCC, 0, 0, WCCd, COLUMN );
00668             if( WCRfr ) free( WCR );
00669 /*
00670 *  Compute the descriptor DBUFC for the buffer that will contained the packed
00671 *  columns of sub( C ). Allocate it.
00672 */
00673             if( ( Cfr = ( nrpq < nbb ) ) != 0 )
00674             {
00675 /*
00676 *  If columns of sub( C ) are not contiguous, then allocate the buffer
00677 */
00678                Cbufld = MAX( 1, CnpD );  tbeta = zero;
00679                if( CisR || ( BCmyprocR == CcurrocR ) )
00680                   Cbuf = PB_Cmalloc( CnpD * nbb * size );
00681             }
00682             else
00683             {
00684 /*
00685 *  Otherwise re-use sub( C )
00686 */
00687                Cbufld = Cld;             tbeta = BETA;
00688                if( CisR || ( BCmyprocR == CcurrocR ) )
00689                   Cbuf = Mptr( C, CiiD, Ckk+Coff, Cld, size );
00690             }
00691             PB_Cdescset( DBUFC, BCnD, nbb, Cinb1D, nbb, CnbD, nbb, CrocD,
00692                          CcurrocR, ctxt, Cbufld );
00693 /*
00694 *  sub( C ) := beta * sub( C ) + WCC
00695 */
00696             PB_Cpaxpby( TYPE, NOCONJG, An, nbb, one, WCC, 0, 0, WCCd, COLUMN,
00697                         tbeta, Cbuf, 0, 0, DBUFC, COLUMN );
00698             if( WCCfr ) free( WCC );
00699 /*
00700 *  Unpack the kbb columns of sub( C ) and release the buffer containing them.
00701 */
00702             if( Cfr && ( CisR || ( BCmyprocR == CcurrocR ) ) )
00703             {
00704                PB_CVMpack( TYPE, &VM, ROW,   COLUMN, UNPACKING, NOTRAN, nbb,
00705                            CnpD, BETA, Mptr( C, CiiD, Ckk, Cld, size ), Cld,
00706                            one, Cbuf, Cbufld );
00707                if( Cbuf ) free( Cbuf );
00708             }
00709          }
00710          else
00711          {
00712 /*
00713 *  Accumulate the intermediate results in WCC and WCR
00714 */
00715             if( WCCsum )
00716             {
00717                WCCd[CSRC_] = 0;
00718                if( Amp > 0 )
00719                   gsum2d( ctxt, ROW,    &rctop, Amp, nbb, WCC, WCCld, myrow,
00720                           WCCd[CSRC_] );
00721             }
00722 
00723             if( WCRsum )
00724             {
00725                WCRd[RSRC_] = CcurrocR;
00726                if( Anq > 0 )
00727                   gsum2d( ctxt, COLUMN, &cctop, nbb, Anq, WCR, WCRld,
00728                           WCRd[RSRC_], mycol );
00729             }
00730 /*
00731 *  WCR := WCR + WCC'
00732 */
00733             PB_Cpaxpby( TYPE, CONJUG, An, nbb, one, WCC, 0, 0, WCCd, COLUMN,
00734                         one, WCR, 0, 0, WCRd, ROW );
00735             if( WCCfr ) free( WCC );
00736 /*
00737 *  Compute the descriptor DBUFC for the buffer that will contained the packed
00738 *  rows of sub( C ). Allocate it.
00739 */
00740             if( ( Cfr = ( nrpq < nbb ) ) != 0 )
00741             {
00742 /*
00743 *  If rows of sub( C ) are not contiguous, then allocate receiving buffer.
00744 */
00745                Cbufld = nbb; tbeta = zero;
00746                if( CisR || ( BCmyprocR == CcurrocR ) )
00747                   Cbuf = PB_Cmalloc( CnpD * nbb * size );
00748             }
00749             else
00750             {
00751 /*
00752 *  Otherwise re-use sub( C )
00753 */
00754                Cbufld = Cld; tbeta = BETA;
00755                if( CisR || ( BCmyprocR == CcurrocR ) )
00756                   Cbuf = Mptr( C, Ckk+Coff, CiiD, Cld, size );
00757             }
00758             PB_Cdescset( DBUFC, nbb, BCnD, nbb, Cinb1D, nbb, CnbD, CcurrocR,
00759                          CrocD, ctxt, Cbufld );
00760 /*
00761 *  sub( C ) := beta * sub( C ) + WCR
00762 */
00763             PB_Cpaxpby( TYPE, NOCONJG, nbb, An, one, WCR, 0, 0, WCRd, ROW,
00764                         tbeta, Cbuf, 0, 0, DBUFC, ROW );
00765 
00766             if( WCRfr ) free( WCR );
00767 /*
00768 *  Unpack the kbb rows of sub( C ) and release the buffer containing them.
00769 */
00770             if( Cfr && ( CisR || ( BCmyprocR == CcurrocR ) ) )
00771             {
00772                PB_CVMpack( TYPE, &VM, ROW,   ROW,    UNPACKING, NOTRAN, nbb,
00773                            CnpD, BETA, Mptr( C, Ckk, CiiD, Cld, size ), Cld,
00774                            one, Cbuf, Cbufld );
00775                if( Cbuf ) free( Cbuf );
00776             }
00777          }
00778 /*
00779 *  Update the local indexes of sub( B ) and sub( C )
00780 */
00781          PB_CVMupdate( &VM, nbb, &Ckk, &Bkk );
00782 
00783          npq -= nbb;
00784       }
00785 /*
00786 *  Go to next or previous virtual process row or column
00787 */
00788       if( ( BCfwd      && ( p == maxpm1 ) ) ||
00789           ( !( BCfwd ) && ( p == 0      ) ) )
00790          q = ( BCfwd ? MModAdd1( q, maxq ) : MModSub1( q, maxq ) );
00791       p = ( BCfwd ? MModAdd1( p, maxp ) : MModSub1( p, maxp ) );
00792    }
00793 
00794    if( conjg ) free( ( lside ? talphaCR : talphaRC ) );
00795 /*
00796 *  End of PB_CpsymmBC
00797 */
00798 }