ScaLAPACK  2.0.2
ScaLAPACK: Scalable Linear Algebra PACKage
PB_CpsyrkA.c
Go to the documentation of this file.
00001 /* ---------------------------------------------------------------------
00002 *
00003 *  -- PBLAS auxiliary routine (version 2.0) --
00004 *     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
00005 *     and University of California, Berkeley.
00006 *     April 1, 1998
00007 *
00008 *  ---------------------------------------------------------------------
00009 */
00010 /*
00011 *  Include files
00012 */
00013 #include "../pblas.h"
00014 #include "../PBpblas.h"
00015 #include "../PBtools.h"
00016 #include "../PBblacs.h"
00017 #include "../PBblas.h"
00018 
00019 #ifdef __STDC__
00020 void PB_CpsyrkA( PBTYP_T * TYPE, char * DIRECA, char * CONJUG,
00021                  char * UPLO, char * TRANS, int N, int K, char * ALPHA,
00022                  char * A, int IA, int JA, int * DESCA, char * BETA,
00023                  char * C, int IC, int JC, int * DESCC )
00024 #else
00025 void PB_CpsyrkA( TYPE, DIRECA, CONJUG, UPLO, TRANS, N, K, ALPHA, A, IA,
00026                  JA, DESCA, BETA, C, IC, JC, DESCC )
00027 /*
00028 *  .. Scalar Arguments ..
00029 */
00030    char           * CONJUG, * DIRECA, * TRANS, * UPLO;
00031    int            IA, IC, JA, JC, K, N;
00032    char           * ALPHA, * BETA;
00033    PBTYP_T        * TYPE;
00034 /*
00035 *  .. Array Arguments ..
00036 */
00037    int            * DESCA, * DESCC;
00038    char           * A, * C;
00039 #endif
00040 {
00041 /*
00042 *  Purpose
00043 *  =======
00044 *
00045 *  PB_CpsyrkA  performs one of the following symmetric or Hermitian rank
00046 *  k operations
00047 *
00048 *     sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ),
00049 *  or
00050 *     sub( C ) := alpha*sub( A )*conjg( sub( A )' ) + beta*sub( C ),
00051 *  or
00052 *     sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ),
00053 *  or
00054 *     sub( C ) := alpha*conjg( sub( A )' )*sub( A ) + beta*sub( C ),
00055 *
00056 *  where
00057 *
00058 *     sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), and,
00059 *
00060 *     sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1)  if TRANS = 'N',
00061 *                      A(IA:IA+K-1,JA:JA+N-1)  otherwise.
00062 *
00063 *  Alpha  and   beta  are  scalars,  sub( C )  is  an  n by n  symmetric
00064 *  or Hermitian submatrix and  sub( A )  is  an  n by k submatrix in the
00065 *  first case and a k by n submatrix in the second case.
00066 *
00067 *  This is the outer-product algorithm  using  the  logical  LCM  hybrid
00068 *  and static blocking techniques. The submatrix operand sub( C )  stays
00069 *  in place.
00070 *
00071 *  Notes
00072 *  =====
00073 *
00074 *  A description  vector  is associated with each 2D block-cyclicly dis-
00075 *  tributed matrix.  This  vector  stores  the  information  required to
00076 *  establish the  mapping  between a  matrix entry and its corresponding
00077 *  process and memory location.
00078 *
00079 *  In  the  following  comments,   the character _  should  be  read  as
00080 *  "of  the  distributed  matrix".  Let  A  be a generic term for any 2D
00081 *  block cyclicly distributed matrix.  Its description vector is DESC_A:
00082 *
00083 *  NOTATION         STORED IN       EXPLANATION
00084 *  ---------------- --------------- ------------------------------------
00085 *  DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
00086 *  CTXT_A  (global) DESCA[ CTXT_  ] The BLACS context handle, indicating
00087 *                                   the NPROW x NPCOL BLACS process grid
00088 *                                   A  is  distributed over. The context
00089 *                                   itself  is  global,  but  the handle
00090 *                                   (the integer value) may vary.
00091 *  M_A     (global) DESCA[ M_     ] The  number of rows in the distribu-
00092 *                                   ted matrix A, M_A >= 0.
00093 *  N_A     (global) DESCA[ N_     ] The number of columns in the distri-
00094 *                                   buted matrix A, N_A >= 0.
00095 *  IMB_A   (global) DESCA[ IMB_   ] The number of rows of the upper left
00096 *                                   block of the matrix A, IMB_A > 0.
00097 *  INB_A   (global) DESCA[ INB_   ] The  number  of columns of the upper
00098 *                                   left   block   of   the  matrix   A,
00099 *                                   INB_A > 0.
00100 *  MB_A    (global) DESCA[ MB_    ] The blocking factor used to  distri-
00101 *                                   bute the last  M_A-IMB_A  rows of A,
00102 *                                   MB_A > 0.
00103 *  NB_A    (global) DESCA[ NB_    ] The blocking factor used to  distri-
00104 *                                   bute the last  N_A-INB_A  columns of
00105 *                                   A, NB_A > 0.
00106 *  RSRC_A  (global) DESCA[ RSRC_  ] The process row over which the first
00107 *                                   row of the matrix  A is distributed,
00108 *                                   NPROW > RSRC_A >= 0.
00109 *  CSRC_A  (global) DESCA[ CSRC_  ] The  process column  over  which the
00110 *                                   first column of  A  is  distributed.
00111 *                                   NPCOL > CSRC_A >= 0.
00112 *  LLD_A   (local)  DESCA[ LLD_   ] The  leading dimension  of the local
00113 *                                   array  storing  the  local blocks of
00114 *                                   the distributed matrix A,
00115 *                                   IF( Lc( 1, N_A ) > 0 )
00116 *                                      LLD_A >= MAX( 1, Lr( 1, M_A ) )
00117 *                                   ELSE
00118 *                                      LLD_A >= 1.
00119 *
00120 *  Let K be the number of  rows of a matrix A starting at the global in-
00121 *  dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
00122 *  that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
00123 *  receive if these K rows were distributed over NPROW processes.  If  K
00124 *  is the number of columns of a matrix  A  starting at the global index
00125 *  JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number  of co-
00126 *  lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would  receive if
00127 *  these K columns were distributed over NPCOL processes.
00128 *
00129 *  The values of Lr() and Lc() may be determined via a call to the func-
00130 *  tion PB_Cnumroc:
00131 *  Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
00132 *  Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
00133 *
00134 *  Arguments
00135 *  =========
00136 *
00137 *  TYPE    (local input) pointer to a PBTYP_T structure
00138 *          On entry,  TYPE  is a pointer to a structure of type PBTYP_T,
00139 *          that contains type information (See pblas.h).
00140 *
00141 *  DIRECA  (global input) pointer to CHAR
00142 *          On entry, DIRECA  specifies  the direction in which the  rows
00143 *          or columns of sub( A ) should be looped over as follows:
00144 *             DIRECA = 'F' or 'f'   forward  or increasing,
00145 *             DIRECA = 'B' or 'b'   backward or decreasing.
00146 *
00147 *  CONJUG  (global input) pointer to CHAR
00148 *          On entry, CONJUG specifies whether sub( C ) is a symmetric or
00149 *          Hermitian submatrix operand as follows:
00150 *             CONJUG = 'N' or 'n'    sub( C ) is symmetric,
00151 *             CONJUG = 'Z' or 'z'    sub( C ) is Hermitian.
00152 *
00153 *  UPLO    (global input) pointer to CHAR
00154 *          On  entry,   UPLO  specifies  whether  the  local  pieces  of
00155 *          the array  C  containing the  upper or lower triangular  part
00156 *          of the submatrix  sub( C )  are to be referenced as follows:
00157 *             UPLO = 'U' or 'u'   Only the local pieces corresponding to
00158 *                                 the   upper  triangular  part  of  the
00159 *                                 submatrix sub( C ) are referenced,
00160 *             UPLO = 'L' or 'l'   Only the local pieces corresponding to
00161 *                                 the   lower  triangular  part  of  the
00162 *                                 submatrix sub( C ) are referenced.
00163 *
00164 *  TRANS   (global input) pointer to CHAR
00165 *          On entry,  TRANS  specifies the  operation to be performed as
00166 *          follows:
00167 *
00168 *             TRANS = 'N' or 'n'
00169 *                  sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ),
00170 *             or
00171 *                  sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ),
00172 *             or
00173 *                  sub( C ) := alpha*sub( A )*conjg( sub( A )' ) +
00174 *                              beta*sub( C ),
00175 *
00176 *             TRANS = 'T' or 't'
00177 *                  sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ),
00178 *             or
00179 *                  sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ),
00180 *
00181 *             TRANS = 'C' or 'c'
00182 *                  sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ),
00183 *             or
00184 *                  sub( C ) := alpha*conjg( sub( A )' )*sub( A ) +
00185 *                              beta*sub( C ).
00186 *
00187 *  N       (global input) INTEGER
00188 *          On entry,  N specifies the order of the  submatrix  sub( C ).
00189 *          N must be at least zero.
00190 *
00191 *  K       (global input) INTEGER
00192 *          On entry, with TRANS = 'N' or 'n',  K specifies the number of
00193 *          columns  of  the submatrix  sub( A ), and with TRANS = 'T' or
00194 *          't' or 'C' or 'c', K specifies the number of rows of the sub-
00195 *          matrix sub( A ). K  must  be at least zero.
00196 *
00197 *  ALPHA   (global input) pointer to CHAR
00198 *          On entry, ALPHA specifies the scalar alpha.   When  ALPHA  is
00199 *          supplied  as  zero  then  the  local entries of  the array  A
00200 *          corresponding to the entries of the submatrix  sub( A )  need
00201 *          not be set on input.
00202 *
00203 *  A       (local input) pointer to CHAR
00204 *          On entry, A is an array of dimension (LLD_A, Ka), where Ka is
00205 *          at least  Lc( 1, JA+K-1 ) when  TRANS = 'N' or 'n', and is at
00206 *          least Lc( 1, JA+N-1 ) otherwise.  Before  entry,  this  array
00207 *          contains the local entries of the matrix A.
00208 *          Before entry with TRANS = 'N' or 'n', this array contains the
00209 *          local entries corresponding to the entries of the n by k sub-
00210 *          matrix sub( A ), otherwise the local entries corresponding to
00211 *          the entries of the k by n submatrix sub( A ).
00212 *
00213 *  IA      (global input) INTEGER
00214 *          On entry, IA  specifies A's global row index, which points to
00215 *          the beginning of the submatrix sub( A ).
00216 *
00217 *  JA      (global input) INTEGER
00218 *          On entry, JA  specifies A's global column index, which points
00219 *          to the beginning of the submatrix sub( A ).
00220 *
00221 *  DESCA   (global and local input) INTEGER array
00222 *          On entry, DESCA  is an integer array of dimension DLEN_. This
00223 *          is the array descriptor for the matrix A.
00224 *
00225 *  BETA    (global input) pointer to CHAR
00226 *          On entry,  BETA  specifies the scalar  beta.   When  BETA  is
00227 *          supplied  as  zero  then  the  local entries of  the array  C
00228 *          corresponding to the entries of the submatrix  sub( C )  need
00229 *          not be set on input.
00230 *
00231 *  C       (local input/local output) pointer to CHAR
00232 *          On entry, C is an array of dimension (LLD_C, Kc), where Kc is
00233 *          at least Lc( 1, JC+N-1 ).  Before  entry, this array contains
00234 *          the local entries of the matrix C.
00235 *          Before  entry  with  UPLO = 'U' or 'u', this  array  contains
00236 *          the local entries corresponding to the upper triangular  part
00237 *          of the  symmetric or Hermitian submatrix  sub( C ),  and  the
00238 *          local entries corresponding to the  strictly lower triangular
00239 *          of sub( C ) are not referenced. On exit, the upper triangular
00240 *          part  of sub( C ) is overwritten by the upper triangular part
00241 *          of the updated submatrix.
00242 *          Before  entry  with  UPLO = 'L' or 'l', this  array  contains
00243 *          the local entries corresponding to the lower triangular  part
00244 *          of the  symmetric or Hermitian submatrix  sub( C ),  and  the
00245 *          local entries corresponding to the  strictly upper triangular
00246 *          of sub( C ) are not referenced. On exit, the lower triangular
00247 *          part of sub( C ) is overwritten by the  lower triangular part
00248 *          of the updated submatrix.
00249 *          Note that the  imaginary parts  of the local entries  corres-
00250 *          ponding to the  diagonal elements  of  sub( C )  need not  be
00251 *          set,  they are assumed to be zero,  and on exit they are  set
00252 *          to zero.
00253 *
00254 *  IC      (global input) INTEGER
00255 *          On entry, IC  specifies C's global row index, which points to
00256 *          the beginning of the submatrix sub( C ).
00257 *
00258 *  JC      (global input) INTEGER
00259 *          On entry, JC  specifies C's global column index, which points
00260 *          to the beginning of the submatrix sub( C ).
00261 *
00262 *  DESCC   (global and local input) INTEGER array
00263 *          On entry, DESCC  is an integer array of dimension DLEN_. This
00264 *          is the array descriptor for the matrix C.
00265 *
00266 *  -- Written on April 1, 1998 by
00267 *     Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
00268 *
00269 *  ---------------------------------------------------------------------
00270 */
00271 /*
00272 *  .. Local Scalars ..
00273 */
00274    char           * one;
00275    int            AcurrocR, Afwd, AiD, AiR, AiiD, AiiR, AinbD, AinbR, Ainb1D,
00276                   Ainb1R, AisR, Ald, AmyprocD, AmyprocR, AnbD, AnbR, AnpR,
00277                   AnprocsD, AnprocsR, ArocD, ArocR, Arocs, AsrcR, Ccol, Cii,
00278                   Cimb1, Cinb1, Cjj, Clcmb, Cld, Clp, Clq, Cnq0, Cmb, Cmp,
00279                   Cmp0, Cnb, Cnq, Crow, WACfr, WACld, WACsum, WARfr, WARld,
00280                   WARsum, Wkbb=0, ctxt, k, kb, kbb, l, lb, ltmp, maxp, mycol,
00281                   myrow, notran, npcol, nprow, p=0, size, tmp, upper;
00282    GEMM_T         gemm;
00283    TZSYR_T        tzsyrk;
00284 /*
00285 *  .. Local Arrays ..
00286 */
00287    int            Cd0[DLEN_], DBUFA[DLEN_], WACd0[DLEN_], WARd0[DLEN_];
00288    char           * Aptr = NULL, * Cptr = NULL, * WAC = NULL, * WAR  = NULL;
00289 /* ..
00290 *  .. Executable Statements ..
00291 *
00292 */
00293 /*
00294 *  sub( C ) := beta * sub( C )
00295 */
00296    PB_Cplascal( TYPE, UPLO, CONJUG, N, N, BETA, C, IC, JC, DESCC );
00297 /*
00298 *  Retrieve process grid information
00299 */
00300    Cblacs_gridinfo( ( ctxt = DESCC[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
00301 
00302    size = TYPE->size; one = TYPE->one; gemm  = TYPE->Fgemm;
00303    kb   = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
00304 /*
00305 *  Compute descriptor Cd0 for sub( C )
00306 */
00307    PB_Cdescribe( N, N, IC, JC, DESCC, nprow, npcol, myrow, mycol, &Cii, &Cjj,
00308                  &Cld, &Cimb1, &Cinb1, &Cmb, &Cnb, &Crow, &Ccol, Cd0 );
00309 
00310    Cmp  = PB_Cnumroc( N, 0, Cimb1, Cmb, myrow, Crow, nprow );
00311    Cnq  = PB_Cnumroc( N, 0, Cinb1, Cnb, mycol, Ccol, npcol );
00312 
00313    if( ( Cmp > 0 ) && ( Cnq > 0 ) )
00314    {
00315       Cptr   = Mptr( C, Cii, Cjj, Cld, size );
00316       tzsyrk = ( ( Mupcase( CONJUG[0] ) == CNOCONJG ) ? PB_Ctzsyrk :
00317                                                         PB_Ctzherk );
00318 /*
00319 *  Computational partitioning size is computed as the product of the logical
00320 *  value returned by pilaenv_ and 2 * lcm( nprow, npcol ).
00321 */
00322       Clcmb  = 2 * kb * PB_Clcm( ( Crow >= 0 ? nprow : 1 ),
00323                                  ( Ccol >= 0 ? npcol : 1 ) );
00324    }
00325 /*
00326 *  Retrieve local information for sub( A )
00327 */
00328    if( ( notran = ( Mupcase( TRANS[0] ) == CNOTRAN  ) ) != 0 )
00329    {
00330       AiR   = JA; AnprocsR = npcol; AinbR = DESCA[INB_]; AnbR = DESCA[NB_];
00331       AsrcR = DESCA[CSRC_];
00332    }
00333    else
00334    {
00335       AiR   = IA; AnprocsR = nprow; AinbR = DESCA[IMB_]; AnbR = DESCA[MB_];
00336       AsrcR = DESCA[RSRC_];
00337    }
00338 /*
00339 *  If sub( A ) only spans one process row or column, then there is no need to
00340 *  pack the data.
00341 */
00342    if( !( PB_Cspan( K, AiR, AinbR, AnbR, AsrcR, AnprocsR ) ) )
00343    {
00344 /*
00345 *  Replicate sub( A ) in process rows and columns spanned by sub( C ): WAC, WAR
00346 */
00347       if( notran )
00348       {
00349          PB_CInV( TYPE, NOCONJG, COLUMN, N, N, Cd0, K, A, IA, JA, DESCA,
00350                   COLUMN, &WAC, WACd0, &WACfr );
00351          PB_CInV( TYPE, CONJUG,  ROW,    N, N, Cd0, K, WAC, 0, 0, WACd0,
00352                   COLUMN, &WAR, WARd0, &WARfr );
00353       }
00354       else
00355       {
00356          PB_CInV( TYPE, NOCONJG, ROW,    N, N, Cd0, K, A, IA, JA, DESCA,
00357                   ROW,    &WAR, WARd0, &WARfr );
00358          PB_CInV( TYPE, CONJUG,  COLUMN, N, N, Cd0, K, WAR, 0, 0, WARd0,
00359                   ROW,    &WAC, WACd0, &WACfr );
00360       }
00361 /*
00362 *  Perform the local update if I own some data
00363 */
00364       if( ( Cmp > 0 ) && ( Cnq > 0 ) )
00365       {
00366          WACld = WACd0[LLD_]; WARld = WARd0[LLD_];
00367 
00368          if( Mupcase( UPLO[0] ) == CUPPER )
00369          {
00370             for( l = 0; l < N; l += Clcmb )
00371             {
00372                lb   = N - l; lb = MIN( lb, Clcmb );
00373                Clp  = PB_Cnumroc( l,  0, Cimb1, Cmb, myrow, Crow, nprow );
00374                Clq  = PB_Cnumroc( l,  0, Cinb1, Cnb, mycol, Ccol, npcol );
00375                Cnq0 = PB_Cnumroc( lb, l, Cinb1, Cnb, mycol, Ccol, npcol );
00376                if( Clp > 0 && Cnq0 > 0 )
00377                   gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Clp, &Cnq0, &K,
00378                         ALPHA, WAC, &WACld, Mptr( WAR, 0, Clq, WARld, size ),
00379                         &WARld, one, Mptr( Cptr, 0, Clq, Cld, size ), &Cld );
00380                PB_Cpsyr( TYPE, UPPER, lb, K, ALPHA, Mptr( WAC, Clp, 0, WACld,
00381                          size ), WACld, Mptr( WAR, 0, Clq, WARld, size ), WARld,
00382                          Cptr, l, l, Cd0, tzsyrk );
00383             }
00384          }
00385          else
00386          {
00387             for( l = 0; l < N; l += Clcmb )
00388             {
00389                lb   = N - l; ltmp = l + ( lb = MIN( lb, Clcmb ) );
00390                Clp  = PB_Cnumroc( l, 0, Cimb1, Cmb, myrow, Crow, nprow );
00391                Clq  = PB_Cnumroc( l, 0, Cinb1, Cnb, mycol, Ccol, npcol );
00392                PB_Cpsyr( TYPE, LOWER, lb, K, ALPHA, Mptr( WAC, Clp, 0, WACld,
00393                          size ), WACld, Mptr( WAR, 0, Clq, WARld, size ), WARld,
00394                          Cptr, l, l, Cd0, tzsyrk );
00395                Clp  = PB_Cnumroc( ltmp, 0, Cimb1, Cmb, myrow, Crow, nprow );
00396                Cmp0 = Cmp - Clp;
00397                Cnq0 = PB_Cnumroc( lb,   l, Cinb1, Cnb, mycol, Ccol, npcol );
00398                if( Cmp0 > 0 && Cnq0 > 0 )
00399                   gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp0, &Cnq0,
00400                         &K, ALPHA, Mptr( WAC, Clp, 0, WACld, size ), &WACld,
00401                         Mptr( WAR, 0, Clq, WARld, size ), &WARld, one,
00402                         Mptr( Cptr, Clp, Clq, Cld, size ), &Cld );
00403             }
00404          }
00405       }
00406 
00407       if( WACfr ) free( WAC );
00408       if( WARfr ) free( WAR );
00409 
00410       return;
00411    }
00412 /*
00413 *  Otherwise sub( A ) spans more than one process row or columns -> LCM hybrid
00414 */
00415    Afwd  = ( Mupcase( DIRECA[0] ) == CFORWARD );
00416    upper = ( Mupcase( UPLO  [0] ) == CUPPER   );
00417 
00418    if( notran )
00419    {
00420       AiD = IA; AinbD = DESCA[IMB_]; AnbD = DESCA[MB_]; Ald = DESCA[LLD_];
00421       AmyprocD = myrow; AmyprocR = mycol; AnprocsD = nprow;
00422       PB_Cinfog2l( IA, JA, DESCA, AnprocsD, AnprocsR, AmyprocD, AmyprocR,
00423                    &AiiD, &AiiR, &ArocD, &ArocR );
00424    }
00425    else
00426    {
00427       AiD = JA; AinbD = DESCA[INB_]; AnbD = DESCA[NB_]; Ald = DESCA[LLD_];
00428       AmyprocD = mycol; AmyprocR = myrow; AnprocsD = npcol;
00429       PB_Cinfog2l( IA, JA, DESCA, AnprocsR, AnprocsD, AmyprocR, AmyprocD,
00430                    &AiiR, &AiiD, &ArocR, &ArocD );
00431    }
00432    Ainb1D = PB_Cfirstnb( N, AiD, AinbD, AnbD );
00433    Ainb1R = PB_Cfirstnb( K, AiR, AinbR, AnbR );
00434    AisR   = ( ( AsrcR < 0 ) || ( AnprocsR == 1 ) );
00435 /*
00436 *  When sub( A ) is not replicated and backward pass on sub( A ), find the
00437 *  virtual process (p,p) owning the last row or column of sub( A ).
00438 */
00439    if( !( AisR ) && !( Afwd ) )
00440    {
00441       tmp = PB_Cindxg2p( K - 1, Ainb1R, AnbR, ArocR, ArocR, AnprocsR );
00442       p   = MModSub( tmp, ArocR, AnprocsR );
00443    }
00444 /*
00445 *  Allocate work space in process rows and columns spanned by sub( C )
00446 */
00447    PB_COutV( TYPE, COLUMN, NOINIT, N, N, Cd0, kb, &WAC, WACd0, &WACfr,
00448              &WACsum );
00449    PB_COutV( TYPE, ROW,    NOINIT, N, N, Cd0, kb, &WAR, WARd0, &WARfr,
00450              &WARsum );
00451 /*
00452 *  Loop over the virtual process grid induced by the rows or columns of sub( A )
00453 */
00454    maxp     = ( AisR ? 1 : AnprocsR );
00455    AcurrocR = ( AisR ? -1 : MModAdd( ArocR, p, AnprocsR ) );
00456    AnpR     = PB_Cnumroc( K, 0, Ainb1R, AnbR, AcurrocR, ArocR, AnprocsR );
00457 
00458    for( k = 0; k < K; k += kb )
00459    {
00460       kbb = K - k; kbb = MIN( kbb, kb );
00461 
00462       while( Wkbb != kbb )
00463       {
00464 /*
00465 *  Ensure that the current virtual process (p,p) has something to contribute to
00466 *  the replicated buffers WAC and WAR.
00467 */
00468          while( AnpR == 0 )
00469          {
00470             p        = ( Afwd ? MModAdd1( p, maxp ) : MModSub1( p, maxp ) );
00471             AcurrocR = ( AisR ? -1 : MModAdd( ArocR, p, AnprocsR ) );
00472             AnpR     = PB_Cnumroc( K, 0, Ainb1R, AnbR, AcurrocR, ArocR,
00473                                    AnprocsR );
00474          }
00475 /*
00476 *  Current virtual process (p,p) has something, find out how many rows or
00477 *  columns could be used: Arocs.
00478 */
00479          if( Wkbb == 0 ) { Arocs = ( AnpR < kbb ? AnpR : kbb ); }
00480          else            { Arocs = kbb - Wkbb; Arocs = MIN( Arocs, AnpR ); }
00481 /*
00482 *  The current virtual process (p,p) has Arocs rows or columns of sub( A )
00483 *  to contribute, replicate the data over sub( C ).
00484 */
00485          if( notran )
00486          {
00487             if( AisR || ( AmyprocR == AcurrocR ) )
00488             { Aptr = Mptr( A, AiiD, AiiR, Ald, size ); AiiR += Arocs; }
00489             PB_Cdescset( DBUFA, N, Arocs, Ainb1D, Arocs, AnbD, Arocs,
00490                          ArocD, AcurrocR, ctxt, Ald );
00491 /*
00492 *  Replicate Arocs columns of sub( A ) in process columns spanned by sub( C )
00493 */
00494             PB_CInV2( TYPE, NOCONJG, COLUMN, N, N, Cd0, Arocs, Aptr, 0, 0,
00495                       DBUFA, COLUMN, WAC, Wkbb, WACd0 );
00496          }
00497          else
00498          {
00499             if( AisR || ( AmyprocR == AcurrocR ) )
00500             { Aptr = Mptr( A, AiiR, AiiD, Ald, size ); AiiR += Arocs; }
00501             PB_Cdescset( DBUFA, Arocs, N, Arocs, Ainb1D, Arocs, AnbD,
00502                          AcurrocR, ArocD, ctxt, Ald );
00503 /*
00504 *  Replicate Arocs rows of sub( A ) in process rows spanned by sub( C )
00505 */
00506             PB_CInV2( TYPE, NOCONJG, ROW,    N, N, Cd0, Arocs, Aptr, 0, 0,
00507                       DBUFA, ROW,    WAR, Wkbb, WARd0 );
00508          }
00509 /*
00510 *  Arocs rows or columns of sub( A ) have been replicated over sub( C ),
00511 *  update the number of diagonals in this virtual process as well as the
00512 *  number of rows or columns of sub( A ) that are in WAR or WAC.
00513 */
00514          AnpR -= Arocs;
00515          Wkbb += Arocs;
00516       }
00517 
00518       if( notran )
00519       {
00520 /*
00521 *  WAR := WAC'
00522 */
00523          PB_CInV2( TYPE, CONJUG,  ROW,    N, N, Cd0, kbb, WAC,  0, 0, WACd0,
00524                    COLUMN, WAR, 0, WARd0 );
00525       }
00526       else
00527       {
00528 /*
00529 *  WAC := WAR'
00530 */
00531          PB_CInV2( TYPE, CONJUG,  COLUMN, N, N, Cd0, kbb, WAR,  0, 0, WARd0,
00532                    ROW,    WAC, 0, WACd0 );
00533       }
00534 /*
00535 *  Perform the local update if I own some data
00536 */
00537       if( ( Cmp > 0 ) && ( Cnq > 0 ) )
00538       {
00539          WACld = WACd0[LLD_]; WARld = WARd0[LLD_];
00540 
00541          if( upper )
00542          {
00543             for( l = 0; l < N; l += Clcmb )
00544             {
00545                lb   = N - l; lb = MIN( lb, Clcmb );
00546                Clp  = PB_Cnumroc( l,  0, Cimb1, Cmb, myrow, Crow, nprow );
00547                Clq  = PB_Cnumroc( l,  0, Cinb1, Cnb, mycol, Ccol, npcol );
00548                Cnq0 = PB_Cnumroc( lb, l, Cinb1, Cnb, mycol, Ccol, npcol );
00549                if( Clp > 0 && Cnq0 > 0 )
00550                   gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Clp, &Cnq0,
00551                         &kbb, ALPHA, WAC, &WACld, Mptr( WAR, 0, Clq, WARld,
00552                         size ), &WARld, one, Mptr( Cptr, 0, Clq, Cld, size ),
00553                         &Cld );
00554                PB_Cpsyr( TYPE, UPPER, lb, kbb, ALPHA, Mptr( WAC, Clp, 0, WACld,
00555                          size ), WACld, Mptr( WAR, 0, Clq, WARld, size ), WARld,
00556                          Cptr, l, l, Cd0, tzsyrk );
00557             }
00558          }
00559          else
00560          {
00561             for( l = 0; l < N; l += Clcmb )
00562             {
00563                lb   = N - l; ltmp = l + ( lb = MIN( lb, Clcmb ) );
00564                Clp  = PB_Cnumroc( l, 0, Cimb1, Cmb, myrow, Crow, nprow );
00565                Clq  = PB_Cnumroc( l, 0, Cinb1, Cnb, mycol, Ccol, npcol );
00566                PB_Cpsyr( TYPE, LOWER, lb, kbb, ALPHA, Mptr( WAC, Clp, 0, WACld,
00567                          size ), WACld, Mptr( WAR, 0, Clq, WARld, size ), WARld,
00568                          Cptr, l, l, Cd0, tzsyrk );
00569                Clp  = PB_Cnumroc( ltmp, 0, Cimb1, Cmb, myrow, Crow, nprow );
00570                Cmp0 = Cmp - Clp;
00571                Cnq0 = PB_Cnumroc( lb,   l, Cinb1, Cnb, mycol, Ccol, npcol );
00572                if( Cmp0 > 0 && Cnq0 > 0 )
00573                   gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Cmp0, &Cnq0,
00574                         &kbb, ALPHA, Mptr( WAC, Clp, 0, WACld, size ), &WACld,
00575                         Mptr( WAR, 0, Clq, WARld, size ), &WARld, one,
00576                         Mptr( Cptr, Clp, Clq, Cld, size ), &Cld );
00577             }
00578          }
00579       }
00580 
00581       Wkbb = 0;
00582    }
00583 
00584    if( WACfr ) free( WAC );
00585    if( WARfr ) free( WAR );
00586 /*
00587 *  End of PB_CpsyrkA
00588 */
00589 }