ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
PB_Cpsyr2kAC.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS auxiliary routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "../pblas.h"
14 #include "../PBpblas.h"
15 #include "../PBtools.h"
16 #include "../PBblacs.h"
17 #include "../PBblas.h"
18 
19 #ifdef __STDC__
20 void PB_Cpsyr2kAC( PBTYP_T * TYPE, char * DIRECAB, char * CONJUG,
21  char * UPLO, char * TRANS, int N, int K, char * ALPHA,
22  char * A, int IA, int JA, int * DESCA, char * B,
23  int IB, int JB, int * DESCB, char * BETA, char * C,
24  int IC, int JC, int * DESCC )
25 #else
26 void PB_Cpsyr2kAC( TYPE, DIRECAB, CONJUG, UPLO, TRANS, N, K, ALPHA, A, IA,
27  JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC )
28 /*
29 * .. Scalar Arguments ..
30 */
31  char * CONJUG, * DIRECAB, * TRANS, * UPLO;
32  int IA, IB, IC, JA, JB, JC, K, N;
33  char * ALPHA, * BETA;
34  PBTYP_T * TYPE;
35 /*
36 * .. Array Arguments ..
37 */
38  int * DESCA, * DESCB, * DESCC;
39  char * A, * B, * C;
40 #endif
41 {
42 /*
43 * Purpose
44 * =======
45 *
46 * PB_Cpsyr2kAC performs one of the following symmetric or Hermitian
47 * rank 2k operations
48 *
49 * sub( C ) := alpha*sub( A )*sub( B )' + alpha*sub( B )*sub( A )' +
50 * beta*sub( C ),
51 * or
52 * sub( C ) := alpha*sub( A )*conjg( sub( B ) )' +
53 * conjg( alpha )*sub( B )*conjg( sub( A ) )' +
54 * beta*sub( C ),
55 * or
56 * sub( C ) := alpha*sub( A )'*sub( B ) + alpha*sub( B )'*sub( A ) +
57 * beta*sub( C ),
58 * or
59 * sub( C ) := alpha*conjg( sub( A )' )*sub( B ) +
60 * conjg( alpha )*conjg( sub( B )' )*sub( A ) +
61 * beta*sub( C ),
62 *
63 * where
64 *
65 * sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1),
66 *
67 * sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N',
68 * A(IA:IA+K-1,JA:JA+N-1) otherwise, and,
69 *
70 * sub( B ) denotes B(IB:IB+N-1,JB:JB+K-1) if TRANS = 'N',
71 * B(IB:IB+K-1,JB:JB+N-1) otherwise.
72 *
73 * Alpha and beta are scalars, sub( C ) is an n by n symmetric or
74 * Hermitian submatrix and sub( A ) and sub( B ) are n by k submatrices
75 * in the first case and k by n submatrices in the second case.
76 *
77 * This is the outer-product algorithm using the logical aggregation
78 * blocking technique.
79 *
80 * Notes
81 * =====
82 *
83 * A description vector is associated with each 2D block-cyclicly dis-
84 * tributed matrix. This vector stores the information required to
85 * establish the mapping between a matrix entry and its corresponding
86 * process and memory location.
87 *
88 * In the following comments, the character _ should be read as
89 * "of the distributed matrix". Let A be a generic term for any 2D
90 * block cyclicly distributed matrix. Its description vector is DESC_A:
91 *
92 * NOTATION STORED IN EXPLANATION
93 * ---------------- --------------- ------------------------------------
94 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
95 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
96 * the NPROW x NPCOL BLACS process grid
97 * A is distributed over. The context
98 * itself is global, but the handle
99 * (the integer value) may vary.
100 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
101 * ted matrix A, M_A >= 0.
102 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
103 * buted matrix A, N_A >= 0.
104 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
105 * block of the matrix A, IMB_A > 0.
106 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
107 * left block of the matrix A,
108 * INB_A > 0.
109 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
110 * bute the last M_A-IMB_A rows of A,
111 * MB_A > 0.
112 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
113 * bute the last N_A-INB_A columns of
114 * A, NB_A > 0.
115 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
116 * row of the matrix A is distributed,
117 * NPROW > RSRC_A >= 0.
118 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
119 * first column of A is distributed.
120 * NPCOL > CSRC_A >= 0.
121 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
122 * array storing the local blocks of
123 * the distributed matrix A,
124 * IF( Lc( 1, N_A ) > 0 )
125 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
126 * ELSE
127 * LLD_A >= 1.
128 *
129 * Let K be the number of rows of a matrix A starting at the global in-
130 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
131 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
132 * receive if these K rows were distributed over NPROW processes. If K
133 * is the number of columns of a matrix A starting at the global index
134 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
135 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
136 * these K columns were distributed over NPCOL processes.
137 *
138 * The values of Lr() and Lc() may be determined via a call to the func-
139 * tion PB_Cnumroc:
140 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
141 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
142 *
143 * Arguments
144 * =========
145 *
146 * TYPE (local input) pointer to a PBTYP_T structure
147 * On entry, TYPE is a pointer to a structure of type PBTYP_T,
148 * that contains type information (See pblas.h).
149 *
150 * DIRECAB (global input) pointer to CHAR
151 * On entry, DIRECAB specifies the direction in which the rows
152 * or columns of sub( A ), sub( B ) and sub( C ) should be
153 * looped over as follows:
154 * DIRECAB = 'F' or 'f' forward or increasing,
155 * DIRECAB = 'B' or 'b' backward or decreasing.
156 *
157 * CONJUG (global input) pointer to CHAR
158 * On entry, CONJUG specifies whether sub( C ) is a symmetric or
159 * Hermitian submatrix operand as follows:
160 * CONJUG = 'N' or 'n' sub( C ) is symmetric,
161 * CONJUG = 'Z' or 'z' sub( C ) is Hermitian.
162 *
163 * UPLO (global input) pointer to CHAR
164 * On entry, UPLO specifies whether the local pieces of
165 * the array C containing the upper or lower triangular part
166 * of the submatrix sub( C ) are to be referenced as follows:
167 * UPLO = 'U' or 'u' Only the local pieces corresponding to
168 * the upper triangular part of the
169 * submatrix sub( C ) are referenced,
170 * UPLO = 'L' or 'l' Only the local pieces corresponding to
171 * the lower triangular part of the
172 * submatrix sub( C ) are referenced.
173 *
174 * TRANS (global input) pointer to CHAR
175 * On entry, TRANS specifies the operation to be performed as
176 * follows:
177 *
178 * TRANS = 'N' or 'n'
179 * sub( C ) := alpha*sub( A )*sub( B )' +
180 * alpha*sub( B )*sub( A )' +
181 * beta*sub( C ),
182 * or
183 * sub( C ) := alpha*sub( A )*sub( B )' +
184 * alpha*sub( B )*sub( A )' +
185 * beta*sub( C ),
186 * or
187 * sub( C ) := alpha*sub( A )*conjg( sub( B )' ) +
188 * conjg( alpha )*sub( B )*conjg( sub( A )' ) +
189 * beta*sub( C ),
190 *
191 * TRANS = 'T' or 't'
192 * sub( C ) := alpha*sub( B )'*sub( A ) +
193 * alpha*sub( A )'*sub( B ) +
194 * beta*sub( C ),
195 * or
196 * sub( C ) := alpha*sub( B )'*sub( A ) +
197 * alpha*sub( A )'*sub( B ) +
198 * beta*sub( C ),
199 *
200 * TRANS = 'C' or 'c'
201 * sub( C ) := alpha*sub( B )'*sub( A ) +
202 * alpha*sub( A )'*sub( B ) +
203 * beta*sub( C ),
204 * or
205 * sub( C ) := alpha*conjg( sub( A )' )*sub( B ) +
206 * conjg( alpha )*conjg( sub( B )' )*sub( A ) +
207 * beta*sub( C ).
208 *
209 * N (global input) INTEGER
210 * On entry, N specifies the order of the submatrix sub( C ).
211 * N must be at least zero.
212 *
213 * K (global input) INTEGER
214 * On entry with TRANS = 'N' or 'n', K specifies the number of
215 * columns of the submatrices sub( A ) and sub( B ), and on
216 * entry with TRANS = 'T' or 't' or 'C' or 'c', K specifies the
217 * number of rows of the submatrices sub( A ) and sub( B ).
218 * K must be at least zero.
219 *
220 * ALPHA (global input) pointer to CHAR
221 * On entry, ALPHA specifies the scalar alpha. When ALPHA is
222 * supplied as zero then the local entries of the arrays A
223 * and B corresponding to the entries of the submatrices
224 * sub( A ) and sub( B ) respectively need not be set on input.
225 *
226 * A (local input) pointer to CHAR
227 * On entry, A is an array of dimension (LLD_A, Ka), where Ka is
228 * at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at
229 * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array
230 * contains the local entries of the matrix A.
231 * Before entry with TRANS = 'N' or 'n', this array contains the
232 * local entries corresponding to the entries of the n by k sub-
233 * matrix sub( A ), otherwise the local entries corresponding to
234 * the entries of the k by n submatrix sub( A ).
235 *
236 * IA (global input) INTEGER
237 * On entry, IA specifies A's global row index, which points to
238 * the beginning of the submatrix sub( A ).
239 *
240 * JA (global input) INTEGER
241 * On entry, JA specifies A's global column index, which points
242 * to the beginning of the submatrix sub( A ).
243 *
244 * DESCA (global and local input) INTEGER array
245 * On entry, DESCA is an integer array of dimension DLEN_. This
246 * is the array descriptor for the matrix A.
247 *
248 * B (local input) pointer to CHAR
249 * On entry, B is an array of dimension (LLD_B, Kb), where Kb is
250 * at least Lc( 1, JB+K-1 ) when TRANS = 'N' or 'n', and is at
251 * least Lc( 1, JB+N-1 ) otherwise. Before entry, this array
252 * contains the local entries of the matrix B.
253 * Before entry with TRANS = 'N' or 'n', this array contains the
254 * local entries corresponding to the entries of the n by k sub-
255 * matrix sub( B ), otherwise the local entries corresponding to
256 * the entries of the k by n submatrix sub( B ).
257 *
258 * IB (global input) INTEGER
259 * On entry, IB specifies B's global row index, which points to
260 * the beginning of the submatrix sub( B ).
261 *
262 * JB (global input) INTEGER
263 * On entry, JB specifies B's global column index, which points
264 * to the beginning of the submatrix sub( B ).
265 *
266 * DESCB (global and local input) INTEGER array
267 * On entry, DESCB is an integer array of dimension DLEN_. This
268 * is the array descriptor for the matrix B.
269 *
270 * BETA (global input) pointer to CHAR
271 * On entry, BETA specifies the scalar beta. When BETA is
272 * supplied as zero then the local entries of the array C
273 * corresponding to the entries of the submatrix sub( C ) need
274 * not be set on input.
275 *
276 * C (local input/local output) pointer to CHAR
277 * On entry, C is an array of dimension (LLD_C, Kc), where Kc is
278 * at least Lc( 1, JC+N-1 ). Before entry, this array contains
279 * the local entries of the matrix C.
280 * Before entry with UPLO = 'U' or 'u', this array contains
281 * the local entries corresponding to the upper triangular part
282 * of the symmetric or Hermitian submatrix sub( C ), and the
283 * local entries corresponding to the strictly lower triangular
284 * of sub( C ) are not referenced. On exit, the upper triangular
285 * part of sub( C ) is overwritten by the upper triangular part
286 * of the updated submatrix.
287 * Before entry with UPLO = 'L' or 'l', this array contains
288 * the local entries corresponding to the lower triangular part
289 * of the symmetric or Hermitian submatrix sub( C ), and the
290 * local entries corresponding to the strictly upper triangular
291 * of sub( C ) are not referenced. On exit, the lower triangular
292 * part of sub( C ) is overwritten by the lower triangular part
293 * of the updated submatrix.
294 * Note that the imaginary parts of the local entries corres-
295 * ponding to the diagonal elements of sub( C ) need not be
296 * set, they are assumed to be zero, and on exit they are set
297 * to zero.
298 *
299 * IC (global input) INTEGER
300 * On entry, IC specifies C's global row index, which points to
301 * the beginning of the submatrix sub( C ).
302 *
303 * JC (global input) INTEGER
304 * On entry, JC specifies C's global column index, which points
305 * to the beginning of the submatrix sub( C ).
306 *
307 * DESCC (global and local input) INTEGER array
308 * On entry, DESCC is an integer array of dimension DLEN_. This
309 * is the array descriptor for the matrix C.
310 *
311 * -- Written on April 1, 1998 by
312 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
313 *
314 * ---------------------------------------------------------------------
315 */
316 /*
317 * .. Local Scalars ..
318 */
319  char GatherDir, ScatterDir, * one, top, * talpha, * tbeta, tran,
320  * zero;
321  int ABm, ABn, Acol, Acurcol, Acurrow, Acurimb1, Acurinb1, Afr,
322  Aii, Aimb, Aimb1, Ainb, Ainb1, AisD, AisR, Ajj, Ald, Amb,
323  Amp, Amp0, Anb, Anq, Anq0, Arow, Aspan, Bcol, Bcurcol,
324  Bcurrow, Bcurimb1, Bcurinb1, Bfr, Bii, Bimb, Bimb1, Binb,
325  Binb1, BisD, BisR, Bjj, Bld, Bmb, Bmp, Bmp0, Bnb, Bnq, Bnq0,
326  Brow, Bspan, Ccsrc, Cimb, Cinb, Cmb, Cnb, Crsrc, WAfr, WACfr,
327  WACld, WACreuse, WACsum, WBfr, WBCfr, WBCld, WBCsum, conjg,
328  ctxt, fwd, k, kb, kbb, kend, kstart, kstep, ktmp, mycol,
329  myrow, notran, npcol, nprow, size, upper;
330  GEMM_T gemm;
331  GSUM2D_T gsum2d;
332 /*
333 * .. Local Arrays ..
334 */
335  char * Aptr = NULL, * Aptr0 = NULL, * Bptr = NULL, * Bptr0 = NULL,
336  * WA = NULL, * WB = NULL, * WAC = NULL, *WBC = NULL;
337  int Ad0[DLEN_], Bd0[DLEN_], DBUFA[DLEN_], DBUFB[DLEN_],
338  WAd[DLEN_], WBd[DLEN_], WACd [DLEN_], WBCd [DLEN_];
339 /* ..
340 * .. Executable Statements ..
341 *
342 */
343 /*
344 * sub( C ) = beta * sub( C )
345 */
346  PB_Cplascal( TYPE, UPLO, CONJUG, N, N, BETA, C, IC, JC, DESCC );
347 /*
348 * Retrieve process grid information
349 */
350  Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
351 
352  fwd = ( Mupcase( DIRECAB[0] ) == CFORWARD );
353  conjg = ( Mupcase( CONJUG [0] ) == CCONJG );
354  upper = ( Mupcase( UPLO [0] ) == CUPPER );
355  notran = ( Mupcase( TRANS [0] ) == CNOTRAN );
356 
357  size = TYPE->size; one = TYPE->one; zero = TYPE->zero;
358  gsum2d = TYPE->Cgsum2d; gemm = TYPE->Fgemm;
359 /*
360 * Figure out the loop bounds accordingly to DIRECAB
361 */
362  kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
363  if( fwd )
364  {
365  kstart = 0; kend = ( ( N - 1 ) / kb + 1 ) * kb; kstep = kb;
366  GatherDir = CFORWARD; ScatterDir = CBACKWARD;
367  }
368  else
369  {
370  kstart = ( ( N - 1 ) / kb ) * kb; kend = kstep = -kb;
371  GatherDir = CBACKWARD; ScatterDir = CFORWARD;
372  }
373 /*
374 * Compute conjg( ALPHA ) and transpose parameter for Hermitian case
375 */
376  if( conjg )
377  {
378  tran = CCOTRAN; talpha = PB_Cmalloc( size );
379  PB_Cconjg( TYPE, ALPHA, talpha );
380  }
381  else { tran = CTRAN; talpha = ALPHA; }
382 /*
383 * Compute local information for sub( A ) and sub( B )
384 */
385  if( notran ) { ABm = N; ABn = K; } else { ABm = K; ABn = N; }
386  PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj,
387  &Arow, &Acol );
388  Aimb = DESCA[IMB_]; Ainb = DESCA[INB_];
389  Amb = DESCA[MB_ ]; Anb = DESCA[NB_ ]; Ald = DESCA[LLD_];
390  Aimb1 = PB_Cfirstnb( ABm, IA, Aimb, Amb );
391  Amp0 = PB_Cnumroc( ABm, 0, Aimb1, Amb, myrow, Arow, nprow );
392  Ainb1 = PB_Cfirstnb( ABn, JA, Ainb, Anb );
393  Anq0 = PB_Cnumroc( ABn, 0, Ainb1, Anb, mycol, Acol, npcol );
394  if( ( Amp0 > 0 ) && ( Anq0 > 0 ) ) Aptr0 = Mptr( A, Aii, Ajj, Ald, size );
395 
396  PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj,
397  &Brow, &Bcol );
398  Bimb = DESCB[IMB_]; Binb = DESCB[INB_];
399  Bmb = DESCB[MB_ ]; Bnb = DESCB[NB_ ]; Bld = DESCB[LLD_];
400  Bimb1 = PB_Cfirstnb( ABm, IB, Bimb, Bmb );
401  Bmp0 = PB_Cnumroc( ABm, 0, Bimb1, Bmb, myrow, Brow, nprow );
402  Binb1 = PB_Cfirstnb( ABn, JB, Binb, Bnb );
403  Bnq0 = PB_Cnumroc( ABn, 0, Binb1, Bnb, mycol, Bcol, npcol );
404  if( ( Bmp0 > 0 ) && ( Bnq0 > 0 ) ) Bptr0 = Mptr( B, Bii, Bjj, Bld, size );
405 
406  if( notran )
407  {
408  top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
409  Cinb = DESCC[INB_]; Cnb = DESCC[NB_]; Ccsrc = DESCC[CSRC_];
410 /*
411 * Determine if one can reuse the WAC buffer for the intermediate local products
412 * sub( A ) * sub( B )' and sub( B ) * sub( A )'.
413 */
414  AisR = ( ( Acol < 0 ) || ( npcol == 1 ) );
415  BisR = ( ( Bcol < 0 ) || ( npcol == 1 ) );
416 
417  if( !( AisR ) && !( BisR ) )
418  {
419 /*
420 * When neither sub( A ) nor sub( B ) are replicated, WAC can be reused if
421 * either sub( A ) spans more than one process column, or, neither sub( A )
422 * nor sub( B ) span more than one process column, and both operands reside
423 * in the same process column.
424 */
425  Aspan = PB_Cspan( ABn, 0, Ainb1, Anb, Acol, npcol );
426  Bspan = PB_Cspan( ABn, 0, Binb1, Bnb, Bcol, npcol );
427  WACreuse = ( Aspan ||
428  ( !( Aspan ) && !( Bspan ) && ( Acol == Bcol ) ) );
429  }
430  else
431  {
432 /*
433 * Otherwise, WAC can be reused when both operands sub( A ) and sub( B ) are
434 * replicated.
435 */
436  WACreuse = ( AisR && BisR );
437  }
438 /*
439 * Furthermore, the ability to reuse WAC requires sub( A ) and sub( B ) to be
440 * either both not row-distributed, or, both row-distributed and aligned.
441 */
442  AisD = ( ( Arow >= 0 ) && ( nprow > 1 ) );
443  BisD = ( ( Brow >= 0 ) && ( nprow > 1 ) );
444  WACreuse = ( WACreuse &&
445  ( ( !AisD && !BisD ) || ( ( AisD && BisD ) &&
446  ( ( Arow == Brow ) &&
447  ( ( ( Aimb1 >= ABm ) && ( Bimb1 >= ABm ) ) ||
448  ( ( Aimb1 == Bimb1 ) && ( Amb == Bmb ) ) ) ) ) ) );
449  tbeta = ( WACreuse ? one : zero );
450 
451  if( upper )
452  {
453  for( k = kstart; k != kend; k += kstep )
454  {
455  kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
456 /*
457 * Accumulate B( IB+k:IB+k+kbb-1, JB:JB+K-1 )
458 */
459  PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, ABn, B, IB+k, JB, DESCB,
460  ROW, &Bptr, DBUFB, &Bfr );
461 /*
462 * Replicate B( IB+k:IB+k+kbb-1, JB:JB+K-1 ) over A( IA:IA+k+kbb-1, JA:JA+K-1 )
463 */
464  PB_Cdescset( Ad0, ktmp, ABn, Aimb1, Ainb1, Amb, Anb, Arow, Acol,
465  ctxt, Ald );
466  PB_CInV( TYPE, NOCONJG, ROW, ktmp, ABn, Ad0, kbb, Bptr, 0, 0, DBUFB,
467  ROW, &WB, WBd, &WBfr );
468 /*
469 * WAC := A( IA:IA+k+kbb-1, JA:JA+K-1 ) * B( IB+k:IB+k+kbb-1, JB:JB+K-1 )'
470 */
471  PB_COutV( TYPE, COLUMN, INIT, ktmp, ABn, Ad0, kbb, &WAC, WACd,
472  &WACfr, &WACsum );
473  WACld = WACd[LLD_];
474  Amp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow );
475  if( ( Amp > 0 ) && ( Anq0 > 0 ) )
476  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Amp, &kbb, &Anq0,
477  ALPHA, Aptr0, &Ald, WB, &WBd[LLD_], zero, WAC, &WACld );
478  if( WBfr ) free( WB );
479  if( Bfr ) free( Bptr );
480 /*
481 * Accumulate A( IA+k:IA+k+kbb-1, JA:JA+K-1 )
482 */
483  PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, ABn, A, IA+k, JA, DESCA,
484  ROW, &Aptr, DBUFA, &Afr );
485 /*
486 * Replicate A( IA+k:IA+k+kbb-1, JA:JA+K-1 ) over B( IB:IB+k+kbb-1, JB:JB+K-1 )
487 */
488  PB_Cdescset( Bd0, ktmp, ABn, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
489  ctxt, Bld );
490  PB_CInV( TYPE, NOCONJG, ROW, ktmp, ABn, Bd0, kbb, Aptr, 0, 0, DBUFA,
491  ROW, &WA, WAd, &WAfr );
492 /*
493 * WBC := B( IB:IB+k+kbb-1, JB:JB+K-1 ) * A( IA+k:IA+k+kbb-1, JA:JA+K-1 )'
494 */
495  if( WACreuse )
496  { WBC = WAC; MDescCopy( WACd, WBCd ); WBCfr = 0; WBCsum = WACsum; }
497  else
498  {
499  PB_COutV( TYPE, COLUMN, INIT, ktmp, ABn, Bd0, kbb, &WBC, WBCd,
500  &WBCfr, &WBCsum );
501  }
502  WBCld = WBCd[LLD_];
503  Bmp = PB_Cnumroc( ktmp, 0, Bimb1, Bmb, myrow, Brow, nprow );
504  if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
505  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Bmp, &kbb, &Bnq0,
506  talpha, Bptr0, &Bld, WA, &WAd[LLD_], tbeta, WBC, &WBCld );
507  if( WAfr ) free( WA );
508  if( Afr ) free( Aptr );
509 /*
510 * Combine the local copies of WAC when necessary
511 */
512  if( WACsum )
513  {
514  WACd[CSRC_] = PB_Cindxg2p( JC + ( fwd ? k : ktmp - 1 ), Cinb,
515  Cnb, Ccsrc, Ccsrc, npcol );
516  if( Amp > 0 )
517  gsum2d( ctxt, ROW, &top, Amp, kbb, WAC, WACld, myrow,
518  WACd[CSRC_] );
519  }
520 /*
521 * Zero lower triangle of WAC( k:k+kbb-1, 0:kbb-1 )
522 */
523  if( conjg )
524  PB_Cplapad( TYPE, LOWER, CONJG, kbb, kbb, zero, zero, WAC,
525  k, 0, WACd );
526  else if( kbb > 1 )
527  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, WAC,
528  k+1, 0, WACd );
529 /*
530 * Combine the local copies of WBC when necessary
531 */
532  if( !( WACreuse ) )
533  {
534  if( WBCsum )
535  {
536  if( WACsum ) { WBCd[CSRC_] = WACd[CSRC_]; }
537  else
538  {
539  WBCd[CSRC_] = PB_Cindxg2p( JC + ( fwd ? k : ktmp - 1 ),
540  Cinb, Cnb, Ccsrc, Ccsrc,
541  npcol );
542  }
543  if( Bmp > 0 )
544  gsum2d( ctxt, ROW, &top, Bmp, kbb, WBC, WBCld, myrow,
545  WBCd[CSRC_] );
546  }
547 /*
548 * Zero lower triangle of WBC( k:k+kbb-1, 0:kbb-1 )
549 */
550  if( conjg )
551  PB_Cplapad( TYPE, LOWER, CONJG, kbb, kbb, zero, zero,
552  WBC, k, 0, WBCd );
553  else if( kbb > 1 )
554  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero,
555  WBC, k+1, 0, WBCd );
556  }
557 /*
558 * Add WAC to C( IC:IC+k+kbb-1, JC+k:JC+k+kbb-1 )
559 */
560  PB_CScatterV( TYPE, &ScatterDir, ktmp, kbb, WAC, 0, 0, WACd, COLUMN,
561  one, C, IC, JC+k, DESCC, COLUMN );
562  if( WACfr ) free( WAC );
563 /*
564 * Add WBC to C( IC:IC+k+kbb-1, JC+k:JC+k+kbb-1 )
565 */
566  if( !( WACreuse ) )
567  {
568  PB_CScatterV( TYPE, &ScatterDir, ktmp, kbb, WBC, 0, 0, WBCd,
569  COLUMN, one, C, IC, JC+k, DESCC, COLUMN );
570  if( WBCfr ) free( WBC );
571  }
572  }
573  }
574  else
575  {
576  for( k = kstart; k != kend; k += kstep )
577  {
578  ktmp = N - k; kbb = MIN( ktmp, kb );
579 /*
580 * Accumulate B( IB+k:IB+k+kbb-1, JB:JB+K-1 )
581 */
582  PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, ABn, B, IB+k, JB, DESCB,
583  ROW, &Bptr, DBUFB, &Bfr );
584 /*
585 * Replicate B( IB+k:IB+k+kbb-1, JB:JB+K-1 ) over A( IA+k:IA+N-1, JA:JA+K-1 )
586 */
587  Acurimb1 = PB_Cfirstnb( ktmp, IA+k, Aimb, Amb );
588  Acurrow = PB_Cindxg2p( k, Aimb1, Amb, Arow, Arow, nprow );
589  PB_Cdescset( Ad0, ktmp, ABn, Acurimb1, Ainb1, Amb, Anb, Acurrow,
590  Acol, ctxt, Ald );
591  PB_CInV( TYPE, NOCONJG, ROW, ktmp, ABn, Ad0, kbb, Bptr, 0, 0, DBUFB,
592  ROW, &WB, WBd, &WBfr );
593 /*
594 * WAC := A( IA+k:IA+N-1, JA:JA+K-1 ) * B( IB+k:IB+k+kbb-1, JB:JB+K-1 )'
595 */
596  PB_COutV( TYPE, COLUMN, INIT, ktmp, ABn, Ad0, kbb, &WAC, WACd,
597  &WACfr, &WACsum );
598  WACld = WACd[LLD_];
599  Amp = PB_Cnumroc( ktmp, k, Aimb1, Amb, myrow, Arow, nprow );
600  if( ( Amp > 0 ) && ( Anq0 > 0 ) )
601  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Amp, &kbb, &Anq0,
602  ALPHA, Mptr( Aptr0, Amp0-Amp, 0, Ald, size ), &Ald, WB,
603  &WBd[LLD_], zero, WAC, &WACld );
604  if( WBfr ) free( WB );
605  if( Bfr ) free( Bptr );
606 /*
607 * Accumulate A( IA+k:IA+k+kbb-1, JA:JA+K-1 )
608 */
609  PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, ABn, A, IA+k, JA, DESCA,
610  ROW, &Aptr, DBUFA, &Afr );
611 /*
612 * Replicate A( IA+k:IA+k+kbb-1, JA:JA+K-1 ) over B( IB+k:IB+N-1, JB:JB+K-1 )
613 */
614  Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb );
615  Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow );
616  PB_Cdescset( Bd0, ktmp, ABn, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow,
617  Bcol, ctxt, Bld );
618  PB_CInV( TYPE, NOCONJG, ROW, ktmp, ABn, Bd0, kbb, Aptr, 0, 0, DBUFA,
619  ROW, &WA, WAd, &WAfr );
620 /*
621 * WBC := B( IB+k:IB+N-1, JB:JB+K-1 ) * A( IA+k:IA+k+kbb-1, JA:JA+K-1 )'
622 */
623  if( WACreuse )
624  { WBC = WAC; MDescCopy( WACd, WBCd ); WBCfr = 0; WBCsum = WACsum; }
625  else
626  {
627  PB_COutV( TYPE, COLUMN, INIT, ktmp, ABn, Bd0, kbb, &WBC, WBCd,
628  &WBCfr, &WBCsum );
629  }
630  WBCld = WBCd[LLD_];
631  Bmp = PB_Cnumroc( ktmp, k, Bimb1, Bmb, myrow, Brow, nprow );
632  if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
633  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Bmp, &kbb, &Bnq0,
634  talpha, Mptr( Bptr0, Bmp0-Bmp, 0, Bld, size ), &Bld, WA,
635  &WAd[LLD_], tbeta, WBC, &WBCld );
636  if( WAfr ) free( WA );
637  if( Afr ) free( Aptr );
638 /*
639 * Combine the local copies of WAC when necessary
640 */
641  if( WACsum )
642  {
643  WACd[CSRC_] = PB_Cindxg2p( JC + ( fwd ? k : k + kbb - 1 ), Cinb,
644  Cnb, Ccsrc, Ccsrc, npcol );
645  if( Amp > 0 )
646  gsum2d( ctxt, ROW, &top, Amp, kbb, WAC, WACld, myrow,
647  WACd[CSRC_] );
648  }
649 /*
650 * Zero upper triangle of WAC( 0:kbb-1, 0:kbb-1 )
651 */
652  if( conjg )
653  PB_Cplapad( TYPE, UPPER, CONJG, kbb, kbb, zero, zero, WAC,
654  0, 0, WACd );
655  else if( kbb > 1 )
656  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WAC,
657  0, 1, WACd );
658 /*
659 * Combine the local copies of WBC when necessary
660 */
661  if( !( WACreuse ) )
662  {
663  if( WBCsum )
664  {
665  if( WACsum ) { WBCd[CSRC_] = WACd[CSRC_]; }
666  else
667  {
668  WBCd[CSRC_] = PB_Cindxg2p( JC + ( fwd ? k : k+kbb-1 ),
669  Cinb, Cnb, Ccsrc, Ccsrc,
670  npcol );
671  }
672  if( Bmp > 0 )
673  gsum2d( ctxt, ROW, &top, Bmp, kbb, WBC, WBCld, myrow,
674  WBCd[CSRC_] );
675  }
676 /*
677 * Zero upper triangle of WBC( 0:kbb-1, 0:kbb-1 )
678 */
679  if( conjg )
680  PB_Cplapad( TYPE, UPPER, CONJG, kbb, kbb, zero, zero,
681  WBC, 0, 0, WBCd );
682  else if( kbb > 1 )
683  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero,
684  WBC, 0, 1, WBCd );
685  }
686 /*
687 * Add WAC to C( IC+k:IC+N-1, JC+k:JC+k+kbb-1 )
688 */
689  PB_CScatterV( TYPE, &ScatterDir, ktmp, kbb, WAC, 0, 0, WACd, COLUMN,
690  one, C, IC+k, JC+k, DESCC, COLUMN );
691  if( WACfr ) free( WAC );
692 /*
693 * Add WBC to C( IC+k:IC+N-1, JC+k:JC+k+kbb-1 )
694 */
695  if( !( WACreuse ) )
696  {
697  PB_CScatterV( TYPE, &ScatterDir, ktmp, kbb, WBC, 0, 0, WBCd,
698  COLUMN, one, C, IC+k, JC+k, DESCC, COLUMN );
699  if( WBCfr ) free( WBC );
700  }
701  }
702  }
703  }
704  else
705  {
706  top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
707  Cimb = DESCC[IMB_]; Cmb = DESCC[MB_]; Crsrc = DESCC[RSRC_];
708 /*
709 * Determine if one can reuse the WAC buffer for the intermediate local products
710 * sub( A )' * sub( B ) and sub( B )' * sub( A ).
711 */
712  AisR = ( ( Arow < 0 ) || ( nprow == 1 ) );
713  BisR = ( ( Brow < 0 ) || ( nprow == 1 ) );
714 /*
715 * When neither sub( A ) nor sub( B ) are replicated, WAC can be reused if
716 * either sub( A ) spans more than one process row, or, neither sub( A ) nor
717 * sub( B ) span more than one process row, and both operands reside in the
718 * same process row.
719 */
720  if( !( AisR ) && !( BisR ) )
721  {
722  Aspan = PB_Cspan( ABm, 0, Aimb1, Amb, Arow, nprow );
723  Bspan = PB_Cspan( ABm, 0, Bimb1, Bmb, Brow, nprow );
724  WACreuse = ( Aspan ||
725  ( !( Aspan ) && !( Bspan ) && ( Arow == Brow ) ) );
726  }
727  else
728  {
729 /*
730 * Otherwise, WAC can be reused when both operands sub( A ) and sub( B ) are
731 * replicated.
732 */
733  WACreuse = ( AisR && BisR );
734  }
735 /*
736 * Furthermore, the ability to reuse WAC requires sub( A ) and sub( B ) to be
737 * either both not column-distributed, or, both column-distributed and aligned.
738 */
739  AisD = ( ( Acol >= 0 ) && ( npcol > 1 ) );
740  BisD = ( ( Bcol >= 0 ) && ( npcol > 1 ) );
741  WACreuse = ( WACreuse &&
742  ( ( !AisD && !BisD ) || ( ( AisD && BisD ) &&
743  ( ( Acol == Bcol ) &&
744  ( ( ( Ainb1 >= ABn ) && ( Binb1 >= ABn ) ) ||
745  ( ( Ainb1 == Binb1 ) && ( Anb == Bnb ) ) ) ) ) ) );
746  tbeta = ( WACreuse ? one : zero );
747 
748  if( upper )
749  {
750  for( k = kstart; k != kend; k += kstep )
751  {
752  ktmp = N - k; kbb = MIN( ktmp, kb );
753 /*
754 * Accumulate B( IB:IB+K-1, JB+k:JB+k+kbb-1 )
755 */
756  PB_CGatherV( TYPE, REUSE, &GatherDir, ABm, kbb, B, IB, JB+k, DESCB,
757  COLUMN, &Bptr, DBUFB, &Bfr );
758 /*
759 * Replicate B( IB:IB+K-1, JB+k:JB+k+kbb-1 ) over A( IA:IA+K-1, JA+k:JA+N-1 )
760 */
761  Acurinb1 = PB_Cfirstnb( ktmp, JA+k, Ainb, Anb );
762  Acurcol = PB_Cindxg2p( k, Ainb1, Anb, Acol, Acol, npcol );
763  PB_Cdescset( Ad0, ABm, ktmp, Aimb1, Acurinb1, Amb, Anb, Arow,
764  Acurcol, ctxt, Ald );
765  PB_CInV( TYPE, NOCONJG, COLUMN, ABm, ktmp, Ad0, kbb, Bptr, 0, 0,
766  DBUFB, COLUMN, &WB, WBd, &WBfr );
767 /*
768 * WAC := B( IB:IB+K-1, JB+k:JB+k+kbb-1 )' * A( IA:IA+K-1, JA+k:JA+N-1 )
769 */
770  PB_COutV( TYPE, ROW, INIT, ABm, ktmp, Ad0, kbb, &WAC, WACd, &WACfr,
771  &WACsum );
772  WACld = WACd[LLD_];
773  Anq = PB_Cnumroc( ktmp, k, Ainb1, Anb, mycol, Acol, npcol );
774  if( ( Anq > 0 ) && ( Amp0 > 0 ) )
775  gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Anq, &Amp0,
776  talpha, WB, &WBd[LLD_], Mptr( Aptr0, 0, Anq0-Anq, Ald,
777  size ), &Ald, zero, WAC, &WACld );
778  if( WBfr ) free( WB );
779  if( Bfr ) free( Bptr );
780 /*
781 * Accumulate A( IA:IA+K-1, JA+k:JA+k+kbb-1 )
782 */
783  PB_CGatherV( TYPE, REUSE, &GatherDir, ABm, kbb, A, IA, JA+k, DESCA,
784  COLUMN, &Aptr, DBUFA, &Afr );
785 /*
786 * Replicate A( IA:IA+K-1, JA+k:JA+k+kbb-1 ) over B( IB:IB+K-1, JB+k:JB+N-1 )
787 */
788  Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb );
789  Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol );
790  PB_Cdescset( Bd0, ABm, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
791  Bcurcol, ctxt, Bld );
792  PB_CInV( TYPE, NOCONJG, COLUMN, ABm, ktmp, Bd0, kbb, Aptr, 0, 0,
793  DBUFA, COLUMN, &WA, WAd, &WAfr );
794 /*
795 * WBC := A( IA:IA+K-1, JA+k:JA+k+kbb-1 )' * B( IB:IB+K-1, JB+k:JB+N-1 )
796 */
797  if( WACreuse )
798  { WBC = WAC; MDescCopy( WACd, WBCd ); WBCfr = 0; WBCsum = WACsum; }
799  else
800  {
801  PB_COutV( TYPE, ROW, INIT, ABm, ktmp, Bd0, kbb, &WBC, WBCd,
802  &WBCfr, &WBCsum );
803  }
804  WBCld = WBCd[LLD_];
805  Bnq = PB_Cnumroc( ktmp, k, Binb1, Bnb, mycol, Bcol, npcol );
806  if( ( Bnq > 0 ) && ( Bmp0 > 0 ) )
807  gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Bnq, &Bmp0,
808  ALPHA, WA, &WAd[LLD_], Mptr( Bptr0, 0, Bnq0-Bnq, Bld,
809  size ), &Bld, tbeta, WBC, &WBCld );
810  if( WAfr ) free( WA );
811  if( Afr ) free( Aptr );
812 /*
813 * Combine the local copies of WAC when necessary
814 */
815  if( WACsum )
816  {
817  WACd[RSRC_] = PB_Cindxg2p( IC + ( fwd ? k : k + kbb - 1 ), Cimb,
818  Cmb, Crsrc, Crsrc, nprow );
819  if( Anq > 0 )
820  gsum2d( ctxt, COLUMN, &top, kbb, Anq, WAC, WACld, WACd[RSRC_],
821  mycol );
822  }
823 /*
824 * Zero lower triangle of WBC( 0:kbb-1, 0:kbb-1 )
825 */
826  if( conjg )
827  PB_Cplapad( TYPE, LOWER, CONJG, kbb, kbb, zero, zero, WAC,
828  0, 0, WACd );
829  else if( kbb > 1 )
830  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, WAC,
831  1, 0, WACd );
832 /*
833 * Combine the local copies of WBC when necessary
834 */
835  if( !( WACreuse ) )
836  {
837  if( WBCsum )
838  {
839  if( WACsum ) { WBCd[RSRC_] = WACd[RSRC_]; }
840  else
841  {
842  WBCd[RSRC_] = PB_Cindxg2p( IC + ( fwd ? k : k + kbb - 1 ),
843  Cimb, Cmb, Crsrc, Crsrc,
844  nprow );
845  }
846  if( Bnq > 0 )
847  gsum2d( ctxt, COLUMN, &top, kbb, Bnq, WBC, WBCld,
848  WBCd[RSRC_], mycol );
849  }
850 /*
851 * Zero lower triangle of WBC( 0:kbb-1, 0:kbb-1 )
852 */
853  if( conjg )
854  PB_Cplapad( TYPE, LOWER, CONJG, kbb, kbb, zero, zero,
855  WBC, 0, 0, WBCd );
856  else if( kbb > 1 )
857  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero,
858  WBC, 1, 0, WBCd );
859  }
860 /*
861 * Add WAC to C( IC+k:IC+k+kbb-1, JC+k:JC+N-1 )
862 */
863  PB_CScatterV( TYPE, &ScatterDir, kbb, ktmp, WAC, 0, 0, WACd, ROW,
864  one, C, IC+k, JC+k, DESCC, ROW );
865  if( WACfr ) free( WAC );
866 /*
867 * Add WBC to C( IC+k:IC+k+kbb-1, JC+k:JC+N-1 )
868 */
869  if( !( WACreuse ) )
870  {
871  PB_CScatterV( TYPE, &ScatterDir, kbb, ktmp, WBC, 0, 0, WBCd, ROW,
872  one, C, IC+k, JC+k, DESCC, ROW );
873  if( WBCfr ) free( WBC );
874  }
875  }
876  }
877  else
878  {
879  for( k = kstart; k != kend; k += kstep )
880  {
881  kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
882 /*
883 * Accumulate B( IB:IB+K-1, JB+k:JB+k+kbb-1 )
884 */
885  PB_CGatherV( TYPE, REUSE, &GatherDir, ABm, kbb, B, IB, JB+k, DESCB,
886  COLUMN, &Bptr, DBUFB, &Bfr );
887 /*
888 * Replicate B( IB:IB+K-1, JB+k:JB+k+kbb-1 ) over A( IA:IA+K-1, JA:JA+k+kbb-1 )
889 */
890  PB_Cdescset( Ad0, ABm, ktmp, Aimb1, Ainb1, Amb, Anb, Arow, Acol,
891  ctxt, Ald );
892  PB_CInV( TYPE, NOCONJG, COLUMN, ABm, ktmp, Ad0, kbb, Bptr, 0, 0,
893  DBUFB, COLUMN, &WB, WBd, &WBfr );
894 /*
895 * WAC := B( IB:IB+K-1, JB+k:JB+k+kbb-1 )' * A( IA:IA+K-1, JA:JA+k+kbb-1 )
896 */
897  PB_COutV( TYPE, ROW, INIT, ABm, ktmp, Ad0, kbb, &WAC, WACd, &WACfr,
898  &WACsum );
899  WACld = WACd[LLD_];
900  Anq = PB_Cnumroc( ktmp, 0, Ainb1, Anb, mycol, Acol, npcol );
901  if( ( Anq > 0 ) && ( Amp0 > 0 ) )
902  gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Anq, &Amp0,
903  talpha, WB, &WBd[LLD_], Aptr0, &Ald, zero, WAC, &WACld );
904  if( WBfr ) free( WB );
905  if( Bfr ) free( Bptr );
906 /*
907 * Accumulate A( IA:IA+K-1, JA+k:JA+k+kbb-1 )
908 */
909  PB_CGatherV( TYPE, REUSE, &GatherDir, ABm, kbb, A, IA, JA+k, DESCA,
910  COLUMN, &Aptr, DBUFA, &Afr );
911 /*
912 * Replicate A( IA:IA+K-1, JA+k:JA+k+kbb-1 ) over B( IB:IB+K-1, JB:JB+k+kbb-1 )
913 */
914  PB_Cdescset( Bd0, ABm, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
915  ctxt, Bld );
916  PB_CInV( TYPE, NOCONJG, COLUMN, ABm, ktmp, Bd0, kbb, Aptr, 0, 0,
917  DBUFA, COLUMN, &WA, WAd, &WAfr );
918 /*
919 * WBC := A( IA:IA+K-1, JA+k:JA+k+kbb-1 )' * B( IB:IB+K-1, JB:JB+k+kbb-1 )
920 */
921  if( WACreuse )
922  { WBC = WAC; MDescCopy( WACd, WBCd ); WBCfr = 0; WBCsum = WACsum; }
923  else
924  {
925  PB_COutV( TYPE, ROW, INIT, ABm, ktmp, Bd0, kbb, &WBC, WBCd,
926  &WBCfr, &WBCsum );
927  }
928  WBCld = WBCd[LLD_];
929  Bnq = PB_Cnumroc( ktmp, 0, Binb1, Bnb, mycol, Bcol, npcol );
930  if( ( Bnq > 0 ) && ( Bmp0 > 0 ) )
931  gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Bnq, &Bmp0,
932  ALPHA, WA, &WAd[LLD_], Bptr0, &Bld, tbeta, WBC, &WBCld );
933  if( WAfr ) free( WA );
934  if( Afr ) free( Aptr );
935 /*
936 * Combine the local copies of WAC when necessary
937 */
938  if( WACsum )
939  {
940  WACd[RSRC_] = PB_Cindxg2p( IC + ( fwd ? k : ktmp - 1 ), Cimb,
941  Cmb, Crsrc, Crsrc, nprow );
942  if( Anq > 0 )
943  gsum2d( ctxt, COLUMN, &top, kbb, Anq, WAC, WACld, WACd[RSRC_],
944  mycol );
945  }
946 /*
947 * Zero upper triangle of WBC( 0:kbb-1, k:k+kbb-1 )
948 */
949  if( conjg )
950  PB_Cplapad( TYPE, UPPER, CONJG, kbb, kbb, zero, zero, WAC,
951  0, k, WACd );
952  else if( kbb > 1 )
953  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WAC,
954  0, k+1, WACd );
955 /*
956 * Combine the local copies of WBC when necessary
957 */
958  if( !( WACreuse ) )
959  {
960  if( WBCsum )
961  {
962  if( WACsum ) { WBCd[RSRC_] = WACd[RSRC_]; }
963  else
964  {
965  WBCd[RSRC_] = PB_Cindxg2p( IC + ( fwd ? k : ktmp - 1 ),
966  Cimb, Cmb, Crsrc, Crsrc,
967  nprow );
968  }
969  if( Bnq > 0 )
970  gsum2d( ctxt, COLUMN, &top, kbb, Bnq, WBC, WBCld,
971  WBCd[RSRC_], mycol );
972  }
973 /*
974 * Zero upper triangle of WBC( 0:kbb-1, k:k+kbb-1 )
975 */
976  if( conjg )
977  PB_Cplapad( TYPE, UPPER, CONJG, kbb, kbb, zero, zero,
978  WBC, 0, k, WBCd );
979  else if( kbb > 1 )
980  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero,
981  WBC, 0, k+1, WBCd );
982  }
983 /*
984 * Add WAC to C( IC+k:IC+k+kbb-1, JC:JC+k+kbb-1 )
985 */
986  PB_CScatterV( TYPE, &ScatterDir, kbb, ktmp, WAC, 0, 0, WACd, ROW,
987  one, C, IC+k, JC, DESCC, ROW );
988  if( WACfr ) free( WAC );
989 /*
990 * Add WBC to C( IC+k:IC+k+kbb-1, JC:JC+k+kbb-1 )
991 */
992  if( !( WACreuse ) )
993  {
994  PB_CScatterV( TYPE, &ScatterDir, kbb, ktmp, WBC, 0, 0, WBCd, ROW,
995  one, C, IC+k, JC, DESCC, ROW );
996  if( WBCfr ) free( WBC );
997  }
998  }
999  }
1000  }
1001  if( conjg ) free( talpha );
1002 /*
1003 * End of PB_Cpsyr2kAC
1004 */
1005 }
CCONJG
#define CCONJG
Definition: PBblas.h:21
TYPE
#define TYPE
Definition: clamov.c:7
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
CSRC_
#define CSRC_
Definition: PBtools.h:46
PB_CScatterV
void PB_CScatterV()
PB_Cconjg
void PB_Cconjg()
CCOTRAN
#define CCOTRAN
Definition: PBblas.h:22
NOCONJG
#define NOCONJG
Definition: PBblas.h:45
PB_Cfirstnb
int PB_Cfirstnb()
DLEN_
#define DLEN_
Definition: PBtools.h:48
GEMM_T
F_VOID_FCT(* GEMM_T)()
Definition: pblas.h:313
NOTRAN
#define NOTRAN
Definition: PBblas.h:44
LLD_
#define LLD_
Definition: PBtools.h:47
MDescCopy
#define MDescCopy(DescIn, DescOut)
Definition: PBtools.h:514
UPPER
#define UPPER
Definition: PBblas.h:52
IMB_
#define IMB_
Definition: PBtools.h:41
pilaenv_
int pilaenv_()
PB_Cplascal
void PB_Cplascal()
PB_CGatherV
void PB_CGatherV()
INIT
#define INIT
Definition: PBblas.h:61
PB_Cdescset
void PB_Cdescset()
PB_Cplapad
void PB_Cplapad()
PB_Cindxg2p
int PB_Cindxg2p()
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
GSUM2D_T
void(* GSUM2D_T)()
Definition: pblas.h:282
PB_Ctop
char * PB_Ctop()
RSRC_
#define RSRC_
Definition: PBtools.h:45
CNOTRAN
#define CNOTRAN
Definition: PBblas.h:18
COMBINE
#define COMBINE
Definition: PBblacs.h:49
CONJG
#define CONJG
Definition: PBblas.h:47
PB_Cinfog2l
void PB_Cinfog2l()
PB_Cnumroc
int PB_Cnumroc()
PB_Cmalloc
char * PB_Cmalloc()
PB_Cpsyr2kAC
void PB_Cpsyr2kAC(PBTYP_T *TYPE, char *DIRECAB, char *CONJUG, char *UPLO, char *TRANS, int N, int K, char *ALPHA, char *A, int IA, int JA, int *DESCA, char *B, int IB, int JB, int *DESCB, char *BETA, char *C, int IC, int JC, int *DESCC)
Definition: PB_Cpsyr2kAC.c:26
CFORWARD
#define CFORWARD
Definition: PBblas.h:38
PB_CInV
void PB_CInV()
MIN
#define MIN(a_, b_)
Definition: PBtools.h:76
INB_
#define INB_
Definition: PBtools.h:42
LOWER
#define LOWER
Definition: PBblas.h:51
PB_COutV
void PB_COutV()
REUSE
#define REUSE
Definition: PBblas.h:67
C2F_CHAR
#define C2F_CHAR(a)
Definition: pblas.h:121
PB_Cspan
int PB_Cspan()
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
Mupcase
#define Mupcase(C)
Definition: PBtools.h:83
CUPPER
#define CUPPER
Definition: PBblas.h:26
CTRAN
#define CTRAN
Definition: PBblas.h:20
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
CBACKWARD
#define CBACKWARD
Definition: PBblas.h:39
CTXT_
#define CTXT_
Definition: PBtools.h:38